1993 files changed, 112399 insertions, 29776 deletions
diff --git a/.gitignore b/.gitignore
index ecf2e3e422b8..24628309dca4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@
 *.pyc
 # vim swap files
 .*.swp
+.sw?
 
 #==============================================================================#
 # Explicit files to ignore (only matches one).
@@ -27,6 +28,7 @@ cscope.files
 cscope.out
 autoconf/aclocal.m4
 autoconf/autom4te.cache
+compile_commands.json
 
 #==============================================================================#
 # Directories to ignore (do not add trailing '/'s, they skip symlinks).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fbf8e2bb90e5..d3edc0219858 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -115,6 +115,11 @@ if(LLVM_ENABLE_TIMESTAMPS)
   set(ENABLE_TIMESTAMPS 1)
 endif()
 
+option(LLVM_ENABLE_BACKTRACES "Enable embedding backtraces on crash." ON)
+if(LLVM_ENABLE_BACKTRACES)
+  set(ENABLE_BACKTRACES 1)
+endif()
+
 option(LLVM_ENABLE_FFI "Use libffi to call external functions from the interpreter" OFF)
 set(FFI_LIBRARY_DIR "" CACHE PATH "Additional directory, where CMake should search for libffi.so")
 set(FFI_INCLUDE_DIR "" CACHE PATH "Additional directory, where CMake should search for ffi.h or ffi/ffi.h")
@@ -172,23 +177,7 @@ option(LLVM_USE_INTEL_JITEVENTS
 
 if( LLVM_USE_INTEL_JITEVENTS )
   # Verify we are on a supported platform
-  if( CMAKE_SYSTEM_NAME MATCHES "Windows" OR CMAKE_SYSTEM_NAME MATCHES "Linux" )
-    # Directory where Intel Parallel Amplifier XE 2011 is installed.
-    if ( WIN32 )
-      set(LLVM_INTEL_JITEVENTS_DIR $ENV{VTUNE_AMPLIFIER_XE_2011_DIR})
-    else ( WIN32 )
-      set(LLVM_INTEL_JITEVENTS_DIR "/opt/intel/vtune_amplifier_xe_2011")
-    endif ( WIN32 )
-
-    # Set include and library search paths for Intel JIT Events API
-    set(LLVM_INTEL_JITEVENTS_INCDIR "${LLVM_INTEL_JITEVENTS_DIR}/include")
-
-    if ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-      set(LLVM_INTEL_JITEVENTS_LIBDIR "${LLVM_INTEL_JITEVENTS_DIR}/lib64")
-    else ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-      set(LLVM_INTEL_JITEVENTS_LIBDIR "${LLVM_INTEL_JITEVENTS_DIR}/lib32")
-    endif ( CMAKE_SIZEOF_VOID_P EQUAL 8 )
-  else()
+  if( NOT CMAKE_SYSTEM_NAME MATCHES "Windows" AND NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
     message(FATAL_ERROR
       "Intel JIT API support is available on Linux and Windows only.")
   endif()
@@ -249,6 +238,14 @@ option(LLVM_INCLUDE_TESTS "Generate build targets for the LLVM unit tests." ON)
 # BEFORE this include, otherwise options will not be correctly set on
 # first cmake run
 include(config-ix)
+
+# By default, we target the host, but this can be overridden at CMake
+# invocation time.
+set(LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_HOST_TRIPLE}" CACHE STRING
+  "Default target for which LLVM will generate code." )
+set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}" CACHE STRING
+  "Default target for which LLVM will generate code." )
+
 include(HandleLLVMOptions)
 
 # Verify that we can find a Python interpreter,
diff --git a/CREDITS.TXT b/CREDITS.TXT
index f090ad734c47..02579182589a 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -5,8 +5,8 @@ done!
 
 The list is sorted by surname and formatted to allow easy grepping and
 beautification by scripts.  The fields are: name (N), email (E), web-address
-(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
-(S).
+(W), PGP key ID and fingerprint (P), description (D), snail-mail address
+(S), and (I) IRC handle.
 
 
 N: Vikram Adve
@@ -17,7 +17,7 @@ D: The Sparc64 backend, provider of much wisdom, and motivator for LLVM
 N: Owen Anderson
 E: resistor@mac.com
 D: LCSSA pass and related LoopUnswitch work
-D: GVNPRE pass, TargetData refactoring, random improvements
+D: GVNPRE pass, DataLayout refactoring, random improvements
 
 N: Henrik Bach
 D: MingW Win32 API portability layer
@@ -328,10 +328,6 @@ D: LTO tool, PassManager rewrite, Loop Pass Manager, Loop Rotate
 D: GCC PCH Integration (llvm-gcc), llvm-gcc improvements
 D: Optimizer improvements, Loop Index Split
 
-N: Sandeep Patel
-E: deeppatel1987@gmail.com
-D: ARM calling conventions rewrite, hard float support
-
 N: Wesley Peck
 E: peckw@wesleypeck.com
 W: http://wesleypeck.com/
@@ -354,6 +350,11 @@ N: Xerxes Ranby
 E: xerxes@zafena.se
 D: Cmake dependency chain and various bug fixes
 
+N: Alex Rosenberg
+E: alexr@leftfield.org
+I: arosenberg
+D: ARM calling conventions rewrite, hard float support
+
 N: Chad Rosier
 E: mcrosier@apple.com
 D: ARM fast-isel improvements
@@ -369,6 +370,7 @@ D: MSIL backend
 
 N: Duncan Sands
 E: baldrick@free.fr
+I: baldrick
 D: Ada support in llvm-gcc
 D: Dragonegg plugin
 D: Exception handling improvements
diff --git a/Makefile b/Makefile
index 604696a1df43..1e5dae470d26 100644
--- a/Makefile
+++ b/Makefile
@@ -68,7 +68,8 @@ endif
 
 ifeq ($(MAKECMDGOALS),install-clang)
   DIRS := tools/clang/tools/driver tools/clang/lib/Headers \
-          tools/clang/tools/libclang tools/clang/tools/c-index-test \
+          tools/clang/tools/libclang \
+          tools/clang/tools/c-index-test \
           tools/clang/include/clang-c \
           tools/clang/runtime tools/clang/docs \
           tools/lto runtime
@@ -111,15 +112,18 @@ cross-compile-build-tools:
 	  cd BuildTools ; \
 	  unset CFLAGS ; \
 	  unset CXXFLAGS ; \
+	  unset SDKROOT ; \
+	  unset UNIVERSAL_SDK_PATH ; \
 	  $(PROJ_SRC_DIR)/configure --build=$(BUILD_TRIPLE) \
 		--host=$(BUILD_TRIPLE) --target=$(BUILD_TRIPLE) \
 	        --disable-polly ; \
 	  cd .. ; \
 	fi; \
-	(unset SDKROOT; \
-	 $(MAKE) -C BuildTools \
+	($(MAKE) -C BuildTools \
 	  BUILD_DIRS_ONLY=1 \
 	  UNIVERSAL= \
+	  UNIVERSAL_SDK_PATH= \
+	  SDKROOT= \
 	  TARGET_NATIVE_ARCH="$(TARGET_NATIVE_ARCH)" \
 	  TARGETS_TO_BUILD="$(TARGETS_TO_BUILD)" \
 	  ENABLE_OPTIMIZED=$(ENABLE_OPTIMIZED) \
diff --git a/Makefile.config.in b/Makefile.config.in
index e3bd2a207a50..b4ecea631e3c 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -61,6 +61,7 @@ endif
 
 prefix          := @prefix@
 PROJ_prefix     := $(prefix)
+program_prefix  := @program_prefix@
 PROJ_VERSION    := $(LLVMVersion)
 else
 ifndef PROJ_SRC_ROOT
diff --git a/Makefile.rules b/Makefile.rules
index 289adc2be429..b2b02c25d44b 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -571,7 +571,11 @@ endif
 #--------------------------------------------------------------------
 
 ifeq ($(HOST_OS),Darwin)
+ ifdef MACOSX_DEPLOYMENT_TARGET
+  DARWIN_VERSION := $(MACOSX_DEPLOYMENT_TARGET)
+ else
   DARWIN_VERSION := `sw_vers -productVersion`
+ endif
   # Strip a number like 10.4.7 to 10.4
   DARWIN_VERSION := $(shell echo $(DARWIN_VERSION)| sed -E 's/(10.[0-9]).*/\1/')
   # Get "4" out of 10.4 for later pieces in the makefile.
@@ -631,19 +635,23 @@ endif
 
 # Adjust linker flags for building an executable
 ifneq ($(HOST_OS), $(filter $(HOST_OS), Cygwin MingW))
-ifneq ($(HOST_OS), Darwin)
-ifdef TOOLNAME
-  LD.Flags += $(RPATH) -Wl,'$$ORIGIN/../lib'
-  ifdef EXAMPLE_TOOL
-    LD.Flags += $(RPATH) -Wl,$(ExmplDir) $(DynamicFlag)
-  else
-    LD.Flags += $(RPATH) -Wl,$(ToolDir) $(DynamicFlag)
+  ifneq ($(HOST_OS), Darwin)
+    ifdef TOOLNAME
+      LD.Flags += $(RPATH) -Wl,'$$ORIGIN/../lib'
+      ifdef EXAMPLE_TOOL
+        LD.Flags += $(RPATH) -Wl,$(ExmplDir) $(DynamicFlag)
+      else
+        LD.Flags += $(RPATH) -Wl,$(ToolDir) $(DynamicFlag)
+    endif
   endif
-endif
 else
-ifneq ($(DARWIN_MAJVERS),4)
-  LD.Flags += $(RPATH) -Wl,@executable_path/../lib
-endif
+  ifneq ($(DARWIN_MAJVERS),4)
+    LD.Flags += $(RPATH) -Wl,@executable_path/../lib
+  endif
+  ifeq ($(RC_BUILDIT),YES)
+    TempFile := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/llvm-lto.XXXXXX)
+    LD.Flags += -Wl,-object_path_lto -Wl,$(TempFile)
+  endif
 endif
 endif
 
@@ -1524,7 +1532,7 @@ ifneq ($(strip $(ToolAliasBuildPath)),)
 $(ToolAliasBuildPath): $(ToolBuildPath)
 	$(Echo) Creating $(BuildMode) Alias $(TOOLALIAS) $(StripWarnMsg)
 	$(Verb) $(RM) -f $(ToolAliasBuildPath)
-	$(Verb) $(AliasTool) $(TOOLEXENAME) $(ToolAliasBuildPath)
+	$(Verb) $(AliasTool) $(notdir $(ToolBuildPath)) $(ToolAliasBuildPath)
 	$(Echo) ======= Finished Creating $(BuildMode) Alias $(TOOLALIAS) \
           $(StripWarnMsg)
 endif
@@ -1541,7 +1549,7 @@ ToolBinDir = $(DESTDIR)$(PROJ_internal_prefix)/bin
 else
 ToolBinDir = $(DESTDIR)$(PROJ_bindir)
 endif
-DestTool = $(ToolBinDir)/$(TOOLEXENAME)
+DestTool = $(ToolBinDir)/$(program_prefix)$(TOOLEXENAME)
 
 install-local:: $(DestTool)
 
@@ -1556,14 +1564,14 @@ uninstall-local::
 
 # TOOLALIAS install.
 ifdef TOOLALIAS
-DestToolAlias = $(ToolBinDir)/$(TOOLALIAS)$(EXEEXT)
+DestToolAlias = $(ToolBinDir)/$(program_prefix)$(TOOLALIAS)$(EXEEXT)
 
 install-local:: $(DestToolAlias)
 
 $(DestToolAlias): $(DestTool)
 	$(Echo) Installing $(BuildMode) $(DestToolAlias)
 	$(Verb) $(RM) -f $(DestToolAlias)
-	$(Verb) $(AliasTool) $(TOOLEXENAME) $(DestToolAlias)
+	$(Verb) $(AliasTool) $(notdir $(DestTool)) $(DestToolAlias)
 
 uninstall-local::
 	$(Echo) Uninstalling $(BuildMode) $(DestToolAlias)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 7fa883e9cc50..7715531a338d 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -363,8 +363,8 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
-  mips-*)                 llvm_cv_target_arch="Mips" ;;
-  mipsel-*)               llvm_cv_target_arch="Mips" ;;
+  mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
+  mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
@@ -396,8 +396,8 @@ case $host in
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
   arm*-*)                 host_arch="ARM" ;;
-  mips-*)                 host_arch="Mips" ;;
-  mipsel-*)               host_arch="Mips" ;;
+  mips-* | mips64-*)      host_arch="Mips" ;;
+  mipsel-* | mips64el-*)  host_arch="Mips" ;;
   xcore-*)                host_arch="XCore" ;;
   msp430-*)               host_arch="MSP430" ;;
   hexagon-*)              host_arch="Hexagon" ;;
@@ -678,6 +678,21 @@ esac
 AC_DEFINE_UNQUOTED([ENABLE_TIMESTAMPS],$ENABLE_TIMESTAMPS,
                    [Define if timestamp information (e.g., __DATE__) is allowed])
 
+dnl Enable embedding timestamp information into build.
+
+AC_ARG_ENABLE(backtraces,
+  AS_HELP_STRING([--enable-backtraces],
+                 [Enable embedding backtraces on crash (default is YES)]),,
+                 enableval=default)
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_BACKTRACES,[1]) ;;
+  no)  AC_SUBST(ENABLE_BACKTRACES,[0]) ;;
+  default) AC_SUBST(ENABLE_BACKTRACES,[1]) ;;
+  *) AC_MSG_ERROR([Invalid setting for --enable-backtraces. Use "yes" or "no"]) ;;
+esac
+AC_DEFINE_UNQUOTED([ENABLE_BACKTRACES],$ENABLE_BACKTRACES,
+                   [Define if you want backtraces on crash])
+
 dnl Allow specific targets to be specified for building (or not)
 TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
@@ -699,6 +714,8 @@ case "$enableval" in
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
+        mips64)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
+        mips64el) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         spu)      TARGETS_TO_BUILD="CellSPU $TARGETS_TO_BUILD" ;;
         xcore)    TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
         msp430)   TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
@@ -1275,46 +1292,23 @@ AC_DEFINE_UNQUOTED([LLVM_USE_OPROFILE],$USE_OPROFILE,
 
 dnl Enable support for Intel JIT Events API.
 AC_ARG_WITH(intel-jitevents,
-  AS_HELP_STRING([--with-intel-jitevents=<vtune-amplifier-dir>],
-    [Specify location of run-time support library for Intel JIT API (default=/opt/intel/vtune_amplifier_xe_2011)]),
+  AS_HELP_STRING([--with-intel-jitevents  Notify Intel JIT profiling API of generated code]),
     [
+       case "$withval" in
+          yes) AC_SUBST(USE_INTEL_JITEVENTS,[1]);;
+          no)  AC_SUBST(USE_INTEL_JITEVENTS,[0]);;
+          *) AC_MSG_ERROR([Invalid setting for --with-intel-jitevents. Use "yes" or "no"]);;
+       esac
+
       case $llvm_cv_os_type in
         Linux|Win32|Cygwin|MingW) ;;
-        *)
-          AC_MSG_ERROR([
-            Intel JIT API support is available on Linux and Windows only."]) ;;
+        *) AC_MSG_ERROR([Intel JIT API support is available on Linux and Windows only.]);;
       esac
 
-      AC_SUBST(USE_INTEL_JITEVENTS, [1])
       case "$llvm_cv_target_arch" in
-        x86)    llvm_intel_jitevents_archdir="lib32";;
-        x86_64) llvm_intel_jitevents_archdir="lib64";;
-        *)      echo "Target architecture $llvm_cv_target_arch does not support Intel JIT Events API"
-                exit -1;;
-      esac
-      INTEL_JITEVENTS_INCDIR="/opt/intel/vtune_amplifier_xe_2011/include"
-      INTEL_JITEVENTS_LIBDIR="/opt/intel/vtune_amplifier_xe_2011/$llvm_intel_jitevents_archdir"
-      case "$withval" in
-        /* | [[A-Za-z]]:[[\\/]]*) INTEL_JITEVENTS_INCDIR=$withval/include
-                                  INTEL_JITEVENTS_LIBDIR=$withval/$llvm_intel_jitevents_archdir ;;
-        *) ;;
+        x86|x86_64) ;;
+        *) AC_MSG_ERROR([Target architecture $llvm_cv_target_arch does not support Intel JIT Events API.]);;
       esac
-
-      AC_SUBST(INTEL_JITEVENTS_INCDIR)
-      AC_SUBST(INTEL_JITEVENTS_LIBDIR)
-
-      LIBS="$LIBS -L${INTEL_JITEVENTS_LIBDIR}"
-      CPPFLAGS="$CPPFLAGS -I$INTEL_JITEVENTS_INCDIR"
-
-      AC_SEARCH_LIBS(iJIT_IsProfilingActive, jitprofiling, [], [
-        echo "Error! Cannot find libjitprofiling.a. Please check path specified in flag --with-intel-jitevents"
-        exit -1
-      ])
-      AC_CHECK_HEADER([jitprofiling.h], [], [
-        echo "Error! Cannot find jitprofiling.h. Please check path specified in flag --with-intel-jitevents"
-        exit -1
-      ])
-
     ],
     [
       AC_SUBST(USE_INTEL_JITEVENTS, [0])
@@ -1717,6 +1711,11 @@ fi
 dnl OCaml findlib META file
 AC_CONFIG_FILES([bindings/ocaml/llvm/META.llvm])
 
+dnl Add --program-prefix value to Makefile.rules. Already an ARG variable.
+test "x$program_prefix" = "xNONE" && program_prefix=""
+AC_SUBST([program_prefix])
+
+
 dnl Do special configuration of Makefiles
 AC_CONFIG_COMMANDS([setup],,[llvm_src="${srcdir}"])
 AC_CONFIG_MAKEFILE(Makefile)
diff --git a/bindings/ocaml/executionengine/executionengine_ocaml.c b/bindings/ocaml/executionengine/executionengine_ocaml.c
index 5b1e32efefcd..02e030605720 100644
--- a/bindings/ocaml/executionengine/executionengine_ocaml.c
+++ b/bindings/ocaml/executionengine/executionengine_ocaml.c
@@ -75,6 +75,9 @@ static struct custom_operations generic_value_ops = {
   custom_hash_default,
   custom_serialize_default,
   custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
 };
 
 static value alloc_generic_value(LLVMGenericValueRef Ref) {
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.ml b/bindings/ocaml/executionengine/llvm_executionengine.ml
index a8535b246404..ddb53bbb5afd 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.ml
+++ b/bindings/ocaml/executionengine/llvm_executionengine.ml
@@ -83,7 +83,7 @@ module ExecutionEngine = struct
   external free_machine_code: Llvm.llvalue -> t -> unit
     = "llvm_ee_free_machine_code"
 
-  external target_data: t -> Llvm_target.TargetData.t
+  external target_data: t -> Llvm_target.DataLayout.t
     = "LLVMGetExecutionEngineTargetData"
   
   (* The following are not bound. Patches are welcome.
diff --git a/bindings/ocaml/executionengine/llvm_executionengine.mli b/bindings/ocaml/executionengine/llvm_executionengine.mli
index 166b7bcddca6..0b06078bad86 100644
--- a/bindings/ocaml/executionengine/llvm_executionengine.mli
+++ b/bindings/ocaml/executionengine/llvm_executionengine.mli
@@ -155,7 +155,7 @@ module ExecutionEngine: sig
 
   (** [target_data ee] is the target data owned by the execution engine
       [ee]. *)
-  val target_data : t -> Llvm_target.TargetData.t
+  val target_data : t -> Llvm_target.DataLayout.t
 
 end
 
diff --git a/bindings/ocaml/llvm/llvm.mli b/bindings/ocaml/llvm/llvm.mli
index 96448ccd960d..eb6c88355afb 100644
--- a/bindings/ocaml/llvm/llvm.mli
+++ b/bindings/ocaml/llvm/llvm.mli
@@ -375,7 +375,7 @@ val module_context : llmodule -> llcontext
 val classify_type : lltype -> TypeKind.t
 
 (** [type_is_sized ty] returns whether the type has a size or not.
- * If it doesn't then it is not safe to call the [TargetData::] methods on it.
+ * If it doesn't then it is not safe to call the [DataLayout::] methods on it.
  * *)
 val type_is_sized : lltype -> bool
 
diff --git a/bindings/ocaml/llvm/llvm_ocaml.c b/bindings/ocaml/llvm/llvm_ocaml.c
index a5985d9d2b04..c984bd154cc2 100644
--- a/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/bindings/ocaml/llvm/llvm_ocaml.c
@@ -1277,6 +1277,9 @@ static struct custom_operations builder_ops = {
   custom_hash_default,
   custom_serialize_default,
   custom_deserialize_default
+#ifdef custom_compare_ext_default
+  , custom_compare_ext_default
+#endif
 };
 
 static value alloc_builder(LLVMBuilderRef B) {
diff --git a/bindings/ocaml/target/llvm_target.ml b/bindings/ocaml/target/llvm_target.ml
index 49940eec4800..f4891e2b83bd 100644
--- a/bindings/ocaml/target/llvm_target.ml
+++ b/bindings/ocaml/target/llvm_target.ml
@@ -13,7 +13,7 @@ module Endian = struct
   | Little
 end
 
-module TargetData = struct
+module DataLayout = struct
   type t
 
   external create : string -> t = "llvm_targetdata_create"
@@ -23,20 +23,20 @@ module TargetData = struct
   external dispose : t -> unit = "llvm_targetdata_dispose"
 end
 
-external byte_order : TargetData.t -> Endian.t = "llvm_byte_order"
-external pointer_size : TargetData.t -> int = "llvm_pointer_size"
-external intptr_type : TargetData.t -> Llvm.lltype = "LLVMIntPtrType"
-external size_in_bits : TargetData.t -> Llvm.lltype -> Int64.t
+external byte_order : DataLayout.t -> Endian.t = "llvm_byte_order"
+external pointer_size : DataLayout.t -> int = "llvm_pointer_size"
+external intptr_type : DataLayout.t -> Llvm.lltype = "LLVMIntPtrType"
+external size_in_bits : DataLayout.t -> Llvm.lltype -> Int64.t
                       = "llvm_size_in_bits"
-external store_size : TargetData.t -> Llvm.lltype -> Int64.t = "llvm_store_size"
-external abi_size : TargetData.t -> Llvm.lltype -> Int64.t = "llvm_abi_size"
-external abi_align : TargetData.t -> Llvm.lltype -> int = "llvm_abi_align"
-external stack_align : TargetData.t -> Llvm.lltype -> int = "llvm_stack_align"
-external preferred_align : TargetData.t -> Llvm.lltype -> int
+external store_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_store_size"
+external abi_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_abi_size"
+external abi_align : DataLayout.t -> Llvm.lltype -> int = "llvm_abi_align"
+external stack_align : DataLayout.t -> Llvm.lltype -> int = "llvm_stack_align"
+external preferred_align : DataLayout.t -> Llvm.lltype -> int
                          = "llvm_preferred_align"
-external preferred_align_of_global : TargetData.t -> Llvm.llvalue -> int
+external preferred_align_of_global : DataLayout.t -> Llvm.llvalue -> int
                                    = "llvm_preferred_align_of_global"
-external element_at_offset : TargetData.t -> Llvm.lltype -> Int64.t -> int
+external element_at_offset : DataLayout.t -> Llvm.lltype -> Int64.t -> int
                            = "llvm_element_at_offset"
-external offset_of_element : TargetData.t -> Llvm.lltype -> int -> Int64.t
+external offset_of_element : DataLayout.t -> Llvm.lltype -> int -> Int64.t
                            = "llvm_offset_of_element"
diff --git a/bindings/ocaml/target/llvm_target.mli b/bindings/ocaml/target/llvm_target.mli
index c288b9ac2d9c..ab9c5e49eab8 100644
--- a/bindings/ocaml/target/llvm_target.mli
+++ b/bindings/ocaml/target/llvm_target.mli
@@ -18,11 +18,11 @@ module Endian : sig
   | Little
 end
 
-module TargetData : sig
+module DataLayout : sig
   type t
 
-  (** [TargetData.create rep] parses the target data string representation [rep].
-      See the constructor llvm::TargetData::TargetData. *)
+  (** [DataLayout.create rep] parses the target data string representation [rep].
+      See the constructor llvm::DataLayout::DataLayout. *)
   external create : string -> t = "llvm_targetdata_create"
 
   (** [add_target_data td pm] adds the target data [td] to the pass manager [pm].
@@ -32,64 +32,64 @@ module TargetData : sig
                = "llvm_targetdata_add"
 
   (** [as_string td] is the string representation of the target data [td].
-      See the constructor llvm::TargetData::TargetData. *)
+      See the constructor llvm::DataLayout::DataLayout. *)
   external as_string : t -> string = "llvm_targetdata_as_string"
 
-  (** Deallocates a TargetData.
-      See the destructor llvm::TargetData::~TargetData. *)
+  (** Deallocates a DataLayout.
+      See the destructor llvm::DataLayout::~DataLayout. *)
   external dispose : t -> unit = "llvm_targetdata_dispose"
 end
 
 (** Returns the byte order of a target, either LLVMBigEndian or
     LLVMLittleEndian.
-    See the method llvm::TargetData::isLittleEndian. *)
-external byte_order : TargetData.t -> Endian.t = "llvm_byte_order"
+    See the method llvm::DataLayout::isLittleEndian. *)
+external byte_order : DataLayout.t -> Endian.t = "llvm_byte_order"
 
 (** Returns the pointer size in bytes for a target.
-    See the method llvm::TargetData::getPointerSize. *)
-external pointer_size : TargetData.t -> int = "llvm_pointer_size"
+    See the method llvm::DataLayout::getPointerSize. *)
+external pointer_size : DataLayout.t -> int = "llvm_pointer_size"
 
 (** Returns the integer type that is the same size as a pointer on a target.
-    See the method llvm::TargetData::getIntPtrType. *)
-external intptr_type : TargetData.t -> Llvm.lltype = "LLVMIntPtrType"
+    See the method llvm::DataLayout::getIntPtrType. *)
+external intptr_type : DataLayout.t -> Llvm.lltype = "LLVMIntPtrType"
 
 (** Computes the size of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeSizeInBits. *)
-external size_in_bits : TargetData.t -> Llvm.lltype -> Int64.t
+    See the method llvm::DataLayout::getTypeSizeInBits. *)
+external size_in_bits : DataLayout.t -> Llvm.lltype -> Int64.t
                       = "llvm_size_in_bits"
 
 (** Computes the storage size of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeStoreSize. *)
-external store_size : TargetData.t -> Llvm.lltype -> Int64.t = "llvm_store_size"
+    See the method llvm::DataLayout::getTypeStoreSize. *)
+external store_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_store_size"
 
 (** Computes the ABI size of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeAllocSize. *)
-external abi_size : TargetData.t -> Llvm.lltype -> Int64.t = "llvm_abi_size"
+    See the method llvm::DataLayout::getTypeAllocSize. *)
+external abi_size : DataLayout.t -> Llvm.lltype -> Int64.t = "llvm_abi_size"
 
 (** Computes the ABI alignment of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeABISize. *)
-external abi_align : TargetData.t -> Llvm.lltype -> int = "llvm_abi_align"
+    See the method llvm::DataLayout::getTypeABISize. *)
+external abi_align : DataLayout.t -> Llvm.lltype -> int = "llvm_abi_align"
 
 (** Computes the call frame alignment of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeABISize. *)
-external stack_align : TargetData.t -> Llvm.lltype -> int = "llvm_stack_align"
+    See the method llvm::DataLayout::getTypeABISize. *)
+external stack_align : DataLayout.t -> Llvm.lltype -> int = "llvm_stack_align"
 
 (** Computes the preferred alignment of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeABISize. *)
-external preferred_align : TargetData.t -> Llvm.lltype -> int
+    See the method llvm::DataLayout::getTypeABISize. *)
+external preferred_align : DataLayout.t -> Llvm.lltype -> int
                          = "llvm_preferred_align"
 
 (** Computes the preferred alignment of a global variable in bytes for a target.
-    See the method llvm::TargetData::getPreferredAlignment. *)
-external preferred_align_of_global : TargetData.t -> Llvm.llvalue -> int
+    See the method llvm::DataLayout::getPreferredAlignment. *)
+external preferred_align_of_global : DataLayout.t -> Llvm.llvalue -> int
                                    = "llvm_preferred_align_of_global"
 
 (** Computes the structure element that contains the byte offset for a target.
     See the method llvm::StructLayout::getElementContainingOffset. *)
-external element_at_offset : TargetData.t -> Llvm.lltype -> Int64.t -> int
+external element_at_offset : DataLayout.t -> Llvm.lltype -> Int64.t -> int
                            = "llvm_element_at_offset"
 
 (** Computes the byte offset of the indexed struct element for a target.
     See the method llvm::StructLayout::getElementContainingOffset. *)
-external offset_of_element : TargetData.t -> Llvm.lltype -> int -> Int64.t
+external offset_of_element : DataLayout.t -> Llvm.lltype -> int -> Int64.t
                            = "llvm_offset_of_element"
diff --git a/bindings/ocaml/target/target_ocaml.c b/bindings/ocaml/target/target_ocaml.c
index ca01e7786b68..62fe789a52e2 100644
--- a/bindings/ocaml/target/target_ocaml.c
+++ b/bindings/ocaml/target/target_ocaml.c
@@ -18,18 +18,18 @@
 #include "llvm-c/Target.h"
 #include "caml/alloc.h"
 
-/* string -> TargetData.t */
+/* string -> DataLayout.t */
 CAMLprim LLVMTargetDataRef llvm_targetdata_create(value StringRep) {
   return LLVMCreateTargetData(String_val(StringRep));
 }
 
-/* TargetData.t -> [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
+/* DataLayout.t -> [<Llvm.PassManager.any] Llvm.PassManager.t -> unit */
 CAMLprim value llvm_targetdata_add(LLVMTargetDataRef TD, LLVMPassManagerRef PM){
   LLVMAddTargetData(TD, PM);
   return Val_unit;
 }
 
-/* TargetData.t -> string */
+/* DataLayout.t -> string */
 CAMLprim value llvm_targetdata_as_string(LLVMTargetDataRef TD) {
   char *StringRep = LLVMCopyStringRepOfTargetData(TD);
   value Copy = copy_string(StringRep);
@@ -37,65 +37,65 @@ CAMLprim value llvm_targetdata_as_string(LLVMTargetDataRef TD) {
   return Copy;
 }
 
-/* TargetData.t -> unit */
+/* DataLayout.t -> unit */
 CAMLprim value llvm_targetdata_dispose(LLVMTargetDataRef TD) {
   LLVMDisposeTargetData(TD);
   return Val_unit;
 }
 
-/* TargetData.t -> Endian.t */
+/* DataLayout.t -> Endian.t */
 CAMLprim value llvm_byte_order(LLVMTargetDataRef TD) {
   return Val_int(LLVMByteOrder(TD));
 }
 
-/* TargetData.t -> int */
+/* DataLayout.t -> int */
 CAMLprim value llvm_pointer_size(LLVMTargetDataRef TD) {
   return Val_int(LLVMPointerSize(TD));
 }
 
-/* TargetData.t -> Llvm.lltype -> Int64.t */
+/* DataLayout.t -> Llvm.lltype -> Int64.t */
 CAMLprim value llvm_size_in_bits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return caml_copy_int64(LLVMSizeOfTypeInBits(TD, Ty));
 }
 
-/* TargetData.t -> Llvm.lltype -> Int64.t */
+/* DataLayout.t -> Llvm.lltype -> Int64.t */
 CAMLprim value llvm_store_size(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return caml_copy_int64(LLVMStoreSizeOfType(TD, Ty));
 }
 
-/* TargetData.t -> Llvm.lltype -> Int64.t */
+/* DataLayout.t -> Llvm.lltype -> Int64.t */
 CAMLprim value llvm_abi_size(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return caml_copy_int64(LLVMABISizeOfType(TD, Ty));
 }
 
-/* TargetData.t -> Llvm.lltype -> int */
+/* DataLayout.t -> Llvm.lltype -> int */
 CAMLprim value llvm_abi_align(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return Val_int(LLVMABIAlignmentOfType(TD, Ty));
 }
 
-/* TargetData.t -> Llvm.lltype -> int */
+/* DataLayout.t -> Llvm.lltype -> int */
 CAMLprim value llvm_stack_align(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return Val_int(LLVMCallFrameAlignmentOfType(TD, Ty));
 }
 
-/* TargetData.t -> Llvm.lltype -> int */
+/* DataLayout.t -> Llvm.lltype -> int */
 CAMLprim value llvm_preferred_align(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return Val_int(LLVMPreferredAlignmentOfType(TD, Ty));
 }
 
-/* TargetData.t -> Llvm.llvalue -> int */
+/* DataLayout.t -> Llvm.llvalue -> int */
 CAMLprim value llvm_preferred_align_of_global(LLVMTargetDataRef TD,
                                               LLVMValueRef GlobalVar) {
   return Val_int(LLVMPreferredAlignmentOfGlobal(TD, GlobalVar));
 }
 
-/* TargetData.t -> Llvm.lltype -> Int64.t -> int */
+/* DataLayout.t -> Llvm.lltype -> Int64.t -> int */
 CAMLprim value llvm_element_at_offset(LLVMTargetDataRef TD, LLVMTypeRef Ty,
                                       value Offset) {
   return Val_int(LLVMElementAtOffset(TD, Ty, Int_val(Offset)));
 }
 
-/* TargetData.t -> Llvm.lltype -> int -> Int64.t */
+/* DataLayout.t -> Llvm.lltype -> int -> Int64.t */
 CAMLprim value llvm_offset_of_element(LLVMTargetDataRef TD, LLVMTypeRef Ty,
                                       value Index) {
   return caml_copy_int64(LLVMOffsetOfElement(TD, Ty, Int_val(Index)));
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 25d6211ac6f2..fcd5dd556676 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -85,13 +85,25 @@ check_include_file(mach-o/dyld.h HAVE_MACH_O_DYLD_H)
 # library checks
 if( NOT PURE_WINDOWS )
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
-  check_library_exists(pthread pthread_getspecific "" HAVE_PTHREAD_GETSPECIFIC)
-  check_library_exists(pthread pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
+  if (HAVE_LIBPTHREAD)
+    check_library_exists(pthread pthread_getspecific "" HAVE_PTHREAD_GETSPECIFIC)
+    check_library_exists(pthread pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
+    check_library_exists(pthread pthread_mutex_lock "" HAVE_PTHREAD_MUTEX_LOCK)
+  else()
+    # this could be Android
+    check_library_exists(c pthread_create "" PTHREAD_IN_LIBC)
+    if (PTHREAD_IN_LIBC)
+      check_library_exists(c pthread_getspecific "" HAVE_PTHREAD_GETSPECIFIC)
+      check_library_exists(c pthread_rwlock_init "" HAVE_PTHREAD_RWLOCK_INIT)
+      check_library_exists(c pthread_mutex_lock "" HAVE_PTHREAD_MUTEX_LOCK)
+    endif()
+  endif()
   check_library_exists(dl dlopen "" HAVE_LIBDL)
 endif()
 
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_ARC4RANDOM)
+check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
 check_symbol_exists(getrusage sys/resource.h HAVE_GETRUSAGE)
 check_symbol_exists(setrlimit sys/resource.h HAVE_SETRLIMIT)
@@ -134,9 +146,6 @@ check_symbol_exists(strchr string.h HAVE_STRCHR)
 check_symbol_exists(strcmp string.h HAVE_STRCMP)
 check_symbol_exists(strdup string.h HAVE_STRDUP)
 check_symbol_exists(strrchr string.h HAVE_STRRCHR)
-if( NOT PURE_WINDOWS )
-  check_symbol_exists(pthread_mutex_lock pthread.h HAVE_PTHREAD_MUTEX_LOCK)
-endif()
 check_symbol_exists(sbrk unistd.h HAVE_SBRK)
 check_symbol_exists(srand48 stdlib.h HAVE_RAND48_SRAND48)
 if( HAVE_RAND48_SRAND48 )
@@ -294,9 +303,7 @@ get_host_triple(LLVM_HOST_TRIPLE)
 
 # By default, we target the host, but this can be overridden at CMake
 # invocation time.
-set(LLVM_DEFAULT_TARGET_TRIPLE "${LLVM_HOST_TRIPLE}")
 set(LLVM_HOSTTRIPLE "${LLVM_HOST_TRIPLE}")
-set(TARGET_TRIPLE "${LLVM_DEFAULT_TARGET_TRIPLE}")
 
 # Determine the native architecture.
 string(TOLOWER "${LLVM_TARGET_ARCH}" LLVM_NATIVE_ARCH)
@@ -324,6 +331,8 @@ elseif (LLVM_NATIVE_ARCH MATCHES "xcore")
   set(LLVM_NATIVE_ARCH XCore)
 elseif (LLVM_NATIVE_ARCH MATCHES "msp430")
   set(LLVM_NATIVE_ARCH MSP430)
+elseif (LLVM_NATIVE_ARCH MATCHES "hexagon")
+  set(LLVM_NATIVE_ARCH Hexagon)
 else ()
   message(FATAL_ERROR "Unknown architecture ${LLVM_NATIVE_ARCH}")
 endif ()
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index f44a27cce83a..43ee9a08b27f 100755
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -135,16 +135,22 @@ endmacro(add_llvm_target)
 # lld, and Polly. This adds two options. One for the source directory of the
 # project, which defaults to ${CMAKE_CURRENT_SOURCE_DIR}/${name}. Another to
 # enable or disable building it with everthing else.
+# Additional parameter can be specified as the name of directory.
 macro(add_llvm_external_project name)
-  string(TOUPPER ${name} nameUPPER)
-  set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${name}"
+  set(add_llvm_external_dir "${ARGN}")
+  if("${add_llvm_external_dir}" STREQUAL "")
+    set(add_llvm_external_dir ${name})
+  endif()
+  string(REPLACE "-" "_" nameUNDERSCORE ${name})
+  string(TOUPPER ${nameUNDERSCORE} nameUPPER)
+  set(LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/${add_llvm_external_dir}"
       CACHE PATH "Path to ${name} source directory")
   if (NOT ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} STREQUAL ""
       AND EXISTS ${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR}/CMakeLists.txt)
     option(LLVM_EXTERNAL_${nameUPPER}_BUILD
            "Whether to build ${name} as part of LLVM" ON)
     if (LLVM_EXTERNAL_${nameUPPER}_BUILD)
-      add_subdirectory(${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} ${name})
+      add_subdirectory(${LLVM_EXTERNAL_${nameUPPER}_SOURCE_DIR} ${add_llvm_external_dir})
     endif()
   endif()
 endmacro(add_llvm_external_project)
@@ -278,11 +284,14 @@ endfunction()
 function(add_lit_testsuite target comment)
   parse_arguments(ARG "PARAMS;DEPENDS;ARGS" "" ${ARGN})
 
-  # Register the testsuites, params and depends for the global check rule.
-  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_TESTSUITES ${ARG_DEFAULT_ARGS})
-  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_PARAMS ${ARG_PARAMS})
-  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_DEPENDS ${ARG_DEPENDS})
-  set_property(GLOBAL APPEND PROPERTY LLVM_LIT_EXTRA_ARGS ${ARG_ARGS})
+  # EXCLUDE_FROM_ALL excludes the test ${target} out of check-all.
+  if(NOT EXCLUDE_FROM_ALL)
+    # Register the testsuites, params and depends for the global check rule.
+    set_property(GLOBAL APPEND PROPERTY LLVM_LIT_TESTSUITES ${ARG_DEFAULT_ARGS})
+    set_property(GLOBAL APPEND PROPERTY LLVM_LIT_PARAMS ${ARG_PARAMS})
+    set_property(GLOBAL APPEND PROPERTY LLVM_LIT_DEPENDS ${ARG_DEPENDS})
+    set_property(GLOBAL APPEND PROPERTY LLVM_LIT_EXTRA_ARGS ${ARG_ARGS})
+  endif()
 
   # Produce a specific suffixed check rule.
   add_lit_target(${target} ${comment}
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 0e410edc15fe..2cef6cfc3a30 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -48,7 +48,7 @@ function(llvm_process_sources OUT_VAR)
     set( f ${CMAKE_CURRENT_SOURCE_DIR}/${s} )
     add_file_dependencies( ${f} ${TABLEGEN_OUTPUT} )
   endforeach(s)
-  if( MSVC_IDE )
+  if( MSVC_IDE OR XCODE )
     # This adds .td and .h files to the Visual Studio solution:
     # FIXME: Shall we handle *.def here?
     add_td_sources(sources)
diff --git a/cmake/platforms/Android.cmake b/cmake/platforms/Android.cmake
new file mode 100644
index 000000000000..72849b16c767
--- /dev/null
+++ b/cmake/platforms/Android.cmake
@@ -0,0 +1,21 @@
+# Toolchain config for Android NDK.
+# This is expected to be used with a standalone Android toolchain (see
+# docs/STANDALONE-TOOLCHAIN.html in the NDK on how to get one).
+#
+# Usage:
+# mkdir build; cd build
+# cmake ..; make
+# mkdir android; cd android
+# cmake -DLLVM_ANDROID_TOOLCHAIN_DIR=/path/to/android/ndk \
+#   -DCMAKE_TOOLCHAIN_FILE=../../cmake/platforms/Android.cmake ../..
+# make <target>
+
+SET(CMAKE_SYSTEM_NAME Linux)
+SET(CMAKE_C_COMPILER ${CMAKE_BINARY_DIR}/../bin/clang)
+SET(CMAKE_CXX_COMPILER ${CMAKE_BINARY_DIR}/../bin/clang++)
+SET(ANDROID "1" CACHE STRING "ANDROID" FORCE)
+
+SET(ANDROID_COMMON_FLAGS "-target arm-linux-androideabi --sysroot=${LLVM_ANDROID_TOOLCHAIN_DIR}/sysroot -B${LLVM_ANDROID_TOOLCHAIN_DIR} -mllvm -arm-enable-ehabi")
+SET(CMAKE_C_FLAGS "${ANDROID_COMMON_FLAGS}" CACHE STRING "toolchain_cflags" FORCE)
+SET(CMAKE_CXX_FLAGS "${ANDROID_COMMON_FLAGS}" CACHE STRING "toolchain_cxxflags" FORCE)
+SET(CMAKE_LINK_FLAGS "${ANDROID_COMMON_FLAGS}" CACHE STRING "toolchain_linkflags" FORCE)
diff --git a/configure b/configure
index 6fbc47c72a7e..4fa070549196 100755
--- a/configure
+++ b/configure
@@ -704,6 +704,7 @@ ENABLE_PIC
 ENABLE_SHARED
 ENABLE_EMBED_STDCXX
 ENABLE_TIMESTAMPS
+ENABLE_BACKTRACES
 TARGETS_TO_BUILD
 LLVM_ENUM_TARGETS
 LLVM_ENUM_ASM_PRINTERS
@@ -766,8 +767,6 @@ COVERED_SWITCH_DEFAULT
 USE_UDIS86
 USE_OPROFILE
 USE_INTEL_JITEVENTS
-INTEL_JITEVENTS_INCDIR
-INTEL_JITEVENTS_LIBDIR
 XML2CONFIG
 LIBXML2_LIBS
 LIBXML2_INC
@@ -792,6 +791,7 @@ OCAML_LIBDIR
 ENABLE_VISIBILITY_INLINES_HIDDEN
 RPATH
 RDYNAMIC
+program_prefix
 LIBOBJS
 LTLIBOBJS'
 ac_subst_files=''
@@ -1423,6 +1423,8 @@ Optional Features:
                           Win32 DLL (default is NO)
   --enable-timestamps     Enable embedding timestamp information in build
                           (default is YES)
+  --enable-backtraces     Enable embedding backtraces on crash (default is
+                          YES)
   --enable-targets        Build specific host targets: all or
                           target1,target2,... Valid targets are: host, x86,
                           x86_64, sparc, powerpc, arm, mips, spu, hexagon,
@@ -1460,10 +1462,8 @@ Optional Packages:
   --with-udis86=<path>    Use udis86 external x86 disassembler library
   --with-oprofile=<prefix>
                           Tell OProfile >= 0.9.4 how to symbolize JIT output
-  --with-intel-jitevents=<vtune-amplifier-dir>
-                          Specify location of run-time support library for
-                          Intel JIT API
-                          (default=/opt/intel/vtune_amplifier_xe_2011)
+  --with-intel-jitevents  Notify Intel JIT profiling API of generated code
+
 
 Some influential environment variables:
   CC          C compiler command
@@ -3904,8 +3904,8 @@ else
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
-  mips-*)                 llvm_cv_target_arch="Mips" ;;
-  mipsel-*)               llvm_cv_target_arch="Mips" ;;
+  mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
+  mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
@@ -3937,8 +3937,8 @@ case $host in
   sparc*-*)               host_arch="Sparc" ;;
   powerpc*-*)             host_arch="PowerPC" ;;
   arm*-*)                 host_arch="ARM" ;;
-  mips-*)                 host_arch="Mips" ;;
-  mipsel-*)               host_arch="Mips" ;;
+  mips-* | mips64-*)      host_arch="Mips" ;;
+  mipsel-* | mips64el-*)  host_arch="Mips" ;;
   xcore-*)                host_arch="XCore" ;;
   msp430-*)               host_arch="MSP430" ;;
   hexagon-*)              host_arch="Hexagon" ;;
@@ -5382,6 +5382,31 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
+
+# Check whether --enable-backtraces was given.
+if test "${enable_backtraces+set}" = set; then
+  enableval=$enable_backtraces;
+else
+  enableval=default
+fi
+
+case "$enableval" in
+  yes) ENABLE_BACKTRACES=1
+ ;;
+  no)  ENABLE_BACKTRACES=0
+ ;;
+  default) ENABLE_BACKTRACES=1
+ ;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
+cat >>confdefs.h <<_ACEOF
+#define ENABLE_BACKTRACES $ENABLE_BACKTRACES
+_ACEOF
+
+
 TARGETS_TO_BUILD=""
 # Check whether --enable-targets was given.
 if test "${enable_targets+set}" = set; then
@@ -5404,6 +5429,8 @@ case "$enableval" in
         arm)      TARGETS_TO_BUILD="ARM $TARGETS_TO_BUILD" ;;
         mips)     TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         mipsel)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
+        mips64)   TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
+        mips64el) TARGETS_TO_BUILD="Mips $TARGETS_TO_BUILD" ;;
         spu)      TARGETS_TO_BUILD="CellSPU $TARGETS_TO_BUILD" ;;
         xcore)    TARGETS_TO_BUILD="XCore $TARGETS_TO_BUILD" ;;
         msp430)   TARGETS_TO_BUILD="MSP430 $TARGETS_TO_BUILD" ;;
@@ -10289,7 +10316,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10292 "configure"
+#line 10317 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -13549,308 +13576,30 @@ _ACEOF
 # Check whether --with-intel-jitevents was given.
 if test "${with_intel_jitevents+set}" = set; then
   withval=$with_intel_jitevents;
+       case "$withval" in
+          yes) USE_INTEL_JITEVENTS=1
+;;
+          no)  USE_INTEL_JITEVENTS=0
+;;
+          *) { { echo "$as_me:$LINENO: error: Invalid setting for --with-intel-jitevents. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --with-intel-jitevents. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; };;
+       esac
+
       case $llvm_cv_os_type in
         Linux|Win32|Cygwin|MingW) ;;
-        *)
-          { { echo "$as_me:$LINENO: error:
-            Intel JIT API support is available on Linux and Windows only.\"" >&5
-echo "$as_me: error:
-            Intel JIT API support is available on Linux and Windows only.\"" >&2;}
-   { (exit 1); exit 1; }; } ;;
+        *) { { echo "$as_me:$LINENO: error: Intel JIT API support is available on Linux and Windows only." >&5
+echo "$as_me: error: Intel JIT API support is available on Linux and Windows only." >&2;}
+   { (exit 1); exit 1; }; };;
       esac
 
-      USE_INTEL_JITEVENTS=1
-
       case "$llvm_cv_target_arch" in
-        x86)    llvm_intel_jitevents_archdir="lib32";;
-        x86_64) llvm_intel_jitevents_archdir="lib64";;
-        *)      echo "Target architecture $llvm_cv_target_arch does not support Intel JIT Events API"
-                exit -1;;
-      esac
-      INTEL_JITEVENTS_INCDIR="/opt/intel/vtune_amplifier_xe_2011/include"
-      INTEL_JITEVENTS_LIBDIR="/opt/intel/vtune_amplifier_xe_2011/$llvm_intel_jitevents_archdir"
-      case "$withval" in
-        /* | [A-Za-z]:[\\/]*) INTEL_JITEVENTS_INCDIR=$withval/include
-                                  INTEL_JITEVENTS_LIBDIR=$withval/$llvm_intel_jitevents_archdir ;;
-        *) ;;
+        x86|x86_64) ;;
+        *) { { echo "$as_me:$LINENO: error: Target architecture $llvm_cv_target_arch does not support Intel JIT Events API." >&5
+echo "$as_me: error: Target architecture $llvm_cv_target_arch does not support Intel JIT Events API." >&2;}
+   { (exit 1); exit 1; }; };;
       esac
 
-
-
-
-      LIBS="$LIBS -L${INTEL_JITEVENTS_LIBDIR}"
-      CPPFLAGS="$CPPFLAGS -I$INTEL_JITEVENTS_INCDIR"
-
-      { echo "$as_me:$LINENO: checking for library containing iJIT_IsProfilingActive" >&5
-echo $ECHO_N "checking for library containing iJIT_IsProfilingActive... $ECHO_C" >&6; }
-if test "${ac_cv_search_iJIT_IsProfilingActive+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_func_search_save_LIBS=$LIBS
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-
-/* Override any GCC internal prototype to avoid an error.
-   Use char because int might match the return type of a GCC
-   builtin and then its argument prototype would still apply.  */
-#ifdef __cplusplus
-extern "C"
-#endif
-char iJIT_IsProfilingActive ();
-int
-main ()
-{
-return iJIT_IsProfilingActive ();
-  ;
-  return 0;
-}
-_ACEOF
-for ac_lib in '' jitprofiling; do
-  if test -z "$ac_lib"; then
-    ac_res="none required"
-  else
-    ac_res=-l$ac_lib
-    LIBS="-l$ac_lib  $ac_func_search_save_LIBS"
-  fi
-  rm -f conftest.$ac_objext conftest$ac_exeext
-if { (ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_link") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } &&
-	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; } &&
-	 { ac_try='test -s conftest$ac_exeext'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  ac_cv_search_iJIT_IsProfilingActive=$ac_res
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-
-fi
-
-rm -f core conftest.err conftest.$ac_objext \
-      conftest$ac_exeext
-  if test "${ac_cv_search_iJIT_IsProfilingActive+set}" = set; then
-  break
-fi
-done
-if test "${ac_cv_search_iJIT_IsProfilingActive+set}" = set; then
-  :
-else
-  ac_cv_search_iJIT_IsProfilingActive=no
-fi
-rm conftest.$ac_ext
-LIBS=$ac_func_search_save_LIBS
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_search_iJIT_IsProfilingActive" >&5
-echo "${ECHO_T}$ac_cv_search_iJIT_IsProfilingActive" >&6; }
-ac_res=$ac_cv_search_iJIT_IsProfilingActive
-if test "$ac_res" != no; then
-  test "$ac_res" = "none required" || LIBS="$ac_res $LIBS"
-
-else
-
-        echo "Error! Cannot find libjitprofiling.a. Please check path specified in flag --with-intel-jitevents"
-        exit -1
-
-fi
-
-      if test "${ac_cv_header_jitprofiling_h+set}" = set; then
-  { echo "$as_me:$LINENO: checking for jitprofiling.h" >&5
-echo $ECHO_N "checking for jitprofiling.h... $ECHO_C" >&6; }
-if test "${ac_cv_header_jitprofiling_h+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_header_jitprofiling_h" >&5
-echo "${ECHO_T}$ac_cv_header_jitprofiling_h" >&6; }
-else
-  # Is the header compilable?
-{ echo "$as_me:$LINENO: checking jitprofiling.h usability" >&5
-echo $ECHO_N "checking jitprofiling.h usability... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-$ac_includes_default
-#include <jitprofiling.h>
-_ACEOF
-rm -f conftest.$ac_objext
-if { (ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_compile") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } &&
-	 { ac_try='test -z "$ac_c_werror_flag" || test ! -s conftest.err'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; } &&
-	 { ac_try='test -s conftest.$ac_objext'
-  { (case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); }; }; then
-  ac_header_compiler=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_header_compiler=no
-fi
-
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
-echo "${ECHO_T}$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ echo "$as_me:$LINENO: checking jitprofiling.h presence" >&5
-echo $ECHO_N "checking jitprofiling.h presence... $ECHO_C" >&6; }
-cat >conftest.$ac_ext <<_ACEOF
-/* confdefs.h.  */
-_ACEOF
-cat confdefs.h >>conftest.$ac_ext
-cat >>conftest.$ac_ext <<_ACEOF
-/* end confdefs.h.  */
-#include <jitprofiling.h>
-_ACEOF
-if { (ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
-  ac_status=$?
-  grep -v '^ *+' conftest.er1 >conftest.err
-  rm -f conftest.er1
-  cat conftest.err >&5
-  echo "$as_me:$LINENO: \$? = $ac_status" >&5
-  (exit $ac_status); } >/dev/null; then
-  if test -s conftest.err; then
-    ac_cpp_err=$ac_c_preproc_warn_flag
-    ac_cpp_err=$ac_cpp_err$ac_c_werror_flag
-  else
-    ac_cpp_err=
-  fi
-else
-  ac_cpp_err=yes
-fi
-if test -z "$ac_cpp_err"; then
-  ac_header_preproc=yes
-else
-  echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-  ac_header_preproc=no
-fi
-
-rm -f conftest.err conftest.$ac_ext
-{ echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
-echo "${ECHO_T}$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
-  yes:no: )
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h: accepted by the compiler, rejected by the preprocessor!" >&5
-echo "$as_me: WARNING: jitprofiling.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h: proceeding with the compiler's result" >&5
-echo "$as_me: WARNING: jitprofiling.h: proceeding with the compiler's result" >&2;}
-    ac_header_preproc=yes
-    ;;
-  no:yes:* )
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h: present but cannot be compiled" >&5
-echo "$as_me: WARNING: jitprofiling.h: present but cannot be compiled" >&2;}
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h:     check for missing prerequisite headers?" >&5
-echo "$as_me: WARNING: jitprofiling.h:     check for missing prerequisite headers?" >&2;}
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h: see the Autoconf documentation" >&5
-echo "$as_me: WARNING: jitprofiling.h: see the Autoconf documentation" >&2;}
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h:     section \"Present But Cannot Be Compiled\"" >&5
-echo "$as_me: WARNING: jitprofiling.h:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h: proceeding with the preprocessor's result" >&5
-echo "$as_me: WARNING: jitprofiling.h: proceeding with the preprocessor's result" >&2;}
-    { echo "$as_me:$LINENO: WARNING: jitprofiling.h: in the future, the compiler will take precedence" >&5
-echo "$as_me: WARNING: jitprofiling.h: in the future, the compiler will take precedence" >&2;}
-    ( cat <<\_ASBOX
-## ------------------------------------ ##
-## Report this to http://llvm.org/bugs/ ##
-## ------------------------------------ ##
-_ASBOX
-     ) | sed "s/^/$as_me: WARNING:     /" >&2
-    ;;
-esac
-{ echo "$as_me:$LINENO: checking for jitprofiling.h" >&5
-echo $ECHO_N "checking for jitprofiling.h... $ECHO_C" >&6; }
-if test "${ac_cv_header_jitprofiling_h+set}" = set; then
-  echo $ECHO_N "(cached) $ECHO_C" >&6
-else
-  ac_cv_header_jitprofiling_h=$ac_header_preproc
-fi
-{ echo "$as_me:$LINENO: result: $ac_cv_header_jitprofiling_h" >&5
-echo "${ECHO_T}$ac_cv_header_jitprofiling_h" >&6; }
-
-fi
-if test $ac_cv_header_jitprofiling_h = yes; then
-  :
-else
-
-        echo "Error! Cannot find jitprofiling.h. Please check path specified in flag --with-intel-jitevents"
-        exit -1
-
-fi
-
-
-
-
 else
 
       USE_INTEL_JITEVENTS=0
@@ -21382,6 +21131,10 @@ fi
 ac_config_files="$ac_config_files bindings/ocaml/llvm/META.llvm"
 
 
+test "x$program_prefix" = "xNONE" && program_prefix=""
+
+
+
 ac_config_commands="$ac_config_commands setup"
 
 ac_config_commands="$ac_config_commands Makefile"
@@ -22219,6 +21972,7 @@ ENABLE_PIC!$ENABLE_PIC$ac_delim
 ENABLE_SHARED!$ENABLE_SHARED$ac_delim
 ENABLE_EMBED_STDCXX!$ENABLE_EMBED_STDCXX$ac_delim
 ENABLE_TIMESTAMPS!$ENABLE_TIMESTAMPS$ac_delim
+ENABLE_BACKTRACES!$ENABLE_BACKTRACES$ac_delim
 TARGETS_TO_BUILD!$TARGETS_TO_BUILD$ac_delim
 LLVM_ENUM_TARGETS!$LLVM_ENUM_TARGETS$ac_delim
 LLVM_ENUM_ASM_PRINTERS!$LLVM_ENUM_ASM_PRINTERS$ac_delim
@@ -22281,8 +22035,6 @@ COVERED_SWITCH_DEFAULT!$COVERED_SWITCH_DEFAULT$ac_delim
 USE_UDIS86!$USE_UDIS86$ac_delim
 USE_OPROFILE!$USE_OPROFILE$ac_delim
 USE_INTEL_JITEVENTS!$USE_INTEL_JITEVENTS$ac_delim
-INTEL_JITEVENTS_INCDIR!$INTEL_JITEVENTS_INCDIR$ac_delim
-INTEL_JITEVENTS_LIBDIR!$INTEL_JITEVENTS_LIBDIR$ac_delim
 XML2CONFIG!$XML2CONFIG$ac_delim
 LIBXML2_LIBS!$LIBXML2_LIBS$ac_delim
 LIBXML2_INC!$LIBXML2_INC$ac_delim
@@ -22307,6 +22059,7 @@ OCAML_LIBDIR!$OCAML_LIBDIR$ac_delim
 ENABLE_VISIBILITY_INLINES_HIDDEN!$ENABLE_VISIBILITY_INLINES_HIDDEN$ac_delim
 RPATH!$RPATH$ac_delim
 RDYNAMIC!$RDYNAMIC$ac_delim
+program_prefix!$program_prefix$ac_delim
 LIBOBJS!$LIBOBJS$ac_delim
 LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index 2d4f2914ee00..fdaec89cdf6b 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -230,7 +230,7 @@ any pass dependencies your pass has.  Thus you should have something like this:
 
 .. code-block:: c++
 
-  void getAnalysisUsage(AnalysisUsage &amp;AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const {
     AliasAnalysis::getAnalysisUsage(AU);
     // declare your dependencies here.
   }
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index d3995e7036bd..bd26f7b1502e 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -489,6 +489,8 @@ The magic number for LLVM IR files is:
 When combined with the bitcode magic number and viewed as bytes, this is
 ``"BC 0xC0DE"``.
 
+.. _Signed VBRs:
+
 Signed VBRs
 ^^^^^^^^^^^
 
@@ -507,6 +509,7 @@ As such, signed VBR values of a specific width are emitted as follows:
 With this encoding, small positive and small negative values can both be emitted
 efficiently. Signed VBR encoding is used in ``CST_CODE_INTEGER`` and
 ``CST_CODE_WIDE_INTEGER`` records within ``CONSTANTS_BLOCK`` blocks.
+It is also used for phi instruction operands in `MODULE_CODE_VERSION`_ 1.
 
 LLVM IR Blocks
 ^^^^^^^^^^^^^^
@@ -553,13 +556,57 @@ block may contain the following sub-blocks:
 * `FUNCTION_BLOCK`_
 * `METADATA_BLOCK`_
 
+.. _MODULE_CODE_VERSION:
+
 MODULE_CODE_VERSION Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 ``[VERSION, version#]``
 
 The ``VERSION`` record (code 1) contains a single value indicating the format
-version. Only version 0 is supported at this time.
+version. Versions 0 and 1 are supported at this time. The difference between
+version 0 and 1 is in the encoding of instruction operands in
+each `FUNCTION_BLOCK`_.
+
+In version 0, each value defined by an instruction is assigned an ID
+unique to the function. Function-level value IDs are assigned starting from
+``NumModuleValues`` since they share the same namespace as module-level
+values. The value enumerator resets after each function. When a value is
+an operand of an instruction, the value ID is used to represent the operand.
+For large functions or large modules, these operand values can be large.
+
+The encoding in version 1 attempts to avoid large operand values
+in common cases. Instead of using the value ID directly, operands are
+encoded as relative to the current instruction. Thus, if an operand
+is the value defined by the previous instruction, the operand
+will be encoded as 1.
+
+For example, instead of
+
+.. code-block:: llvm
+
+  #n = load #n-1
+  #n+1 = icmp eq #n, #const0
+  br #n+1, label #(bb1), label #(bb2)
+
+version 1 will encode the instructions as
+
+.. code-block:: llvm
+
+  #n = load #1
+  #n+1 = icmp eq #1, (#n+1)-#const0
+  br #1, label #(bb1), label #(bb2)
+
+Note in the example that operands which are constants also use
+the relative encoding, while operands like basic block labels
+do not use the relative encoding.
+
+Forward references will result in a negative value.
+This can be inefficient, as operands are normally encoded
+as unsigned VBRs. However, forward references are rare, except in the
+case of phi instructions. For phi instructions, operands are encoded as
+`Signed VBRs`_ to deal with forward references.
+
 
 MODULE_CODE_TRIPLE Record
 ^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/docs/CMake.rst b/docs/CMake.rst
index e1761c5b1d45..7f0420c4469f 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -273,11 +273,6 @@ LLVM-specific variables
 **LLVM_USE_INTEL_JITEVENTS**:BOOL
   Enable building support for Intel JIT Events API. Defaults to OFF
 
-**LLVM_INTEL_JITEVENTS_DIR**:PATH
-  Path to installation of Intel(R) VTune(TM) Amplifier XE 2011, used to locate
-  the ``jitprofiling`` library. Default = ``%VTUNE_AMPLIFIER_XE_2011_DIR%``
-  (Windows) | ``/opt/intel/vtune_amplifier_xe_2011`` (Linux)
-
 Executing the test suite
 ========================
 
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index d1d0231105b6..5fab76ec1a44 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -81,7 +81,7 @@ Required components in the code generator
 The two pieces of the LLVM code generator are the high-level interface to the
 code generator and the set of reusable components that can be used to build
 target-specific backends.  The two most important interfaces (:raw-html:`<tt>`
-`TargetMachine`_ :raw-html:`</tt>` and :raw-html:`<tt>` `TargetData`_
+`TargetMachine`_ :raw-html:`</tt>` and :raw-html:`<tt>` `DataLayout`_
 :raw-html:`</tt>`) are the only ones that are required to be defined for a
 backend to fit into the LLVM system, but the others must be defined if the
 reusable code generator components are going to be used.
@@ -197,7 +197,7 @@ any particular client.  These classes are designed to capture the *abstract*
 properties of the target (such as the instructions and registers it has), and do
 not incorporate any particular pieces of code generation algorithms.
 
-All of the target description classes (except the :raw-html:`<tt>` `TargetData`_
+All of the target description classes (except the :raw-html:`<tt>` `DataLayout`_
 :raw-html:`</tt>` class) are designed to be subclassed by the concrete target
 implementation, and have virtual methods implemented.  To get to these
 implementations, the :raw-html:`<tt>` `TargetMachine`_ :raw-html:`</tt>` class
@@ -214,18 +214,18 @@ the ``get*Info`` methods (``getInstrInfo``, ``getRegisterInfo``,
 ``getFrameInfo``, etc.).  This class is designed to be specialized by a concrete
 target implementation (e.g., ``X86TargetMachine``) which implements the various
 virtual methods.  The only required target description class is the
-:raw-html:`<tt>` `TargetData`_ :raw-html:`</tt>` class, but if the code
+:raw-html:`<tt>` `DataLayout`_ :raw-html:`</tt>` class, but if the code
 generator components are to be used, the other interfaces should be implemented
 as well.
 
-.. _TargetData:
+.. _DataLayout:
 
-The ``TargetData`` class
+The ``DataLayout`` class
 ------------------------
 
-The ``TargetData`` class is the only required target description class, and it
-is the only class that is not extensible (you cannot derived a new class from
-it).  ``TargetData`` specifies information about how the target lays out memory
+The ``DataLayout`` class is the only required target description class, and it
+is the only class that is not extensible (you cannot derive a new class from
+it).  ``DataLayout`` specifies information about how the target lays out memory
 for structures, the alignment requirements for various data types, the size of
 pointers in the target, and whether the target is little-endian or
 big-endian.
@@ -248,7 +248,7 @@ operations.  Among other things, this class indicates:
 * the type to use for shift amounts, and
 
 * various high-level characteristics, like whether it is profitable to turn
-  division by a constant into a multiplication sequence
+  division by a constant into a multiplication sequence.
 
 The ``TargetRegisterInfo`` class
 --------------------------------
@@ -256,10 +256,10 @@ The ``TargetRegisterInfo`` class
 The ``TargetRegisterInfo`` class is used to describe the register file of the
 target and any interactions between the registers.
 
-Registers in the code generator are represented in the code generator by
-unsigned integers.  Physical registers (those that actually exist in the target
-description) are unique small numbers, and virtual registers are generally
-large.  Note that register ``#0`` is reserved as a flag value.
+Registers are represented in the code generator by unsigned integers.  Physical
+registers (those that actually exist in the target description) are unique
+small numbers, and virtual registers are generally large.  Note that
+register ``#0`` is reserved as a flag value.
 
 Each register in the processor description has an associated
 ``TargetRegisterDesc`` entry, which provides a textual name for the register
@@ -390,7 +390,7 @@ functions make it easy to build arbitrary machine instructions.  Usage of the
   MachineInstr *MI = BuildMI(X86::MOV32ri, 1, DestReg).addImm(42);
 
   // Create the same instr, but insert it at the end of a basic block.
-  MachineBasicBlock &amp;MBB = ...
+  MachineBasicBlock &MBB = ...
   BuildMI(MBB, X86::MOV32ri, 1, DestReg).addImm(42);
 
   // Create the same instr, but insert it before a specified iterator point.
@@ -404,7 +404,7 @@ functions make it easy to build arbitrary machine instructions.  Usage of the
   MI = BuildMI(X86::SAHF, 0);
 
   // Create a self looping branch instruction.
-  BuildMI(MBB, X86::JNE, 1).addMBB(&amp;MBB);
+  BuildMI(MBB, X86::JNE, 1).addMBB(&MBB);
 
 The key thing to remember with the ``BuildMI`` functions is that you have to
 specify the number of operands that the machine instruction will take.  This
@@ -838,8 +838,7 @@ Initial SelectionDAG Construction
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The initial SelectionDAG is na\ :raw-html:`&iuml;`\ vely peephole expanded from
-the LLVM input by the ``SelectionDAGLowering`` class in the
-``lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp`` file.  The intent of this pass
+the LLVM input by the ``SelectionDAGBuilder`` class.  The intent of this pass
 is to expose as much low-level, target-specific details to the SelectionDAG as
 possible.  This pass is mostly hard-coded (e.g. an LLVM ``add`` turns into an
 ``SDNode add`` while a ``getelementptr`` is expanded into the obvious
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index a416a1e856fa..90835307b15c 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -79,10 +79,11 @@ tree.  The standard header looks like this:
   // License. See LICENSE.TXT for details.
   //
   //===----------------------------------------------------------------------===//
-  //
-  // This file contains the declaration of the Instruction class, which is the
-  // base class for all of the VM instructions.
-  //
+  ///
+  /// \file
+  /// \brief This file contains the declaration of the Instruction class, which is
+  /// the base class for all of the VM instructions.
+  ///
   //===----------------------------------------------------------------------===//
 
 A few things to note about this particular format: The "``-*- C++ -*-``" string
@@ -100,10 +101,12 @@ The next section in the file is a concise note that defines the license that the
 file is released under.  This makes it perfectly clear what terms the source
 code can be distributed under and should not be modified in any way.
 
-The main body of the description does not have to be very long in most cases.
-Here it's only two lines.  If an algorithm is being implemented or something
-tricky is going on, a reference to the paper where it is published should be
-included, as well as any notes or *gotchas* in the code to watch out for.
+The main body is a ``doxygen`` comment describing the purpose of the file.  It
+should have a ``\brief`` command that describes the file in one or two
+sentences.  Any additional information should be separated by a blank line.  If
+an algorithm is being implemented or something tricky is going on, a reference
+to the paper where it is published should be included, as well as any notes or
+*gotchas* in the code to watch out for.
 
 Class overviews
 """""""""""""""
@@ -143,6 +146,132 @@ useful to use C style (``/* */``) comments however:
 To comment out a large block of code, use ``#if 0`` and ``#endif``. These nest
 properly and are better behaved in general than C style comments.
 
+Doxygen Use in Documentation Comments
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use the ``\file`` command to turn the standard file header into a file-level
+comment.
+
+Include descriptive ``\brief`` paragraphs for all public interfaces (public
+classes, member and non-member functions).  Explain API use and purpose in
+``\brief`` paragraphs, don't just restate the information that can be inferred
+from the API name.  Put detailed discussion into separate paragraphs.
+
+To refer to parameter names inside a paragraph, use the ``\p name`` command.
+Don't use the ``\arg name`` command since it starts a new paragraph that
+contains documentation for the parameter.
+
+Wrap non-inline code examples in ``\code ... \endcode``.
+
+To document a function parameter, start a new paragraph with the
+``\param name`` command.  If the parameter is used as an out or an in/out
+parameter, use the ``\param [out] name`` or ``\param [in,out] name`` command,
+respectively.
+
+To describe function return value, start a new paragraph with the ``\returns``
+command.
+
+A minimal documentation comment:
+
+.. code-block:: c++
+
+  /// \brief Does foo and bar.
+  void fooBar(bool Baz);
+
+A documentation comment that uses all Doxygen features in a preferred way:
+
+.. code-block:: c++
+
+  /// \brief Does foo and bar.
+  ///
+  /// Does not do foo the usual way if \p Baz is true.
+  ///
+  /// Typical usage:
+  /// \code
+  ///   fooBar(false, "quux", Res);
+  /// \endcode
+  ///
+  /// \param Quux kind of foo to do.
+  /// \param [out] Result filled with bar sequence on foo success.
+  ///
+  /// \returns true on success.
+  bool fooBar(bool Baz, StringRef Quux, std::vector<int> &Result);
+
+Don't duplicate the documentation comment in the header file and in the
+implementation file.  Put the documentation comments for public APIs into the
+header file.  Documentation comments for private APIs can go to the
+implementation file.  In any case, implementation files can include additional
+comments (not necessarily in Doxygen markup) to explain implementation details
+as needed.
+
+Don't duplicate function or class name at the beginning of the comment.
+For humans it is obvious which function or class is being documented;
+automatic documentation processing tools are smart enough to bind the comment
+to the correct declaration.
+
+Wrong:
+
+.. code-block:: c++
+
+  // In Something.h:
+
+  /// Something - An abstraction for some complicated thing.
+  class Something {
+  public:
+    /// fooBar - Does foo and bar.
+    void fooBar();
+  };
+
+  // In Something.cpp:
+
+  /// fooBar - Does foo and bar.
+  void Something::fooBar() { ... }
+
+Correct:
+
+.. code-block:: c++
+
+  // In Something.h:
+
+  /// \brief An abstraction for some complicated thing.
+  class Something {
+  public:
+    /// \brief Does foo and bar.
+    void fooBar();
+  };
+
+  // In Something.cpp:
+
+  // Builds a B-tree in order to do foo.  See paper by...
+  void Something::fooBar() { ... }
+
+It is not required to use additional Doxygen features, but sometimes it might
+be a good idea to do so.
+
+Consider:
+
+* adding comments to any narrow namespace containing a collection of
+  related functions or types;
+
+* using top-level groups to organize a collection of related functions at
+  namespace scope where the grouping is smaller than the namespace;
+
+* using member groups and additional comments attached to member
+  groups to organize within a class.
+
+For example:
+
+.. code-block:: c++
+
+  class Something {
+    /// \name Functions that do Foo.
+    /// @{
+    void fooBar();
+    void fooBaz();
+    /// @}
+    ...
+  };
+
 ``#include`` Style
 ^^^^^^^^^^^^^^^^^^
 
@@ -421,9 +550,9 @@ exit from a function, consider this "bad" code:
 
 .. code-block:: c++
 
-  Value *DoSomething(Instruction *I) {
+  Value *doSomething(Instruction *I) {
     if (!isa<TerminatorInst>(I) &&
-        I->hasOneUse() && SomeOtherThing(I)) {
+        I->hasOneUse() && doOtherThing(I)) {
       ... some long code ....
     }
 
@@ -445,7 +574,7 @@ It is much preferred to format the code like this:
 
 .. code-block:: c++
 
-  Value *DoSomething(Instruction *I) {
+  Value *doSomething(Instruction *I) {
     // Terminators never need 'something' done to them because ... 
     if (isa<TerminatorInst>(I))
       return 0;
@@ -456,7 +585,7 @@ It is much preferred to format the code like this:
       return 0;
 
     // This is really just here for example.
-    if (!SomeOtherThing(I))
+    if (!doOtherThing(I))
       return 0;
     
     ... some long code ....
@@ -601,9 +730,8 @@ code to be structured like this:
 
 .. code-block:: c++
 
-  /// ListContainsFoo - Return true if the specified list has an element that is
-  /// a foo.
-  static bool ListContainsFoo(const std::vector<Bar*> &List) {
+  /// \returns true if the specified list has an element that is a foo.
+  static bool containsFoo(const std::vector<Bar*> &List) {
     for (unsigned i = 0, e = List.size(); i != e; ++i)
       if (List[i]->isFoo())
         return true;
@@ -611,7 +739,7 @@ code to be structured like this:
   }
   ...
 
-  if (ListContainsFoo(BarList)) {
+  if (containsFoo(BarList)) {
     ...
   }
 
@@ -714,7 +842,7 @@ enforced, and hopefully what to do about it.  Here is one complete example:
 .. code-block:: c++
 
   inline Value *getOperand(unsigned i) { 
-    assert(i < Operands.size() &amp;&amp; "getOperand() out of range!");
+    assert(i < Operands.size() && "getOperand() out of range!");
     return Operands[i]; 
   }
 
@@ -734,23 +862,28 @@ Here are more examples:
 
 You get the idea.
 
-Please be aware that, when adding assert statements, not all compilers are aware
-of the semantics of the assert.  In some places, asserts are used to indicate a
-piece of code that should not be reached.  These are typically of the form:
+In the past, asserts were used to indicate a piece of code that should not be
+reached.  These were typically of the form:
 
 .. code-block:: c++
 
-  assert(0 && "Some helpful error message");
+  assert(0 && "Invalid radix for integer literal");
 
-When used in a function that returns a value, they should be followed with a
-return statement and a comment indicating that this line is never reached.  This
-will prevent a compiler which is unable to deduce that the assert statement
-never returns from generating a warning.
+This has a few issues, the main one being that some compilers might not
+understand the assertion, or warn about a missing return in builds where
+assertions are compiled out.
+
+Today, we have something much better: ``llvm_unreachable``:
 
 .. code-block:: c++
 
-  assert(0 && "Some helpful error message");
-  return 0;
+  llvm_unreachable("Invalid radix for integer literal");
+
+When assertions are enabled, this will print the message if it's ever reached
+and then exit the program. When assertions are disabled (i.e. in release
+builds), ``llvm_unreachable`` becomes a hint to compilers to skip generating
+code for this branch. If the compiler does not support this, it will fall back
+to the "abort" implementation.
 
 Another issue is that values used only by assertions will produce an "unused
 value" warning when assertions are disabled.  For example, this code will warn:
@@ -818,6 +951,52 @@ least one out-of-line virtual method in the class.  Without this, the compiler
 will copy the vtable and RTTI into every ``.o`` file that ``#include``\s the
 header, bloating ``.o`` file sizes and increasing link times.
 
+Don't use default labels in fully covered switches over enumerations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``-Wswitch`` warns if a switch, without a default label, over an enumeration
+does not cover every enumeration value. If you write a default label on a fully
+covered switch over an enumeration then the ``-Wswitch`` warning won't fire
+when new elements are added to that enumeration. To help avoid adding these
+kinds of defaults, Clang has the warning ``-Wcovered-switch-default`` which is
+off by default but turned on when building LLVM with a version of Clang that
+supports the warning.
+
+A knock-on effect of this stylistic requirement is that when building LLVM with
+GCC you may get warnings related to "control may reach end of non-void function"
+if you return from each case of a covered switch-over-enum because GCC assumes
+that the enum expression may take any representable value, not just those of
+individual enumerators. To suppress this warning, use ``llvm_unreachable`` after
+the switch.
+
+Use ``LLVM_DELETED_FUNCTION`` to mark uncallable methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Prior to C++11, a common pattern to make a class uncopyable was to declare an
+unimplemented copy constructor and copy assignment operator and make them
+private. This would give a compiler error for accessing a private method or a
+linker error because it wasn't implemented.
+
+With C++11, we can mark methods that won't be implemented with ``= delete``.
+This will trigger a much better error message and tell the compiler that the
+method will never be implemented. This enables other checks like
+``-Wunused-private-field`` to run correctly on classes that contain these
+methods.
+
+To maintain compatibility with C++03, ``LLVM_DELETED_FUNCTION`` should be used
+which will expand to ``= delete`` if the compiler supports it. These methods
+should still be declared private. Example of the uncopyable pattern:
+
+.. code-block:: c++
+
+  class DontCopy {
+  private:
+    DontCopy(const DontCopy&) LLVM_DELETED_FUNCTION;
+    DontCopy &operator =(const DontCopy&) LLVM_DELETED_FUNCTION;
+  public:
+    ...
+  };
+
 Don't evaluate ``end()`` every time through a loop
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -1002,21 +1181,21 @@ If a namespace definition is small and *easily* fits on a screen (say, less than
 
   namespace llvm {
     namespace X86 {
-      /// RelocationType - An enum for the x86 relocation codes. Note that
+      /// \brief An enum for the x86 relocation codes.  Note that
       /// the terminology here doesn't follow x86 convention - word means
       /// 32-bit and dword means 64-bit.
       enum RelocationType {
-        /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+        /// \brief PC relative relocation, add the relocated value to
         /// the value already in memory, after we adjust it for where the PC is.
         reloc_pcrel_word = 0,
 
-        /// reloc_picrel_word - PIC base relative relocation, add the relocated
-        /// value to the value already in memory, after we adjust it for where the
+        /// \brief PIC base relative relocation, add the relocated value to
+        /// the value already in memory, after we adjust it for where the
         /// PIC base is.
         reloc_picrel_word = 1,
 
-        /// reloc_absolute_word, reloc_absolute_dword - Absolute relocation, just
-        /// add the relocated value to the value already in memory.
+        /// \brief Absolute relocation, just add the relocated value to the
+        /// value already in memory.
         reloc_absolute_word = 2,
         reloc_absolute_dword = 3
       };
@@ -1035,7 +1214,7 @@ closed.  For example:
   namespace llvm {
   namespace knowledge {
 
-  /// Grokable - This class represents things that Smith can have an intimate
+  /// This class represents things that Smith can have an intimate
   /// understanding of and contains the data associated with it.
   class Grokable {
   ...
@@ -1092,7 +1271,7 @@ good:
     };
   } // end anonymous namespace
 
-  static void Helper() { 
+  static void runHelper() { 
     ... 
   }
 
@@ -1112,7 +1291,7 @@ This is bad:
     bool operator<(const char *RHS) const;
   };
 
-  void Helper() { 
+  void runHelper() { 
     ... 
   }
 
@@ -1122,7 +1301,7 @@ This is bad:
 
   } // end anonymous namespace
 
-This is bad specifically because if you're looking at "``Helper``" in the middle
+This is bad specifically because if you're looking at "``runHelper``" in the middle
 of a large C++ file, that you have no immediate way to tell if it is local to
 the file.  When it is marked static explicitly, this is immediately obvious.
 Also, there is no reason to enclose the definition of "``operator<``" in the
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 51a9bf6293b3..1d7a462bd71f 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -45,6 +45,11 @@ OPTIONS
 
 
 
+**--input-file** *filename*
+
+  File to check (defaults to stdin).
+
+
 **--strict-whitespace**
 
  By default, FileCheck canonicalizes input horizontal whitespace (spaces and
@@ -271,8 +276,9 @@ simple example:
 The first check line matches a regex (**%[a-z]+**) and captures it into
 the variable "REGISTER".  The second line verifies that whatever is in REGISTER
 occurs later in the file after an "andw".  FileCheck variable references are
-always contained in **[[ ]]** pairs, are named, and their names can be
-name, then it is a definition of the variable, if not, it is a use.
+always contained in **[[ ]]** pairs, and their names can be formed with the
+regex **[a-zA-Z][a-zA-Z0-9]***.  If a colon follows the name, then it is a
+definition of the variable; otherwise, it is a use.
 
 FileCheck variables can be defined multiple times, and uses always get the
 latest value.  Note that variables are all read at the start of a "CHECK" line
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index 3eb0be91f137..9e96cd2a4bfd 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -125,6 +125,10 @@ EXECUTION OPTIONS
  *--error-exitcode* argument for valgrind is used so that valgrind failures will
  cause the program to exit with a non-zero status.
 
+ When this option is enabled, **lit** will also automatically provide a
+ "valgrind" feature that can be used to conditionally disable (or expect failure
+ in) certain tests.
+
 
 
 **--vg-arg**\ =\ *ARG*
@@ -133,6 +137,15 @@ EXECUTION OPTIONS
 
 
 
+**--vg-leak**
+
+ When *--vg* is used, enable memory leak checks. When this option is enabled,
+ **lit** will also automatically provide a "vg_leak" feature that can be
+ used to conditionally disable (or expect failure in) certain tests.
+
+
+
+
 **--time-tests**
 
  Track the wall time individual tests take to execute and includes the results in
diff --git a/docs/CompilerWriterInfo.html b/docs/CompilerWriterInfo.html
deleted file mode 100644
index 67da783b1679..000000000000
--- a/docs/CompilerWriterInfo.html
+++ /dev/null
@@ -1,267 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" 
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Architecture/platform information for compiler writers</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-
-<body>
-
-<h1>
-  Architecture/platform information for compiler writers
-</h1>
-
-<div class="doc_warning">
-  <p>Note: This document is a work-in-progress.  Additions and clarifications
-  are welcome.</p>
-</div>
-
-<ol>
-  <li><a href="#hw">Hardware</a>
-  <ol>
-    <li><a href="#arm">ARM</a></li>
-    <li><a href="#ia64">Itanium</a></li>
-    <li><a href="#mips">MIPS</a></li>
-    <li><a href="#ppc">PowerPC</a></li>
-    <li><a href="#sparc">SPARC</a></li>
-    <li><a href="#x86">X86</a></li>
-    <li><a href="#other">Other lists</a></li>
-  </ol></li>
-  <li><a href="#abi">Application Binary Interface (ABI)</a>
-  <ol>
-    <li><a href="#linux">Linux</a></li>
-    <li><a href="#osx">OS X</a></li>
-  </ol></li>
-  <li><a href="#misc">Miscellaneous resources</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Compiled by <a href="http://misha.brukman.net">Misha Brukman</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="hw">Hardware</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<!-- ======================================================================= -->
-<h3><a name="arm">ARM</a></h3>
-
-<div>
-<ul>
-<li><a href="http://www.arm.com/documentation/">ARM documentation</a> 
-(<a href="http://www.arm.com/documentation/ARMProcessor_Cores/">Processor
-Cores</a>)</li>
-<li><a href="http://www.arm.com/products/DevTools/ABI.html">ABI</a></li>
-</ul>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="ia64">Itanium (ia64)</a></h3>
-
-<div>
-<ul>
-<li><a
-href="http://developer.intel.com/design/itanium2/documentation.htm">Itanium documentation</a> 
-</li>
-</ul>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="mips">MIPS</a></h3>
-
-<div>
-<ul>
-<li><a
-href="http://mips.com/content/Documentation/MIPSDocumentation/ProcessorArchitecture/doclibrary">MIPS
-Processor Architecture</a></li>
-</ul>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="ppc">PowerPC</a></h3>
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>IBM - Official manuals and docs</h4>
-
-<div>
-
-<ul>
-<li><a
-href="http://www-106.ibm.com/developerworks/eserver/articles/archguide.html">PowerPC
-Architecture Book</a>
-<ul>
-  <li>Book I: <a
-  href="http://www-106.ibm.com/developerworks/eserver/pdfs/archpub1.pdf">PowerPC
-  User Instruction Set Architecture</a></li>
-  <li>Book II: <a
-  href="http://www-106.ibm.com/developerworks/eserver/pdfs/archpub2.pdf">PowerPC
-  Virtual Environment Architecture</a></li>
-  <li>Book III: <a
-  href="http://www-106.ibm.com/developerworks/eserver/pdfs/archpub3.pdf">PowerPC
-  Operating Environment Architecture</a></li>
-</ul></li>
-<li><a
-href="http://www-3.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF7785256996007558C6">PowerPC
-Compiler Writer's Guide</a></li>
-<li><A
-href="http://www-3.ibm.com/chips/techlib/techlib.nsf/products/PowerPC">PowerPC
-Processor Manuals</a></li>
-<li><a
-href="http://www-106.ibm.com/developerworks/linux/library/l-powarch/">Intro to
-PowerPC architecture</a></li>
-<li><a href="http://publibn.boulder.ibm.com/doc_link/en_US/a_doc_lib/aixassem/alangref/alangreftfrm.htm">IBM AIX/5L for POWER Assembly reference</a></li>
-</ul>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>Other documents, collections, notes</h4>
-
-<div>
-
-<ul>
-<li><a href="http://penguinppc.org/dev/#library">PowerPC ABI documents</a></li>
-<li><a href="http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00997.html">PowerPC64
-alignment of long doubles (from GCC)</a></li>
-<li><a href="http://sources.redhat.com/ml/binutils/2002-04/msg00573.html">Long
-branch stubs for powerpc64-linux (from binutils)</a></li>
-</ul>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="sparc">SPARC</a></h3>
-
-<div>
-
-<ul>
-<li><a href="http://www.sparc.org/resource.htm">SPARC resources</a></li>
-<li><a href="http://www.sparc.org/standards.html">SPARC standards</a></li>
-</ul>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="x86">X86</a></h3>
-
-<div>
-
-<!-- _______________________________________________________________________ -->
-<h4>AMD - Official manuals and docs</h4>
-
-<div>
-<ul>
-<li><a
-href="http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739,00.html">AMD processor manuals</a></li>
-<li><a href="http://www.x86-64.org/documentation">X86-64 ABI</a></li>
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>Intel - Official manuals and docs</h4>
-
-<div>
-<ul>
-<li><a
-href="http://developer.intel.com/design/pentium4/manuals/index_new.htm">IA-32
-manuals</a></li>
-<li><a
-href="http://www.intel.com/design/itanium/documentation.htm?iid=ipp_srvr_proc_itanium2+techdocs">Intel
-Itanium documentation</a></li>
-</ul>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>Other x86-specific information</h4>
-
-<div>
-<ul>
-<li><a href="http://www.agner.org/assem/calling_conventions.pdf">Calling
-conventions for different C++ compilers and operating systems</a></li>
-</ul>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="other">Other relevant lists</a></h3>
-
-<div>
-
-<ul>
-<li><a href="http://gcc.gnu.org/readings.html">GCC reading list</a></li>
-</ul>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="abi">ABI</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<!-- ======================================================================= -->
-<h3><a name="linux">Linux</a></h3>
-
-<div>
-<ol>
-<li><a href="http://www.linuxbase.org/spec/ELF/ppc64/">PowerPC 64-bit ELF ABI
-Supplement</a></li>
-</ol>
-</div>
-
-<!-- ======================================================================= -->
-<h3><a name="osx">OS X</a></h3>
-
-<div>
-<ol>
-<li><a
-href="http://developer.apple.com/documentation/Darwin/RuntimeArchitecture-date.html">Mach-O
-Runtime Architecture</a></li>
-<li><a href="http://www.unsanity.org/archives/000044.php">Notes on Mach-O
-ABI</a></li>
-</ol>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="misc">Miscellaneous resources</a></h2>
-<!-- *********************************************************************** -->
-
-<ul>
-<li><a
-href="http://www.nondot.org/sabre/os/articles/ExecutableFileFormats/">Executable
-File Format library</a></li>
-<li><a href="http://gcc.gnu.org/projects/prefetch.html">GCC prefetch project</a>
-page has a good survey of the prefetching capabilities of a variety of modern
-processors.</li>
-</ul>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://misha.brukman.net">Misha Brukman</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
new file mode 100644
index 000000000000..e41f5f9eecea
--- /dev/null
+++ b/docs/CompilerWriterInfo.rst
@@ -0,0 +1,118 @@
+.. _compiler_writer_info:
+
+========================================================
+Architecture & Platform Information for Compiler Writers
+========================================================
+
+.. contents::
+   :local:
+
+.. note::
+
+  This document is a work-in-progress.  Additions and clarifications are
+  welcome.
+
+  Compiled by `Misha Brukman <http://misha.brukman.net>`_.
+
+Hardware
+========
+
+ARM
+---
+
+* `ARM documentation <http://www.arm.com/documentation/>`_ (`Processor Cores <http://www.arm.com/documentation/ARMProcessor_Cores/>`_ Cores)
+
+* `ABI <http://www.arm.com/products/DevTools/ABI.html>`_
+
+Itanium (ia64)
+--------------
+
+* `Itanium documentation <http://developer.intel.com/design/itanium2/documentation.htm>`_
+
+MIPS
+----
+
+* `MIPS Processor Architecture <http://mips.com/content/Documentation/MIPSDocumentation/ProcessorArchitecture/doclibrary>`_
+
+PowerPC
+-------
+
+IBM - Official manuals and docs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* `PowerPC Architecture Book <http://www-106.ibm.com/developerworks/eserver/articles/archguide.html>`_
+
+  * Book I: `PowerPC User Instruction Set Architecture <http://www-106.ibm.com/developerworks/eserver/pdfs/archpub1.pdf>`_
+
+  * Book II: `PowerPC Virtual Environment Architecture <http://www-106.ibm.com/developerworks/eserver/pdfs/archpub2.pdf>`_
+
+  * Book III: `PowerPC Operating Environment Architecture <http://www-106.ibm.com/developerworks/eserver/pdfs/archpub3.pdf>`_
+
+* `PowerPC Compiler Writer's Guide <http://www-3.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF7785256996007558C6>`_
+
+* `PowerPC Processor Manuals <http://www-3.ibm.com/chips/techlib/techlib.nsf/products/PowerPC>`_
+
+* `Intro to PowerPC Architecture <http://www-106.ibm.com/developerworks/linux/library/l-powarch/>`_
+
+* `IBM AIX/5L for POWER Assembly Reference <http://publibn.boulder.ibm.com/doc_link/en_US/a_doc_lib/aixassem/alangref/alangreftfrm.htm>`_
+
+Other documents, collections, notes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* `PowerPC ABI documents <http://penguinppc.org/dev/#library>`_
+* `PowerPC64 alignment of long doubles (from GCC) <http://gcc.gnu.org/ml/gcc-patches/2003-09/msg00997.html>`_
+* `Long branch stubs for powerpc64-linux (from binutils) <http://sources.redhat.com/ml/binutils/2002-04/msg00573.html>`_
+
+SPARC
+-----
+
+* `SPARC resources <http://www.sparc.org/resource.htm>`_
+* `SPARC standards <http://www.sparc.org/standards.html>`_
+
+X86
+---
+
+AMD - Official manuals and docs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* `AMD processor manuals <http://www.amd.com/us-en/Processors/TechnicalResources/0,,30_182_739,00.html>`_
+* `X86-64 ABI <http://www.x86-64.org/documentation>`_
+
+Intel - Official manuals and docs
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* `IA-32 manuals <http://developer.intel.com/design/pentium4/manuals/index_new.htm>`_
+* `Intel Itanium documentation <http://www.intel.com/design/itanium/documentation.htm?iid=ipp_srvr_proc_itanium2+techdocs>`_
+
+Other x86-specific information
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+* `Calling conventions for different C++ compilers and operating systems  <http://www.agner.org/assem/calling_conventions.pdf>`_
+
+Other relevant lists
+--------------------
+
+* `GCC reading list <http://gcc.gnu.org/readings.html>`_
+
+ABI
+===
+
+Linux
+-----
+
+* `PowerPC 64-bit ELF ABI Supplement <http://www.linuxbase.org/spec/ELF/ppc64/>`_
+
+OS X
+----
+
+* `Mach-O Runtime Architecture <http://developer.apple.com/documentation/Darwin/RuntimeArchitecture-date.html>`_
+* `Notes on Mach-O ABI <http://www.unsanity.org/archives/000044.php>`_
+
+Miscellaneous Resources
+=======================
+
+* `Executable File Format library <http://www.nondot.org/sabre/os/articles/ExecutableFileFormats/>`_
+
+* `GCC prefetch project <http://gcc.gnu.org/projects/prefetch.html>`_ page has a
+  good survey of the prefetching capabilities of a variety of modern
+  processors.
diff --git a/docs/DebuggingJITedCode.html b/docs/DebuggingJITedCode.html
deleted file mode 100644
index 652572cc8441..000000000000
--- a/docs/DebuggingJITedCode.html
+++ /dev/null
@@ -1,184 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Debugging JITed Code With GDB</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>Debugging JIT-ed Code With GDB</h1>
-<ol>
-  <li><a href="#background">Background</a></li>
-  <li><a href="#gdbversion">GDB Version</a></li>
-  <li><a href="#mcjitdebug">Debugging MCJIT-ed code</a></li>
-  <ul>
-    <li><a href="#mcjitdebug_example">Example</a></li>
-  </ul>
-</ol>
-<div class="doc_author">Written by Reid Kleckner and Eli Bendersky</div>
-
-<!--=========================================================================-->
-<h2><a name="background">Background</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>Without special runtime support, debugging dynamically generated code with
-GDB (as well as most debuggers) can be quite painful.  Debuggers generally read
-debug information from the object file of the code, but for JITed code, there is
-no such file to look for.
-</p>
-
-<p>In order to communicate the necessary debug info to GDB, an interface for
-registering JITed code with debuggers has been designed and implemented for
-GDB and LLVM MCJIT.  At a high level, whenever MCJIT generates new machine code,
-it does so in an in-memory object file that contains the debug information in
-DWARF format.  MCJIT then adds this in-memory object file to a global list of
-dynamically generated object files and calls a special function
-(<tt>__jit_debug_register_code</tt>) marked noinline that GDB knows about.  When
-GDB attaches to a process, it puts a breakpoint in this function and loads all
-of the object files in the global list.  When MCJIT calls the registration
-function, GDB catches the breakpoint signal, loads the new object file from
-the inferior's memory, and resumes the execution.  In this way, GDB can get the
-necessary debug information.
-</p>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="gdbversion">GDB Version</a></h2>
-<!--=========================================================================-->
-
-<p>In order to debug code JIT-ed by LLVM, you need GDB 7.0 or newer, which is
-available on most modern distributions of Linux.  The version of GDB that Apple
-ships with Xcode has been frozen at 6.3 for a while.  LLDB may be a better
-option for debugging JIT-ed code on Mac OS X.
-</p>
-
-
-<!--=========================================================================-->
-<h2><a name="mcjitdebug">Debugging MCJIT-ed code</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>The emerging MCJIT component of LLVM allows full debugging of JIT-ed code with
-GDB.  This is due to MCJIT's ability to use the MC emitter to provide full
-DWARF debugging information to GDB.</p>
-
-<p>Note that lli has to be passed the <tt>-use-mcjit</tt> flag to JIT the code
-with MCJIT instead of the old JIT.</p>
-
-<h3><a name="mcjitdebug_example">Example</a></h3>
-
-<div>
-
-<p>Consider the following C code (with line numbers added to make the example
-easier to follow):</p>
-
-<pre class="doc_code">
-1   int compute_factorial(int n)
-2   {
-3       if (n <= 1)
-4           return 1;
-5
-6       int f = n;
-7       while (--n > 1) 
-8           f *= n;
-9       return f;
-10  }
-11
-12
-13  int main(int argc, char** argv)
-14  {
-15      if (argc < 2)
-16          return -1;
-17      char firstletter = argv[1][0];
-18      int result = compute_factorial(firstletter - '0');
-19  
-20      // Returned result is clipped at 255...
-21      return result;
-22  }
-</pre>
-
-<p>Here is a sample command line session that shows how to build and run this
-code via lli inside GDB:
-</p>
-
-<pre class="doc_code">
-$ $BINPATH/clang -cc1 -O0 -g -emit-llvm showdebug.c
-$ gdb --quiet --args $BINPATH/lli -use-mcjit showdebug.ll 5
-Reading symbols from $BINPATH/lli...done.
-(gdb) b showdebug.c:6
-No source file named showdebug.c.
-Make breakpoint pending on future shared library load? (y or [n]) y
-Breakpoint 1 (showdebug.c:6) pending.
-(gdb) r
-Starting program: $BINPATH/lli -use-mcjit showdebug.ll 5
-[Thread debugging using libthread_db enabled]
-
-Breakpoint 1, compute_factorial (n=5) at showdebug.c:6
-6	    int f = n;
-(gdb) p n
-$1 = 5
-(gdb) p f
-$2 = 0
-(gdb) n
-7	    while (--n > 1) 
-(gdb) p f
-$3 = 5
-(gdb) b showdebug.c:9
-Breakpoint 2 at 0x7ffff7ed404c: file showdebug.c, line 9.
-(gdb) c
-Continuing.
-
-Breakpoint 2, compute_factorial (n=1) at showdebug.c:9
-9	    return f;
-(gdb) p f
-$4 = 120
-(gdb) bt
-#0  compute_factorial (n=1) at showdebug.c:9
-#1  0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
-#2  0x3500000001652748 in ?? ()
-#3  0x00000000016677e0 in ?? ()
-#4  0x0000000000000002 in ?? ()
-#5  0x0000000000d953b3 in llvm::MCJIT::runFunction (this=0x16151f0, F=0x1603020, ArgValues=...) at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/MCJIT/MCJIT.cpp:161
-#6  0x0000000000dc8872 in llvm::ExecutionEngine::runFunctionAsMain (this=0x16151f0, Fn=0x1603020, argv=..., envp=0x7fffffffe040)
-    at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/ExecutionEngine.cpp:397
-#7  0x000000000059c583 in main (argc=4, argv=0x7fffffffe018, envp=0x7fffffffe040) at /home/ebenders_test/llvm_svn_rw/tools/lli/lli.cpp:324
-(gdb) finish
-Run till exit from #0  compute_factorial (n=1) at showdebug.c:9
-0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
-18	    int result = compute_factorial(firstletter - '0');
-Value returned is $5 = 120
-(gdb) p result
-$6 = 23406408
-(gdb) n
-21	    return result;
-(gdb) p result
-$7 = 120
-(gdb) c
-Continuing.
-
-Program exited with code 0170.
-(gdb) 
-
-</pre>
-
-</div>
-</div>
-
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="mailto:reid.kleckner@gmail.com">Reid Kleckner</a>,
-  <a href="mailto:eliben@gmail.com">Eli Bendersky</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-13 16:36:15 +0200 (Sun, 13 May 2012) $
-</address>
-</body>
-</html>
diff --git a/docs/DebuggingJITedCode.rst b/docs/DebuggingJITedCode.rst
new file mode 100644
index 000000000000..eeb2f7787dae
--- /dev/null
+++ b/docs/DebuggingJITedCode.rst
@@ -0,0 +1,147 @@
+.. _debugging-jited-code:
+
+==============================
+Debugging JIT-ed Code With GDB
+==============================
+
+.. sectionauthor:: Reid Kleckner and Eli Bendersky
+
+Background
+==========
+
+Without special runtime support, debugging dynamically generated code with
+GDB (as well as most debuggers) can be quite painful.  Debuggers generally
+read debug information from the object file of the code, but for JITed
+code, there is no such file to look for.
+
+In order to communicate the necessary debug info to GDB, an interface for
+registering JITed code with debuggers has been designed and implemented for
+GDB and LLVM MCJIT.  At a high level, whenever MCJIT generates new machine code,
+it does so in an in-memory object file that contains the debug information in
+DWARF format.  MCJIT then adds this in-memory object file to a global list of
+dynamically generated object files and calls a special function
+(``__jit_debug_register_code``) marked noinline that GDB knows about.  When
+GDB attaches to a process, it puts a breakpoint in this function and loads all
+of the object files in the global list.  When MCJIT calls the registration
+function, GDB catches the breakpoint signal, loads the new object file from
+the inferior's memory, and resumes the execution.  In this way, GDB can get the
+necessary debug information.
+
+GDB Version
+===========
+
+In order to debug code JIT-ed by LLVM, you need GDB 7.0 or newer, which is
+available on most modern distributions of Linux.  The version of GDB that
+Apple ships with Xcode has been frozen at 6.3 for a while.  LLDB may be a
+better option for debugging JIT-ed code on Mac OS X.
+
+
+Debugging MCJIT-ed code
+=======================
+
+The emerging MCJIT component of LLVM allows full debugging of JIT-ed code with
+GDB.  This is due to MCJIT's ability to use the MC emitter to provide full
+DWARF debugging information to GDB.
+
+Note that lli has to be passed the ``-use-mcjit`` flag to JIT the code with
+MCJIT instead of the old JIT.
+
+Example
+-------
+
+Consider the following C code (with line numbers added to make the example
+easier to follow):
+
+..
+   FIXME:
+   Sphinx has the ability to automatically number these lines by adding
+   :linenos: on the line immediately following the `.. code-block:: c`, but
+   it looks like garbage; the line numbers don't even line up with the
+   lines. Is this a Sphinx bug, or is it a CSS problem?
+
+.. code-block:: c
+
+   1   int compute_factorial(int n)
+   2   {
+   3       if (n <= 1)
+   4           return 1;
+   5
+   6       int f = n;
+   7       while (--n > 1)
+   8           f *= n;
+   9       return f;
+   10  }
+   11
+   12
+   13  int main(int argc, char** argv)
+   14  {
+   15      if (argc < 2)
+   16          return -1;
+   17      char firstletter = argv[1][0];
+   18      int result = compute_factorial(firstletter - '0');
+   19
+   20      // Returned result is clipped at 255...
+   21      return result;
+   22  }
+
+Here is a sample command line session that shows how to build and run this
+code via ``lli`` inside GDB:
+
+.. code-block:: bash
+
+   $ $BINPATH/clang -cc1 -O0 -g -emit-llvm showdebug.c
+   $ gdb --quiet --args $BINPATH/lli -use-mcjit showdebug.ll 5
+   Reading symbols from $BINPATH/lli...done.
+   (gdb) b showdebug.c:6
+   No source file named showdebug.c.
+   Make breakpoint pending on future shared library load? (y or [n]) y
+   Breakpoint 1 (showdebug.c:6) pending.
+   (gdb) r
+   Starting program: $BINPATH/lli -use-mcjit showdebug.ll 5
+   [Thread debugging using libthread_db enabled]
+
+   Breakpoint 1, compute_factorial (n=5) at showdebug.c:6
+   6	    int f = n;
+   (gdb) p n
+   $1 = 5
+   (gdb) p f
+   $2 = 0
+   (gdb) n
+   7	    while (--n > 1)
+   (gdb) p f
+   $3 = 5
+   (gdb) b showdebug.c:9
+   Breakpoint 2 at 0x7ffff7ed404c: file showdebug.c, line 9.
+   (gdb) c
+   Continuing.
+
+   Breakpoint 2, compute_factorial (n=1) at showdebug.c:9
+   9	    return f;
+   (gdb) p f
+   $4 = 120
+   (gdb) bt
+   #0  compute_factorial (n=1) at showdebug.c:9
+   #1  0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
+   #2  0x3500000001652748 in ?? ()
+   #3  0x00000000016677e0 in ?? ()
+   #4  0x0000000000000002 in ?? ()
+   #5  0x0000000000d953b3 in llvm::MCJIT::runFunction (this=0x16151f0, F=0x1603020, ArgValues=...) at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/MCJIT/MCJIT.cpp:161
+   #6  0x0000000000dc8872 in llvm::ExecutionEngine::runFunctionAsMain (this=0x16151f0, Fn=0x1603020, argv=..., envp=0x7fffffffe040)
+       at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/ExecutionEngine.cpp:397
+   #7  0x000000000059c583 in main (argc=4, argv=0x7fffffffe018, envp=0x7fffffffe040) at /home/ebenders_test/llvm_svn_rw/tools/lli/lli.cpp:324
+   (gdb) finish
+   Run till exit from #0  compute_factorial (n=1) at showdebug.c:9
+   0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
+   18	    int result = compute_factorial(firstletter - '0');
+   Value returned is $5 = 120
+   (gdb) p result
+   $6 = 23406408
+   (gdb) n
+   21	    return result;
+   (gdb) p result
+   $7 = 120
+   (gdb) c
+   Continuing.
+
+   Program exited with code 0170.
+   (gdb)
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index cda281a25c12..e35e72955640 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -137,6 +137,9 @@ reviewees. If someone is kind enough to review your code, you should return the
 favor for someone else.  Note that anyone is welcome to review and give feedback
 on a patch, but only people with Subversion write access can approve it.
 
+There is a web based code review tool that can optionally be used
+for code reviews. See :doc:`Phabricator`.
+
 Code Owners
 -----------
 
@@ -279,7 +282,7 @@ If you have recently been granted commit access, these policies apply:
 #. You are granted *commit-after-approval* to all parts of LLVM.  To get
    approval, submit a `patch`_ to `llvm-commits
    <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_. When approved
-   you may commit it yourself.</li>
+   you may commit it yourself.
 
 #. You are allowed to commit patches without approval which you think are
    obvious. This is clearly a subjective decision --- we simply expect you to
diff --git a/docs/ExtendingLLVM.html b/docs/ExtendingLLVM.html
deleted file mode 100644
index 6782787d521d..000000000000
--- a/docs/ExtendingLLVM.html
+++ /dev/null
@@ -1,379 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Extending LLVM: Adding instructions, intrinsics, types, etc.</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-
-<body>
-
-<h1>
-  Extending LLVM: Adding instructions, intrinsics, types, etc.
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction and Warning</a></li>
-  <li><a href="#intrinsic">Adding a new intrinsic function</a></li>
-  <li><a href="#instruction">Adding a new instruction</a></li>
-  <li><a href="#sdnode">Adding a new SelectionDAG node</a></li>
-  <li><a href="#type">Adding a new type</a>
-  <ol>
-    <li><a href="#fund_type">Adding a new fundamental type</a></li>
-    <li><a href="#derived_type">Adding a new derived type</a></li>
-  </ol></li>
-</ol>
-
-<div class="doc_author">    
-  <p>Written by <a href="http://misha.brukman.net">Misha Brukman</a>,
-  Brad Jones, Nate Begeman,
-  and <a href="http://nondot.org/sabre">Chris Lattner</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction and Warning</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>During the course of using LLVM, you may wish to customize it for your
-research project or for experimentation. At this point, you may realize that
-you need to add something to LLVM, whether it be a new fundamental type, a new
-intrinsic function, or a whole new instruction.</p>
-
-<p>When you come to this realization, stop and think. Do you really need to
-extend LLVM? Is it a new fundamental capability that LLVM does not support at
-its current incarnation or can it be synthesized from already pre-existing LLVM
-elements? If you are not sure, ask on the <a
-href="http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev">LLVM-dev</a> list. The
-reason is that extending LLVM will get involved as you need to update all the
-different passes that you intend to use with your extension, and there are
-<em>many</em> LLVM analyses and transformations, so it may be quite a bit of
-work.</p>
-
-<p>Adding an <a href="#intrinsic">intrinsic function</a> is far easier than
-adding an instruction, and is transparent to optimization passes.  If your added
-functionality can be expressed as a
-function call, an intrinsic function is the method of choice for LLVM
-extension.</p>
-
-<p>Before you invest a significant amount of effort into a non-trivial
-extension, <span class="doc_warning">ask on the list</span> if what you are
-looking to do can be done with already-existing infrastructure, or if maybe
-someone else is already working on it. You will save yourself a lot of time and
-effort by doing so.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="intrinsic">Adding a new intrinsic function</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Adding a new intrinsic function to LLVM is much easier than adding a new
-instruction.  Almost all extensions to LLVM should start as an intrinsic
-function and then be turned into an instruction if warranted.</p>
-
-<ol>
-<li><tt>llvm/docs/LangRef.html</tt>:
-    Document the intrinsic.  Decide whether it is code generator specific and
-    what the restrictions are.  Talk to other people about it so that you are
-    sure it's a good idea.</li>
-
-<li><tt>llvm/include/llvm/Intrinsics*.td</tt>:
-    Add an entry for your intrinsic.  Describe its memory access characteristics
-    for optimization (this controls whether it will be DCE'd, CSE'd, etc). Note
-    that any intrinsic using the <tt>llvm_int_ty</tt> type for an argument will
-    be deemed by <tt>tblgen</tt> as overloaded and the corresponding suffix 
-    will be required on the intrinsic's name.</li>
-
-<li><tt>llvm/lib/Analysis/ConstantFolding.cpp</tt>: If it is possible to 
-    constant fold your intrinsic, add support to it in the 
-    <tt>canConstantFoldCallTo</tt> and <tt>ConstantFoldCall</tt> functions.</li>
-
-<li><tt>llvm/test/Regression/*</tt>: Add test cases for your test cases to the 
-    test suite</li>
-</ol>
-
-<p>Once the intrinsic has been added to the system, you must add code generator
-support for it.  Generally you must do the following steps:</p>
-
-<dl>
-
-<dt>Add support to the .td file for the target(s) of your choice in 
-   <tt>lib/Target/*/*.td</tt>.</dt>
-
-<dd>This is usually a matter of adding a pattern to the .td file that matches
-    the intrinsic, though it may obviously require adding the instructions you
-    want to generate as well.  There are lots of examples in the PowerPC and X86
-    backend to follow.</dd>
-</dl>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="sdnode">Adding a new SelectionDAG node</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>As with intrinsics, adding a new SelectionDAG node to LLVM is much easier
-than adding a new instruction.  New nodes are often added to help represent
-instructions common to many targets.  These nodes often map to an LLVM
-instruction (add, sub) or intrinsic (byteswap, population count).  In other
-cases, new nodes have been added to allow many targets to perform a common task
-(converting between floating point and integer representation) or capture more
-complicated behavior in a single node (rotate).</p>
-
-<ol>
-<li><tt>include/llvm/CodeGen/ISDOpcodes.h</tt>:
-    Add an enum value for the new SelectionDAG node.</li>
-<li><tt>lib/CodeGen/SelectionDAG/SelectionDAG.cpp</tt>:
-    Add code to print the node to <tt>getOperationName</tt>.  If your new node
-    can be evaluated at compile time when given constant arguments (such as an
-    add of a constant with another constant), find the <tt>getNode</tt> method
-    that takes the appropriate number of arguments, and add a case for your node
-    to the switch statement that performs constant folding for nodes that take
-    the same number of arguments as your new node.</li>
-<li><tt>lib/CodeGen/SelectionDAG/LegalizeDAG.cpp</tt>:
-    Add code to <a href="CodeGenerator.html#selectiondag_legalize">legalize, 
-    promote, and expand</a> the node as necessary.  At a minimum, you will need
-    to add a case statement for your node in <tt>LegalizeOp</tt> which calls
-    LegalizeOp on the node's operands, and returns a new node if any of the
-    operands changed as a result of being legalized.  It is likely that not all
-    targets supported by the SelectionDAG framework will natively support the
-    new node.  In this case, you must also add code in your node's case
-    statement in <tt>LegalizeOp</tt> to Expand your node into simpler, legal
-    operations.  The case for <tt>ISD::UREM</tt> for expanding a remainder into
-    a divide, multiply, and a subtract is a good example.</li>
-<li><tt>lib/CodeGen/SelectionDAG/LegalizeDAG.cpp</tt>:
-    If targets may support the new node being added only at certain sizes, you 
-    will also need to add code to your node's case statement in 
-    <tt>LegalizeOp</tt> to Promote your node's operands to a larger size, and 
-    perform the correct operation.  You will also need to add code to 
-    <tt>PromoteOp</tt> to do this as well.  For a good example, see 
-    <tt>ISD::BSWAP</tt>,
-    which promotes its operand to a wider size, performs the byteswap, and then
-    shifts the correct bytes right to emulate the narrower byteswap in the
-    wider type.</li>
-<li><tt>lib/CodeGen/SelectionDAG/LegalizeDAG.cpp</tt>:
-    Add a case for your node in <tt>ExpandOp</tt> to teach the legalizer how to
-    perform the action represented by the new node on a value that has been
-    split into high and low halves.  This case will be used to support your 
-    node with a 64 bit operand on a 32 bit target.</li>
-<li><tt>lib/CodeGen/SelectionDAG/DAGCombiner.cpp</tt>:
-    If your node can be combined with itself, or other existing nodes in a 
-    peephole-like fashion, add a visit function for it, and call that function
-    from <tt></tt>.  There are several good examples for simple combines you
-    can do; <tt>visitFABS</tt> and <tt>visitSRL</tt> are good starting places.
-    </li>
-<li><tt>lib/Target/PowerPC/PPCISelLowering.cpp</tt>:
-    Each target has an implementation of the <tt>TargetLowering</tt> class,
-    usually in its own file (although some targets include it in the same
-    file as the DAGToDAGISel).  The default behavior for a target is to
-    assume that your new node is legal for all types that are legal for
-    that target.  If this target does not natively support your node, then
-    tell the target to either Promote it (if it is supported at a larger
-    type) or Expand it.  This will cause the code you wrote in 
-    <tt>LegalizeOp</tt> above to decompose your new node into other legal
-    nodes for this target.</li>
-<li><tt>lib/Target/TargetSelectionDAG.td</tt>:
-    Most current targets supported by LLVM generate code using the DAGToDAG
-    method, where SelectionDAG nodes are pattern matched to target-specific
-    nodes, which represent individual instructions.  In order for the targets
-    to match an instruction to your new node, you must add a def for that node
-    to the list in this file, with the appropriate type constraints. Look at
-    <tt>add</tt>, <tt>bswap</tt>, and <tt>fadd</tt> for examples.</li>
-<li><tt>lib/Target/PowerPC/PPCInstrInfo.td</tt>:
-    Each target has a tablegen file that describes the target's instruction
-    set.  For targets that use the DAGToDAG instruction selection framework,
-    add a pattern for your new node that uses one or more target nodes.
-    Documentation for this is a bit sparse right now, but there are several
-    decent examples.  See the patterns for <tt>rotl</tt> in 
-    <tt>PPCInstrInfo.td</tt>.</li>
-<li>TODO: document complex patterns.</li>
-<li><tt>llvm/test/Regression/CodeGen/*</tt>: Add test cases for your new node
-    to the test suite.  <tt>llvm/test/Regression/CodeGen/X86/bswap.ll</tt> is
-    a good example.</li>
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="instruction">Adding a new instruction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p><span class="doc_warning">WARNING: adding instructions changes the bitcode
-format, and it will take some effort to maintain compatibility with
-the previous version.</span> Only add an instruction if it is absolutely
-necessary.</p>
-
-<ol>
-
-<li><tt>llvm/include/llvm/Instruction.def</tt>:
-    add a number for your instruction and an enum name</li>
-
-<li><tt>llvm/include/llvm/Instructions.h</tt>:
-    add a definition for the class that will represent your instruction</li>
-
-<li><tt>llvm/include/llvm/Support/InstVisitor.h</tt>:
-    add a prototype for a visitor to your new instruction type</li>
-
-<li><tt>llvm/lib/AsmParser/Lexer.l</tt>:
-    add a new token to parse your instruction from assembly text file</li>
-
-<li><tt>llvm/lib/AsmParser/llvmAsmParser.y</tt>:
-    add the grammar on how your instruction can be read and what it will
-    construct as a result</li>
-
-<li><tt>llvm/lib/Bitcode/Reader/Reader.cpp</tt>:
-    add a case for your instruction and how it will be parsed from bitcode</li>
-
-<li><tt>llvm/lib/VMCore/Instruction.cpp</tt>:
-    add a case for how your instruction will be printed out to assembly</li>
-
-<li><tt>llvm/lib/VMCore/Instructions.cpp</tt>:
-    implement the class you defined in
-    <tt>llvm/include/llvm/Instructions.h</tt></li>
-
-<li>Test your instruction</li>
-
-<li><tt>llvm/lib/Target/*</tt>: 
-    Add support for your instruction to code generators, or add a lowering
-    pass.</li>
-
-<li><tt>llvm/test/Regression/*</tt>: add your test cases to the test suite.</li>
-
-</ol>
-
-<p>Also, you need to implement (or modify) any analyses or passes that you want
-to understand this new instruction.</p>
-
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="type">Adding a new type</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p><span class="doc_warning">WARNING: adding new types changes the bitcode
-format, and will break compatibility with currently-existing LLVM
-installations.</span> Only add new types if it is absolutely necessary.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="fund_type">Adding a fundamental type</a>
-</h3>
-
-<div>
-
-<ol>
-
-<li><tt>llvm/include/llvm/Type.h</tt>:
-    add enum for the new type; add static <tt>Type*</tt> for this type</li>
-
-<li><tt>llvm/lib/VMCore/Type.cpp</tt>:
-    add mapping from <tt>TypeID</tt> =&gt; <tt>Type*</tt>;
-    initialize the static <tt>Type*</tt></li>
-
-<li><tt>llvm/lib/AsmReader/Lexer.l</tt>:
-    add ability to parse in the type from text assembly</li>
-
-<li><tt>llvm/lib/AsmReader/llvmAsmParser.y</tt>:
-    add a token for that type</li>
-
-</ol>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="derived_type">Adding a derived type</a>
-</h3>
-
-<div>
-
-<ol>
-<li><tt>llvm/include/llvm/Type.h</tt>:
-    add enum for the new type; add a forward declaration of the type
-    also</li>
-
-<li><tt>llvm/include/llvm/DerivedTypes.h</tt>:
-    add new class to represent new class in the hierarchy; add forward 
-    declaration to the TypeMap value type</li>
-
-<li><tt>llvm/lib/VMCore/Type.cpp</tt>:
-    add support for derived type to: 
-<div class="doc_code">
-<pre>
-std::string getTypeDescription(const Type &amp;Ty,
-  std::vector&lt;const Type*&gt; &amp;TypeStack)
-bool TypesEqual(const Type *Ty, const Type *Ty2,
-  std::map&lt;const Type*, const Type*&gt; &amp; EqTypes)
-</pre>
-</div>
-    add necessary member functions for type, and factory methods</li>
-
-<li><tt>llvm/lib/AsmReader/Lexer.l</tt>:
-    add ability to parse in the type from text assembly</li>
-
-<li><tt>llvm/lib/BitCode/Writer/Writer.cpp</tt>:
-    modify <tt>void BitcodeWriter::outputType(const Type *T)</tt> to serialize
-    your type</li>
-
-<li><tt>llvm/lib/BitCode/Reader/Reader.cpp</tt>:
-    modify <tt>const Type *BitcodeReader::ParseType()</tt> to read your data
-    type</li> 
-
-<li><tt>llvm/lib/VMCore/AsmWriter.cpp</tt>:
-    modify
-<div class="doc_code">
-<pre>
-void calcTypeName(const Type *Ty,
-                  std::vector&lt;const Type*&gt; &amp;TypeStack,
-                  std::map&lt;const Type*,std::string&gt; &amp;TypeNames,
-                  std::string &amp; Result)
-</pre>
-</div>
-    to output the new derived type
-</li>  
- 
-
-</ol>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
-  <br>
-  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/ExtendingLLVM.rst b/docs/ExtendingLLVM.rst
new file mode 100644
index 000000000000..6df08eee985a
--- /dev/null
+++ b/docs/ExtendingLLVM.rst
@@ -0,0 +1,306 @@
+.. _extending_llvm:
+
+============================================================
+Extending LLVM: Adding instructions, intrinsics, types, etc.
+============================================================
+
+Introduction and Warning
+========================
+
+
+During the course of using LLVM, you may wish to customize it for your research
+project or for experimentation. At this point, you may realize that you need to
+add something to LLVM, whether it be a new fundamental type, a new intrinsic
+function, or a whole new instruction.
+
+When you come to this realization, stop and think. Do you really need to extend
+LLVM? Is it a new fundamental capability that LLVM does not support at its
+current incarnation or can it be synthesized from already pre-existing LLVM
+elements? If you are not sure, ask on the `LLVM-dev
+<http://mail.cs.uiuc.edu/mailman/listinfo/llvmdev>`_ list. The reason is that
+extending LLVM will get involved as you need to update all the different passes
+that you intend to use with your extension, and there are ``many`` LLVM analyses
+and transformations, so it may be quite a bit of work.
+
+Adding an `intrinsic function`_ is far easier than adding an
+instruction, and is transparent to optimization passes.  If your added
+functionality can be expressed as a function call, an intrinsic function is the
+method of choice for LLVM extension.
+
+Before you invest a significant amount of effort into a non-trivial extension,
+**ask on the list** if what you are looking to do can be done with
+already-existing infrastructure, or if maybe someone else is already working on
+it. You will save yourself a lot of time and effort by doing so.
+
+.. _intrinsic function:
+
+Adding a new intrinsic function
+===============================
+
+Adding a new intrinsic function to LLVM is much easier than adding a new
+instruction.  Almost all extensions to LLVM should start as an intrinsic
+function and then be turned into an instruction if warranted.
+
+#. ``llvm/docs/LangRef.html``:
+
+   Document the intrinsic.  Decide whether it is code generator specific and
+   what the restrictions are.  Talk to other people about it so that you are
+   sure it's a good idea.
+
+#. ``llvm/include/llvm/Intrinsics*.td``:
+
+   Add an entry for your intrinsic.  Describe its memory access characteristics
+   for optimization (this controls whether it will be DCE'd, CSE'd, etc). Note
+   that any intrinsic using the ``llvm_int_ty`` type for an argument will
+   be deemed by ``tblgen`` as overloaded and the corresponding suffix will
+   be required on the intrinsic's name.
+
+#. ``llvm/lib/Analysis/ConstantFolding.cpp``:
+
+   If it is possible to constant fold your intrinsic, add support to it in the
+   ``canConstantFoldCallTo`` and ``ConstantFoldCall`` functions.
+
+#. ``llvm/test/Regression/*``:
+
+   Add test cases for your test cases to the test suite
+
+Once the intrinsic has been added to the system, you must add code generator
+support for it.  Generally you must do the following steps:
+
+Add support to the .td file for the target(s) of your choice in
+``lib/Target/*/*.td``.
+
+  This is usually a matter of adding a pattern to the .td file that matches the
+  intrinsic, though it may obviously require adding the instructions you want to
+  generate as well.  There are lots of examples in the PowerPC and X86 backend
+  to follow.
+
+Adding a new SelectionDAG node
+==============================
+
+As with intrinsics, adding a new SelectionDAG node to LLVM is much easier than
+adding a new instruction.  New nodes are often added to help represent
+instructions common to many targets.  These nodes often map to an LLVM
+instruction (add, sub) or intrinsic (byteswap, population count).  In other
+cases, new nodes have been added to allow many targets to perform a common task
+(converting between floating point and integer representation) or capture more
+complicated behavior in a single node (rotate).
+
+#. ``include/llvm/CodeGen/ISDOpcodes.h``:
+
+   Add an enum value for the new SelectionDAG node.
+
+#. ``lib/CodeGen/SelectionDAG/SelectionDAG.cpp``:
+
+   Add code to print the node to ``getOperationName``.  If your new node can be
+    evaluated at compile time when given constant arguments (such as an add of a
+    constant with another constant), find the ``getNode`` method that takes the
+    appropriate number of arguments, and add a case for your node to the switch
+    statement that performs constant folding for nodes that take the same number
+    of arguments as your new node.
+
+#. ``lib/CodeGen/SelectionDAG/LegalizeDAG.cpp``:
+
+   Add code to `legalize, promote, and expand
+   <CodeGenerator.html#selectiondag_legalize>`_ the node as necessary.  At a
+   minimum, you will need to add a case statement for your node in
+   ``LegalizeOp`` which calls LegalizeOp on the node's operands, and returns a
+   new node if any of the operands changed as a result of being legalized.  It
+   is likely that not all targets supported by the SelectionDAG framework will
+   natively support the new node.  In this case, you must also add code in your
+   node's case statement in ``LegalizeOp`` to Expand your node into simpler,
+   legal operations.  The case for ``ISD::UREM`` for expanding a remainder into
+   a divide, multiply, and a subtract is a good example.
+
+#. ``lib/CodeGen/SelectionDAG/LegalizeDAG.cpp``:
+
+   If targets may support the new node being added only at certain sizes, you
+    will also need to add code to your node's case statement in ``LegalizeOp``
+    to Promote your node's operands to a larger size, and perform the correct
+    operation.  You will also need to add code to ``PromoteOp`` to do this as
+    well.  For a good example, see ``ISD::BSWAP``, which promotes its operand to
+    a wider size, performs the byteswap, and then shifts the correct bytes right
+    to emulate the narrower byteswap in the wider type.
+
+#. ``lib/CodeGen/SelectionDAG/LegalizeDAG.cpp``:
+
+   Add a case for your node in ``ExpandOp`` to teach the legalizer how to
+   perform the action represented by the new node on a value that has been split
+   into high and low halves.  This case will be used to support your node with a
+   64 bit operand on a 32 bit target.
+
+#. ``lib/CodeGen/SelectionDAG/DAGCombiner.cpp``:
+
+   If your node can be combined with itself, or other existing nodes in a
+   peephole-like fashion, add a visit function for it, and call that function
+   from. There are several good examples for simple combines you can do;
+   ``visitFABS`` and ``visitSRL`` are good starting places.
+
+#. ``lib/Target/PowerPC/PPCISelLowering.cpp``:
+
+   Each target has an implementation of the ``TargetLowering`` class, usually in
+   its own file (although some targets include it in the same file as the
+   DAGToDAGISel).  The default behavior for a target is to assume that your new
+   node is legal for all types that are legal for that target.  If this target
+   does not natively support your node, then tell the target to either Promote
+   it (if it is supported at a larger type) or Expand it.  This will cause the
+   code you wrote in ``LegalizeOp`` above to decompose your new node into other
+   legal nodes for this target.
+
+#. ``lib/Target/TargetSelectionDAG.td``:
+
+   Most current targets supported by LLVM generate code using the DAGToDAG
+   method, where SelectionDAG nodes are pattern matched to target-specific
+   nodes, which represent individual instructions.  In order for the targets to
+   match an instruction to your new node, you must add a def for that node to
+   the list in this file, with the appropriate type constraints. Look at
+   ``add``, ``bswap``, and ``fadd`` for examples.
+
+#. ``lib/Target/PowerPC/PPCInstrInfo.td``:
+
+   Each target has a tablegen file that describes the target's instruction set.
+   For targets that use the DAGToDAG instruction selection framework, add a
+   pattern for your new node that uses one or more target nodes.  Documentation
+   for this is a bit sparse right now, but there are several decent examples.
+   See the patterns for ``rotl`` in ``PPCInstrInfo.td``.
+
+#. TODO: document complex patterns.
+
+#. ``llvm/test/Regression/CodeGen/*``:
+
+   Add test cases for your new node to the test suite.
+   ``llvm/test/Regression/CodeGen/X86/bswap.ll`` is a good example.
+
+Adding a new instruction
+========================
+
+.. warning::
+
+  Adding instructions changes the bitcode format, and it will take some effort
+  to maintain compatibility with the previous version. Only add an instruction
+  if it is absolutely necessary.
+
+#. ``llvm/include/llvm/Instruction.def``:
+
+   add a number for your instruction and an enum name
+
+#. ``llvm/include/llvm/Instructions.h``:
+
+   add a definition for the class that will represent your instruction
+
+#. ``llvm/include/llvm/Support/InstVisitor.h``:
+
+   add a prototype for a visitor to your new instruction type
+
+#. ``llvm/lib/AsmParser/Lexer.l``:
+
+   add a new token to parse your instruction from assembly text file
+
+#. ``llvm/lib/AsmParser/llvmAsmParser.y``:
+
+   add the grammar on how your instruction can be read and what it will
+   construct as a result
+
+#. ``llvm/lib/Bitcode/Reader/Reader.cpp``:
+
+   add a case for your instruction and how it will be parsed from bitcode
+
+#. ``llvm/lib/VMCore/Instruction.cpp``:
+
+   add a case for how your instruction will be printed out to assembly
+
+#. ``llvm/lib/VMCore/Instructions.cpp``:
+
+   implement the class you defined in ``llvm/include/llvm/Instructions.h``
+
+#. Test your instruction
+
+#. ``llvm/lib/Target/*``: 
+
+   add support for your instruction to code generators, or add a lowering pass.
+
+#. ``llvm/test/Regression/*``:
+
+   add your test cases to the test suite.
+
+Also, you need to implement (or modify) any analyses or passes that you want to
+understand this new instruction.
+
+Adding a new type
+=================
+
+.. warning::
+
+  Adding new types changes the bitcode format, and will break compatibility with
+  currently-existing LLVM installations. Only add new types if it is absolutely
+  necessary.
+
+Adding a fundamental type
+-------------------------
+
+#. ``llvm/include/llvm/Type.h``:
+
+   add enum for the new type; add static ``Type*`` for this type
+
+#. ``llvm/lib/VMCore/Type.cpp``:
+
+   add mapping from ``TypeID`` => ``Type*``; initialize the static ``Type*``
+
+#. ``llvm/lib/AsmReader/Lexer.l``:
+
+   add ability to parse in the type from text assembly
+
+#. ``llvm/lib/AsmReader/llvmAsmParser.y``:
+
+   add a token for that type
+
+Adding a derived type
+---------------------
+
+#. ``llvm/include/llvm/Type.h``:
+
+   add enum for the new type; add a forward declaration of the type also
+
+#. ``llvm/include/llvm/DerivedTypes.h``:
+
+   add new class to represent new class in the hierarchy; add forward
+   declaration to the TypeMap value type
+
+#. ``llvm/lib/VMCore/Type.cpp``:
+
+   add support for derived type to:
+
+   .. code-block:: c++
+
+     std::string getTypeDescription(const Type &Ty,
+                                    std::vector<const Type*> &TypeStack)
+     bool TypesEqual(const Type *Ty, const Type *Ty2,
+                     std::map<const Type*, const Type*> &EqTypes)
+
+   add necessary member functions for type, and factory methods
+
+#. ``llvm/lib/AsmReader/Lexer.l``:
+
+   add ability to parse in the type from text assembly
+
+#. ``llvm/lib/BitCode/Writer/Writer.cpp``:
+
+   modify ``void BitcodeWriter::outputType(const Type *T)`` to serialize your
+   type
+
+#. ``llvm/lib/BitCode/Reader/Reader.cpp``:
+
+   modify ``const Type *BitcodeReader::ParseType()`` to read your data type
+
+#. ``llvm/lib/VMCore/AsmWriter.cpp``:
+
+   modify
+
+   .. code-block:: c++
+
+     void calcTypeName(const Type *Ty,
+                       std::vector<const Type*> &TypeStack,
+                       std::map<const Type*,std::string> &TypeNames,
+                       std::string &Result)
+
+   to output the new derived type
diff --git a/docs/GarbageCollection.html b/docs/GarbageCollection.html
index 0b8f5889ac65..e12485167a72 100644
--- a/docs/GarbageCollection.html
+++ b/docs/GarbageCollection.html
@@ -1253,7 +1253,7 @@ methods. Here's a realistic example:</p>
 >#include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Function.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetAsmInfo.h"
 
 void MyGCPrinter::beginAssembly(std::ostream &amp;OS, AsmPrinter &amp;AP,
@@ -1266,7 +1266,7 @@ void MyGCPrinter::finishAssembly(std::ostream &amp;OS, AsmPrinter &amp;AP,
   // Set up for emitting addresses.
   const char *AddressDirective;
   int AddressAlignLog;
-  if (AP.TM.getTargetData()->getPointerSize() == sizeof(int32_t)) {
+  if (AP.TM.getDataLayout()->getPointerSize() == sizeof(int32_t)) {
     AddressDirective = TAI.getData32bitsDirective();
     AddressAlignLog = 2;
   } else {
@@ -1382,7 +1382,7 @@ Fergus Henderson. International Symposium on Memory Management 2002.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-03 17:25:19 +0200 (Thu, 03 May 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 
 </body>
diff --git a/docs/GettingStarted.html b/docs/GettingStarted.html
deleted file mode 100644
index 61335afd8329..000000000000
--- a/docs/GettingStarted.html
+++ /dev/null
@@ -1,1760 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Getting Started with LLVM System</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  Getting Started with the LLVM System  
-</h1>
-
-<ul>
-  <li><a href="#overview">Overview</a>
-  <li><a href="#quickstart">Getting Started Quickly (A Summary)</a>
-  <li><a href="#requirements">Requirements</a>
-    <ol>
-      <li><a href="#hardware">Hardware</a></li>
-      <li><a href="#software">Software</a></li>
-      <li><a href="#brokengcc">Broken versions of GCC and other tools</a></li>
-    </ol></li>
-
-  <li><a href="#starting">Getting Started with LLVM</a>
-    <ol>
-      <li><a href="#terminology">Terminology and Notation</a></li>
-      <li><a href="#environment">Setting Up Your Environment</a></li>
-      <li><a href="#unpack">Unpacking the LLVM Archives</a></li>
-      <li><a href="#checkout">Checkout LLVM from Subversion</a></li>
-      <li><a href="#git_mirror">LLVM GIT mirror</a></li>
-      <li><a href="#config">Local LLVM Configuration</a></li>
-      <li><a href="#compile">Compiling the LLVM Suite Source Code</a></li>
-      <li><a href="#cross-compile">Cross-Compiling LLVM</a></li>
-      <li><a href="#objfiles">The Location of LLVM Object Files</a></li>
-      <li><a href="#optionalconfig">Optional Configuration Items</a></li>
-    </ol></li>
-
-  <li><a href="#layout">Program layout</a>
-    <ol>
-      <li><a href="#examples"><tt>llvm/examples</tt></a></li>
-      <li><a href="#include"><tt>llvm/include</tt></a></li>
-      <li><a href="#lib"><tt>llvm/lib</tt></a></li>
-      <li><a href="#projects"><tt>llvm/projects</tt></a></li>
-      <li><a href="#runtime"><tt>llvm/runtime</tt></a></li>
-      <li><a href="#test"><tt>llvm/test</tt></a></li>
-      <li><a href="#test-suite"><tt>test-suite</tt></a></li>
-      <li><a href="#tools"><tt>llvm/tools</tt></a></li>
-      <li><a href="#utils"><tt>llvm/utils</tt></a></li>
-    </ol></li>
-
-  <li><a href="#tutorial">An Example Using the LLVM Tool Chain</a>
-      <ol>
-         <li><a href="#tutorial4">Example with Clang</a></li>
-      </ol>
-  <li><a href="#problems">Common Problems</a>
-  <li><a href="#links">Links</a>
-</ul>
-
-<div class="doc_author">
-  <p>Written by: 
-    <a href="mailto:criswell@uiuc.edu">John Criswell</a>, 
-    <a href="mailto:sabre@nondot.org">Chris Lattner</a>,
-    <a href="http://misha.brukman.net/">Misha Brukman</a>, 
-    <a href="http://www.cs.uiuc.edu/~vadve">Vikram Adve</a>, and
-    <a href="mailto:gshi1@uiuc.edu">Guochun Shi</a>.
-  </p>
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="overview">Overview</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Welcome to LLVM! In order to get started, you first need to know some
-basic information.</p>
-
-<p>First, LLVM comes in three pieces. The first piece is the LLVM
-suite. This contains all of the tools, libraries, and header files
-needed to use LLVM.  It contains an assembler, disassembler, bitcode
-analyzer and bitcode optimizer.  It also contains basic regression tests that
-can be used to test the LLVM tools and the Clang front end.</p>
-
-<p>The second piece is the <a href="http://clang.llvm.org/">Clang</a> front end.
-This component compiles C, C++, Objective C, and Objective C++ code into LLVM
-bitcode. Once compiled into LLVM bitcode, a program can be manipulated with the
-LLVM tools from the LLVM suite.
-</p>
-
-<p>
-There is a third, optional piece called Test Suite.  It is a suite of programs
-with a testing harness that can be used to further test LLVM's functionality
-and performance.
-</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="quickstart">Getting Started Quickly (A Summary)</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The LLVM Getting Started documentation may be out of date.  So, the Clang 
-<a href="http://clang.llvm.org/get_started.html">Getting Started</a> page might 
-also be a good place to start.</p>
-
-<p>Here's the short story for getting up and running quickly with LLVM:</p>
-
-<ol>
-  <li>Read the documentation.</li>
-  <li>Read the documentation.</li>
-  <li>Remember that you were warned twice about reading the documentation.</li>
-
-  <li>Checkout LLVM:
-  <ul>
-    <li><tt>cd <i>where-you-want-llvm-to-live</i></tt>
-    <li><tt>svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm</tt></li>
-  </ul>
-  </li>
-
-  <li>Checkout Clang:
-  <ul>
-    <li><tt>cd <i>where-you-want-llvm-to-live</i></tt>
-    <li><tt>cd llvm/tools</tt>
-    <li><tt>svn co http://llvm.org/svn/llvm-project/cfe/trunk clang</tt></li>
-  </ul>
-  </li>
-
-  <li>Checkout Compiler-RT:
-  <ul>
-    <li><tt>cd <i>where-you-want-llvm-to-live</i></tt>
-    <li><tt>cd llvm/projects</tt>
-    <li><tt>svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk
-        compiler-rt</tt></li>
-  </ul>
-  </li>
-
-  <li>Get the Test Suite Source Code <b>[Optional]</b>
-  <ul>
-    <li><tt>cd <i>where-you-want-llvm-to-live</i></tt>
-    <li><tt>cd llvm/projects</tt>
-    <li><tt>svn co http://llvm.org/svn/llvm-project/test-suite/trunk test-suite</tt></li>
-  </ul>
-  </li>
-
-  <li>Configure and build LLVM and Clang:
-  <ul>
-    <li><tt>cd <i>where-you-want-to-build-llvm</i></tt></li>
-    <li><tt>mkdir build</tt> (for building without polluting the source dir)</li>
-    <li><tt>cd build</tt></li>
-    <li><tt>../llvm/configure [options]</tt>
-    <br>Some common options:
-
-      <ul>
-        <li><tt>--prefix=<i>directory</i></tt> -
-        Specify for <i>directory</i> the full pathname of where you
-        want the LLVM tools and libraries to be installed (default
-        <tt>/usr/local</tt>).</li>
-      </ul>
-
-      <ul>
-        <li><tt>--enable-optimized</tt> -
-        Compile with optimizations enabled (default is NO).</li>
-      </ul>
-
-      <ul>
-        <li><tt>--enable-assertions</tt> -
-        Compile with assertion checks enabled (default is YES).</li>
-      </ul>
-   </li>
-    <li><tt>make [-j]</tt> - The -j specifies the number of jobs (commands) to 
-    run simultaneously.  This builds both LLVM and Clang for Debug+Asserts mode.
-    The --enabled-optimized configure option is used to specify a Release build.</li>
-    <li><tt>make check-all</tt> -
-    This run the regression tests to ensure everything is in working order.</li>
-    <li><tt>make update</tt> -
-    This command is used to update all the svn repositories at once, rather then
-    having to <tt>cd</tt> into the individual repositories and running
-    <tt>svn update</tt>.</li>
-    <li>It is also possible to use CMake instead of the makefiles. With CMake
-    it is also possible to generate project files for several IDEs: Eclipse
-    CDT4, CodeBlocks, Qt-Creator (use the CodeBlocks generator), KDevelop3.</li>
-    <li>If you get an "internal compiler error (ICE)" or test failures, see 
-        <a href="#brokengcc">below</a>.</li>
-
-  </ul>
-  </li>
-
-</ol>
-
-<p>Consult the <a href="#starting">Getting Started with LLVM</a> section for
-detailed information on configuring and compiling LLVM.  See <a
-href="#environment">Setting Up Your Environment</a> for tips that simplify
-working with the Clang front end and LLVM tools.  Go to <a href="#layout">Program
-Layout</a> to learn about the layout of the source code tree.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="requirements">Requirements</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Before you begin to use the LLVM system, review the requirements given below.
-This may save you some trouble by knowing ahead of time what hardware and
-software you will need.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="hardware">Hardware</a>
-</h3>
-
-<div>
-
-<p>LLVM is known to work on the following platforms:</p>
-
-<table cellpadding="3" summary="Known LLVM platforms">
-<tr>
-  <th>OS</th>
-  <th>Arch</th>
-  <th>Compilers</th>
-</tr>
-<tr>
-  <td>AuroraUX</td>
-  <td>x86<sup><a href="#pf_1">1</a></sup></td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>Linux</td>
-  <td>x86<sup><a href="#pf_1">1</a></sup></td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>Linux</td>
-  <td>amd64</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>Solaris</td>
-  <td>V9 (Ultrasparc)</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>FreeBSD</td>
-  <td>x86<sup><a href="#pf_1">1</a></sup></td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>FreeBSD</td>
-  <td>amd64</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>MacOS X<sup><a href="#pf_2">2</a></sup></td>
-  <td>PowerPC</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>MacOS X<sup><a href="#pf_2">2</a>,<a href="#pf_9">9</a></sup></td>
-  <td>x86</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>Cygwin/Win32</td>
-  <td>x86<sup><a href="#pf_1">1</a>,<a href="#pf_8">8</a>,
-     <a href="#pf_11">11</a></sup></td>
-  <td>GCC 3.4.X, binutils 2.20</td>
-</tr>
-<tr>
-  <td>MinGW/Win32</td>
-  <td>x86<sup><a href="#pf_1">1</a>,<a href="#pf_6">6</a>,
-     <a href="#pf_8">8</a>, <a href="#pf_10">10</a>,
-     <a href="#pf_11">11</a></sup></td>
-  <td>GCC 3.4.X, binutils 2.20</td>
-</tr>
-</table>
-
-<p>LLVM has partial support for the following platforms:</p>
-
-<table summary="LLVM partial platform support">
-<tr>
-  <th>OS</th>
-  <th>Arch</th>
-  <th>Compilers</th>
-</tr>
-<tr>
-  <td>Windows</td>
-  <td>x86<sup><a href="#pf_1">1</a></sup></td>
-  <td>Visual Studio 2008 or higher<sup><a href="#pf_4">4</a>,<a href="#pf_5">5</a></sup></td>
-<tr>
-  <td>AIX<sup><a href="#pf_3">3</a>,<a href="#pf_4">4</a></sup></td>
-  <td>PowerPC</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>Linux<sup><a href="#pf_3">3</a>,<a href="#pf_5">5</a></sup></td>
-  <td>PowerPC</td>
-  <td>GCC</td>
-</tr>
-
-<tr>
-  <td>Linux<sup><a href="#pf_7">7</a></sup></td>
-  <td>Alpha</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>Linux<sup><a href="#pf_7">7</a></sup></td>
-  <td>Itanium (IA-64)</td>
-  <td>GCC</td>
-</tr>
-<tr>
-  <td>HP-UX<sup><a href="#pf_7">7</a></sup></td>
-  <td>Itanium (IA-64)</td>
-  <td>HP aCC</td>
-</tr>
-<tr>
-  <td>Windows x64</td>
-  <td>x86-64</td>
-  <td>mingw-w64's GCC-4.5.x<sup><a href="#pf_12">12</a></sup></td>
-</tr>
-</table>
-
-<p><b>Notes:</b></p>
-
-<div class="doc_notes">
-<ol>
-<li><a name="pf_1">Code generation supported for Pentium processors and
-up</a></li>
-<li><a name="pf_2">Code generation supported for 32-bit ABI only</a></li>
-<li><a name="pf_3">No native code generation</a></li>
-<li><a name="pf_4">Build is not complete: one or more tools do not link or function</a></li>
-<li><a name="pf_5">The GCC-based C/C++ frontend does not build</a></li>
-<li><a name="pf_6">The port is done using the MSYS shell.</a></li>
-<li><a name="pf_7">Native code generation exists but is not complete.</a></li>
-<li><a name="pf_8">Binutils 2.20 or later is required to build the assembler
-    generated by LLVM properly.</a></li>
-<li><a name="pf_9">Xcode 2.5 and gcc 4.0.1</a> (Apple Build 5370) will trip
-    internal LLVM assert messages when compiled for Release at optimization
-    levels greater than 0 (i.e., <i>"-O1"</i> and higher).
-    Add <i>OPTIMIZE_OPTION="-O0"</i> to the build command line
-    if compiling for LLVM Release or bootstrapping the LLVM toolchain.</li>
-<li><a name="pf_10">For MSYS/MinGW on Windows, be sure to install the MSYS
-    version of the perl package, and be sure it appears in your path
-    before any Windows-based versions such as Strawberry Perl and
-    ActivePerl, as these have Windows-specifics that will cause the
-    build to fail.</a></li>
-<li><a name="pf_11">To use LLVM modules on Win32-based system,
-    you may configure LLVM with <i>&quot;--enable-shared&quot;</i>.</a></li>
-<li><a name="pf_12">To compile SPU backend, you need to add
-    <tt>&quot;LDFLAGS=-Wl,--stack,16777216&quot;</tt> to configure.</a></li>
-</ol>
-</div>
-
-<p>Note that you will need about 1-3 GB of space for a full LLVM build in Debug
-mode, depending on the system (it is so large because of all the debugging
-information and the fact that the libraries are statically linked into multiple
-tools).  If you do not need many of the tools and you are space-conscious, you
-can pass <tt>ONLY_TOOLS="tools you need"</tt> to make.  The Release build
-requires considerably less space.</p>
-
-<p>The LLVM suite <i>may</i> compile on other platforms, but it is not
-guaranteed to do so.  If compilation is successful, the LLVM utilities should be
-able to assemble, disassemble, analyze, and optimize LLVM bitcode.  Code
-generation should work as well, although the generated native code may not work
-on your platform.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="software">Software</a>
-</h3>
-<div>
-  <p>Compiling LLVM requires that you have several software packages 
-  installed. The table below lists those required packages. The Package column
-  is the usual name for the software package that LLVM depends on. The Version
-  column provides "known to work" versions of the package. The Notes column
-  describes how LLVM uses the package and provides other details.</p>
-  <table summary="Packages required to compile LLVM">
-    <tr><th>Package</th><th>Version</th><th>Notes</th></tr>
-
-    <tr>
-      <td><a href="http://savannah.gnu.org/projects/make">GNU Make</a></td>
-      <td>3.79, 3.79.1</td>
-      <td>Makefile/build processor</td>
-    </tr>
-
-    <tr>
-      <td><a href="http://gcc.gnu.org/">GCC</a></td>
-      <td>3.4.2</td>
-      <td>C/C++ compiler<sup><a href="#sf1">1</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://www.gnu.org/software/texinfo/">TeXinfo</a></td>
-      <td>4.5</td>
-      <td>For building the CFE</td>
-    </tr>
-
-    <tr>
-      <td><a href="http://subversion.tigris.org/project_packages.html">SVN</a></td>
-      <td>&ge;1.3</td>
-      <td>Subversion access to LLVM<sup><a href="#sf2">2</a></sup></td>
-    </tr>
-
-    <!-- FIXME:
-    Do we support dg?
-    Are DejaGnu and expect obsolete?
-    Shall we mention Python? -->
-
-    <tr>
-      <td><a href="http://savannah.gnu.org/projects/dejagnu">DejaGnu</a></td>
-      <td>1.4.2</td>
-      <td>Automated test suite<sup><a href="#sf3">3</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://www.tcl.tk/software/tcltk/">tcl</a></td>
-      <td>8.3, 8.4</td>
-      <td>Automated test suite<sup><a href="#sf3">3</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://expect.nist.gov/">expect</a></td>
-      <td>5.38.0</td>
-      <td>Automated test suite<sup><a href="#sf3">3</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://www.perl.com/download.csp">perl</a></td>
-      <td>&ge;5.6.0</td>
-      <td>Utilities</td>
-    </tr>
-
-    <tr>
-      <td><a href="http://savannah.gnu.org/projects/m4">GNU M4</a>
-      <td>1.4</td>
-      <td>Macro processor for configuration<sup><a href="#sf4">4</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://www.gnu.org/software/autoconf/">GNU Autoconf</a></td>
-      <td>2.60</td>
-      <td>Configuration script builder<sup><a href="#sf4">4</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://www.gnu.org/software/automake/">GNU Automake</a></td>
-      <td>1.9.6</td>
-      <td>aclocal macro generator<sup><a href="#sf4">4</a></sup></td>
-    </tr>
-
-    <tr>
-      <td><a href="http://savannah.gnu.org/projects/libtool">libtool</a></td>
-      <td>1.5.22</td>
-      <td>Shared library manager<sup><a href="#sf4">4</a></sup></td>
-    </tr>
-
-  </table>
-
-  <p><b>Notes:</b></p>
-  <div class="doc_notes">
-  <ol>
-    <li><a name="sf1">Only the C and C++ languages are needed so there's no
-      need to build the other languages for LLVM's purposes.</a> See 
-      <a href="#brokengcc">below</a> for specific version info.</li>
-    <li><a name="sf2">You only need Subversion if you intend to build from the 
-      latest LLVM sources. If you're working from a release distribution, you
-      don't need Subversion.</a></li>
-    <li><a name="sf3">Only needed if you want to run the automated test 
-      suite in the <tt>llvm/test</tt> directory.</a></li>
-    <li><a name="sf4">If you want to make changes to the configure scripts, 
-      you will need GNU autoconf (2.60), and consequently, GNU M4 (version 1.4 
-      or higher). You will also need automake (1.9.6). We only use aclocal 
-      from that package.</a></li>
-  </ol>
-  </div>
-  
-  <p>Additionally, your compilation host is expected to have the usual 
-  plethora of Unix utilities. Specifically:</p>
-  <ul>
-    <li><b>ar</b> - archive library builder</li>
-    <li><b>bzip2*</b> - bzip2 command for distribution generation</li>
-    <li><b>bunzip2*</b> - bunzip2 command for distribution checking</li>
-    <li><b>chmod</b> - change permissions on a file</li>
-    <li><b>cat</b> - output concatenation utility</li>
-    <li><b>cp</b> - copy files</li>
-    <li><b>date</b> - print the current date/time </li>
-    <li><b>echo</b> - print to standard output</li>
-    <li><b>egrep</b> - extended regular expression search utility</li>
-    <li><b>find</b> - find files/dirs in a file system</li>
-    <li><b>grep</b> - regular expression search utility</li>
-    <li><b>gzip*</b> - gzip command for distribution generation</li>
-    <li><b>gunzip*</b> - gunzip command for distribution checking</li>
-    <li><b>install</b> - install directories/files </li>
-    <li><b>mkdir</b> - create a directory</li>
-    <li><b>mv</b> - move (rename) files</li>
-    <li><b>ranlib</b> - symbol table builder for archive libraries</li>
-    <li><b>rm</b> - remove (delete) files and directories</li>
-    <li><b>sed</b> - stream editor for transforming output</li>
-    <li><b>sh</b> - Bourne shell for make build scripts</li>
-    <li><b>tar</b> - tape archive for distribution generation</li>
-    <li><b>test</b> - test things in file system</li>
-    <li><b>unzip*</b> - unzip command for distribution checking</li>
-    <li><b>zip*</b> - zip command for distribution generation</li>
-  </ul>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="brokengcc">Broken versions of GCC and other tools</a>
-</h3>
-
-<div>
-
-<p>LLVM is very demanding of the host C++ compiler, and as such tends to expose
-bugs in the compiler.  In particular, several versions of GCC crash when trying
-to compile LLVM.  We routinely use GCC 4.2 (and higher) or Clang.  
-Other versions of GCC will probably work as well.  GCC versions listed
-here are known to not work.  If you are using one of these versions, please try
-to upgrade your GCC to something more recent.  If you run into a problem with a
-version of GCC not listed here, please <a href="mailto:llvmdev@cs.uiuc.edu">let
-us know</a>.  Please use the "<tt>gcc -v</tt>" command to find out which version
-of GCC you are using.
-</p>
-
-<p><b>GCC versions prior to 3.0</b>: GCC 2.96.x and before had several
-problems in the STL that effectively prevent it from compiling LLVM.
-</p>
-
-<p><b>GCC 3.2.2 and 3.2.3</b>: These versions of GCC fails to compile LLVM with
-a bogus template error.  This was fixed in later GCCs.</p>
-
-<p><b>GCC 3.3.2</b>: This version of GCC suffered from a <a 
-href="http://gcc.gnu.org/PR13392">serious bug</a> which causes it to crash in
-the "<tt>convert_from_eh_region_ranges_1</tt>" GCC function.</p>
-
-<p><b>Cygwin GCC 3.3.3</b>: The version of GCC 3.3.3 commonly shipped with 
-   Cygwin does not work.</p>
-<p><b>SuSE GCC 3.3.3</b>: The version of GCC 3.3.3 shipped with SuSE 9.1 (and 
-   possibly others) does not compile LLVM correctly (it appears that exception 
-   handling is broken in some cases).  Please download the FSF 3.3.3 or upgrade
-   to a newer version of GCC.</p>
-<p><b>GCC 3.4.0 on linux/x86 (32-bit)</b>: GCC miscompiles portions of the 
-   code generator, causing an infinite loop in the llvm-gcc build when built
-   with optimizations enabled (i.e. a release build).</p>
-<p><b>GCC 3.4.2 on linux/x86 (32-bit)</b>: GCC miscompiles portions of the 
-   code generator at -O3, as with 3.4.0.  However gcc 3.4.2 (unlike 3.4.0)
-   correctly compiles LLVM at -O2.  A work around is to build release LLVM
-   builds with "make ENABLE_OPTIMIZED=1 OPTIMIZE_OPTION=-O2 ..."</p>
-<p><b>GCC 3.4.x on X86-64/amd64</b>: GCC <a href="http://llvm.org/PR1056">
-   miscompiles portions of LLVM</a>.</p>
-<p><b>GCC 3.4.4 (CodeSourcery ARM 2005q3-2)</b>: this compiler miscompiles LLVM
-   when building with optimizations enabled.  It appears to work with 
-   "<tt>make ENABLE_OPTIMIZED=1 OPTIMIZE_OPTION=-O1</tt>" or build a debug
-   build.</p>
-<p><b>IA-64 GCC 4.0.0</b>: The IA-64 version of GCC 4.0.0 is known to
-   miscompile LLVM.</p>
-<p><b>Apple Xcode 2.3</b>: GCC crashes when compiling LLVM at -O3 (which is the
-   default with ENABLE_OPTIMIZED=1.  To work around this, build with 
-   "ENABLE_OPTIMIZED=1 OPTIMIZE_OPTION=-O2".</p>
-<p><b>GCC 4.1.1</b>: GCC fails to build LLVM with template concept check errors
-      compiling some files.  At the time of this writing, GCC mainline (4.2)
-      did not share the problem.</p>
-<p><b>GCC 4.1.1 on X86-64/amd64</b>: GCC <a href="http://llvm.org/PR1063">
-   miscompiles portions of LLVM</a> when compiling llvm itself into 64-bit 
-   code.  LLVM will appear to mostly work but will be buggy, e.g. failing 
-   portions of its testsuite.</p>
-<p><b>GCC 4.1.2 on OpenSUSE</b>: Seg faults during libstdc++ build and on x86_64
-platforms compiling md5.c gets a mangled constant.</p>
-<p><b>GCC 4.1.2 (20061115 (prerelease) (Debian 4.1.1-21)) on Debian</b>: Appears
-to miscompile parts of LLVM 2.4. One symptom is ValueSymbolTable complaining
-about symbols remaining in the table on destruction.</p>
-<p><b>GCC 4.1.2 20071124 (Red Hat 4.1.2-42)</b>: Suffers from the same symptoms
-as the previous one. It appears to work with ENABLE_OPTIMIZED=0 (the default).</p>
-<p><b>Cygwin GCC 4.3.2 20080827 (beta) 2</b>:
-  Users <a href="http://llvm.org/PR4145">reported</a> various problems related
-  with link errors when using this GCC version.</p>
-<p><b>Debian GCC 4.3.2 on X86</b>: Crashes building some files in LLVM 2.6.</p>
-<p><b>GCC 4.3.3 (Debian 4.3.3-10) on ARM</b>: Miscompiles parts of LLVM 2.6
-when optimizations are turned on. The symptom is an infinite loop in
-FoldingSetImpl::RemoveNode while running the code generator.</p>
-<p><b>SUSE 11 GCC 4.3.4</b>: Miscompiles LLVM, causing crashes in ValueHandle logic.</p>
-<p><b>GCC 4.3.5 and GCC 4.4.5 on ARM</b>: These can miscompile <tt>value >>
-1</tt> even at -O0. A test failure in <tt>test/Assembler/alignstack.ll</tt> is
-one symptom of the problem.
-<p><b>GNU ld 2.16.X</b>. Some 2.16.X versions of the ld linker will produce very
-long warning messages complaining that some ".gnu.linkonce.t.*" symbol was
-defined in a discarded section. You can safely ignore these messages as they are
-erroneous and the linkage is correct.  These messages disappear using ld
-2.17.</p>
-
-<p><b>GNU binutils 2.17</b>: Binutils 2.17 contains <a 
-href="http://sourceware.org/bugzilla/show_bug.cgi?id=3111">a bug</a> which
-causes huge link times (minutes instead of seconds) when building LLVM.  We
-recommend upgrading to a newer version (2.17.50.0.4 or later).</p>
-
-<p><b>GNU Binutils 2.19.1 Gold</b>: This version of Gold contained
-<a href="http://sourceware.org/bugzilla/show_bug.cgi?id=9836">a bug</a>
-which causes intermittent failures when building LLVM with position independent
-code.  The symptom is an error about cyclic dependencies.  We recommend
-upgrading to a newer version of Gold.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="starting">Getting Started with LLVM</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The remainder of this guide is meant to get you up and running with
-LLVM and to give you some basic information about the LLVM environment.</p>
-
-<p>The later sections of this guide describe the <a
-href="#layout">general layout</a> of the LLVM source tree, a <a
-href="#tutorial">simple example</a> using the LLVM tool chain, and <a
-href="#links">links</a> to find more information about LLVM or to get
-help via e-mail.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="terminology">Terminology and Notation</a>
-</h3>
-
-<div>
-
-<p>Throughout this manual, the following names are used to denote paths
-specific to the local system and working environment.  <i>These are not
-environment variables you need to set but just strings used in the rest
-of this document below</i>.  In any of the examples below, simply replace
-each of these names with the appropriate pathname on your local system.
-All these paths are absolute:</p>
-
-<dl>
-    <dt>SRC_ROOT
-    <dd>
-    This is the top level directory of the LLVM source tree.
-    <br><br>
-
-    <dt>OBJ_ROOT
-    <dd>
-    This is the top level directory of the LLVM object tree (i.e. the
-    tree where object files and compiled programs will be placed.  It
-    can be the same as SRC_ROOT).
-    <br><br>
-
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="environment">Setting Up Your Environment</a>
-</h3>
-
-<div>
-
-<p>
-In order to compile and use LLVM, you may need to set some environment
-variables.
-
-<dl>
-  <dt><tt>LLVM_LIB_SEARCH_PATH</tt>=<tt>/path/to/your/bitcode/libs</tt></dt>
-  <dd>[Optional] This environment variable helps LLVM linking tools find the
-  locations of your bitcode libraries. It is provided only as a
-  convenience since you can specify the paths using the -L options of the
-  tools and the C/C++ front-end will automatically use the bitcode files
-  installed in its
-  <tt>lib</tt> directory.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="unpack">Unpacking the LLVM Archives</a>
-</h3>
-
-<div>
-
-<p>
-If you have the LLVM distribution, you will need to unpack it before you
-can begin to compile it.  LLVM is distributed as a set of two files: the LLVM
-suite and the LLVM GCC front end compiled for your platform.  There is an
-additional test suite that is optional.  Each file is a TAR archive that is
-compressed with the gzip program.
-</p>
-
-<p>The files are as follows, with <em>x.y</em> marking the version number:
-<dl>
-  <dt><tt>llvm-x.y.tar.gz</tt></dt>
-  <dd>Source release for the LLVM libraries and tools.<br></dd>
-
-  <dt><tt>llvm-test-x.y.tar.gz</tt></dt>
-  <dd>Source release for the LLVM test-suite.</dd>
-
-  <dt><tt>llvm-gcc-4.2-x.y.source.tar.gz</tt></dt>
-  <dd>Source release of the llvm-gcc-4.2 front end.  See README.LLVM in the root
-      directory for build instructions.<br></dd>
-
-  <dt><tt>llvm-gcc-4.2-x.y-platform.tar.gz</tt></dt>
-  <dd>Binary release of the llvm-gcc-4.2 front end for a specific platform.<br></dd>
-
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="checkout">Checkout LLVM from Subversion</a>
-</h3>
-
-<div>
-
-<p>If you have access to our Subversion repository, you can get a fresh copy of
-the entire source code.  All you need to do is check it out from Subversion as
-follows:</p>
-
-<ul>
-  <li><tt>cd <i>where-you-want-llvm-to-live</i></tt></li>
-  <li>Read-Only: <tt>svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm</tt></li>
-  <li>Read-Write:<tt>svn co https://user@llvm.org/svn/llvm-project/llvm/trunk
-    llvm</tt></li>
-</ul>
-
-
-<p>This will create an '<tt>llvm</tt>' directory in the current
-directory and fully populate it with the LLVM source code, Makefiles,
-test directories, and local copies of documentation files.</p>
-
-<p>If you want to get a specific release (as opposed to the most recent
-revision), you can checkout it from the '<tt>tags</tt>' directory (instead of
-'<tt>trunk</tt>'). The following releases are located in the following
-subdirectories of the '<tt>tags</tt>' directory:</p>
-
-<ul>
-<li>Release 3.1: <b>RELEASE_31/final</b></li>
-<li>Release 3.0: <b>RELEASE_30/final</b></li>
-<li>Release 2.9: <b>RELEASE_29/final</b></li>
-<li>Release 2.8: <b>RELEASE_28</b></li>
-<li>Release 2.7: <b>RELEASE_27</b></li>
-<li>Release 2.6: <b>RELEASE_26</b></li>
-<li>Release 2.5: <b>RELEASE_25</b></li>
-<li>Release 2.4: <b>RELEASE_24</b></li>
-<li>Release 2.3: <b>RELEASE_23</b></li>
-<li>Release 2.2: <b>RELEASE_22</b></li>
-<li>Release 2.1: <b>RELEASE_21</b></li>
-<li>Release 2.0: <b>RELEASE_20</b></li>
-<li>Release 1.9: <b>RELEASE_19</b></li>
-<li>Release 1.8: <b>RELEASE_18</b></li>
-<li>Release 1.7: <b>RELEASE_17</b></li>
-<li>Release 1.6: <b>RELEASE_16</b></li>
-<li>Release 1.5: <b>RELEASE_15</b></li>
-<li>Release 1.4: <b>RELEASE_14</b></li>
-<li>Release 1.3: <b>RELEASE_13</b></li>
-<li>Release 1.2: <b>RELEASE_12</b></li>
-<li>Release 1.1: <b>RELEASE_11</b></li>
-<li>Release 1.0: <b>RELEASE_1</b></li>
-</ul>
-
-<p>If you would like to get the LLVM test suite (a separate package as of 1.4),
-you get it from the Subversion repository:</p>
-
-<div class="doc_code">
-<pre>
-% cd llvm/projects
-% svn co http://llvm.org/svn/llvm-project/test-suite/trunk test-suite
-</pre>
-</div>
-
-<p>By placing it in the <tt>llvm/projects</tt>, it will be automatically
-configured by the LLVM configure script as well as automatically updated when
-you run <tt>svn update</tt>.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="git_mirror">GIT mirror</a>
-</h3>
-
-<div>
-
-<p>GIT mirrors are available for a number of LLVM subprojects. These mirrors
-  sync automatically with each Subversion commit and contain all necessary
-  git-svn marks (so, you can recreate git-svn metadata locally). Note that right
-  now mirrors reflect only <tt>trunk</tt> for each project. You can do the
-  read-only GIT clone of LLVM via:</p>
-
-<pre class="doc_code">
-git clone http://llvm.org/git/llvm.git
-</pre>
-
-<p>If you want to check out clang too, run:</p>
-
-<pre class="doc_code">
-git clone http://llvm.org/git/llvm.git
-cd llvm/tools
-git clone http://llvm.org/git/clang.git
-</pre>
-
-<p>
-Since the upstream repository is in Subversion, you should use
-<tt>&quot;git pull --rebase&quot;</tt>
-instead of <tt>&quot;git pull&quot;</tt> to avoid generating a non-linear
-history in your clone.
-To configure <tt>&quot;git pull&quot;</tt> to pass <tt>--rebase</tt> by default
-on the master branch, run the following command:
-</p>
-
-<pre class="doc_code">
-git config branch.master.rebase true
-</pre>
-
-<h4>Sending patches with Git</h4>
-<div>
-<p>
-Please read <a href="DeveloperPolicy.html#patches">Developer Policy</a>, too.
-</p>
-
-<p>
-Assume <tt>master</tt> points the upstream and <tt>mybranch</tt> points your
-working branch, and <tt>mybranch</tt> is rebased onto <tt>master</tt>.
-At first you may check sanity of whitespaces:
-</p>
-
-<pre class="doc_code">
-git diff --check master..mybranch
-</pre>
-
-<p>
-The easiest way to generate a patch is as below:
-</p>
-
-<pre class="doc_code">
-git diff master..mybranch &gt; /path/to/mybranch.diff
-</pre>
-
-<p>
-It is a little different from svn-generated diff. git-diff-generated diff has
-prefixes like <tt>a/</tt> and <tt>b/</tt>. Don't worry, most developers might
-know it could be accepted with <tt>patch -p1 -N</tt>.
-</p>
-
-<p>
-But you may generate patchset with git-format-patch. It generates
-by-each-commit patchset. To generate patch files to attach to your article:
-</p>
-
-<pre class="doc_code">
-git format-patch --no-attach master..mybranch -o /path/to/your/patchset
-</pre>
-
-<p>
-If you would like to send patches directly, you may use git-send-email or
-git-imap-send. Here is an example to generate the patchset in Gmail's [Drafts].
-</p>
-
-<pre class="doc_code">
-git format-patch --attach master..mybranch --stdout | git imap-send
-</pre>
-
-<p>
-Then, your .git/config should have [imap] sections.
-</p>
-
-<pre class="doc_code">
-[imap]
-        host = imaps://imap.gmail.com
-        user = <em>your.gmail.account</em>@gmail.com
-        pass = <em>himitsu!</em>
-        port = 993
-        sslverify = false
-; in English
-        folder = "[Gmail]/Drafts"
-; example for Japanese, "Modified UTF-7" encoded.
-        folder = "[Gmail]/&amp;Tgtm+DBN-"
-; example for Traditional Chinese
-        folder = "[Gmail]/&amp;g0l6Pw-"
-</pre>
-
-</div>
-
-<h4>For developers to work with git-svn</h4>
-<div>
-
-<p>To set up clone from which you can submit code using
-   <tt>git-svn</tt>, run:</p>
-
-<pre class="doc_code">
-git clone http://llvm.org/git/llvm.git
-cd llvm
-git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username=&lt;username>
-git config svn-remote.svn.fetch :refs/remotes/origin/master
-git svn rebase -l  # -l avoids fetching ahead of the git mirror.
-
-# If you have clang too:
-cd tools
-git clone http://llvm.org/git/clang.git
-cd clang
-git svn init https://llvm.org/svn/llvm-project/cfe/trunk --username=&lt;username>
-git config svn-remote.svn.fetch :refs/remotes/origin/master
-git svn rebase -l
-</pre>
-
-<p>To update this clone without generating git-svn tags that conflict
-with the upstream git repo, run:</p>
-
-<pre class="doc_code">
-git fetch && (cd tools/clang && git fetch)  # Get matching revisions of both trees.
-git checkout master
-git svn rebase -l
-(cd tools/clang &&
- git checkout master &&
- git svn rebase -l)
-</pre>
-
-<p>This leaves your working directories on their master branches, so
-you'll need to <tt>checkout</tt> each working branch individually and
-<tt>rebase</tt> it on top of its parent branch.  (Note: This script is
-intended for relative newbies to git.  If you have more experience,
-you can likely improve on it.)</p>
-
-<p>The git-svn metadata can get out of sync after you mess around with
-branches and <code>dcommit</code>. When that happens, <code>git svn
-dcommit</code> stops working, complaining about files with uncommitted
-changes. The fix is to rebuild the metadata:</p>
-
-<pre class="doc_code">
-rm -rf .git/svn
-git svn rebase -l
-</pre>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="config">Local LLVM Configuration</a>
-</h3>
-
-<div>
-
-  <p>Once checked out from the Subversion repository, the LLVM suite source 
-  code must be
-configured via the <tt>configure</tt> script.  This script sets variables in the
-various <tt>*.in</tt> files, most notably <tt>llvm/Makefile.config</tt> and 
-<tt>llvm/include/Config/config.h</tt>.  It also populates <i>OBJ_ROOT</i> with 
-the Makefiles needed to begin building LLVM.</p>
-
-<p>The following environment variables are used by the <tt>configure</tt>
-script to configure the build system:</p>
-
-<table summary="LLVM configure script environment variables">
-  <tr><th>Variable</th><th>Purpose</th></tr>
-  <tr>
-    <td>CC</td>
-    <td>Tells <tt>configure</tt> which C compiler to use.  By default,
-        <tt>configure</tt> will look for the first GCC C compiler in
-        <tt>PATH</tt>.  Use this variable to override
-        <tt>configure</tt>'s default behavior.</td>
-  </tr>
-  <tr>
-    <td>CXX</td>
-    <td>Tells <tt>configure</tt> which C++ compiler to use.  By default,
-       <tt>configure</tt> will look for the first GCC C++ compiler in
-       <tt>PATH</tt>.  Use this variable to override
-       <tt>configure</tt>'s default behavior.</td>
-  </tr>
-</table>
-
-<p>The following options can be used to set or enable LLVM specific options:</p>
-
-<dl>
-  <dt><i>--enable-optimized</i></dt>
-  <dd>
-    Enables optimized compilation (debugging symbols are removed
-    and GCC optimization flags are enabled). Note that this is the default 
-    setting     if you are using the LLVM distribution. The default behavior 
-    of an Subversion checkout is to use an unoptimized build (also known as a 
-    debug build).
-    <br><br>
-  </dd>
-  <dt><i>--enable-debug-runtime</i></dt>
-  <dd>
-    Enables debug symbols in the runtime libraries. The default is to strip
-    debug symbols from the runtime libraries. 
-  </dd>
-  <dt><i>--enable-jit</i></dt>
-  <dd>
-    Compile the Just In Time (JIT) compiler functionality.  This is not
-    available
-    on all platforms.  The default is dependent on platform, so it is best
-    to explicitly enable it if you want it.
-    <br><br>
-  </dd>
-  <dt><i>--enable-targets=</i><tt>target-option</tt></dt>
-  <dd>Controls which targets will be built and linked into llc. The default 
-  value for <tt>target_options</tt> is "all" which builds and links all 
-  available targets.  The value "host-only" can be specified to build only a 
-  native compiler (no cross-compiler targets available). The "native" target is 
-  selected as the target of the build host. You can also specify a comma 
-  separated list of target names that you want available in llc. The target 
-  names use all lower case. The current set of targets is: <br>
-  <tt>arm, cpp, hexagon, mblaze, mips, mipsel, msp430, powerpc, ptx, sparc, spu, x86, x86_64, xcore</tt>.
-  <br><br></dd>
-  <dt><i>--enable-doxygen</i></dt>
-  <dd>Look for the doxygen program and enable construction of doxygen based
-  documentation from the source code. This is disabled by default because 
-  generating the documentation can take a long time and producess 100s of 
-  megabytes of output.</dd>
-  <dt><i>--with-udis86</i></dt>
-  <dd>LLVM can use external disassembler library for various purposes (now it's
-  used only for examining code produced by JIT). This option will enable usage
-  of <a href="http://udis86.sourceforge.net/">udis86</a> x86 (both 32 and 64
-  bits) disassembler library.</dd>
-</dl>
-
-<p>To configure LLVM, follow these steps:</p>
-
-<ol>
-    <li><p>Change directory into the object root directory:</p>
-
-    <div class="doc_code"><pre>% cd <i>OBJ_ROOT</i></pre></div></li>
-
-    <li><p>Run the <tt>configure</tt> script located in the LLVM source
-    tree:</p>
-
-    <div class="doc_code">
-    <pre>% <i>SRC_ROOT</i>/configure --prefix=/install/path [other options]</pre>
-    </div></li>
-</ol>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="compile">Compiling the LLVM Suite Source Code</a>
-</h3>
-
-<div>
-
-<p>Once you have configured LLVM, you can build it.  There are three types of
-builds:</p>
-
-<dl>
-    <dt>Debug Builds
-    <dd>
-    These builds are the default when one is using an Subversion checkout and 
-    types <tt>gmake</tt> (unless the <tt>--enable-optimized</tt> option was 
-    used during configuration).  The build system will compile the tools and 
-    libraries with debugging information.  To get a Debug Build using the
-    LLVM distribution the <tt>--disable-optimized</tt> option must be passed
-    to <tt>configure</tt>.
-    <br><br>
-
-    <dt>Release (Optimized) Builds
-    <dd>
-    These builds are enabled with the <tt>--enable-optimized</tt> option to
-    <tt>configure</tt> or by specifying <tt>ENABLE_OPTIMIZED=1</tt> on the
-    <tt>gmake</tt> command line.  For these builds, the build system will
-    compile the tools and libraries with GCC optimizations enabled and strip
-    debugging information from the libraries and executables it generates. 
-    Note that Release Builds are default when using an LLVM distribution.
-    <br><br>
-
-    <dt>Profile Builds
-    <dd>
-    These builds are for use with profiling.  They compile profiling
-    information into the code for use with programs like <tt>gprof</tt>.
-    Profile builds must be started by specifying <tt>ENABLE_PROFILING=1</tt>
-    on the <tt>gmake</tt> command line.
-</dl>
-
-<p>Once you have LLVM configured, you can build it by entering the
-<i>OBJ_ROOT</i> directory and issuing the following command:</p>
-
-<div class="doc_code"><pre>% gmake</pre></div>
-
-<p>If the build fails, please <a href="#brokengcc">check here</a> to see if you
-are using a version of GCC that is known not to compile LLVM.</p>
-
-<p>
-If you have multiple processors in your machine, you may wish to use some of
-the parallel build options provided by GNU Make.  For example, you could use the
-command:</p>
-
-<div class="doc_code"><pre>% gmake -j2</pre></div>
-
-<p>There are several special targets which are useful when working with the LLVM
-source code:</p>
-
-<dl>
-  <dt><tt>gmake clean</tt>
-  <dd>
-  Removes all files generated by the build.  This includes object files,
-  generated C/C++ files, libraries, and executables.
-  <br><br>
-
-  <dt><tt>gmake dist-clean</tt>
-  <dd>
-  Removes everything that <tt>gmake clean</tt> does, but also removes files
-  generated by <tt>configure</tt>.  It attempts to return the source tree to the
-  original state in which it was shipped.
-  <br><br>
-
-  <dt><tt>gmake install</tt>
-  <dd>
-  Installs LLVM header files, libraries, tools, and documentation in a
-  hierarchy 
-  under $PREFIX, specified with <tt>./configure --prefix=[dir]</tt>, which 
-  defaults to <tt>/usr/local</tt>.
-  <br><br>
-
-  <dt><tt>gmake -C runtime install-bytecode</tt>
-  <dd>
-  Assuming you built LLVM into $OBJDIR, when this command is run, it will 
-  install bitcode libraries into the GCC front end's bitcode library 
-  directory.  If you need to update your bitcode libraries,
-  this is the target to use once you've built them.
-  <br><br>
-</dl>
-
-<p>Please see the <a href="MakefileGuide.html">Makefile Guide</a> for further
-details on these <tt>make</tt> targets and descriptions of other targets
-available.</p>
-
-<p>It is also possible to override default values from <tt>configure</tt> by
-declaring variables on the command line.  The following are some examples:</p>
-
-<dl>
-  <dt><tt>gmake ENABLE_OPTIMIZED=1</tt>
-  <dd>
-  Perform a Release (Optimized) build.
-  <br><br>
-
-  <dt><tt>gmake ENABLE_OPTIMIZED=1 DISABLE_ASSERTIONS=1</tt>
-  <dd>
-  Perform a Release (Optimized) build without assertions enabled.
-  <br><br>
- 
-  <dt><tt>gmake ENABLE_OPTIMIZED=0</tt>
-  <dd>
-  Perform a Debug build.
-  <br><br>
-
-  <dt><tt>gmake ENABLE_PROFILING=1</tt>
-  <dd>
-  Perform a Profiling build.
-  <br><br>
-
-  <dt><tt>gmake VERBOSE=1</tt>
-  <dd>
-  Print what <tt>gmake</tt> is doing on standard output.
-  <br><br>
-
-  <dt><tt>gmake TOOL_VERBOSE=1</tt></dt>
-  <dd>Ask each tool invoked by the makefiles to print out what it is doing on 
-  the standard output. This also implies <tt>VERBOSE=1</tt>.
-  <br><br></dd>
-</dl>
-
-<p>Every directory in the LLVM object tree includes a <tt>Makefile</tt> to build
-it and any subdirectories that it contains.  Entering any directory inside the
-LLVM object tree and typing <tt>gmake</tt> should rebuild anything in or below
-that directory that is out of date.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="cross-compile">Cross-Compiling LLVM</a>
-</h3>
-
-<div>
-  <p>It is possible to cross-compile LLVM itself. That is, you can create LLVM
-  executables and libraries to be hosted on a platform different from the
-  platform where they are build (a Canadian Cross build). To configure a
-  cross-compile, supply the configure script with <tt>--build</tt> and
-  <tt>--host</tt> options that are different. The values of these options must
-  be legal target triples that your GCC compiler supports.</p>
-
-  <p>The result of such a build is executables that are not runnable on
-  on the build host (--build option) but can be executed on the compile host
-  (--host option).</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="objfiles">The Location of LLVM Object Files</a>
-</h3>
-
-<div>
-
-<p>The LLVM build system is capable of sharing a single LLVM source tree among
-several LLVM builds.  Hence, it is possible to build LLVM for several different
-platforms or configurations using the same source tree.</p>
-
-<p>This is accomplished in the typical autoconf manner:</p>
-
-<ul>
-  <li><p>Change directory to where the LLVM object files should live:</p>
-
-      <div class="doc_code"><pre>% cd <i>OBJ_ROOT</i></pre></div></li>
-
-  <li><p>Run the <tt>configure</tt> script found in the LLVM source
-      directory:</p>
-
-      <div class="doc_code"><pre>% <i>SRC_ROOT</i>/configure</pre></div></li>
-</ul>
-
-<p>The LLVM build will place files underneath <i>OBJ_ROOT</i> in directories
-named after the build type:</p>
-
-<dl>
-  <dt>Debug Builds with assertions enabled (the default)
-  <dd>
-  <dl>
-    <dt>Tools
-    <dd><tt><i>OBJ_ROOT</i>/Debug+Asserts/bin</tt>
-    <dt>Libraries
-    <dd><tt><i>OBJ_ROOT</i>/Debug+Asserts/lib</tt>
-  </dl>
-  <br><br>
-
-  <dt>Release Builds
-  <dd>
-  <dl>
-    <dt>Tools
-    <dd><tt><i>OBJ_ROOT</i>/Release/bin</tt>
-    <dt>Libraries
-    <dd><tt><i>OBJ_ROOT</i>/Release/lib</tt>
-  </dl>
-  <br><br>
-
-  <dt>Profile Builds
-  <dd>
-  <dl>
-    <dt>Tools
-    <dd><tt><i>OBJ_ROOT</i>/Profile/bin</tt>
-    <dt>Libraries
-    <dd><tt><i>OBJ_ROOT</i>/Profile/lib</tt>
-  </dl>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="optionalconfig">Optional Configuration Items</a>
-</h3>
-
-<div>
-
-<p>
-If you're running on a Linux system that supports the "<a
-href="http://www.tat.physik.uni-tuebingen.de/~rguenth/linux/binfmt_misc.html">binfmt_misc</a>"
-module, and you have root access on the system, you can set your system up to
-execute LLVM bitcode files directly. To do this, use commands like this (the
-first command may not be required if you are already using the module):</p>
-
-<div class="doc_code">
-<pre>
-$ mount -t binfmt_misc none /proc/sys/fs/binfmt_misc
-$ echo ':llvm:M::BC::/path/to/lli:' &gt; /proc/sys/fs/binfmt_misc/register
-$ chmod u+x hello.bc   (if needed)
-$ ./hello.bc
-</pre>
-</div>
-
-<p>
-This allows you to execute LLVM bitcode files directly.  On Debian, you 
-can also use this command instead of the 'echo' command above:
-</p>
-
-<div class="doc_code">
-<pre>
-$ sudo update-binfmts --install llvm /path/to/lli --magic 'BC'
-</pre>
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="layout">Program Layout</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>One useful source of information about the LLVM source base is the LLVM <a
-href="http://www.doxygen.org/">doxygen</a> documentation available at <tt><a
-href="http://llvm.org/doxygen/">http://llvm.org/doxygen/</a></tt>.
-The following is a brief introduction to code layout:</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="examples"><tt>llvm/examples</tt></a>
-</h3>
-
-<div>
-  <p>This directory contains some simple examples of how to use the LLVM IR and
-  JIT.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="include"><tt>llvm/include</tt></a>
-</h3>
-
-<div>
-
-<p>This directory contains public header files exported from the LLVM
-library. The three main subdirectories of this directory are:</p>
-
-<dl>
-  <dt><tt><b>llvm/include/llvm</b></tt></dt>
-  <dd>This directory contains all of the LLVM specific header files.  This 
-  directory also has subdirectories for different portions of LLVM: 
-  <tt>Analysis</tt>, <tt>CodeGen</tt>, <tt>Target</tt>, <tt>Transforms</tt>, 
-  etc...</dd>
-
-  <dt><tt><b>llvm/include/llvm/Support</b></tt></dt>
-  <dd>This directory contains generic support libraries that are provided with 
-  LLVM but not necessarily specific to LLVM. For example, some C++ STL utilities 
-  and a Command Line option processing library store their header files here.
-  </dd>
-
-  <dt><tt><b>llvm/include/llvm/Config</b></tt></dt>
-  <dd>This directory contains header files configured by the <tt>configure</tt> 
-  script.  They wrap "standard" UNIX and C header files.  Source code can 
-  include these header files which automatically take care of the conditional 
-  #includes that the <tt>configure</tt> script generates.</dd>
-</dl>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="lib"><tt>llvm/lib</tt></a>
-</h3>
-
-<div>
-
-<p>This directory contains most of the source files of the LLVM system. In LLVM,
-almost all code exists in libraries, making it very easy to share code among the
-different <a href="#tools">tools</a>.</p>
-
-<dl>
-  <dt><tt><b>llvm/lib/VMCore/</b></tt></dt>
-  <dd> This directory holds the core LLVM source files that implement core 
-  classes like Instruction and BasicBlock.</dd>
-
-  <dt><tt><b>llvm/lib/AsmParser/</b></tt></dt>
-  <dd>This directory holds the source code for the LLVM assembly language parser 
-  library.</dd>
-
-  <dt><tt><b>llvm/lib/BitCode/</b></tt></dt>
-  <dd>This directory holds code for reading and write LLVM bitcode.</dd>
-
-  <dt><tt><b>llvm/lib/Analysis/</b></tt><dd>This directory contains a variety of
-  different program analyses, such as Dominator Information, Call Graphs,
-  Induction Variables, Interval Identification, Natural Loop Identification,
-  etc.</dd>
-
-  <dt><tt><b>llvm/lib/Transforms/</b></tt></dt>
-  <dd> This directory contains the source code for the LLVM to LLVM program 
-  transformations, such as Aggressive Dead Code Elimination, Sparse Conditional 
-  Constant Propagation, Inlining, Loop Invariant Code Motion, Dead Global 
-  Elimination, and many others.</dd>
-
-  <dt><tt><b>llvm/lib/Target/</b></tt></dt>
-  <dd> This directory contains files that describe various target architectures
-  for code generation.  For example, the <tt>llvm/lib/Target/X86</tt> 
-  directory holds the X86 machine description while
-  <tt>llvm/lib/Target/ARM</tt> implements the ARM backend.</dd>
-    
-  <dt><tt><b>llvm/lib/CodeGen/</b></tt></dt>
-  <dd> This directory contains the major parts of the code generator: Instruction 
-  Selector, Instruction Scheduling, and Register Allocation.</dd>
-
-  <dt><tt><b>llvm/lib/MC/</b></tt></dt>
-  <dd>(FIXME: T.B.D.)</dd>
-
-  <!--FIXME: obsoleted -->
-  <dt><tt><b>llvm/lib/Debugger/</b></tt></dt>
-  <dd> This directory contains the source level debugger library that makes 
-  it possible to instrument LLVM programs so that a debugger could identify 
-  source code locations at which the program is executing.</dd>
-
-  <dt><tt><b>llvm/lib/ExecutionEngine/</b></tt></dt>
-  <dd> This directory contains libraries for executing LLVM bitcode directly 
-  at runtime in both interpreted and JIT compiled fashions.</dd>
-
-  <dt><tt><b>llvm/lib/Support/</b></tt></dt>
-  <dd> This directory contains the source code that corresponds to the header
-  files located in <tt>llvm/include/ADT/</tt>
-  and <tt>llvm/include/Support/</tt>.</dd>
-</dl>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="projects"><tt>llvm/projects</tt></a>
-</h3>
-
-<div>
-  <p>This directory contains projects that are not strictly part of LLVM but are
-  shipped with LLVM. This is also the directory where you should create your own
-  LLVM-based projects. See <tt>llvm/projects/sample</tt> for an example of how
-  to set up your own project.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="runtime"><tt>llvm/runtime</tt></a>
-</h3>
-
-<div>
-
-<p>This directory contains libraries which are compiled into LLVM bitcode and
-used when linking programs with the Clang front end.  Most of these libraries are
-skeleton versions of real libraries; for example, libc is a stripped down
-version of glibc.</p>
-
-<p>Unlike the rest of the LLVM suite, this directory needs the LLVM GCC front
-end to compile.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="test"><tt>llvm/test</tt></a>
-</h3>
-
-<div>
-  <p>This directory contains feature and regression tests and other basic sanity
-  checks on the LLVM infrastructure. These are intended to run quickly and cover
-  a lot of territory without being exhaustive.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="test-suite"><tt>test-suite</tt></a>
-</h3>
-
-<div>
-  <p>This is not a directory in the normal llvm module; it is a separate
-  Subversion
-  module that must be checked out (usually to <tt>projects/test-suite</tt>). 
-  This
-  module contains a comprehensive correctness, performance, and benchmarking
-  test
-  suite for LLVM. It is a separate Subversion module because not every LLVM 
-  user is
-  interested in downloading or building such a comprehensive test suite. For
-  further details on this test suite, please see the 
-  <a href="TestingGuide.html">Testing Guide</a> document.</p>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="tools"><tt>llvm/tools</tt></a>
-</h3>
-
-<div>
-
-<p>The <b>tools</b> directory contains the executables built out of the
-libraries above, which form the main part of the user interface.  You can
-always get help for a tool by typing <tt>tool_name -help</tt>.  The
-following is a brief introduction to the most important tools.  More detailed
-information is in the <a href="CommandGuide/index.html">Command Guide</a>.</p>
-
-<dl>
-
-  <dt><tt><b>bugpoint</b></tt></dt>
-  <dd><tt>bugpoint</tt> is used to debug
-  optimization passes or code generation backends by narrowing down the
-  given test case to the minimum number of passes and/or instructions that
-  still cause a problem, whether it is a crash or miscompilation. See <a
-  href="HowToSubmitABug.html">HowToSubmitABug.html</a> for more information
-  on using <tt>bugpoint</tt>.</dd>
-
-  <dt><tt><b>llvm-ar</b></tt></dt>
-  <dd>The archiver produces an archive containing
-  the given LLVM bitcode files, optionally with an index for faster
-  lookup.</dd>
-  
-  <dt><tt><b>llvm-as</b></tt></dt>
-  <dd>The assembler transforms the human readable LLVM assembly to LLVM 
-  bitcode.</dd>
-
-  <dt><tt><b>llvm-dis</b></tt></dt>
-  <dd>The disassembler transforms the LLVM bitcode to human readable 
-  LLVM assembly.</dd>
-
-  <dt><tt><b>llvm-link</b></tt></dt>
-  <dd><tt>llvm-link</tt>, not surprisingly, links multiple LLVM modules into 
-  a single program.</dd>
-  
-  <dt><tt><b>lli</b></tt></dt>
-  <dd><tt>lli</tt> is the LLVM interpreter, which
-  can directly execute LLVM bitcode (although very slowly...). For architectures
-  that support it (currently x86, Sparc, and PowerPC), by default, <tt>lli</tt>
-  will function as a Just-In-Time compiler (if the functionality was compiled
-  in), and will execute the code <i>much</i> faster than the interpreter.</dd>
-
-  <dt><tt><b>llc</b></tt></dt>
-  <dd> <tt>llc</tt> is the LLVM backend compiler, which
-  translates LLVM bitcode to a native code assembly file or to C code (with
-  the -march=c option).</dd>
-
-  <dt><tt><b>llvm-gcc</b></tt></dt>
-  <dd><tt>llvm-gcc</tt> is a GCC-based C frontend that has been retargeted to 
-  use LLVM as its backend instead of GCC's RTL backend. It can also emit LLVM 
-  bitcode or assembly (with the <tt>-emit-llvm</tt> option) instead of the
-  usual machine code output.  It works just like any other GCC compiler, 
-  taking the typical <tt>-c, -S, -E, -o</tt> options that are typically used.  
-  Additionally, the source code for <tt>llvm-gcc</tt> is available as a 
-  separate Subversion module.</dd>
-
-  <dt><tt><b>opt</b></tt></dt>
-  <dd><tt>opt</tt> reads LLVM bitcode, applies a series of LLVM to LLVM 
-  transformations (which are specified on the command line), and then outputs 
-  the resultant bitcode.  The '<tt>opt -help</tt>' command is a good way to 
-  get a list of the program transformations available in LLVM.<br>
-  <dd><tt>opt</tt> can also be used to run a specific analysis on an input 
-  LLVM bitcode file and print out the results.  It is primarily useful for 
-  debugging analyses, or familiarizing yourself with what an analysis does.</dd>
-</dl>
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="utils"><tt>llvm/utils</tt></a>
-</h3>
-
-<div>
-
-<p>This directory contains utilities for working with LLVM source code, and some
-of the utilities are actually required as part of the build process because they
-are code generators for parts of LLVM infrastructure.</p>
-
-<dl>
-  <dt><tt><b>codegen-diff</b></tt> <dd><tt>codegen-diff</tt> is a script
-  that finds differences between code that LLC generates and code that LLI
-  generates. This is a useful tool if you are debugging one of them,
-  assuming that the other generates correct output. For the full user
-  manual, run <tt>`perldoc codegen-diff'</tt>.<br><br>
-
-  <dt><tt><b>emacs/</b></tt> <dd>The <tt>emacs</tt> directory contains
-  syntax-highlighting files which will work with Emacs and XEmacs editors,
-  providing syntax highlighting support for LLVM assembly files and TableGen
-  description files. For information on how to use the syntax files, consult
-  the <tt>README</tt> file in that directory.<br><br>
-
-  <dt><tt><b>getsrcs.sh</b></tt> <dd>The <tt>getsrcs.sh</tt> script finds
-  and outputs all non-generated source files, which is useful if one wishes
-  to do a lot of development across directories and does not want to
-  individually find each file. One way to use it is to run, for example:
-  <tt>xemacs `utils/getsources.sh`</tt> from the top of your LLVM source
-  tree.<br><br>
-
-  <dt><tt><b>llvmgrep</b></tt></dt>
-  <dd>This little tool performs an "egrep -H -n" on each source file in LLVM and
-  passes to it a regular expression provided on <tt>llvmgrep</tt>'s command
-  line. This is a very efficient way of searching the source base for a
-  particular regular expression.</dd>
-
-  <dt><tt><b>makellvm</b></tt> <dd>The <tt>makellvm</tt> script compiles all
-  files in the current directory and then compiles and links the tool that
-  is the first argument. For example, assuming you are in the directory
-  <tt>llvm/lib/Target/Sparc</tt>, if <tt>makellvm</tt> is in your path,
-  simply running <tt>makellvm llc</tt> will make a build of the current
-  directory, switch to directory <tt>llvm/tools/llc</tt> and build it,
-  causing a re-linking of LLC.<br><br>
-
-  <dt><tt><b>TableGen/</b></tt> <dd>The <tt>TableGen</tt> directory contains
-  the tool used to generate register descriptions, instruction set
-  descriptions, and even assemblers from common TableGen description
-  files.<br><br>
-
-  <dt><tt><b>vim/</b></tt> <dd>The <tt>vim</tt> directory contains
-  syntax-highlighting files which will work with the VIM editor, providing
-  syntax highlighting support for LLVM assembly files and TableGen
-  description files. For information on how to use the syntax files, consult
-  the <tt>README</tt> file in that directory.<br><br>
-
-</dl>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="tutorial">An Example Using the LLVM Tool Chain</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<p>This section gives an example of using LLVM with the Clang front end.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="tutorial4">Example with clang</a>
-</h3>
-
-<div>
-
-<ol>
-  <li><p>First, create a simple C file, name it 'hello.c':</p>
-
-<div class="doc_code">
-<pre>
-#include &lt;stdio.h&gt;
-
-int main() {
-  printf("hello world\n");
-  return 0;
-}
-</pre></div></li>
-
-  <li><p>Next, compile the C file into a native executable:</p>
-
-      <div class="doc_code"><pre>% clang hello.c -o hello</pre></div>
-
-      <p>Note that clang works just like GCC by default.  The standard -S and
-        -c arguments work as usual (producing a native .s or .o file,
-        respectively).</p></li>
-
-  <li><p>Next, compile the C file into a LLVM bitcode file:</p>
-
-      <div class="doc_code">
-      <pre>% clang -O3 -emit-llvm hello.c -c -o hello.bc</pre></div>
-
-      <p>The -emit-llvm option can be used with the -S or -c options to emit an
-         LLVM ".ll" or ".bc" file (respectively) for the code.  This allows you
-         to use the <a href="CommandGuide/index.html">standard LLVM tools</a> on
-         the bitcode file.</p></li>
-
-  <li><p>Run the program in both forms. To run the program, use:</p>
-      
-      <div class="doc_code"><pre>% ./hello</pre></div>
- 
-      <p>and</p>
-
-      <div class="doc_code"><pre>% lli hello.bc</pre></div>
-
-      <p>The second examples shows how to invoke the LLVM JIT, <a
-       href="CommandGuide/html/lli.html">lli</a>.</p></li>
-
-  <li><p>Use the <tt>llvm-dis</tt> utility to take a look at the LLVM assembly
-      code:</p>
-
-<div class="doc_code">
-<pre>llvm-dis &lt; hello.bc | less</pre>
-</div></li>
-
-  <li><p>Compile the program to native assembly using the LLC code
-      generator:</p>
-
-      <div class="doc_code"><pre>% llc hello.bc -o hello.s</pre></div></li>
-
-  <li><p>Assemble the native assembly language file into a program:</p>
-
-<div class="doc_code">
-<pre>
-<b>Solaris:</b> % /opt/SUNWspro/bin/cc -xarch=v9 hello.s -o hello.native
-
-<b>Others:</b>  % gcc hello.s -o hello.native
-</pre>
-</div></li>
-
-  <li><p>Execute the native code program:</p>
-
-      <div class="doc_code"><pre>% ./hello.native</pre></div>
-
-      <p>Note that using clang to compile directly to native code (i.e. when
-         the -emit-llvm option is not present) does steps 6/7/8 for you.</p>
-        </li>
-
-</ol>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="problems">Common Problems</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If you are having problems building or using LLVM, or if you have any other
-general questions about LLVM, please consult the <a href="FAQ.html">Frequently
-Asked Questions</a> page.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="links">Links</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document is just an <b>introduction</b> on how to use LLVM to do
-some simple things... there are many more interesting and complicated things
-that you can do that aren't documented here (but we'll gladly accept a patch
-if you want to write something up!).  For more information about LLVM, check
-out:</p>
-
-<ul>
-  <li><a href="http://llvm.org/">LLVM homepage</a></li>
-  <li><a href="http://llvm.org/doxygen/">LLVM doxygen tree</a></li>
-  <li><a href="http://llvm.org/docs/Projects.html">Starting a Project
-  that Uses LLVM</a></li>
-</ul>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.x10sys.com/rspencer/">Reid Spencer</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-07-23 10:51:15 +0200 (Mon, 23 Jul 2012) $
-</address>
-</body>
-</html>
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
new file mode 100644
index 000000000000..68768921f6ae
--- /dev/null
+++ b/docs/GettingStarted.rst
@@ -0,0 +1,1304 @@
+.. _getting_started:
+
+====================================
+Getting Started with the LLVM System  
+====================================
+
+Overview
+========
+
+Welcome to LLVM! In order to get started, you first need to know some basic
+information.
+
+First, LLVM comes in three pieces. The first piece is the LLVM suite. This
+contains all of the tools, libraries, and header files needed to use LLVM.  It
+contains an assembler, disassembler, bitcode analyzer and bitcode optimizer.  It
+also contains basic regression tests that can be used to test the LLVM tools and
+the Clang front end.
+
+The second piece is the `Clang <http://clang.llvm.org/>`_ front end.  This
+component compiles C, C++, Objective C, and Objective C++ code into LLVM
+bitcode. Once compiled into LLVM bitcode, a program can be manipulated with the
+LLVM tools from the LLVM suite.
+
+There is a third, optional piece called Test Suite.  It is a suite of programs
+with a testing harness that can be used to further test LLVM's functionality
+and performance.
+
+Getting Started Quickly (A Summary)
+===================================
+
+The LLVM Getting Started documentation may be out of date.  So, the `Clang
+Getting Started <http://clang.llvm.org/get_started.html>`_ page might also be a
+good place to start.
+
+Here's the short story for getting up and running quickly with LLVM:
+
+#. Read the documentation.
+#. Read the documentation.
+#. Remember that you were warned twice about reading the documentation.
+#. Checkout LLVM:
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm``
+
+#. Checkout Clang:
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/tools``
+   * ``svn co http://llvm.org/svn/llvm-project/cfe/trunk clang``
+
+#. Checkout Compiler-RT:
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/projects``
+   * ``svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt``
+
+#. Get the Test Suite Source Code **[Optional]**
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/projects``
+   * ``svn co http://llvm.org/svn/llvm-project/test-suite/trunk test-suite``
+
+#. Configure and build LLVM and Clang:
+
+   * ``cd where-you-want-to-build-llvm``
+   * ``mkdir build`` (for building without polluting the source dir)
+   * ``cd build``
+   * ``../llvm/configure [options]``
+     Some common options:
+
+     * ``--prefix=directory`` ---
+
+       Specify for *directory* the full pathname of where you want the LLVM
+       tools and libraries to be installed (default ``/usr/local``).
+
+     * ``--enable-optimized`` ---
+
+       Compile with optimizations enabled (default is NO).
+
+     * ``--enable-assertions`` ---
+
+       Compile with assertion checks enabled (default is YES).
+
+   * ``make [-j]`` --- The ``-j`` specifies the number of jobs (commands) to run
+     simultaneously.  This builds both LLVM and Clang for Debug+Asserts mode.
+     The --enabled-optimized configure option is used to specify a Release
+     build.
+
+   * ``make check-all`` --- This run the regression tests to ensure everything
+     is in working order.
+  
+   * ``make update`` --- This command is used to update all the svn repositories
+     at once, rather then having to ``cd`` into the individual repositories and
+     running ``svn update``.
+
+   * It is also possible to use CMake instead of the makefiles. With CMake it is
+     also possible to generate project files for several IDEs: Eclipse CDT4,
+     CodeBlocks, Qt-Creator (use the CodeBlocks generator), KDevelop3.
+
+   * If you get an "internal compiler error (ICE)" or test failures, see
+     `below`.
+
+Consult the `Getting Started with LLVM`_ section for detailed information on
+configuring and compiling LLVM.  See `Setting Up Your Environment`_ for tips
+that simplify working with the Clang front end and LLVM tools.  Go to `Program
+Layout`_ to learn about the layout of the source code tree.
+
+Requirements
+============
+
+Before you begin to use the LLVM system, review the requirements given below.
+This may save you some trouble by knowing ahead of time what hardware and
+software you will need.
+
+Hardware
+--------
+
+LLVM is known to work on the following platforms:
+
++-----------------+----------------------+-------------------------+
+|OS               |  Arch                | Compilers               |
++=================+======================+=========================+
+|AuroraUX         | x86\ :sup:`1`        | GCC                     |
++-----------------+----------------------+-------------------------+
+|Linux            | x86\ :sup:`1`        | GCC                     |
++-----------------+----------------------+-------------------------+
+|Linux            | amd64                | GCC                     |
++-----------------+----------------------+-------------------------+
+|Solaris          | V9 (Ultrasparc)      | GCC                     |
++-----------------+----------------------+-------------------------+
+|FreeBSD          | x86\ :sup:`1`        | GCC                     |
++-----------------+----------------------+-------------------------+
+|FreeBSD          | amd64                | GCC                     |
++-----------------+----------------------+-------------------------+
+|MacOS X\ :sup:`2`| PowerPC              | GCC                     |
++-----------------+----------------------+-------------------------+
+|MacOS X\ :sup:`9`| x86                  | GCC                     |
++-----------------+----------------------+-------------------------+
+|Cygwin/Win32     | x86\ :sup:`1, 8, 11` | GCC 3.4.X, binutils 2.20|
++-----------------+----------------------+-------------------------+
+
+LLVM has partial support for the following platforms:
+
++-------------------+----------------------+-------------------------------------------+
+|OS                 |  Arch                | Compilers                                 |
++===================+======================+===========================================+
+| Windows           | x86\ :sup:`1`        | Visual Studio 2000 or higher\ :sup:`4,5`  |
++-------------------+----------------------+-------------------------------------------+
+| AIX\ :sup:`3,4`   | PowerPC              | GCC                                       |
++-------------------+----------------------+-------------------------------------------+
+| Linux\ :sup:`3,5` | PowerPC              | GCC                                       |
++-------------------+----------------------+-------------------------------------------+
+| Linux\ :sup:`7`   | Alpha                | GCC                                       |
++-------------------+----------------------+-------------------------------------------+
+| Linux\ :sup:`7`   | Itanium (IA-64)      | GCC                                       |
++-------------------+----------------------+-------------------------------------------+
+| HP-UX\ :sup:`7`   | Itanium (IA-64)      | HP aCC                                    |
++-------------------+----------------------+-------------------------------------------+
+| Windows x64       | x86-64               | mingw-w64's GCC-4.5.x\ :sup:`12`          |
++-------------------+----------------------+-------------------------------------------+
+
+.. note::
+
+  Code generation supported for Pentium processors and up
+
+  #. Code generation supported for Pentium processors and up
+  #. Code generation supported for 32-bit ABI only
+  #. No native code generation
+  #. Build is not complete: one or more tools do not link or function
+  #. The GCC-based C/C++ frontend does not build
+  #. The port is done using the MSYS shell.
+  #. Native code generation exists but is not complete.
+  #. Binutils 2.20 or later is required to build the assembler generated by LLVM properly.
+  #. Xcode 2.5 and gcc 4.0.1 (Apple Build 5370) will trip internal LLVM assert
+     messages when compiled for Release at optimization levels greater than 0
+     (i.e., ``-O1`` and higher).  Add ``OPTIMIZE_OPTION="-O0"`` to the build
+     command line if compiling for LLVM Release or bootstrapping the LLVM
+     toolchain.
+  #. For MSYS/MinGW on Windows, be sure to install the MSYS version of the perl
+     package, and be sure it appears in your path before any Windows-based
+     versions such as Strawberry Perl and ActivePerl, as these have
+     Windows-specifics that will cause the build to fail.
+  #. To use LLVM modules on Win32-based system, you may configure LLVM
+     with ``--enable-shared``.
+
+  #. To compile SPU backend, you need to add ``LDFLAGS=-Wl,--stack,16777216`` to
+     configure.
+
+Note that you will need about 1-3 GB of space for a full LLVM build in Debug
+mode, depending on the system (it is so large because of all the debugging
+information and the fact that the libraries are statically linked into multiple
+tools).  If you do not need many of the tools and you are space-conscious, you
+can pass ``ONLY_TOOLS="tools you need"`` to make.  The Release build requires
+considerably less space.
+
+The LLVM suite *may* compile on other platforms, but it is not guaranteed to do
+so.  If compilation is successful, the LLVM utilities should be able to
+assemble, disassemble, analyze, and optimize LLVM bitcode.  Code generation
+should work as well, although the generated native code may not work on your
+platform.
+
+Software
+--------
+
+Compiling LLVM requires that you have several software packages installed. The
+table below lists those required packages. The Package column is the usual name
+for the software package that LLVM depends on. The Version column provides
+"known to work" versions of the package. The Notes column describes how LLVM
+uses the package and provides other details.
+
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| Package                                                      | Version         | Notes                                       |
++==============================================================+=================+=============================================+
+| `GNU Make <http://savannah.gnu.org/projects/make>`_          | 3.79, 3.79.1    | Makefile/build processor                    |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `GCC <http://gcc.gnu.org/>`_                                 | 3.4.2           | C/C++ compiler\ :sup:`1`                    |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `TeXinfo <http://www.gnu.org/software/texinfo/>`_            | 4.5             | For building the CFE                        |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `SVN <http://subversion.tigris.org/project_packages.html>`_  | >=1.3           | Subversion access to LLVM\ :sup:`2`         |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `DejaGnu <http://savannah.gnu.org/projects/dejagnu>`_        | 1.4.2           | Automated test suite\ :sup:`3`              |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `tcl <http://www.tcl.tk/software/tcltk/>`_                   | 8.3, 8.4        | Automated test suite\ :sup:`3`              |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `expect <http://expect.nist.gov/>`_                          | 5.38.0          | Automated test suite\ :sup:`3`              |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `perl <http://www.perl.com/download.csp>`_                   | >=5.6.0         | Utilities                                   |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `GNU M4 <http://savannah.gnu.org/projects/m4>`_              | 1.4             | Macro processor for configuration\ :sup:`4` |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `GNU Autoconf <http://www.gnu.org/software/autoconf/>`_      | 2.60            | Configuration script builder\ :sup:`4`      |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `GNU Automake <http://www.gnu.org/software/automake/>`_      | 1.9.6           | aclocal macro generator\ :sup:`4`           |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+| `libtool <http://savannah.gnu.org/projects/libtool>`_        | 1.5.22          | Shared library manager\ :sup:`4`            |
++--------------------------------------------------------------+-----------------+---------------------------------------------+
+
+.. note::
+
+   #. Only the C and C++ languages are needed so there's no need to build the
+      other languages for LLVM's purposes. See `below` for specific version
+      info.
+   #. You only need Subversion if you intend to build from the latest LLVM
+      sources. If you're working from a release distribution, you don't need
+      Subversion.
+   #. Only needed if you want to run the automated test suite in the
+      ``llvm/test`` directory.
+   #. If you want to make changes to the configure scripts, you will need GNU
+      autoconf (2.60), and consequently, GNU M4 (version 1.4 or higher). You
+      will also need automake (1.9.6). We only use aclocal from that package.
+
+Additionally, your compilation host is expected to have the usual plethora of
+Unix utilities. Specifically:
+
+* **ar** --- archive library builder
+* **bzip2** --- bzip2 command for distribution generation
+* **bunzip2** --- bunzip2 command for distribution checking
+* **chmod** --- change permissions on a file
+* **cat** --- output concatenation utility
+* **cp** --- copy files
+* **date** --- print the current date/time 
+* **echo** --- print to standard output
+* **egrep** --- extended regular expression search utility
+* **find** --- find files/dirs in a file system
+* **grep** --- regular expression search utility
+* **gzip** --- gzip command for distribution generation
+* **gunzip** --- gunzip command for distribution checking
+* **install** --- install directories/files 
+* **mkdir** --- create a directory
+* **mv** --- move (rename) files
+* **ranlib** --- symbol table builder for archive libraries
+* **rm** --- remove (delete) files and directories
+* **sed** --- stream editor for transforming output
+* **sh** --- Bourne shell for make build scripts
+* **tar** --- tape archive for distribution generation
+* **test** --- test things in file system
+* **unzip** --- unzip command for distribution checking
+* **zip** --- zip command for distribution generation
+
+.. _below:
+.. _check here:
+
+Broken versions of GCC and other tools
+--------------------------------------
+
+LLVM is very demanding of the host C++ compiler, and as such tends to expose
+bugs in the compiler.  In particular, several versions of GCC crash when trying
+to compile LLVM.  We routinely use GCC 4.2 (and higher) or Clang.  Other
+versions of GCC will probably work as well.  GCC versions listed here are known
+to not work.  If you are using one of these versions, please try to upgrade your
+GCC to something more recent.  If you run into a problem with a version of GCC
+not listed here, please `let us know <mailto:llvmdev@cs.uiuc.edu>`_.  Please use
+the "``gcc -v``" command to find out which version of GCC you are using.
+
+**GCC versions prior to 3.0**: GCC 2.96.x and before had several problems in the
+STL that effectively prevent it from compiling LLVM.
+
+**GCC 3.2.2 and 3.2.3**: These versions of GCC fails to compile LLVM with a
+bogus template error.  This was fixed in later GCCs.
+
+**GCC 3.3.2**: This version of GCC suffered from a `serious bug
+<http://gcc.gnu.org/PR13392>`_ which causes it to crash in the
+"``convert_from_eh_region_ranges_1``" GCC function.
+
+**Cygwin GCC 3.3.3**: The version of GCC 3.3.3 commonly shipped with Cygwin does
+not work.
+
+**SuSE GCC 3.3.3**: The version of GCC 3.3.3 shipped with SuSE 9.1 (and possibly
+others) does not compile LLVM correctly (it appears that exception handling is
+broken in some cases).  Please download the FSF 3.3.3 or upgrade to a newer
+version of GCC.
+
+**GCC 3.4.0 on linux/x86 (32-bit)**: GCC miscompiles portions of the code
+generator, causing an infinite loop in the llvm-gcc build when built with
+optimizations enabled (i.e. a release build).
+
+**GCC 3.4.2 on linux/x86 (32-bit)**: GCC miscompiles portions of the code
+generator at -O3, as with 3.4.0.  However gcc 3.4.2 (unlike 3.4.0) correctly
+compiles LLVM at -O2.  A work around is to build release LLVM builds with
+"``make ENABLE_OPTIMIZED=1 OPTIMIZE_OPTION=-O2 ...``"
+
+**GCC 3.4.x on X86-64/amd64**: GCC `miscompiles portions of LLVM
+<http://llvm.org/PR1056>`__.
+
+**GCC 3.4.4 (CodeSourcery ARM 2005q3-2)**: this compiler miscompiles LLVM when
+building with optimizations enabled.  It appears to work with "``make
+ENABLE_OPTIMIZED=1 OPTIMIZE_OPTION=-O1``" or build a debug build.
+
+**IA-64 GCC 4.0.0**: The IA-64 version of GCC 4.0.0 is known to miscompile LLVM.
+
+**Apple Xcode 2.3**: GCC crashes when compiling LLVM at -O3 (which is the
+default with ENABLE_OPTIMIZED=1.  To work around this, build with
+"``ENABLE_OPTIMIZED=1 OPTIMIZE_OPTION=-O2``".
+
+**GCC 4.1.1**: GCC fails to build LLVM with template concept check errors
+compiling some files.  At the time of this writing, GCC mainline (4.2) did not
+share the problem.
+
+**GCC 4.1.1 on X86-64/amd64**: GCC `miscompiles portions of LLVM
+<http://llvm.org/PR1063>`__ when compiling llvm itself into 64-bit code.  LLVM
+will appear to mostly work but will be buggy, e.g. failing portions of its
+testsuite.
+
+**GCC 4.1.2 on OpenSUSE**: Seg faults during libstdc++ build and on x86_64
+platforms compiling md5.c gets a mangled constant.
+
+**GCC 4.1.2 (20061115 (prerelease) (Debian 4.1.1-21)) on Debian**: Appears to
+miscompile parts of LLVM 2.4. One symptom is ValueSymbolTable complaining about
+symbols remaining in the table on destruction.
+
+**GCC 4.1.2 20071124 (Red Hat 4.1.2-42)**: Suffers from the same symptoms as the
+previous one. It appears to work with ENABLE_OPTIMIZED=0 (the default).
+
+**Cygwin GCC 4.3.2 20080827 (beta) 2**: Users `reported
+<http://llvm.org/PR4145>`_ various problems related with link errors when using
+this GCC version.
+
+**Debian GCC 4.3.2 on X86**: Crashes building some files in LLVM 2.6.
+
+**GCC 4.3.3 (Debian 4.3.3-10) on ARM**: Miscompiles parts of LLVM 2.6 when
+optimizations are turned on. The symptom is an infinite loop in
+``FoldingSetImpl::RemoveNode`` while running the code generator.
+
+**SUSE 11 GCC 4.3.4**: Miscompiles LLVM, causing crashes in ValueHandle logic.
+
+**GCC 4.3.5 and GCC 4.4.5 on ARM**: These can miscompile ``value >> 1`` even at
+``-O0``. A test failure in ``test/Assembler/alignstack.ll`` is one symptom of
+the problem.
+
+**GNU ld 2.16.X**. Some 2.16.X versions of the ld linker will produce very long
+warning messages complaining that some "``.gnu.linkonce.t.*``" symbol was
+defined in a discarded section. You can safely ignore these messages as they are
+erroneous and the linkage is correct.  These messages disappear using ld 2.17.
+
+**GNU binutils 2.17**: Binutils 2.17 contains `a bug
+<http://sourceware.org/bugzilla/show_bug.cgi?id=3111>`__ which causes huge link
+times (minutes instead of seconds) when building LLVM.  We recommend upgrading
+to a newer version (2.17.50.0.4 or later).
+
+**GNU Binutils 2.19.1 Gold**: This version of Gold contained `a bug
+<http://sourceware.org/bugzilla/show_bug.cgi?id=9836>`__ which causes
+intermittent failures when building LLVM with position independent code.  The
+symptom is an error about cyclic dependencies.  We recommend upgrading to a
+newer version of Gold.
+
+.. _Getting Started with LLVM:
+
+Getting Started with LLVM
+=========================
+
+The remainder of this guide is meant to get you up and running with LLVM and to
+give you some basic information about the LLVM environment.
+
+The later sections of this guide describe the `general layout`_ of the LLVM
+source tree, a `simple example`_ using the LLVM tool chain, and `links`_ to find
+more information about LLVM or to get help via e-mail.
+
+Terminology and Notation
+------------------------
+
+Throughout this manual, the following names are used to denote paths specific to
+the local system and working environment.  *These are not environment variables
+you need to set but just strings used in the rest of this document below*.  In
+any of the examples below, simply replace each of these names with the
+appropriate pathname on your local system.  All these paths are absolute:
+
+``SRC_ROOT``
+
+  This is the top level directory of the LLVM source tree.
+
+``OBJ_ROOT``
+
+  This is the top level directory of the LLVM object tree (i.e. the tree where
+  object files and compiled programs will be placed.  It can be the same as
+  SRC_ROOT).
+
+.. _Setting Up Your Environment:
+
+Setting Up Your Environment
+---------------------------
+
+In order to compile and use LLVM, you may need to set some environment
+variables.
+
+``LLVM_LIB_SEARCH_PATH=/path/to/your/bitcode/libs``
+
+  [Optional] This environment variable helps LLVM linking tools find the
+  locations of your bitcode libraries. It is provided only as a convenience
+  since you can specify the paths using the -L options of the tools and the
+  C/C++ front-end will automatically use the bitcode files installed in its
+  ``lib`` directory.
+
+Unpacking the LLVM Archives
+---------------------------
+
+If you have the LLVM distribution, you will need to unpack it before you can
+begin to compile it.  LLVM is distributed as a set of two files: the LLVM suite
+and the LLVM GCC front end compiled for your platform.  There is an additional
+test suite that is optional.  Each file is a TAR archive that is compressed with
+the gzip program.
+
+The files are as follows, with *x.y* marking the version number:
+
+``llvm-x.y.tar.gz``
+
+  Source release for the LLVM libraries and tools.
+
+``llvm-test-x.y.tar.gz``
+
+  Source release for the LLVM test-suite.
+
+``llvm-gcc-4.2-x.y.source.tar.gz``
+
+  Source release of the llvm-gcc-4.2 front end.  See README.LLVM in the root
+  directory for build instructions.
+
+``llvm-gcc-4.2-x.y-platform.tar.gz``
+
+  Binary release of the llvm-gcc-4.2 front end for a specific platform.
+
+Checkout LLVM from Subversion
+-----------------------------
+
+If you have access to our Subversion repository, you can get a fresh copy of the
+entire source code.  All you need to do is check it out from Subversion as
+follows:
+
+* ``cd where-you-want-llvm-to-live``
+* Read-Only: ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm``
+* Read-Write:``svn co https://user@llvm.org/svn/llvm-project/llvm/trunk llvm``
+
+This will create an '``llvm``' directory in the current directory and fully
+populate it with the LLVM source code, Makefiles, test directories, and local
+copies of documentation files.
+
+If you want to get a specific release (as opposed to the most recent revision),
+you can checkout it from the '``tags``' directory (instead of '``trunk``'). The
+following releases are located in the following subdirectories of the '``tags``'
+directory:
+
+* Release 3.1: **RELEASE_31/final**
+* Release 3.0: **RELEASE_30/final**
+* Release 2.9: **RELEASE_29/final**
+* Release 2.8: **RELEASE_28**
+* Release 2.7: **RELEASE_27**
+* Release 2.6: **RELEASE_26**
+* Release 2.5: **RELEASE_25**
+* Release 2.4: **RELEASE_24**
+* Release 2.3: **RELEASE_23**
+* Release 2.2: **RELEASE_22**
+* Release 2.1: **RELEASE_21**
+* Release 2.0: **RELEASE_20**
+* Release 1.9: **RELEASE_19**
+* Release 1.8: **RELEASE_18**
+* Release 1.7: **RELEASE_17**
+* Release 1.6: **RELEASE_16**
+* Release 1.5: **RELEASE_15**
+* Release 1.4: **RELEASE_14**
+* Release 1.3: **RELEASE_13**
+* Release 1.2: **RELEASE_12**
+* Release 1.1: **RELEASE_11**
+* Release 1.0: **RELEASE_1**
+
+If you would like to get the LLVM test suite (a separate package as of 1.4), you
+get it from the Subversion repository:
+
+.. code-block:: bash
+
+  % cd llvm/projects
+  % svn co http://llvm.org/svn/llvm-project/test-suite/trunk test-suite
+
+By placing it in the ``llvm/projects``, it will be automatically configured by
+the LLVM configure script as well as automatically updated when you run ``svn
+update``.
+
+GIT mirror
+----------
+
+GIT mirrors are available for a number of LLVM subprojects. These mirrors sync
+automatically with each Subversion commit and contain all necessary git-svn
+marks (so, you can recreate git-svn metadata locally). Note that right now
+mirrors reflect only ``trunk`` for each project. You can do the read-only GIT
+clone of LLVM via:
+
+.. code-block:: bash
+
+  % git clone http://llvm.org/git/llvm.git
+
+If you want to check out clang too, run:
+
+.. code-block:: bash
+
+  % git clone http://llvm.org/git/llvm.git
+  % cd llvm/tools
+  % git clone http://llvm.org/git/clang.git
+
+Since the upstream repository is in Subversion, you should use ``git
+pull --rebase`` instead of ``git pull`` to avoid generating a non-linear history
+in your clone.  To configure ``git pull`` to pass ``--rebase`` by default on the
+master branch, run the following command:
+
+.. code-block:: bash
+
+  % git config branch.master.rebase true
+
+Sending patches with Git
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+Please read `Developer Policy <DeveloperPolicy.html#one-off-patches>`_, too.
+
+Assume ``master`` points the upstream and ``mybranch`` points your working
+branch, and ``mybranch`` is rebased onto ``master``.  At first you may check
+sanity of whitespaces:
+
+.. code-block:: bash
+
+  % git diff --check master..mybranch
+
+The easiest way to generate a patch is as below:
+
+.. code-block:: bash
+
+  % git diff master..mybranch > /path/to/mybranch.diff
+
+It is a little different from svn-generated diff. git-diff-generated diff has
+prefixes like ``a/`` and ``b/``. Don't worry, most developers might know it
+could be accepted with ``patch -p1 -N``.
+
+But you may generate patchset with git-format-patch. It generates by-each-commit
+patchset. To generate patch files to attach to your article:
+
+.. code-block:: bash
+
+  % git format-patch --no-attach master..mybranch -o /path/to/your/patchset
+
+If you would like to send patches directly, you may use git-send-email or
+git-imap-send. Here is an example to generate the patchset in Gmail's [Drafts].
+
+.. code-block:: bash
+
+  % git format-patch --attach master..mybranch --stdout | git imap-send
+
+Then, your .git/config should have [imap] sections.
+
+.. code-block:: bash
+
+  [imap]
+        host = imaps://imap.gmail.com
+        user = your.gmail.account@gmail.com
+        pass = himitsu!
+        port = 993
+        sslverify = false
+  ; in English
+        folder = "[Gmail]/Drafts"
+  ; example for Japanese, "Modified UTF-7" encoded.
+        folder = "[Gmail]/&Tgtm+DBN-"
+  ; example for Traditional Chinese
+        folder = "[Gmail]/&g0l6Pw-"
+
+For developers to work with git-svn
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To set up clone from which you can submit code using ``git-svn``, run:
+
+.. code-block:: bash
+
+  % git clone http://llvm.org/git/llvm.git
+  % cd llvm
+  % git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username=<username>
+  % git config svn-remote.svn.fetch :refs/remotes/origin/master
+  % git svn rebase -l  # -l avoids fetching ahead of the git mirror.
+
+  # If you have clang too:
+  % cd tools
+  % git clone http://llvm.org/git/clang.git
+  % cd clang
+  % git svn init https://llvm.org/svn/llvm-project/cfe/trunk --username=<username>
+  % git config svn-remote.svn.fetch :refs/remotes/origin/master
+  % git svn rebase -l
+
+To update this clone without generating git-svn tags that conflict with the
+upstream git repo, run:
+
+.. code-block:: bash
+
+  % git fetch && (cd tools/clang && git fetch)  # Get matching revisions of both trees.
+  % git checkout master
+  % git svn rebase -l
+  % (cd tools/clang &&
+     git checkout master &&
+     git svn rebase -l)
+
+This leaves your working directories on their master branches, so you'll need to
+``checkout`` each working branch individually and ``rebase`` it on top of its
+parent branch.  (Note: This script is intended for relative newbies to git.  If
+you have more experience, you can likely improve on it.)
+
+The git-svn metadata can get out of sync after you mess around with branches and
+``dcommit``. When that happens, ``git svn dcommit`` stops working, complaining
+about files with uncommitted changes. The fix is to rebuild the metadata:
+
+.. code-block:: bash
+
+  % rm -rf .git/svn
+  % git svn rebase -l
+
+Local LLVM Configuration
+------------------------
+
+Once checked out from the Subversion repository, the LLVM suite source code must
+be configured via the ``configure`` script.  This script sets variables in the
+various ``*.in`` files, most notably ``llvm/Makefile.config`` and
+``llvm/include/Config/config.h``.  It also populates *OBJ_ROOT* with the
+Makefiles needed to begin building LLVM.
+
+The following environment variables are used by the ``configure`` script to
+configure the build system:
+
++------------+-----------------------------------------------------------+
+| Variable   | Purpose                                                   |
++============+===========================================================+
+| CC         | Tells ``configure`` which C compiler to use.  By default, |
+|            | ``configure`` will look for the first GCC C compiler in   |
+|            | ``PATH``.  Use this variable to override ``configure``\'s |
+|            | default behavior.                                         |
++------------+-----------------------------------------------------------+
+| CXX        | Tells ``configure`` which C++ compiler to use.  By        |
+|            | default, ``configure`` will look for the first GCC C++    |
+|            | compiler in ``PATH``.  Use this variable to override      |
+|            | ``configure``'s default behavior.                         |
++------------+-----------------------------------------------------------+
+
+The following options can be used to set or enable LLVM specific options:
+
+``--enable-optimized``
+
+  Enables optimized compilation (debugging symbols are removed and GCC
+  optimization flags are enabled). Note that this is the default setting if you
+  are using the LLVM distribution. The default behavior of an Subversion
+  checkout is to use an unoptimized build (also known as a debug build).
+
+``--enable-debug-runtime``
+
+  Enables debug symbols in the runtime libraries. The default is to strip debug
+  symbols from the runtime libraries.
+
+``--enable-jit``
+
+  Compile the Just In Time (JIT) compiler functionality.  This is not available
+  on all platforms.  The default is dependent on platform, so it is best to
+  explicitly enable it if you want it.
+
+``--enable-targets=target-option``
+
+  Controls which targets will be built and linked into llc. The default value
+  for ``target_options`` is "all" which builds and links all available targets.
+  The value "host-only" can be specified to build only a native compiler (no
+  cross-compiler targets available). The "native" target is selected as the
+  target of the build host. You can also specify a comma separated list of
+  target names that you want available in llc. The target names use all lower
+  case. The current set of targets is:
+
+    ``arm, cpp, hexagon, mblaze, mips, mipsel, msp430, powerpc, ptx, sparc, spu,
+    x86, x86_64, xcore``.
+
+``--enable-doxygen``
+
+  Look for the doxygen program and enable construction of doxygen based
+  documentation from the source code. This is disabled by default because
+  generating the documentation can take a long time and producess 100s of
+  megabytes of output.
+
+``--with-udis86``
+
+  LLVM can use external disassembler library for various purposes (now it's used
+  only for examining code produced by JIT). This option will enable usage of
+  `udis86 <http://udis86.sourceforge.net/>`_ x86 (both 32 and 64 bits)
+  disassembler library.
+
+To configure LLVM, follow these steps:
+
+#. Change directory into the object root directory:
+
+   .. code-block:: bash
+
+     % cd OBJ_ROOT
+
+#. Run the ``configure`` script located in the LLVM source tree:
+
+   .. code-block:: bash
+
+     % SRC_ROOT/configure --prefix=/install/path [other options]
+
+Compiling the LLVM Suite Source Code
+------------------------------------
+
+Once you have configured LLVM, you can build it.  There are three types of
+builds:
+
+Debug Builds
+
+  These builds are the default when one is using an Subversion checkout and
+  types ``gmake`` (unless the ``--enable-optimized`` option was used during
+  configuration).  The build system will compile the tools and libraries with
+  debugging information.  To get a Debug Build using the LLVM distribution the
+  ``--disable-optimized`` option must be passed to ``configure``.
+
+Release (Optimized) Builds
+
+  These builds are enabled with the ``--enable-optimized`` option to
+  ``configure`` or by specifying ``ENABLE_OPTIMIZED=1`` on the ``gmake`` command
+  line.  For these builds, the build system will compile the tools and libraries
+  with GCC optimizations enabled and strip debugging information from the
+  libraries and executables it generates.  Note that Release Builds are default
+  when using an LLVM distribution.
+
+Profile Builds
+
+  These builds are for use with profiling.  They compile profiling information
+  into the code for use with programs like ``gprof``.  Profile builds must be
+  started by specifying ``ENABLE_PROFILING=1`` on the ``gmake`` command line.
+
+Once you have LLVM configured, you can build it by entering the *OBJ_ROOT*
+directory and issuing the following command:
+
+.. code-block:: bash
+
+  % gmake
+
+If the build fails, please `check here`_ to see if you are using a version of
+GCC that is known not to compile LLVM.
+
+If you have multiple processors in your machine, you may wish to use some of the
+parallel build options provided by GNU Make.  For example, you could use the
+command:
+
+.. code-block:: bash
+
+  % gmake -j2
+
+There are several special targets which are useful when working with the LLVM
+source code:
+
+``gmake clean``
+
+  Removes all files generated by the build.  This includes object files,
+  generated C/C++ files, libraries, and executables.
+
+``gmake dist-clean``
+
+  Removes everything that ``gmake clean`` does, but also removes files generated
+  by ``configure``.  It attempts to return the source tree to the original state
+  in which it was shipped.
+
+``gmake install``
+
+  Installs LLVM header files, libraries, tools, and documentation in a hierarchy
+  under ``$PREFIX``, specified with ``./configure --prefix=[dir]``, which
+  defaults to ``/usr/local``.
+
+``gmake -C runtime install-bytecode``
+
+  Assuming you built LLVM into $OBJDIR, when this command is run, it will
+  install bitcode libraries into the GCC front end's bitcode library directory.
+  If you need to update your bitcode libraries, this is the target to use once
+  you've built them.
+
+Please see the `Makefile Guide <MakefileGuide.html>`_ for further details on
+these ``make`` targets and descriptions of other targets available.
+
+It is also possible to override default values from ``configure`` by declaring
+variables on the command line.  The following are some examples:
+
+``gmake ENABLE_OPTIMIZED=1``
+
+  Perform a Release (Optimized) build.
+
+``gmake ENABLE_OPTIMIZED=1 DISABLE_ASSERTIONS=1``
+
+  Perform a Release (Optimized) build without assertions enabled.
+ 
+``gmake ENABLE_OPTIMIZED=0``
+
+  Perform a Debug build.
+
+``gmake ENABLE_PROFILING=1``
+
+  Perform a Profiling build.
+
+``gmake VERBOSE=1``
+
+  Print what ``gmake`` is doing on standard output.
+
+``gmake TOOL_VERBOSE=1``
+
+  Ask each tool invoked by the makefiles to print out what it is doing on 
+  the standard output. This also implies ``VERBOSE=1``.
+
+Every directory in the LLVM object tree includes a ``Makefile`` to build it and
+any subdirectories that it contains.  Entering any directory inside the LLVM
+object tree and typing ``gmake`` should rebuild anything in or below that
+directory that is out of date.
+
+Cross-Compiling LLVM
+--------------------
+
+It is possible to cross-compile LLVM itself. That is, you can create LLVM
+executables and libraries to be hosted on a platform different from the platform
+where they are build (a Canadian Cross build). To configure a cross-compile,
+supply the configure script with ``--build`` and ``--host`` options that are
+different. The values of these options must be legal target triples that your
+GCC compiler supports.
+
+The result of such a build is executables that are not runnable on on the build
+host (--build option) but can be executed on the compile host (--host option).
+
+The Location of LLVM Object Files
+---------------------------------
+
+The LLVM build system is capable of sharing a single LLVM source tree among
+several LLVM builds.  Hence, it is possible to build LLVM for several different
+platforms or configurations using the same source tree.
+
+This is accomplished in the typical autoconf manner:
+
+* Change directory to where the LLVM object files should live:
+
+  .. code-block:: bash
+
+    % cd OBJ_ROOT
+
+* Run the ``configure`` script found in the LLVM source directory:
+
+  .. code-block:: bash
+
+    % SRC_ROOT/configure
+
+The LLVM build will place files underneath *OBJ_ROOT* in directories named after
+the build type:
+
+Debug Builds with assertions enabled (the default)
+
+  Tools
+
+    ``OBJ_ROOT/Debug+Asserts/bin``
+
+  Libraries
+
+    ``OBJ_ROOT/Debug+Asserts/lib``
+
+Release Builds
+
+  Tools
+
+    ``OBJ_ROOT/Release/bin``
+
+  Libraries
+
+    ``OBJ_ROOT/Release/lib``
+
+Profile Builds
+
+  Tools
+
+    ``OBJ_ROOT/Profile/bin``
+
+  Libraries
+
+    ``OBJ_ROOT/Profile/lib``
+
+Optional Configuration Items
+----------------------------
+
+If you're running on a Linux system that supports the `binfmt_misc
+<http://www.tat.physik.uni-tuebingen.de/~rguenth/linux/binfmt_misc.html>`_
+module, and you have root access on the system, you can set your system up to
+execute LLVM bitcode files directly. To do this, use commands like this (the
+first command may not be required if you are already using the module):
+
+.. code-block:: bash
+
+  % mount -t binfmt_misc none /proc/sys/fs/binfmt_misc
+  % echo ':llvm:M::BC::/path/to/lli:' > /proc/sys/fs/binfmt_misc/register
+  % chmod u+x hello.bc   (if needed)
+  % ./hello.bc
+
+This allows you to execute LLVM bitcode files directly.  On Debian, you can also
+use this command instead of the 'echo' command above:
+
+.. code-block:: bash
+
+  % sudo update-binfmts --install llvm /path/to/lli --magic 'BC'
+
+.. _Program Layout:
+.. _general layout:
+
+Program Layout
+==============
+
+One useful source of information about the LLVM source base is the LLVM `doxygen
+<http://www.doxygen.org/>`_ documentation available at
+`<http://llvm.org/doxygen/>`_.  The following is a brief introduction to code
+layout:
+
+``llvm/examples``
+-----------------
+
+This directory contains some simple examples of how to use the LLVM IR and JIT.
+
+``llvm/include``
+----------------
+
+This directory contains public header files exported from the LLVM library. The
+three main subdirectories of this directory are:
+
+``llvm/include/llvm``
+
+  This directory contains all of the LLVM specific header files.  This directory
+  also has subdirectories for different portions of LLVM: ``Analysis``,
+  ``CodeGen``, ``Target``, ``Transforms``, etc...
+
+``llvm/include/llvm/Support``
+
+  This directory contains generic support libraries that are provided with LLVM
+  but not necessarily specific to LLVM. For example, some C++ STL utilities and
+  a Command Line option processing library store their header files here.
+
+``llvm/include/llvm/Config``
+
+  This directory contains header files configured by the ``configure`` script.
+  They wrap "standard" UNIX and C header files.  Source code can include these
+  header files which automatically take care of the conditional #includes that
+  the ``configure`` script generates.
+
+``llvm/lib``
+------------
+
+This directory contains most of the source files of the LLVM system. In LLVM,
+almost all code exists in libraries, making it very easy to share code among the
+different `tools`_.
+
+``llvm/lib/VMCore/``
+
+  This directory holds the core LLVM source files that implement core classes
+  like Instruction and BasicBlock.
+
+``llvm/lib/AsmParser/``
+
+  This directory holds the source code for the LLVM assembly language parser
+  library.
+
+``llvm/lib/BitCode/``
+
+  This directory holds code for reading and write LLVM bitcode.
+
+``llvm/lib/Analysis/``
+
+  This directory contains a variety of different program analyses, such as
+  Dominator Information, Call Graphs, Induction Variables, Interval
+  Identification, Natural Loop Identification, etc.
+
+``llvm/lib/Transforms/``
+
+  This directory contains the source code for the LLVM to LLVM program
+  transformations, such as Aggressive Dead Code Elimination, Sparse Conditional
+  Constant Propagation, Inlining, Loop Invariant Code Motion, Dead Global
+  Elimination, and many others.
+
+``llvm/lib/Target/``
+
+  This directory contains files that describe various target architectures for
+  code generation.  For example, the ``llvm/lib/Target/X86`` directory holds the
+  X86 machine description while ``llvm/lib/Target/ARM`` implements the ARM
+  backend.
+    
+``llvm/lib/CodeGen/``
+
+  This directory contains the major parts of the code generator: Instruction
+  Selector, Instruction Scheduling, and Register Allocation.
+
+``llvm/lib/MC/``
+
+  (FIXME: T.B.D.)
+
+``llvm/lib/Debugger/``
+
+  This directory contains the source level debugger library that makes it
+  possible to instrument LLVM programs so that a debugger could identify source
+  code locations at which the program is executing.
+
+``llvm/lib/ExecutionEngine/``
+
+  This directory contains libraries for executing LLVM bitcode directly at
+  runtime in both interpreted and JIT compiled fashions.
+
+``llvm/lib/Support/``
+
+  This directory contains the source code that corresponds to the header files
+  located in ``llvm/include/ADT/`` and ``llvm/include/Support/``.
+
+``llvm/projects``
+-----------------
+
+This directory contains projects that are not strictly part of LLVM but are
+shipped with LLVM. This is also the directory where you should create your own
+LLVM-based projects. See ``llvm/projects/sample`` for an example of how to set
+up your own project.
+
+``llvm/runtime``
+----------------
+
+This directory contains libraries which are compiled into LLVM bitcode and used
+when linking programs with the Clang front end.  Most of these libraries are
+skeleton versions of real libraries; for example, libc is a stripped down
+version of glibc.
+
+Unlike the rest of the LLVM suite, this directory needs the LLVM GCC front end
+to compile.
+
+``llvm/test``
+-------------
+
+This directory contains feature and regression tests and other basic sanity
+checks on the LLVM infrastructure. These are intended to run quickly and cover a
+lot of territory without being exhaustive.
+
+``test-suite``
+--------------
+
+This is not a directory in the normal llvm module; it is a separate Subversion
+module that must be checked out (usually to ``projects/test-suite``).  This
+module contains a comprehensive correctness, performance, and benchmarking test
+suite for LLVM. It is a separate Subversion module because not every LLVM user
+is interested in downloading or building such a comprehensive test suite. For
+further details on this test suite, please see the `Testing
+Guide <TestingGuide.html>`_ document.
+
+.. _tools:
+
+``llvm/tools``
+--------------
+
+The **tools** directory contains the executables built out of the libraries
+above, which form the main part of the user interface.  You can always get help
+for a tool by typing ``tool_name -help``.  The following is a brief introduction
+to the most important tools.  More detailed information is in
+the `Command Guide <CommandGuide/index.html>`_.
+
+``bugpoint``
+
+  ``bugpoint`` is used to debug optimization passes or code generation backends
+  by narrowing down the given test case to the minimum number of passes and/or
+  instructions that still cause a problem, whether it is a crash or
+  miscompilation. See `<HowToSubmitABug.html>`_ for more information on using
+  ``bugpoint``.
+
+``llvm-ar``
+
+  The archiver produces an archive containing the given LLVM bitcode files,
+  optionally with an index for faster lookup.
+  
+``llvm-as``
+
+  The assembler transforms the human readable LLVM assembly to LLVM bitcode.
+
+``llvm-dis``
+
+  The disassembler transforms the LLVM bitcode to human readable LLVM assembly.
+
+``llvm-link``
+
+  ``llvm-link``, not surprisingly, links multiple LLVM modules into a single
+  program.
+  
+``lli``
+
+  ``lli`` is the LLVM interpreter, which can directly execute LLVM bitcode
+  (although very slowly...). For architectures that support it (currently x86,
+  Sparc, and PowerPC), by default, ``lli`` will function as a Just-In-Time
+  compiler (if the functionality was compiled in), and will execute the code
+  *much* faster than the interpreter.
+
+``llc``
+
+  ``llc`` is the LLVM backend compiler, which translates LLVM bitcode to a
+  native code assembly file or to C code (with the ``-march=c`` option).
+
+``opt``
+
+  ``opt`` reads LLVM bitcode, applies a series of LLVM to LLVM transformations
+  (which are specified on the command line), and then outputs the resultant
+  bitcode.  The '``opt -help``' command is a good way to get a list of the
+  program transformations available in LLVM.
+
+  ``opt`` can also be used to run a specific analysis on an input LLVM bitcode
+  file and print out the results.  It is primarily useful for debugging
+  analyses, or familiarizing yourself with what an analysis does.
+
+``llvm/utils``
+--------------
+
+This directory contains utilities for working with LLVM source code, and some of
+the utilities are actually required as part of the build process because they
+are code generators for parts of LLVM infrastructure.
+
+
+``codegen-diff``
+
+  ``codegen-diff`` is a script that finds differences between code that LLC
+  generates and code that LLI generates. This is a useful tool if you are
+  debugging one of them, assuming that the other generates correct output. For
+  the full user manual, run ```perldoc codegen-diff'``.
+
+``emacs/``
+
+  The ``emacs`` directory contains syntax-highlighting files which will work
+  with Emacs and XEmacs editors, providing syntax highlighting support for LLVM
+  assembly files and TableGen description files. For information on how to use
+  the syntax files, consult the ``README`` file in that directory.
+
+``getsrcs.sh``
+
+  The ``getsrcs.sh`` script finds and outputs all non-generated source files,
+  which is useful if one wishes to do a lot of development across directories
+  and does not want to individually find each file. One way to use it is to run,
+  for example: ``xemacs `utils/getsources.sh``` from the top of your LLVM source
+  tree.
+
+``llvmgrep``
+
+  This little tool performs an ``egrep -H -n`` on each source file in LLVM and
+  passes to it a regular expression provided on ``llvmgrep``'s command
+  line. This is a very efficient way of searching the source base for a
+  particular regular expression.
+
+``makellvm``
+
+  The ``makellvm`` script compiles all files in the current directory and then
+  compiles and links the tool that is the first argument. For example, assuming
+  you are in the directory ``llvm/lib/Target/Sparc``, if ``makellvm`` is in your
+  path, simply running ``makellvm llc`` will make a build of the current
+  directory, switch to directory ``llvm/tools/llc`` and build it, causing a
+  re-linking of LLC.
+
+``TableGen/``
+
+  The ``TableGen`` directory contains the tool used to generate register
+  descriptions, instruction set descriptions, and even assemblers from common
+  TableGen description files.
+
+``vim/``
+
+  The ``vim`` directory contains syntax-highlighting files which will work with
+  the VIM editor, providing syntax highlighting support for LLVM assembly files
+  and TableGen description files. For information on how to use the syntax
+  files, consult the ``README`` file in that directory.
+
+.. _simple example:
+
+An Example Using the LLVM Tool Chain
+====================================
+
+This section gives an example of using LLVM with the Clang front end.
+
+Example with clang
+------------------
+
+#. First, create a simple C file, name it 'hello.c':
+
+   .. code-block:: c
+
+     #include <stdio.h>
+
+     int main() {
+       printf("hello world\n");
+       return 0;
+     }
+
+#. Next, compile the C file into a native executable:
+
+   .. code-block:: bash
+
+     % clang hello.c -o hello
+
+   .. note::
+
+     Clang works just like GCC by default.  The standard -S and -c arguments
+     work as usual (producing a native .s or .o file, respectively).
+
+#. Next, compile the C file into a LLVM bitcode file:
+
+   .. code-block:: bash
+
+     % clang -O3 -emit-llvm hello.c -c -o hello.bc
+
+   The -emit-llvm option can be used with the -S or -c options to emit an LLVM
+   ``.ll`` or ``.bc`` file (respectively) for the code.  This allows you to use
+   the `standard LLVM tools <CommandGuide/index.html>`_ on the bitcode file.
+
+#. Run the program in both forms. To run the program, use:
+
+   .. code-block:: bash
+
+      % ./hello
+ 
+   and
+
+   .. code-block:: bash
+
+     % lli hello.bc
+
+   The second examples shows how to invoke the LLVM JIT, `lli
+   <CommandGuide/html/lli.html>`_.
+
+#. Use the ``llvm-dis`` utility to take a look at the LLVM assembly code:
+
+   .. code-block:: bash
+
+     % llvm-dis < hello.bc | less
+
+#. Compile the program to native assembly using the LLC code generator:
+
+   .. code-block:: bash
+
+     % llc hello.bc -o hello.s
+
+#. Assemble the native assembly language file into a program:
+
+   .. code-block:: bash
+
+     **Solaris:** % /opt/SUNWspro/bin/cc -xarch=v9 hello.s -o hello.native
+
+     **Others:**  % gcc hello.s -o hello.native
+
+#. Execute the native code program:
+
+   .. code-block:: bash
+
+     % ./hello.native
+
+   Note that using clang to compile directly to native code (i.e. when the
+   ``-emit-llvm`` option is not present) does steps 6/7/8 for you.
+
+Common Problems
+===============
+
+If you are having problems building or using LLVM, or if you have any other
+general questions about LLVM, please consult the `Frequently Asked
+Questions <FAQ.html>`_ page.
+
+.. _links:
+
+Links
+=====
+
+This document is just an **introduction** on how to use LLVM to do some simple
+things... there are many more interesting and complicated things that you can do
+that aren't documented here (but we'll gladly accept a patch if you want to
+write something up!).  For more information about LLVM, check out:
+
+* `LLVM Homepage <http://llvm.org/>`_
+* `LLVM Doxygen Tree <http://llvm.org/doxygen/>`_
+* `Starting a Project that Uses LLVM <http://llvm.org/docs/Projects.html>`_
diff --git a/docs/GoldPlugin.html b/docs/GoldPlugin.html
deleted file mode 100644
index 1e99a5a3d6a1..000000000000
--- a/docs/GoldPlugin.html
+++ /dev/null
@@ -1,227 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>LLVM gold plugin</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-      
-<h1>LLVM gold plugin</h1>
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#build">How to build it</a></li>
-  <li><a href="#usage">Usage</a>
-  <ul>
-    <li><a href="#example1">Example of link time optimization</a></li>
-    <li><a href="#lto_autotools">Quickstart for using LTO with autotooled projects</a></li>
-  </ul></li>
-  <li><a href="#licensing">Licensing</a></li>
-</ol>
-<div class="doc_author">Written by Nick Lewycky</div>
-
-<!--=========================================================================-->
-<h2><a name="introduction">Introduction</a></h2>
-<!--=========================================================================-->
-<div>
-  <p>Building with link time optimization requires cooperation from the
-system linker. LTO support on Linux systems requires that you use
-the <a href="http://sourceware.org/binutils">gold linker</a> which supports
-LTO via plugins. This is the same mechanism used by the
-<a href="http://gcc.gnu.org/wiki/LinkTimeOptimization">GCC LTO</a>
-project.</p>
-  <p>The LLVM gold plugin implements the
-<a href="http://gcc.gnu.org/wiki/whopr/driver">gold plugin interface</a>
-on top of
-<a href="LinkTimeOptimization.html#lto">libLTO</a>.
-The same plugin can also be used by other tools such as <tt>ar</tt> and
-<tt>nm</tt>.
-</div>
-<!--=========================================================================-->
-<h2><a name="build">How to build it</a></h2>
-<!--=========================================================================-->
-<div>
-  <p>You need to have gold with plugin support and build the LLVMgold
-plugin. Check whether you have gold running <tt>/usr/bin/ld -v</tt>. It will
-report &#8220;GNU gold&#8221; or else &#8220GNU ld&#8221; if not. If you have
-gold, check for plugin support by running <tt>/usr/bin/ld -plugin</tt>. If it
-complains &#8220missing argument&#8221 then you have plugin support. If not,
-such as an &#8220;unknown option&#8221; error then you will either need to
-build gold or install a version with plugin support.</p>
-<ul>
-  <li>To build gold with plugin support:
-    <pre class="doc_code">
-mkdir binutils
-cd binutils
-cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src login
-<em>{enter "anoncvs" as the password}</em>
-cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src co binutils
-mkdir build
-cd build
-../src/configure --enable-gold --enable-plugins
-make all-gold
-</pre>
-    That should leave you with <tt>binutils/build/gold/ld-new</tt> which supports the <tt>-plugin</tt> option. It also built would have
-<tt>binutils/build/binutils/ar</tt> and <tt>nm-new</tt> which support plugins
-but don't have a visible -plugin option, instead relying on the gold plugin
-being present in <tt>../lib/bfd-plugins</tt> relative to where the binaries are
-placed.
-    <li>Build the LLVMgold plugin: Configure LLVM with
-    <tt>--with-binutils-include=/path/to/binutils/src/include</tt> and run
-    <tt>make</tt>.
-</ul>
-</div>
-<!--=========================================================================-->
-<h2><a name="usage">Usage</a></h2>
-<!--=========================================================================-->
-<div>
-
-  <p>The linker takes a <tt>-plugin</tt> option that points to the path of
-  the plugin <tt>.so</tt> file. To find out what link command <tt>gcc</tt>
-  would run in a given situation, run <tt>gcc -v <em>[...]</em></tt> and look
-  for the line where it runs <tt>collect2</tt>. Replace that with
-  <tt>ld-new -plugin /path/to/LLVMgold.so</tt> to test it out. Once you're
-  ready to switch to using gold, backup your existing <tt>/usr/bin/ld</tt>
-  then replace it with <tt>ld-new</tt>.</p>
-
-  <p>You can produce bitcode files from <tt>clang</tt> using
-  <tt>-emit-llvm</tt> or <tt>-flto</tt>, or the <tt>-O4</tt> flag which is
-  synonymous with <tt>-O3 -flto</tt>.</p>
-
-  <p>Any of these flags will also cause <tt>clang</tt> to look for the
-  gold plugin in the <tt>lib</tt> directory under its prefix and pass the
-  <tt>-plugin</tt> option to <tt>ld</tt>. It will not look for an alternate
-  linker, which is why you need gold to be the installed system linker in
-  your path.</p>
-
-  <p>If you want <tt>ar</tt> and <tt>nm</tt> to work seamlessly as well, install
-  <tt>LLVMgold.so</tt> to <tt>/usr/lib/bfd-plugins</tt>. If you built your
-  own gold, be sure to install the <tt>ar</tt> and <tt>nm-new</tt> you built to
-  <tt>/usr/bin</tt>.<p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="example1">Example of link time optimization</a>
-</h3>
-
-<div>
-  <p>The following example shows a worked example of the gold plugin mixing
-  LLVM bitcode and native code.
-<pre class="doc_code">
---- a.c ---
-#include &lt;stdio.h&gt;
-
-extern void foo1(void);
-extern void foo4(void);
-
-void foo2(void) {
-  printf("Foo2\n");
-}
-
-void foo3(void) {
-  foo4();
-}
-
-int main(void) {
-  foo1();
-}
-
---- b.c ---
-#include &lt;stdio.h&gt;
-
-extern void foo2(void);
-
-void foo1(void) {
-  foo2();
-}
-
-void foo4(void) {
-  printf("Foo4");
-}
-
---- command lines ---
-$ clang -flto a.c -c -o a.o      # &lt;-- a.o is LLVM bitcode file
-$ ar q a.a a.o                   # &lt;-- a.a is an archive with LLVM bitcode
-$ clang b.c -c -o b.o            # &lt;-- b.o is native object file
-$ clang -flto a.a b.o -o main    # &lt;-- link with LLVMgold plugin
-</pre>
-
-  <p>Gold informs the plugin that foo3 is never referenced outside the IR,
-  leading LLVM to delete that function. However, unlike in the
-  <a href="LinkTimeOptimization.html#example1">libLTO
-  example</a> gold does not currently eliminate foo4.</p>
-</div>
-
-</div>
-
-<!--=========================================================================-->
-<h2>
-  <a name="lto_autotools">
-    Quickstart for using LTO with autotooled projects
-  </a>
-</h2>
-<!--=========================================================================-->
-<div>
-  <p>Once your system <tt>ld</tt>, <tt>ar</tt>, and <tt>nm</tt> all support LLVM
-     bitcode, everything is in place for an easy to use LTO build of autotooled
-     projects:</p>
-
-  <ul>
-    <li>Follow the instructions <a href="#build">on how to build LLVMgold.so</a>.</li>
-    <li>Install the newly built binutils to <tt>$PREFIX</tt></li>
-    <li>Copy <tt>Release/lib/LLVMgold.so</tt> to
-        <tt>$PREFIX/lib/bfd-plugins/</tt></li>
-    <li>Set environment variables (<tt>$PREFIX</tt> is where you installed clang and
-        binutils):
-<pre class="doc_code">
-export CC="$PREFIX/bin/clang -flto"
-export CXX="$PREFIX/bin/clang++ -flto"
-export AR="$PREFIX/bin/ar"
-export NM="$PREFIX/bin/nm"
-export RANLIB=/bin/true #ranlib is not needed, and doesn't support .bc files in .a
-export CFLAGS="-O4"
-</pre>
-     </li>
-     <li>Or you can just set your path:
-<pre class="doc_code">
-export PATH="$PREFIX/bin:$PATH"
-export CC="clang -flto"
-export CXX="clang++ -flto"
-export RANLIB=/bin/true
-export CFLAGS="-O4"
-</pre></li>
-     <li>Configure &amp; build the project as usual:
-<pre class="doc_code">
-% ./configure &amp;&amp; make &amp;&amp; make check
-</pre></li>
-   </ul>
-
-   <p>The environment variable settings may work for non-autotooled projects
-      too, but you may need to set the <tt>LD</tt> environment variable as
-      well.</p>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="licensing">Licensing</a></h2>
-<!--=========================================================================-->
-<div>
-  <p>Gold is licensed under the GPLv3. LLVMgold uses the interface file
-<tt>plugin-api.h</tt> from gold which means that the resulting LLVMgold.so
-binary is also GPLv3. This can still be used to link non-GPLv3 programs just
-as much as gold could without the plugin.</p>
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="mailto:nicholas@metrix.on.ca">Nick Lewycky</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2010-04-16 23:58:21 -0800 (Fri, 16 Apr 2010) $
-</address>
-</body>
-</html>
diff --git a/docs/GoldPlugin.rst b/docs/GoldPlugin.rst
new file mode 100644
index 000000000000..300aea9f9a49
--- /dev/null
+++ b/docs/GoldPlugin.rst
@@ -0,0 +1,186 @@
+.. _gold-plugin:
+
+====================
+The LLVM gold plugin
+====================
+
+.. sectionauthor:: Nick Lewycky
+
+Introduction
+============
+
+Building with link time optimization requires cooperation from
+the system linker. LTO support on Linux systems requires that you use the
+`gold linker`_ which supports LTO via plugins. This is the same mechanism
+used by the `GCC LTO`_ project.
+
+The LLVM gold plugin implements the gold plugin interface on top of
+:ref:`libLTO`.  The same plugin can also be used by other tools such as
+``ar`` and ``nm``.
+
+.. _`gold linker`: http://sourceware.org/binutils
+.. _`GCC LTO`: http://gcc.gnu.org/wiki/LinkTimeOptimization
+.. _`gold plugin interface`: http://gcc.gnu.org/wiki/whopr/driver
+
+.. _lto-how-to-build:
+
+How to build it
+===============
+
+You need to have gold with plugin support and build the LLVMgold plugin.
+Check whether you have gold running ``/usr/bin/ld -v``. It will report "GNU
+gold" or else "GNU ld" if not. If you have gold, check for plugin support
+by running ``/usr/bin/ld -plugin``. If it complains "missing argument" then
+you have plugin support. If not, such as an "unknown option" error then you
+will either need to build gold or install a version with plugin support.
+
+* To build gold with plugin support:
+
+  .. code-block:: bash
+
+     $ mkdir binutils
+     $ cd binutils
+     $ cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src login
+     {enter "anoncvs" as the password}
+     $ cvs -z 9 -d :pserver:anoncvs@sourceware.org:/cvs/src co binutils
+     $ mkdir build
+     $ cd build
+     $ ../src/configure --enable-gold --enable-plugins
+     $ make all-gold
+
+  That should leave you with ``binutils/build/gold/ld-new`` which supports
+  the ``-plugin`` option. It also built would have
+  ``binutils/build/binutils/ar`` and ``nm-new`` which support plugins but
+  don't have a visible -plugin option, instead relying on the gold plugin
+  being present in ``../lib/bfd-plugins`` relative to where the binaries
+  are placed.
+
+* Build the LLVMgold plugin: Configure LLVM with
+  ``--with-binutils-include=/path/to/binutils/src/include`` and run
+  ``make``.
+
+Usage
+=====
+
+The linker takes a ``-plugin`` option that points to the path of
+the plugin ``.so`` file. To find out what link command ``gcc``
+would run in a given situation, run ``gcc -v [...]`` and
+look for the line where it runs ``collect2``. Replace that with
+``ld-new -plugin /path/to/LLVMgold.so`` to test it out. Once you're
+ready to switch to using gold, backup your existing ``/usr/bin/ld``
+then replace it with ``ld-new``.
+
+You can produce bitcode files from ``clang`` using ``-emit-llvm`` or
+``-flto``, or the ``-O4`` flag which is synonymous with ``-O3 -flto``.
+
+Any of these flags will also cause ``clang`` to look for the gold plugin in
+the ``lib`` directory under its prefix and pass the ``-plugin`` option to
+``ld``. It will not look for an alternate linker, which is why you need
+gold to be the installed system linker in your path.
+
+If you want ``ar`` and ``nm`` to work seamlessly as well, install
+``LLVMgold.so`` to ``/usr/lib/bfd-plugins``. If you built your own gold, be
+sure to install the ``ar`` and ``nm-new`` you built to ``/usr/bin``.
+
+
+Example of link time optimization
+---------------------------------
+
+The following example shows a worked example of the gold plugin mixing LLVM
+bitcode and native code.
+
+.. code-block:: c
+
+   --- a.c ---
+   #include <stdio.h>
+
+   extern void foo1(void);
+   extern void foo4(void);
+
+   void foo2(void) {
+     printf("Foo2\n");
+   }
+
+   void foo3(void) {
+     foo4();
+   }
+
+   int main(void) {
+     foo1();
+   }
+
+   --- b.c ---
+   #include <stdio.h>
+
+   extern void foo2(void);
+
+   void foo1(void) {
+     foo2();
+   }
+
+   void foo4(void) {
+     printf("Foo4");
+   }
+
+.. code-block:: bash
+
+   --- command lines ---
+   $ clang -flto a.c -c -o a.o      # <-- a.o is LLVM bitcode file
+   $ ar q a.a a.o                   # <-- a.a is an archive with LLVM bitcode
+   $ clang b.c -c -o b.o            # <-- b.o is native object file
+   $ clang -flto a.a b.o -o main    # <-- link with LLVMgold plugin
+
+Gold informs the plugin that foo3 is never referenced outside the IR,
+leading LLVM to delete that function. However, unlike in the :ref:`libLTO
+example <libLTO-example>` gold does not currently eliminate foo4.
+
+Quickstart for using LTO with autotooled projects
+=================================================
+
+Once your system ``ld``, ``ar``, and ``nm`` all support LLVM bitcode,
+everything is in place for an easy to use LTO build of autotooled projects:
+
+* Follow the instructions :ref:`on how to build LLVMgold.so
+  <lto-how-to-build>`.
+
+* Install the newly built binutils to ``$PREFIX``
+
+* Copy ``Release/lib/LLVMgold.so`` to ``$PREFIX/lib/bfd-plugins/``
+
+* Set environment variables (``$PREFIX`` is where you installed clang and
+  binutils):
+
+  .. code-block:: bash
+
+     export CC="$PREFIX/bin/clang -flto"
+     export CXX="$PREFIX/bin/clang++ -flto"
+     export AR="$PREFIX/bin/ar"
+     export NM="$PREFIX/bin/nm"
+     export RANLIB=/bin/true #ranlib is not needed, and doesn't support .bc files in .a
+     export CFLAGS="-O4"
+
+* Or you can just set your path:
+
+  .. code-block:: bash
+
+     export PATH="$PREFIX/bin:$PATH"
+     export CC="clang -flto"
+     export CXX="clang++ -flto"
+     export RANLIB=/bin/true
+     export CFLAGS="-O4"
+* Configure and build the project as usual:
+
+  .. code-block:: bash
+
+     % ./configure && make && make check
+
+The environment variable settings may work for non-autotooled projects too,
+but you may need to set the ``LD`` environment variable as well.
+
+Licensing
+=========
+
+Gold is licensed under the GPLv3. LLVMgold uses the interface file
+``plugin-api.h`` from gold which means that the resulting ``LLVMgold.so``
+binary is also GPLv3. This can still be used to link non-GPLv3 programs
+just as much as gold could without the plugin.
diff --git a/docs/HowToAddABuilder.html b/docs/HowToAddABuilder.html
deleted file mode 100644
index 985b30e4f7ba..000000000000
--- a/docs/HowToAddABuilder.html
+++ /dev/null
@@ -1,142 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>
-    How To Add Your Build Configuration To LLVM Buildbot Infrastructure
-  </title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>How To Add Your Build Configuration To LLVM Buildbot Infrastructure</h1>
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#steps">Steps To Add Builder To LLVM Buildbot</a></li>
-</ol>
-<div class="doc_author">
-  <p>Written by <a href="mailto:gkistanova@gmail.com">Galina Kistanova</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="introduction">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document contains information about adding a build configuration and
-   buildslave to private slave builder to LLVM Buildbot Infrastructure
-   <a href="http://lab.llvm.org:8011">http://lab.llvm.org:8011</a></p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="steps">Steps To Add Builder To LLVM Buildbot</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Volunteers can provide their build machines to work as build slaves to
-   public LLVM Buildbot.</p>
-
-<p>Here are the steps you can follow to do so:</p>
-
-<ol>
-  <li><p>Check the existing build configurations to make sure the one you are
-      interested in is not covered yet or gets built on your computer much
-      faster than on the existing one. We prefer faster builds so developers
-      will get feedback sooner after changes get committed.</p></li>
-
-  <li><p>The computer you will be registering with the LLVM buildbot
-      infrastructure should have all dependencies installed and you can
-      actually build your configuration successfully. Please check what degree
-      of parallelism (-j param) would give the fastest build.
-      You can build multiple configurations on one computer.</p></li>
-
-  <li><p>Install buildslave (currently we are using buildbot version 0.8.5).
-      Depending on the platform, buildslave could be available to download and
-      install with your packet manager, or you can download it directly from
-      <a href="http://trac.buildbot.net">http://trac.buildbot.net</a> and
-      install it manually.</p></li>
-
-  <li><p>Create a designated user account, your buildslave will be running
-      under, and set appropriate permissions.</p></li>
-
-  <li><p>Choose the buildslave root directory (all builds will be placed under
-      it), buildslave access name and password the build master will be using
-      to authenticate your buildslave.</p></li>
-
-  <li><p>Create a buildslave in context of that buildslave account.
-      Point it to the <b>lab.llvm.org</b> port <b>9990</b> (see
-      <a href="http://buildbot.net/buildbot/docs/current/full.html#creating-a-slave">
-      Buildbot documentation, Creating a slave</a>
-      for more details) by running the following command:</p>
-
-<div class="doc_code">
-<pre>
-$ buildslave create-slave <i>buildslave-root-directory</i> \
-             lab.llvm.org:9990 \
-             <i>buildslave-access-name buildslave-access-password</i>
-</pre>
-</div></li>
-
-  <li><p>Fill the buildslave description and admin name/e-mail.
-      Here is an example of the buildslave description:</p>
-
-<div class="doc_code">
-<pre>
-Windows 7 x64
-Core i7 (2.66GHz), 16GB of RAM
-
-g++.exe (TDM-1 mingw32) 4.4.0
-GNU Binutils 2.19.1
-cmake version 2.8.4
-Microsoft(R) 32-bit C/C++ Optimizing Compiler Version 16.00.40219.01 for 80x86
-</pre>
-</div></li>
-
-  <li><p>Make sure you can actually start the buildslave successfully. Then set
-      up your buildslave to start automatically at the start up time.
-      See the buildbot documentation for help.
-      You may want to restart your computer to see if it works.</p></li>
-
-  <li><p>Send a patch which adds your build slave and your builder to zorg.</p>
-      <ul>
-          <li>slaves are added to
-              <tt>buildbot/osuosl/master/config/slaves.py</tt></li>
-          <li>builders are added to
-              <tt>buildbot/osuosl/master/config/builders.py</tt></li>
-      </ul></li>
-
-  <li><p>Send the buildslave access name and the access password directly
-      to <a href="mailto:gkistanova@gmail.com">Galina Kistanova</a>, and wait
-      till she will let you know that your changes are applied and buildmaster
-      is reconfigured.</p>
-
-  <li><p>Check the status of your buildslave on the
-      <a href="http://lab.llvm.org:8011/waterfall">Waterfall Display</a>
-      to make sure it is connected, and
-      <a href="http://lab.llvm.org:8011/buildslaves/your-buildslave-name">
-      http://lab.llvm.org:8011/buildslaves/&lt;your-buildslave-name&gt;</a>
-      to see if administrator contact and slave information are correct.</p>
-      </li>
-
-  <li><p>Wait for the first build to succeed and enjoy.</p></li>
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
-  <br>
-  Last modified: $Date: 2011-10-31 12:50:0 -0700 (Mon, 31 Oct 2011) $
-</address>
-</body>
-</html>
diff --git a/docs/HowToAddABuilder.rst b/docs/HowToAddABuilder.rst
new file mode 100644
index 000000000000..b0cd2907f975
--- /dev/null
+++ b/docs/HowToAddABuilder.rst
@@ -0,0 +1,90 @@
+.. _how_to_add_a_builder:
+
+===================================================================
+How To Add Your Build Configuration To LLVM Buildbot Infrastructure
+===================================================================
+
+.. sectionauthor:: Galina Kistanova <gkistanova@gmail.com>
+
+Introduction
+============
+
+This document contains information about adding a build configuration and
+buildslave to private slave builder to LLVM Buildbot Infrastructure
+`<http://lab.llvm.org:8011>`_.
+
+
+Steps To Add Builder To LLVM Buildbot
+=====================================
+Volunteers can provide their build machines to work as build slaves to
+public LLVM Buildbot.
+
+Here are the steps you can follow to do so:
+
+#. Check the existing build configurations to make sure the one you are
+   interested in is not covered yet or gets built on your computer much
+   faster than on the existing one. We prefer faster builds so developers
+   will get feedback sooner after changes get committed.
+
+#. The computer you will be registering with the LLVM buildbot
+   infrastructure should have all dependencies installed and you can
+   actually build your configuration successfully. Please check what degree
+   of parallelism (-j param) would give the fastest build.  You can build
+   multiple configurations on one computer.
+
+#. Install buildslave (currently we are using buildbot version 0.8.5).
+   Depending on the platform, buildslave could be available to download and
+   install with your packet manager, or you can download it directly from
+   `<http://trac.buildbot.net>`_ and install it manually.
+
+#. Create a designated user account, your buildslave will be running under,
+   and set appropriate permissions.
+
+#. Choose the buildslave root directory (all builds will be placed under
+   it), buildslave access name and password the build master will be using
+   to authenticate your buildslave.
+
+#. Create a buildslave in context of that buildslave account.  Point it to
+   the **lab.llvm.org** port **9990** (see `Buildbot documentation,
+   Creating a slave
+   <http://buildbot.net/buildbot/docs/current/full.html#creating-a-slave>`_
+   for more details) by running the following command:
+
+    .. code-block:: bash
+
+       $ buildslave create-slave <buildslave-root-directory> \
+                    lab.llvm.org:9990 \
+                    <buildslave-access-name> <buildslave-access-password>
+
+#. Fill the buildslave description and admin name/e-mail.  Here is an
+   example of the buildslave description::
+
+       Windows 7 x64
+       Core i7 (2.66GHz), 16GB of RAM
+
+       g++.exe (TDM-1 mingw32) 4.4.0
+       GNU Binutils 2.19.1
+       cmake version 2.8.4
+       Microsoft(R) 32-bit C/C++ Optimizing Compiler Version 16.00.40219.01 for 80x86
+
+#. Make sure you can actually start the buildslave successfully. Then set
+   up your buildslave to start automatically at the start up time.  See the
+   buildbot documentation for help.  You may want to restart your computer
+   to see if it works.
+
+#. Send a patch which adds your build slave and your builder to zorg.
+
+   * slaves are added to ``buildbot/osuosl/master/config/slaves.py``
+   * builders are added to ``buildbot/osuosl/master/config/builders.py``
+
+#. Send the buildslave access name and the access password directly to
+   `Galina Kistanova <mailto:gkistanova@gmail.com>`_, and wait till she
+   will let you know that your changes are applied and buildmaster is
+   reconfigured.
+
+#. Check the status of your buildslave on the `Waterfall Display
+   <http://lab.llvm.org:8011/waterfall>`_ to make sure it is connected, and
+   ``http://lab.llvm.org:8011/buildslaves/<your-buildslave-name>`` to see
+   if administrator contact and slave information are correct.
+
+#. Wait for the first build to succeed and enjoy.
diff --git a/docs/HowToBuildOnARM.rst b/docs/HowToBuildOnARM.rst
new file mode 100644
index 000000000000..d786a7dedaf4
--- /dev/null
+++ b/docs/HowToBuildOnARM.rst
@@ -0,0 +1,47 @@
+.. _how_to_build_on_arm:
+
+===================================================================
+How To Build On ARM
+===================================================================
+
+.. sectionauthor:: Wei-Ren Chen (陳韋任) <chenwj@iis.sinica.edu.tw>
+
+Introduction
+============
+
+This document contains information about building/testing LLVM and
+Clang on ARM.
+
+Notes On Building LLVM/Clang on ARM
+=====================================
+Here are some notes on building/testing LLVM/Clang on ARM. Note that
+ARM encompasses a wide variety of CPUs; this advice is primarily based
+on the ARMv6 and ARMv7 architectures and may be inapplicable to older chips.
+
+#. If you are building LLVM/Clang on an ARM board with 1G of memory or less,
+   please use ``gold`` rather then GNU ``ld``.
+   Building LLVM/Clang with ``--enable-optimized``
+   is prefered since it consumes less memory. Otherwise, the building
+   process will very likely fail due to insufficient memory. In any
+   case it is probably a good idea to set up a swap partition.
+
+#. If you want to run ``make
+   check-all`` after building LLVM/Clang, to avoid false alarms (eg, ARCMT
+   failure) please use at least the following configuration:
+
+   .. code-block:: bash
+
+     $ ../$LLVM_SRC_DIR/configure --with-abi=aapcs-vfp
+
+#. The most popular linaro/ubuntu OS's for ARM boards, eg, the
+   Pandaboard, have become hard-float platforms. The following set
+   of configuration options appears to be a good choice for this
+   platform:
+
+   .. code-block:: bash
+
+     ./configure --build=armv7l-unknown-linux-gnueabihf
+     --host=armv7l-unknown-linux-gnueabihf
+     --target=armv7l-unknown-linux-gnueabihf --with-cpu=cortex-a9
+     --with-float=hard --with-abi=aapcs-vfp --with-fpu=neon
+     --enable-targets=arm --disable-optimized --enable-assertions
diff --git a/docs/HowToSetUpLLVMStyleRTTI.rst b/docs/HowToSetUpLLVMStyleRTTI.rst
new file mode 100644
index 000000000000..aa1ad84afee3
--- /dev/null
+++ b/docs/HowToSetUpLLVMStyleRTTI.rst
@@ -0,0 +1,332 @@
+.. _how-to-set-up-llvm-style-rtti:
+
+======================================================
+How to set up LLVM-style RTTI for your class hierarchy
+======================================================
+
+.. sectionauthor:: Sean Silva <silvas@purdue.edu>
+
+.. contents::
+
+Background
+==========
+
+LLVM avoids using C++'s built in RTTI. Instead, it  pervasively uses its
+own hand-rolled form of RTTI which is much more efficient and flexible,
+although it requires a bit more work from you as a class author.
+
+A description of how to use LLVM-style RTTI from a client's perspective is
+given in the `Programmer's Manual <ProgrammersManual.html#isa>`_. This
+document, in contrast, discusses the steps you need to take as a class
+hierarchy author to make LLVM-style RTTI available to your clients.
+
+Before diving in, make sure that you are familiar with the Object Oriented
+Programming concept of "`is-a`_".
+
+.. _is-a: http://en.wikipedia.org/wiki/Is-a
+
+Basic Setup
+===========
+
+This section describes how to set up the most basic form of LLVM-style RTTI
+(which is sufficient for 99.9% of the cases). We will set up LLVM-style
+RTTI for this class hierarchy:
+
+.. code-block:: c++
+
+   class Shape {
+   public:
+     Shape() {}
+     virtual double computeArea() = 0;
+   };
+
+   class Square : public Shape {
+     double SideLength;
+   public:
+     Square(double S) : SideLength(S) {}
+     double computeArea() /* override */;
+   };
+
+   class Circle : public Shape {
+     double Radius;
+   public:
+     Circle(double R) : Radius(R) {}
+     double computeArea() /* override */;
+   };
+
+The most basic working setup for LLVM-style RTTI requires the following
+steps:
+
+#. In the header where you declare ``Shape``, you will want to ``#include
+   "llvm/Support/Casting.h"``, which declares LLVM's RTTI templates. That
+   way your clients don't even have to think about it.
+
+   .. code-block:: c++
+
+      #include "llvm/Support/Casting.h"
+
+#. In the base class, introduce an enum which discriminates all of the
+   different concrete classes in the hierarchy, and stash the enum value
+   somewhere in the base class.
+
+   Here is the code after introducing this change:
+
+   .. code-block:: c++
+
+       class Shape {
+       public:
+      +  /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
+      +  enum ShapeKind {
+      +    SK_Square,
+      +    SK_Circle
+      +  };
+      +private:
+      +  const ShapeKind Kind;
+      +public:
+      +  ShapeKind getKind() const { return Kind; }
+      +
+         Shape() {}
+         virtual double computeArea() = 0;
+       };
+
+   You will usually want to keep the ``Kind`` member encapsulated and
+   private, but let the enum ``ShapeKind`` be public along with providing a
+   ``getKind()`` method. This is convenient for clients so that they can do
+   a ``switch`` over the enum.
+
+   A common naming convention is that these enums are "kind"s, to avoid
+   ambiguity with the words "type" or "class" which have overloaded meanings
+   in many contexts within LLVM. Sometimes there will be a natural name for
+   it, like "opcode". Don't bikeshed over this; when in doubt use ``Kind``.
+
+   You might wonder why the ``Kind`` enum doesn't have an entry for
+   ``Shape``. The reason for this is that since ``Shape`` is abstract
+   (``computeArea() = 0;``), you will never actually have non-derived
+   instances of exactly that class (only subclasses). See `Concrete Bases
+   and Deeper Hierarchies`_ for information on how to deal with
+   non-abstract bases. It's worth mentioning here that unlike
+   ``dynamic_cast<>``, LLVM-style RTTI can be used (and is often used) for
+   classes that don't have v-tables.
+
+#. Next, you need to make sure that the ``Kind`` gets initialized to the
+   value corresponding to the dynamic type of the class. Typically, you will
+   want to have it be an argument to the constructor of the base class, and
+   then pass in the respective ``XXXKind`` from subclass constructors.
+
+   Here is the code after that change:
+
+   .. code-block:: c++
+
+       class Shape {
+       public:
+         /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
+         enum ShapeKind {
+           SK_Square,
+           SK_Circle
+         };
+       private:
+         const ShapeKind Kind;
+       public:
+         ShapeKind getKind() const { return Kind; }
+
+      -  Shape() {}
+      +  Shape(ShapeKind K) : Kind(K) {}
+         virtual double computeArea() = 0;
+       };
+
+       class Square : public Shape {
+         double SideLength;
+       public:
+      -  Square(double S) : SideLength(S) {}
+      +  Square(double S) : Shape(SK_Square), SideLength(S) {}
+         double computeArea() /* override */;
+       };
+
+       class Circle : public Shape {
+         double Radius;
+       public:
+      -  Circle(double R) : Radius(R) {}
+      +  Circle(double R) : Shape(SK_Circle), Radius(R) {}
+         double computeArea() /* override */;
+       };
+
+#. Finally, you need to inform LLVM's RTTI templates how to dynamically
+   determine the type of a class (i.e. whether the ``isa<>``/``dyn_cast<>``
+   should succeed). The default "99.9% of use cases" way to accomplish this
+   is through a small static member function ``classof``. In order to have
+   proper context for an explanation, we will display this code first, and
+   then below describe each part:
+
+   .. code-block:: c++
+
+       class Shape {
+       public:
+         /// Discriminator for LLVM-style RTTI (dyn_cast<> et al.)
+         enum ShapeKind {
+           SK_Square,
+           SK_Circle
+         };
+       private:
+         const ShapeKind Kind;
+       public:
+         ShapeKind getKind() const { return Kind; }
+
+         Shape(ShapeKind K) : Kind(K) {}
+         virtual double computeArea() = 0;
+       };
+
+       class Square : public Shape {
+         double SideLength;
+       public:
+         Square(double S) : Shape(SK_Square), SideLength(S) {}
+         double computeArea() /* override */;
+      +
+      +  static bool classof(const Shape *S) {
+      +    return S->getKind() == SK_Square;
+      +  }
+       };
+
+       class Circle : public Shape {
+         double Radius;
+       public:
+         Circle(double R) : Shape(SK_Circle), Radius(R) {}
+         double computeArea() /* override */;
+      +
+      +  static bool classof(const Shape *S) {
+      +    return S->getKind() == SK_Circle;
+      +  }
+       };
+
+   The job of ``classof`` is to dynamically determine whether an object of
+   a base class is in fact of a particular derived class.  In order to
+   downcast a type ``Base`` to a type ``Derived``, there needs to be a
+   ``classof`` in ``Derived`` which will accept an object of type ``Base``.
+
+   To be concrete, consider the following code:
+
+   .. code-block:: c++
+
+      Shape *S = ...;
+      if (isa<Circle>(S)) {
+        /* do something ... */
+      }
+
+   The code of the ``isa<>`` test in this code will eventually boil
+   down---after template instantiation and some other machinery---to a
+   check roughly like ``Circle::classof(S)``. For more information, see
+   :ref:`classof-contract`.
+
+   The argument to ``classof`` should always be an *ancestor* class because
+   the implementation has logic to allow and optimize away
+   upcasts/up-``isa<>``'s automatically. It is as though every class
+   ``Foo`` automatically has a ``classof`` like:
+
+   .. code-block:: c++
+
+      class Foo {
+        [...]
+        template <class T>
+        static bool classof(const T *,
+                            ::llvm::enable_if_c<
+                              ::llvm::is_base_of<Foo, T>::value
+                            >::type* = 0) { return true; }
+        [...]
+      };
+
+   Note that this is the reason that we did not need to introduce a
+   ``classof`` into ``Shape``: all relevant classes derive from ``Shape``,
+   and ``Shape`` itself is abstract (has no entry in the ``Kind`` enum),
+   so this notional inferred ``classof`` is all we need. See `Concrete
+   Bases and Deeper Hierarchies`_ for more information about how to extend
+   this example to more general hierarchies.
+
+Although for this small example setting up LLVM-style RTTI seems like a lot
+of "boilerplate", if your classes are doing anything interesting then this
+will end up being a tiny fraction of the code.
+
+Concrete Bases and Deeper Hierarchies
+=====================================
+
+For concrete bases (i.e. non-abstract interior nodes of the inheritance
+tree), the ``Kind`` check inside ``classof`` needs to be a bit more
+complicated. The situation differs from the example above in that
+
+* Since the class is concrete, it must itself have an entry in the ``Kind``
+  enum because it is possible to have objects with this class as a dynamic
+  type.
+
+* Since the class has children, the check inside ``classof`` must take them
+  into account.
+
+Say that ``SpecialSquare`` and ``OtherSpecialSquare`` derive
+from ``Square``, and so ``ShapeKind`` becomes:
+
+.. code-block:: c++
+
+    enum ShapeKind {
+      SK_Square,
+   +  SK_SpecialSquare,
+   +  SK_OtherSpecialSquare,
+      SK_Circle
+    }
+
+Then in ``Square``, we would need to modify the ``classof`` like so:
+
+.. code-block:: c++
+
+   -  static bool classof(const Shape *S) {
+   -    return S->getKind() == SK_Square;
+   -  }
+   +  static bool classof(const Shape *S) {
+   +    return S->getKind() >= SK_Square &&
+   +           S->getKind() <= SK_OtherSpecialSquare;
+   +  }
+
+The reason that we need to test a range like this instead of just equality
+is that both ``SpecialSquare`` and ``OtherSpecialSquare`` "is-a"
+``Square``, and so ``classof`` needs to return ``true`` for them.
+
+This approach can be made to scale to arbitrarily deep hierarchies. The
+trick is that you arrange the enum values so that they correspond to a
+preorder traversal of the class hierarchy tree. With that arrangement, all
+subclass tests can be done with two comparisons as shown above. If you just
+list the class hierarchy like a list of bullet points, you'll get the
+ordering right::
+
+   | Shape
+     | Square
+       | SpecialSquare
+       | OtherSpecialSquare
+     | Circle
+
+.. _classof-contract:
+
+The Contract of ``classof``
+---------------------------
+
+To be more precise, let ``classof`` be inside a class ``C``.  Then the
+contract for ``classof`` is "return ``true`` if the dynamic type of the
+argument is-a ``C``".  As long as your implementation fulfills this
+contract, you can tweak and optimize it as much as you want.
+
+.. TODO::
+
+   Touch on some of the more advanced features, like ``isa_impl`` and
+   ``simplify_type``. However, those two need reference documentation in
+   the form of doxygen comments as well. We need the doxygen so that we can
+   say "for full details, see http://llvm.org/doxygen/..."
+
+Rules of Thumb
+==============
+
+#. The ``Kind`` enum should have one entry per concrete class, ordered
+   according to a preorder traversal of the inheritance tree.
+#. The argument to ``classof`` should be a ``const Base *``, where ``Base``
+   is some ancestor in the inheritance hierarchy. The argument should
+   *never* be a derived class or the class itself: the template machinery
+   for ``isa<>`` already handles this case and optimizes it.
+#. For each class in the hierarchy that has no children, implement a
+   ``classof`` that checks only against its ``Kind``.
+#. For each class in the hierarchy that has children, implement a
+   ``classof`` that checks a range of the first child's ``Kind`` and the
+   last child's ``Kind``.
diff --git a/docs/HowToSubmitABug.html b/docs/HowToSubmitABug.html
deleted file mode 100644
index ef7cf9e48707..000000000000
--- a/docs/HowToSubmitABug.html
+++ /dev/null
@@ -1,345 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>How to submit an LLVM bug report</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  How to submit an LLVM bug report
-</h1>
-
-<table class="layout" style="width: 90%" >
-<tr class="layout">
-  <td class="left">
-<ol>
-  <li><a href="#introduction">Introduction - Got bugs?</a></li>
-  <li><a href="#crashers">Crashing Bugs</a>
-    <ul>
-    <li><a href="#front-end">Front-end bugs</a>
-    <li><a href="#ct_optimizer">Compile-time optimization bugs</a>
-    <li><a href="#ct_codegen">Code generator bugs</a>
-    </ul></li>
-  <li><a href="#miscompilations">Miscompilations</a></li>
-  <li><a href="#codegen">Incorrect code generation (JIT and LLC)</a></li>
-</ol>
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a> and
-                <a href="http://misha.brukman.net">Misha Brukman</a></p>
-</div>
-</td>
-</tr>
-</table>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction - Got bugs?</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If you're working with LLVM and run into a bug, we definitely want to know
-about it.  This document describes what you can do to increase the odds of
-getting it fixed quickly.</p>
-
-<p>Basically you have to do two things at a minimum.  First, decide whether the
-bug <a href="#crashers">crashes the compiler</a> (or an LLVM pass), or if the
-compiler is <a href="#miscompilations">miscompiling</a> the program (i.e., the
-compiler successfully produces an executable, but it doesn't run right).  Based
-on
-what type of bug it is, follow the instructions in the linked section to narrow
-down the bug so that the person who fixes it will be able to find the problem
-more easily.</p>
-
-<p>Once you have a reduced test-case, go to <a
-href="http://llvm.org/bugs/enter_bug.cgi">the LLVM Bug Tracking
-System</a> and fill out the form with the necessary details (note that you don't
-need to pick a category, just use the "new-bugs" category if you're not sure).
-The bug description should contain the following
-information:</p>
-
-<ul>
-  <li>All information necessary to reproduce the problem.</li>
-  <li>The reduced test-case that triggers the bug.</li>
-  <li>The location where you obtained LLVM (if not from our Subversion
-  repository).</li>
-</ul>
-
-<p>Thanks for helping us make LLVM better!</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="crashers">Crashing Bugs</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>More often than not, bugs in the compiler cause it to crash&mdash;often due
-to an assertion failure of some sort. The most important
-piece of the puzzle is to figure out if it is crashing in the GCC front-end
-or if it is one of the LLVM libraries (e.g. the optimizer or code generator)
-that has problems.</p>
-
-<p>To figure out which component is crashing (the front-end,
-optimizer or code generator), run the
-<tt><b>llvm-gcc</b></tt> command line as you were when the crash occurred, but
-with the following extra command line options:</p>
-
-<ul>
-  <li><tt><b>-O0 -emit-llvm</b></tt>: If <tt>llvm-gcc</tt> still crashes when
-  passed these options (which disable the optimizer and code generator), then
-  the crash is in the front-end.  Jump ahead to the section on <a
-  href="#front-end">front-end bugs</a>.</li>
-
-  <li><tt><b>-emit-llvm</b></tt>: If <tt>llvm-gcc</tt> crashes with this option
-  (which disables the code generator), you found an optimizer bug.  Jump ahead
-  to <a href="#ct_optimizer"> compile-time optimization bugs</a>.</li>
-
-  <li>Otherwise, you have a code generator crash.  Jump ahead to <a
-  href="#ct_codegen">code generator bugs</a>.</li>
-
-</ul>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="front-end">Front-end bugs</a>
-</h3>
-
-<div>
-
-<p>If the problem is in the front-end, you should re-run the same
-<tt>llvm-gcc</tt> command that resulted in the crash, but add the
-<tt>-save-temps</tt> option.  The compiler will crash again, but it will leave
-behind a <tt><i>foo</i>.i</tt> file (containing preprocessed C source code) and
-possibly <tt><i>foo</i>.s</tt> for each
-compiled <tt><i>foo</i>.c</tt> file. Send us the <tt><i>foo</i>.i</tt> file,
-along with the options you passed to llvm-gcc, and a brief description of the
-error it caused.</p>
-
-<p>The <a href="http://delta.tigris.org/">delta</a> tool helps to reduce the
-preprocessed file down to the smallest amount of code that still replicates the
-problem. You're encouraged to use delta to reduce the code to make the
-developers' lives easier. <a
-href="http://gcc.gnu.org/wiki/A_guide_to_testcase_reduction">This website</a>
-has instructions on the best way to use delta.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ct_optimizer">Compile-time optimization bugs</a>
-</h3>
-
-<div>
-
-<p>If you find that a bug crashes in the optimizer, compile your test-case to a
-<tt>.bc</tt> file by passing "<tt><b>-emit-llvm -O0 -c -o foo.bc</b></tt>".
-Then run:</p>
-
-<div class="doc_code">
-<p><tt><b>opt</b> -std-compile-opts -debug-pass=Arguments foo.bc
-    -disable-output</tt></p>
-</div>
-
-<p>This command should do two things: it should print out a list of passes, and
-then it should crash in the same way as llvm-gcc.  If it doesn't crash, please
-follow the instructions for a <a href="#front-end">front-end bug</a>.</p>
-
-<p>If this does crash, then you should be able to debug this with the following
-bugpoint command:</p>
-
-<div class="doc_code">
-<p><tt><b>bugpoint</b> foo.bc &lt;list of passes printed by 
-<b>opt</b>&gt;</tt></p>
-</div>
-
-<p>Please run this, then file a bug with the instructions and reduced .bc files
-that bugpoint emits.  If something goes wrong with bugpoint, please submit the
-"foo.bc" file and the list of passes printed by <b>opt</b>.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ct_codegen">Code generator bugs</a>
-</h3>
-
-<div>
-
-<p>If you find a bug that crashes llvm-gcc in the code generator, compile your
-source file to a .bc file by passing "<tt><b>-emit-llvm -c -o foo.bc</b></tt>"
-to llvm-gcc (in addition to the options you already pass).  Once your have
-foo.bc, one of the following commands should fail:</p>
-
-<ol>
-<li><tt><b>llc</b> foo.bc</tt></li>
-<li><tt><b>llc</b> foo.bc -relocation-model=pic</tt></li>
-<li><tt><b>llc</b> foo.bc -relocation-model=static</tt></li>
-</ol>
-
-<p>If none of these crash, please follow the instructions for a
-<a href="#front-end">front-end bug</a>.  If one of these do crash, you should
-be able to reduce this with one of the following bugpoint command lines (use
-the one corresponding to the command above that failed):</p>
-
-<ol>
-<li><tt><b>bugpoint</b> -run-llc foo.bc</tt></li>
-<li><tt><b>bugpoint</b> -run-llc foo.bc --tool-args
-           -relocation-model=pic</tt></li>
-<li><tt><b>bugpoint</b> -run-llc foo.bc --tool-args
-           -relocation-model=static</tt></li>
-</ol>
-
-<p>Please run this, then file a bug with the instructions and reduced .bc file
-that bugpoint emits.  If something goes wrong with bugpoint, please submit the
-"foo.bc" file and the option that llc crashes with.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="miscompilations">Miscompilations</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>If llvm-gcc successfully produces an executable, but that executable doesn't
-run right, this is either a bug in the code or a bug in the
-compiler.  The first thing to check is to make sure it is not using undefined
-behavior (e.g. reading a variable before it is defined).  In particular, check
-to see if the program <a href="http://valgrind.org/">valgrind</a>s clean,
-passes purify, or some other memory checker tool.  Many of the "LLVM bugs" that
-we have chased down ended up being bugs in the program being compiled, not
- LLVM.</p>
-
-<p>Once you determine that the program itself is not buggy, you should choose 
-which code generator you wish to compile the program with (e.g. LLC or the JIT)
-and optionally a series of LLVM passes to run.  For example:</p>
-
-<div class="doc_code">
-<p><tt>
-<b>bugpoint</b> -run-llc [... optzn passes ...] file-to-test.bc --args -- [program arguments]</tt></p>
-</div>
-
-<p><tt>bugpoint</tt> will try to narrow down your list of passes to the one pass
-that causes an error, and simplify the bitcode file as much as it can to assist
-you. It will print a message letting you know how to reproduce the resulting
-error.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="codegen">Incorrect code generation</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Similarly to debugging incorrect compilation by mis-behaving passes, you can
-debug incorrect code generation by either LLC or the JIT, using
-<tt>bugpoint</tt>. The process <tt>bugpoint</tt> follows in this case is to try
-to narrow the code down to a function that is miscompiled by one or the other
-method, but since for correctness, the entire program must be run,
-<tt>bugpoint</tt> will compile the code it deems to not be affected with the C
-Backend, and then link in the shared object it generates.</p>
-
-<p>To debug the JIT:</p>
-
-<div class="doc_code">
-<pre>
-bugpoint -run-jit -output=[correct output file] [bitcode file]  \
-         --tool-args -- [arguments to pass to lli]              \
-         --args -- [program arguments]
-</pre>
-</div>
-
-<p>Similarly, to debug the LLC, one would run:</p>
-
-<div class="doc_code">
-<pre>
-bugpoint -run-llc -output=[correct output file] [bitcode file]  \
-         --tool-args -- [arguments to pass to llc]              \
-         --args -- [program arguments]
-</pre>
-</div>
-
-<p><b>Special note:</b> if you are debugging MultiSource or SPEC tests that
-already exist in the <tt>llvm/test</tt> hierarchy, there is an easier way to
-debug the JIT, LLC, and CBE, using the pre-written Makefile targets, which
-will pass the program options specified in the Makefiles:</p>
-
-<div class="doc_code">
-<p><tt>
-cd llvm/test/../../program<br>
-make bugpoint-jit
-</tt></p>
-</div>
-
-<p>At the end of a successful <tt>bugpoint</tt> run, you will be presented
-with two bitcode files: a <em>safe</em> file which can be compiled with the C
-backend and the <em>test</em> file which either LLC or the JIT
-mis-codegenerates, and thus causes the error.</p>
-
-<p>To reproduce the error that <tt>bugpoint</tt> found, it is sufficient to do
-the following:</p>
-
-<ol>
-
-<li><p>Regenerate the shared object from the safe bitcode file:</p>
-
-<div class="doc_code">
-<p><tt>
-<b>llc</b> -march=c safe.bc -o safe.c<br>
-<b>gcc</b> -shared safe.c -o safe.so
-</tt></p>
-</div></li>
-
-<li><p>If debugging LLC, compile test bitcode native and link with the shared
-    object:</p>
-
-<div class="doc_code">
-<p><tt>
-<b>llc</b> test.bc -o test.s<br>
-<b>gcc</b> test.s safe.so -o test.llc<br>
-./test.llc [program options]
-</tt></p>
-</div></li>
-    
-<li><p>If debugging the JIT, load the shared object and supply the test
-    bitcode:</p>
-
-<div class="doc_code">
-<p><tt><b>lli</b> -load=safe.so test.bc [program options]</tt></p>
-</div></li>  
-
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
-  <br>
-  Last modified: $Date: 2012-06-14 18:52:55 +0200 (Thu, 14 Jun 2012) $
-</address>
-
-</body>
-</html>
diff --git a/docs/HowToSubmitABug.rst b/docs/HowToSubmitABug.rst
new file mode 100644
index 000000000000..ff2d649ce33c
--- /dev/null
+++ b/docs/HowToSubmitABug.rst
@@ -0,0 +1,233 @@
+.. _how-to-submit-a-bug-report:
+
+================================
+How to submit an LLVM bug report
+================================
+
+.. sectionauthor:: Chris Lattner <sabre@nondot.org> and Misha Brukman <http://misha.brukman.net>
+
+Introduction - Got bugs?
+========================
+
+
+If you're working with LLVM and run into a bug, we definitely want to know
+about it.  This document describes what you can do to increase the odds of
+getting it fixed quickly.
+
+Basically you have to do two things at a minimum.  First, decide whether
+the bug `crashes the compiler`_ (or an LLVM pass), or if the
+compiler is `miscompiling`_ the program (i.e., the
+compiler successfully produces an executable, but it doesn't run right).
+Based on what type of bug it is, follow the instructions in the linked
+section to narrow down the bug so that the person who fixes it will be able
+to find the problem more easily.
+
+Once you have a reduced test-case, go to `the LLVM Bug Tracking System
+<http://llvm.org/bugs/enter_bug.cgi>`_ and fill out the form with the
+necessary details (note that you don't need to pick a category, just use
+the "new-bugs" category if you're not sure).  The bug description should
+contain the following information:
+
+* All information necessary to reproduce the problem.
+* The reduced test-case that triggers the bug.
+* The location where you obtained LLVM (if not from our Subversion
+  repository).
+
+Thanks for helping us make LLVM better!
+
+.. _crashes the compiler:
+
+Crashing Bugs
+=============
+
+More often than not, bugs in the compiler cause it to crash---often due to
+an assertion failure of some sort. The most important piece of the puzzle
+is to figure out if it is crashing in the GCC front-end or if it is one of
+the LLVM libraries (e.g. the optimizer or code generator) that has
+problems.
+
+To figure out which component is crashing (the front-end, optimizer or code
+generator), run the ``llvm-gcc`` command line as you were when the crash
+occurred, but with the following extra command line options:
+
+* ``-O0 -emit-llvm``: If ``llvm-gcc`` still crashes when passed these
+  options (which disable the optimizer and code generator), then the crash
+  is in the front-end.  Jump ahead to the section on :ref:`front-end bugs
+  <front-end>`.
+
+* ``-emit-llvm``: If ``llvm-gcc`` crashes with this option (which disables
+  the code generator), you found an optimizer bug.  Jump ahead to
+  `compile-time optimization bugs`_.
+
+* Otherwise, you have a code generator crash. Jump ahead to `code
+  generator bugs`_.
+
+.. _front-end bug:
+.. _front-end:
+
+Front-end bugs
+--------------
+
+If the problem is in the front-end, you should re-run the same ``llvm-gcc``
+command that resulted in the crash, but add the ``-save-temps`` option.
+The compiler will crash again, but it will leave behind a ``foo.i`` file
+(containing preprocessed C source code) and possibly ``foo.s`` for each
+compiled ``foo.c`` file. Send us the ``foo.i`` file, along with the options
+you passed to ``llvm-gcc``, and a brief description of the error it caused.
+
+The `delta <http://delta.tigris.org/>`_ tool helps to reduce the
+preprocessed file down to the smallest amount of code that still replicates
+the problem. You're encouraged to use delta to reduce the code to make the
+developers' lives easier. `This website
+<http://gcc.gnu.org/wiki/A_guide_to_testcase_reduction>`_ has instructions
+on the best way to use delta.
+
+.. _compile-time optimization bugs:
+
+Compile-time optimization bugs
+------------------------------
+
+If you find that a bug crashes in the optimizer, compile your test-case to a
+``.bc`` file by passing "``-emit-llvm -O0 -c -o foo.bc``".
+Then run:
+
+.. code-block:: bash
+
+   opt -std-compile-opts -debug-pass=Arguments foo.bc -disable-output
+
+This command should do two things: it should print out a list of passes, and
+then it should crash in the same way as llvm-gcc.  If it doesn't crash, please
+follow the instructions for a `front-end bug`_.
+
+If this does crash, then you should be able to debug this with the following
+bugpoint command:
+
+.. code-block:: bash
+
+   bugpoint foo.bc <list of passes printed by opt>
+
+Please run this, then file a bug with the instructions and reduced .bc
+files that bugpoint emits.  If something goes wrong with bugpoint, please
+submit the "foo.bc" file and the list of passes printed by ``opt``.
+
+.. _code generator bugs:
+
+Code generator bugs
+-------------------
+
+If you find a bug that crashes llvm-gcc in the code generator, compile your
+source file to a .bc file by passing "``-emit-llvm -c -o foo.bc``" to
+llvm-gcc (in addition to the options you already pass).  Once your have
+foo.bc, one of the following commands should fail:
+
+#. ``llc foo.bc``
+#. ``llc foo.bc -relocation-model=pic``
+#. ``llc foo.bc -relocation-model=static``
+
+If none of these crash, please follow the instructions for a `front-end
+bug`_.  If one of these do crash, you should be able to reduce this with
+one of the following bugpoint command lines (use the one corresponding to
+the command above that failed):
+
+#. ``bugpoint -run-llc foo.bc``
+#. ``bugpoint -run-llc foo.bc --tool-args -relocation-model=pic``
+#. ``bugpoint -run-llc foo.bc --tool-args -relocation-model=static``
+
+Please run this, then file a bug with the instructions and reduced .bc file
+that bugpoint emits.  If something goes wrong with bugpoint, please submit
+the "foo.bc" file and the option that llc crashes with.
+
+.. _miscompiling:
+
+Miscompilations
+===============
+
+If llvm-gcc successfully produces an executable, but that executable
+doesn't run right, this is either a bug in the code or a bug in the
+compiler.  The first thing to check is to make sure it is not using
+undefined behavior (e.g. reading a variable before it is defined). In
+particular, check to see if the program `valgrind
+<http://valgrind.org/>`_'s clean, passes purify, or some other memory
+checker tool. Many of the "LLVM bugs" that we have chased down ended up
+being bugs in the program being compiled, not LLVM.
+
+Once you determine that the program itself is not buggy, you should choose
+which code generator you wish to compile the program with (e.g. LLC or the JIT)
+and optionally a series of LLVM passes to run.  For example:
+
+.. code-block:: bash
+
+   bugpoint -run-llc [... optzn passes ...] file-to-test.bc --args -- [program arguments]
+
+bugpoint will try to narrow down your list of passes to the one pass that
+causes an error, and simplify the bitcode file as much as it can to assist
+you. It will print a message letting you know how to reproduce the
+resulting error.
+
+Incorrect code generation
+=========================
+
+Similarly to debugging incorrect compilation by mis-behaving passes, you
+can debug incorrect code generation by either LLC or the JIT, using
+``bugpoint``. The process ``bugpoint`` follows in this case is to try to
+narrow the code down to a function that is miscompiled by one or the other
+method, but since for correctness, the entire program must be run,
+``bugpoint`` will compile the code it deems to not be affected with the C
+Backend, and then link in the shared object it generates.
+
+To debug the JIT:
+
+.. code-block:: bash
+
+   bugpoint -run-jit -output=[correct output file] [bitcode file]  \
+            --tool-args -- [arguments to pass to lli]              \
+            --args -- [program arguments]
+
+Similarly, to debug the LLC, one would run:
+
+.. code-block:: bash
+
+   bugpoint -run-llc -output=[correct output file] [bitcode file]  \
+            --tool-args -- [arguments to pass to llc]              \
+            --args -- [program arguments]
+
+**Special note:** if you are debugging MultiSource or SPEC tests that
+already exist in the ``llvm/test`` hierarchy, there is an easier way to
+debug the JIT, LLC, and CBE, using the pre-written Makefile targets, which
+will pass the program options specified in the Makefiles:
+
+.. code-block:: bash
+
+   cd llvm/test/../../program
+   make bugpoint-jit
+
+At the end of a successful ``bugpoint`` run, you will be presented
+with two bitcode files: a *safe* file which can be compiled with the C
+backend and the *test* file which either LLC or the JIT
+mis-codegenerates, and thus causes the error.
+
+To reproduce the error that ``bugpoint`` found, it is sufficient to do
+the following:
+
+#. Regenerate the shared object from the safe bitcode file:
+
+   .. code-block:: bash
+
+      llc -march=c safe.bc -o safe.c
+      gcc -shared safe.c -o safe.so
+
+#. If debugging LLC, compile test bitcode native and link with the shared
+   object:
+
+   .. code-block:: bash
+
+      llc test.bc -o test.s
+      gcc test.s safe.so -o test.llc
+      ./test.llc [program options]
+
+#. If debugging the JIT, load the shared object and supply the test
+   bitcode:
+
+   .. code-block:: bash
+
+      lli -load=safe.so test.bc [program options]
diff --git a/docs/HowToUseInstrMappings.rst b/docs/HowToUseInstrMappings.rst
new file mode 100755
index 000000000000..b51e74e23c29
--- /dev/null
+++ b/docs/HowToUseInstrMappings.rst
@@ -0,0 +1,179 @@
+.. _how_to_use_instruction_mappings:
+
+===============================
+How To Use Instruction Mappings
+===============================
+
+.. sectionauthor:: Jyotsna Verma <jverma@codeaurora.org>
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document contains information about adding instruction mapping support
+for a target. The motivation behind this feature comes from the need to switch
+between different instruction formats during various optimizations. One approach
+could be to use switch cases which list all the instructions along with formats
+they can transition to. However, it has large maintenance overhead
+because of the hardcoded instruction names. Also, whenever a new instruction is
+added in the .td files, all the relevant switch cases should be modified
+accordingly. Instead, the same functionality could be achieved with TableGen and
+some support from the .td files for a fraction of maintenance cost.
+
+``InstrMapping`` Class Overview
+===============================
+
+TableGen uses relationship models to map instructions with each other. These
+models are described using ``InstrMapping`` class as a base. Each model sets
+various fields of the ``InstrMapping`` class such that they can uniquely
+describe all the instructions using that model. TableGen parses all the relation
+models and uses the information to construct relation tables which relate
+instructions with each other. These tables are emitted in the
+``XXXInstrInfo.inc`` file along with the functions to query them. Following
+is the definition of ``InstrMapping`` class definied in Target.td file:
+
+.. code-block:: llvm
+
+  class InstrMapping {
+    // Used to reduce search space only to the instructions using this
+    // relation model.
+    string FilterClass;
+
+    // List of fields/attributes that should be same for all the instructions in
+    // a row of the relation table. Think of this as a set of properties shared
+    // by all the instructions related by this relationship.
+    list<string> RowFields = [];
+
+    // List of fields/attributes that are same for all the instructions
+    // in a column of the relation table.
+    list<string> ColFields = [];
+
+    // Values for the fields/attributes listed in 'ColFields' corresponding to
+    // the key instruction. This is the instruction that will be transformed
+    // using this relation model.
+    list<string> KeyCol = [];
+
+    // List of values for the fields/attributes listed in 'ColFields', one for
+    // each column in the relation table. These are the instructions a key
+    // instruction will be transformed into.
+    list<list<string> > ValueCols = [];
+  }
+
+Sample Example
+--------------
+
+Let's say that we want to have a function
+``int getPredOpcode(uint16_t Opcode, enum PredSense inPredSense)`` which
+takes a non-predicated instruction and returns its predicated true or false form
+depending on some input flag, ``inPredSense``. The first step in the process is
+to define a relationship model that relates predicated instructions to their
+non-predicated form by assigning appropriate values to the ``InstrMapping``
+fields. For this relationship, non-predicated instructions are treated as key
+instruction since they are the one used to query the interface function.
+
+.. code-block:: llvm
+
+  def getPredOpcode : InstrMapping {
+    // Choose a FilterClass that is used as a base class for all the
+    // instructions modeling this relationship. This is done to reduce the
+    // search space only to these set of instructions.
+    let FilterClass = "PredRel";
+
+    // Instructions with same values for all the fields in RowFields form a
+    // row in the resulting relation table.
+    // For example, if we want to relate 'ADD' (non-predicated) with 'Add_pt'
+    // (predicated true) and 'Add_pf' (predicated false), then all 3
+    // instructions need to have same value for BaseOpcode field. It can be any
+    // unique value (Ex: XYZ) and should not be shared with any other
+    // instruction not related to 'add'.
+    let RowFields = ["BaseOpcode"];
+
+    // List of attributes that can be used to define key and column instructions
+    // for a relation. Key instruction is passed as an argument
+    // to the function used for querying relation tables. Column instructions
+    // are the instructions they (key) can transform into.
+    //
+    // Here, we choose 'PredSense' as ColFields since this is the unique
+    // attribute of the key (non-predicated) and column (true/false)
+    // instructions involved in this relationship model.
+    let ColFields = ["PredSense"];
+
+    // The key column contains non-predicated instructions.
+    let KeyCol = ["none"];
+
+    // Two value columns - first column contains instructions with
+    // PredSense=true while second column has instructions with PredSense=false.
+    let ValueCols = [["true"], ["false"]];
+  }
+
+TableGen uses the above relationship model to emit relation table that maps
+non-predicated instructions with their predicated forms. It also outputs the
+interface function
+``int getPredOpcode(uint16_t Opcode, enum PredSense inPredSense)`` to query
+the table. Here, Function ``getPredOpcode`` takes two arguments, opcode of the
+current instruction and PredSense of the desired instruction, and returns
+predicated form of the instruction, if found in the relation table.
+In order for an instruction to be added into the relation table, it needs
+to include relevant information in its definition. For example, consider
+following to be the current definitions of ADD, ADD_pt (true) and ADD_pf (false)
+instructions:
+
+.. code-block::llvm
+
+  def ADD : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
+              "$dst = add($a, $b)",
+              [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$a),
+                                             (i32 IntRegs:$b)))]>;
+
+  def ADD_Pt : ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if ($p) $dst = add($a, $b)",
+              []>;
+
+  def ADD_Pf : ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if (!$p) $dst = add($a, $b)",
+              []>;
+
+In this step, we modify these instructions to include the information
+required by the relationship model, <tt>getPredOpcode</tt>, so that they can
+be related.
+
+.. code-block::llvm
+
+  def ADD : PredRel, ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
+              "$dst = add($a, $b)",
+              [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$a),
+                                             (i32 IntRegs:$b)))]> {
+    let BaseOpcode = "ADD";
+    let PredSense = "none";
+  }
+
+  def ADD_Pt : PredRel, ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if ($p) $dst = add($a, $b)",
+              []> {
+    let BaseOpcode = "ADD";
+    let PredSense = "true";
+  }
+
+  def ADD_Pf : PredRel, ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if (!$p) $dst = add($a, $b)",
+              []> {
+    let BaseOpcode = "ADD";
+    let PredSense = "false";
+  }
+
+Please note that all the above instructions use ``PredRel`` as a base class.
+This is extremely important since TableGen uses it as a filter for selecting
+instructions for ``getPredOpcode`` model. Any instruction not derived from
+``PredRel`` is excluded from the analysis. ``BaseOpcode`` is another important
+field. Since it's selected as a ``RowFields`` of the model, it is required
+to have the same value for all 3 instructions in order to be related. Next,
+``PredSense`` is used to determine their column positions by comparing its value
+with ``KeyCol`` and ``ValueCols``. If an instruction sets its ``PredSense``
+value to something not used in the relation model, it will not be assigned
+a column in the relation table.
diff --git a/docs/LangRef.html b/docs/LangRef.html
index 946380e24dfb..13daa65ca358 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@@ -25,7 +25,6 @@
           <li><a href="#linkage_private">'<tt>private</tt>' Linkage</a></li>
           <li><a href="#linkage_linker_private">'<tt>linker_private</tt>' Linkage</a></li>
           <li><a href="#linkage_linker_private_weak">'<tt>linker_private_weak</tt>' Linkage</a></li>
-          <li><a href="#linkage_linker_private_weak_def_auto">'<tt>linker_private_weak_def_auto</tt>' Linkage</a></li>
           <li><a href="#linkage_internal">'<tt>internal</tt>' Linkage</a></li>
           <li><a href="#linkage_available_externally">'<tt>available_externally</tt>' Linkage</a></li>
           <li><a href="#linkage_linkonce">'<tt>linkonce</tt>' Linkage</a></li>
@@ -34,6 +33,7 @@
           <li><a href="#linkage_appending">'<tt>appending</tt>' Linkage</a></li>
           <li><a href="#linkage_externweak">'<tt>extern_weak</tt>' Linkage</a></li>
           <li><a href="#linkage_linkonce_odr">'<tt>linkonce_odr</tt>' Linkage</a></li>
+          <li><a href="#linkage_linkonce_odr_auto_hide">'<tt>linkonce_odr_auto_hide</tt>' Linkage</a></li>
           <li><a href="#linkage_weak">'<tt>weak_odr</tt>' Linkage</a></li>
           <li><a href="#linkage_external">'<tt>external</tt>' Linkage</a></li>
           <li><a href="#linkage_dllimport">'<tt>dllimport</tt>' Linkage</a></li>
@@ -103,6 +103,7 @@
       <li><a href="#metadata">Metadata Nodes and Metadata Strings</a>
         <ol>
           <li><a href="#tbaa">'<tt>tbaa</tt>' Metadata</a></li>
+          <li><a href="#tbaa.struct">'<tt>tbaa.struct</tt>' Metadata</a></li>
           <li><a href="#fpmath">'<tt>fpmath</tt>' Metadata</a></li>
           <li><a href="#range">'<tt>range</tt>' Metadata</a></li>
         </ol>
@@ -576,15 +577,6 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
       linker. The symbols are removed by the linker from the final linked image
       (executable or dynamic library).</dd>
 
-  <dt><tt><b><a name="linkage_linker_private_weak_def_auto">linker_private_weak_def_auto</a></b></tt></dt>
-  <dd>Similar to "<tt>linker_private_weak</tt>", but it's known that the address
-      of the object is not taken. For instance, functions that had an inline
-      definition, but the compiler decided not to inline it. Note,
-      unlike <tt>linker_private</tt> and <tt>linker_private_weak</tt>,
-      <tt>linker_private_weak_def_auto</tt> may have only <tt>default</tt>
-      visibility.  The symbols are removed by the linker from the final linked
-      image (executable or dynamic library).</dd>
-
   <dt><tt><b><a name="linkage_internal">internal</a></b></tt></dt>
   <dd>Similar to private, but the value shows as a local symbol
       (<tt>STB_LOCAL</tt> in the case of ELF) in the object file. This
@@ -653,6 +645,14 @@ define i32 @main() {   <i>; i32()* </i>&nbsp;
       be merged with equivalent globals.  These linkage types are otherwise the
       same as their non-<tt>odr</tt> versions.</dd>
 
+  <dt><tt><b><a name="linkage_linkonce_odr_auto_hide">linkonce_odr_auto_hide</a></b></tt></dt>
+  <dd>Similar to "<tt>linkonce_odr</tt>", but nothing in the translation unit
+      takes the address of this definition. For instance, functions that had an
+      inline definition, but the compiler decided not to inline it.
+      <tt>linkonce_odr_auto_hide</tt> may have only <tt>default</tt> visibility.
+      The symbols are removed by the linker from the final linked image
+      (executable or dynamic library).</dd>
+
   <dt><tt><b><a name="linkage_external">external</a></b></tt></dt>
   <dd>If none of the above identifiers are used, the global is externally
       visible, meaning that it participates in linkage and can be used to
@@ -1107,9 +1107,9 @@ declare signext i8 @returns_signed_char()
   <dd>This indicates that the pointer parameter specifies the address of a
       structure that is the return value of the function in the source program.
       This pointer must be guaranteed by the caller to be valid: loads and
-      stores to the structure may be assumed by the callee to not to trap.  This
-      may only be applied to the first parameter. This is not a valid attribute
-      for return values. </dd>
+      stores to the structure may be assumed by the callee to not to trap and
+      to be properly aligned.  This may only be applied to the first parameter.
+      This is not a valid attribute for return values. </dd>
 
   <dt><tt><b><a name="noalias">noalias</a></b></tt></dt>
   <dd>This indicates that pointer values
@@ -1208,13 +1208,6 @@ define void @f() optsize { ... }
       may make calls to the function faster, at the cost of extra program
       startup time if the function is not called during program startup.</dd>
 
-  <dt><tt><b>ia_nsdialect</b></tt></dt>
-  <dd>This attribute indicates the associated inline assembly call is using a
-      non-standard assembly dialect.  The standard dialect is ATT, which is
-      assumed when this attribute is not present.  When present, the dialect
-      is assumed to be Intel.  Currently, ATT and Intel are the only supported
-      dialects.</dd>
-
   <dt><tt><b>inlinehint</b></tt></dt>
   <dd>This attribute indicates that the source code contained a hint that inlining
       this function is desirable (such as the "inline" keyword in C/C++).  It
@@ -1371,11 +1364,13 @@ target datalayout = "<i>layout specification</i>"
       8-bits. If omitted, the natural stack alignment defaults to "unspecified",
       which does not prevent any alignment promotions.</dd>
 
-  <dt><tt>p:<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt>
+  <dt><tt>p[n]:<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt>
   <dd>This specifies the <i>size</i> of a pointer and its <i>abi</i> and
-      <i>preferred</i> alignments. All sizes are in bits. Specifying
-      the <i>pref</i> alignment is optional. If omitted, the
-      preceding <tt>:</tt> should be omitted too.</dd>
+      <i>preferred</i> alignments for address space <i>n</i>. All sizes are in
+      bits. Specifying the <i>pref</i> alignment is optional. If omitted, the
+      preceding <tt>:</tt> should be omitted too. The address space,
+      <i>n</i> is optional, and if not specified, denotes the default address
+      space 0. The value of <i>n</i> must be in the range [1,2^23).</dd>
 
   <dt><tt>i<i>size</i>:<i>abi</i>:<i>pref</i></tt></dt>
   <dd>This specifies the alignment for an integer type of a given bit
@@ -1416,6 +1411,10 @@ target datalayout = "<i>layout specification</i>"
 <ul>
   <li><tt>E</tt> - big endian</li>
   <li><tt>p:64:64:64</tt> - 64-bit pointers with 64-bit alignment</li>
+  <li><tt>p1:32:32:32</tt> - 32-bit pointers with 32-bit alignment for
+  address space 1</li>
+  <li><tt>p2:16:32:32</tt> - 16-bit pointers with 32-bit alignment for
+  address space 2</li>
   <li><tt>i1:8:8</tt> - i1 is 8-bit (byte) aligned</li>
   <li><tt>i8:8:8</tt> - i8 is 8-bit (byte) aligned</li>
   <li><tt>i16:16:16</tt> - i16 is 16-bit aligned</li>
@@ -2111,7 +2110,7 @@ in signal handlers).</p>
 <p>Structures may optionally be "packed" structures, which indicate that the 
   alignment of the struct is one byte, and that there is no padding between
   the elements.  In non-packed structs, padding between field types is inserted
-  as defined by the TargetData string in the module, which is required to match
+  as defined by the DataLayout string in the module, which is required to match
   what the underlying code generator expects.</p>
 
 <p>Structures can either be "literal" or "identified".  A literal structure is
@@ -2902,8 +2901,18 @@ call void asm sideeffect "eieio", ""()
 call void asm alignstack "eieio", ""()
 </pre>
 
-<p>If both keywords appear the '<tt>sideeffect</tt>' keyword must come
-   first.</p>
+<p>Inline asms also support using non-standard assembly dialects.  The assumed
+   dialect is ATT.  When the '<tt>inteldialect</tt>' keyword is present, the
+   inline asm is using the Intel dialect.  Currently, ATT and Intel are the
+   only supported dialects.  An example is:</p>
+
+<pre class="doc_code">
+call void asm inteldialect "eieio", ""()
+</pre>
+
+<p>If multiple keywords appear the '<tt>sideeffect</tt>' keyword must come
+   first, the '<tt>alignstack</tt>' keyword second and the
+   '<tt>inteldialect</tt>' keyword last.</p>
 
 <!--
 <p>TODO: The format of the asm and constraints string still need to be
@@ -3050,6 +3059,44 @@ call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)
 
 <!-- _______________________________________________________________________ -->
 <h4>
+  <a name="tbaa.struct">'<tt>tbaa.struct</tt>' Metadata</a>
+</h4>
+
+<div>
+
+<p>The <a href="#int_memcpy"><tt>llvm.memcpy</tt></a> is often used to implement
+aggregate assignment operations in C and similar languages, however it is
+defined to copy a contiguous region of memory, which is more than strictly
+necessary for aggregate types which contain holes due to padding. Also, it
+doesn't contain any TBAA information about the fields of the aggregate.</p>
+
+<p><tt>!tbaa.struct</tt> metadata can describe which memory subregions in a memcpy
+are padding and what the TBAA tags of the struct are.</p>
+
+<p>The current metadata format is very simple. <tt>!tbaa.struct</tt> metadata nodes
+   are a list of operands which are in conceptual groups of three. For each
+   group of three, the first operand gives the byte offset of a field in bytes,
+   the second gives its size in bytes, and the third gives its
+   tbaa tag. e.g.:</p>
+
+<div class="doc_code">
+<pre>
+!4 = metadata !{ i64 0, i64 4, metadata !1, i64 8, i64 4, metadata !2 }
+</pre>
+</div>
+
+<p>This describes a struct with two fields. The first is at offset 0 bytes
+   with size 4 bytes, and has tbaa tag !1. The second is at offset 8 bytes
+   and has size 4 bytes and has tbaa tag !2.</p>
+
+<p>Note that the fields need not be contiguous. In this example, there is a
+   4 byte gap between the two fields. This gap represents padding which
+   does not carry useful data and need not be preserved.</p>
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
   <a name="fpmath">'<tt>fpmath</tt>' Metadata</a>
 </h4>
  
@@ -5013,7 +5060,7 @@ IfUnequal:
 
 <p>The optional constant <tt>align</tt> argument specifies the alignment of the
    operation (that is, the alignment of the memory address). A value of 0 or an
-   omitted <tt>align</tt> argument means that the operation has the preferential
+   omitted <tt>align</tt> argument means that the operation has the abi
    alignment for the target. It is the responsibility of the code emitter to
    ensure that the alignment information is correct. Overestimating the
    alignment results in undefined behavior. Underestimating the alignment may
@@ -5094,7 +5141,7 @@ IfUnequal:
 
 <p>The optional constant "align" argument specifies the alignment of the
    operation (that is, the alignment of the memory address). A value of 0 or an
-   omitted "align" argument means that the operation has the preferential
+   omitted "align" argument means that the operation has the abi
    alignment for the target. It is the responsibility of the code emitter to
    ensure that the alignment information is correct. Overestimating the
    alignment results in an undefined behavior. Underestimating the alignment may
@@ -8722,7 +8769,7 @@ codegen.</p>
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-08-10 02:00:22 +0200 (Fri, 10 Aug 2012) $
+  Last modified: $Date: 2012-10-29 15:12:44 +0100 (Mon, 29 Oct 2012) $
 </address>
 
 </body>
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index 6ebe61429f96..d568c0b302ec 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -20,8 +20,10 @@ A
 B
 -
 
-**BURS**
+**BB Vectorization**
+    Basic Block Vectorization
 
+**BURS**
     Bottom Up Rewriting System --- A method of instruction selection for code
     generation.  An example is the `BURG
     <http://www.program-transformation.org/Transform/BURG>`_ tool.
@@ -156,7 +158,7 @@ R
     In garbage collection, a pointer variable lying outside of the `heap`_ from
     which the collector begins its reachability analysis. In the context of code
     generation, "root" almost always refers to a "stack root" --- a local or
-    temporary variable within an executing function.</dd>
+    temporary variable within an executing function.
 
 **RPO**
     Reverse postorder
@@ -192,3 +194,10 @@ S
 **Stack Map**
     In garbage collection, metadata emitted by the code generator which
     identifies `roots`_ within the stack frame of an executing function.
+
+T
+-
+
+**TBAA**
+    Type-Based Alias Analysis
+
diff --git a/docs/LinkTimeOptimization.rst b/docs/LinkTimeOptimization.rst
index 53d673e40666..7eacf0bd0d01 100644
--- a/docs/LinkTimeOptimization.rst
+++ b/docs/LinkTimeOptimization.rst
@@ -29,6 +29,8 @@ bitcode files. This tight integration between the linker and LLVM optimizer
 helps to do optimizations that are not possible in other models. The linker
 input allows the optimizer to avoid relying on conservative escape analysis.
 
+.. _libLTO-example:
+
 Example of link time optimization
 ---------------------------------
 
diff --git a/docs/Makefile.sphinx b/docs/Makefile.sphinx
index 21f66488b2b7..81c13de9cd9e 100644
--- a/docs/Makefile.sphinx
+++ b/docs/Makefile.sphinx
@@ -46,6 +46,10 @@ clean:
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
+	@# FIXME: Remove this `cp` once HTML->Sphinx transition is completed.
+	@# Kind of a hack, but HTML-formatted docs are on the way out anyway.
+	@echo "Copying legacy HTML-formatted docs into $(BUILDDIR)/html"
+	@cp -a *.html tutorial $(BUILDDIR)/html
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 
 dirhtml:
diff --git a/docs/MarkedUpDisassembly.rst b/docs/MarkedUpDisassembly.rst
new file mode 100644
index 000000000000..e1282e102ebe
--- /dev/null
+++ b/docs/MarkedUpDisassembly.rst
@@ -0,0 +1,88 @@
+.. _marked_up_disassembly:
+
+=======================================
+LLVM's Optional Rich Disassembly Output
+=======================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+LLVM's default disassembly output is raw text. To allow consumers more ability
+to introspect the instructions' textual representation or to reformat for a more
+user friendly display there is an optional rich disassembly output.
+
+This optional output is sufficient to reference into individual portions of the
+instruction text. This is intended for clients like disassemblers, list file
+generators, and pretty-printers, which need more than the raw instructions and
+the ability to print them.
+
+To provide this functionality the assembly text is marked up with annotations.
+The markup is simple enough in syntax to be robust even in the case of version
+mismatches between consumers and producers. That is, the syntax generally does
+not carry semantics beyond "this text has an annotation," so consumers can
+simply ignore annotations they do not understand or do not care about.
+
+After calling ``LLVMCreateDisasm()`` to create a disassembler context the
+optional output is enable with this call:
+
+.. code-block:: c
+
+    LLVMSetDisasmOptions(DC, LLVMDisassembler_Option_UseMarkup);
+
+Then subsequent calls to ``LLVMDisasmInstruction()`` will return output strings
+with the marked up annotations.
+
+Instruction Annotations
+=======================
+
+.. _contextual markups:
+
+Contextual markups
+------------------
+
+Annoated assembly display will supply contextual markup to help clients more
+efficiently implement things like pretty printers. Most markup will be target
+independent, so clients can effectively provide good display without any target
+specific knowledge.
+
+Annotated assembly goes through the normal instruction printer, but optionally
+includes contextual tags on portions of the instruction string. An annotation
+is any '<' '>' delimited section of text(1).
+
+.. code-block:: bat
+
+    annotation: '<' tag-name tag-modifier-list ':' annotated-text '>'
+    tag-name: identifier
+    tag-modifier-list: comma delimited identifier list
+
+The tag-name is an identifier which gives the type of the annotation. For the
+first pass, this will be very simple, with memory references, registers, and
+immediates having the tag names "mem", "reg", and "imm", respectively.
+
+The tag-modifier-list is typically additional target-specific context, such as
+register class.
+
+Clients should accept and ignore any tag-names or tag-modifiers they do not
+understand, allowing the annotations to grow in richness without breaking older
+clients.
+
+For example, a possible annotation of an ARM load of a stack-relative location
+might be annotated as:
+
+.. code-block:: nasm
+
+   ldr <reg gpr:r0>, <mem regoffset:[<reg gpr:sp>, <imm:#4>]>
+
+
+1: For assembly dialects in which '<' and/or '>' are legal tokens, a literal token is escaped by following immediately with a repeat of the character.  For example, a literal '<' character is output as '<<' in an annotated assembly string.
+
+C API Details
+-------------
+
+The intended consumers of this information use the C API, therefore the new C
+API function for the disassembler will be added to provide an option to produce
+disassembled instructions with annotations, ``LLVMSetDisasmOptions()`` and the
+``LLVMDisassembler_Option_UseMarkup`` option (see above).
diff --git a/docs/Passes.html b/docs/Passes.html
index e8048d5a45a5..16e8bd6f6b13 100644
--- a/docs/Passes.html
+++ b/docs/Passes.html
@@ -77,6 +77,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#basicaa">-basicaa</a></td><td>Basic Alias Analysis (stateless AA impl)</td></tr>
 <tr><td><a href="#basiccg">-basiccg</a></td><td>Basic CallGraph Construction</td></tr>
 <tr><td><a href="#count-aa">-count-aa</a></td><td>Count Alias Analysis Query Responses</td></tr>
+<tr><td><a href="#da">-da</a></td><td>Dependence Analysis</td></tr>
 <tr><td><a href="#debug-aa">-debug-aa</a></td><td>AA use debugger</td></tr>
 <tr><td><a href="#domfrontier">-domfrontier</a></td><td>Dominance Frontier Construction</td></tr>
 <tr><td><a href="#domtree">-domtree</a></td><td>Dominator Tree Construction</td></tr>
@@ -92,7 +93,6 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#intervals">-intervals</a></td><td>Interval Partition Construction</td></tr>
 <tr><td><a href="#iv-users">-iv-users</a></td><td>Induction Variable Users</td></tr>
 <tr><td><a href="#lazy-value-info">-lazy-value-info</a></td><td>Lazy Value Information Analysis</td></tr>
-<tr><td><a href="#lda">-lda</a></td><td>Loop Dependence Analysis</td></tr>
 <tr><td><a href="#libcall-aa">-libcall-aa</a></td><td>LibCall Alias Analysis</td></tr>
 <tr><td><a href="#lint">-lint</a></td><td>Statically lint-checks LLVM IR</td></tr>
 <tr><td><a href="#loops">-loops</a></td><td>Natural Loop Information</td></tr>
@@ -182,7 +182,6 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#strip-debug-declare">-strip-debug-declare</a></td><td>Strip all llvm.dbg.declare intrinsics</td></tr>
 <tr><td><a href="#strip-nondebug">-strip-nondebug</a></td><td>Strip all symbols, except dbg symbols, from a module</td></tr>
 <tr><td><a href="#tailcallelim">-tailcallelim</a></td><td>Tail Call Elimination</td></tr>
-<tr><td><a href="#tailduplicate">-tailduplicate</a></td><td>Tail Duplication</td></tr>
 
 
 <tr><th colspan="2"><b>UTILITY PASSES</b></th></tr>
@@ -251,6 +250,15 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 
 <!-------------------------------------------------------------------------- -->
 <h3>
+  <a name="da">-da: Dependence Analysis</a>
+</h3>
+<div>
+  <p>Dependence analysis framework, which is used to detect dependences in
+  memory accesses.</p>
+</div>
+
+<!-------------------------------------------------------------------------- -->
+<h3>
   <a name="debug-aa">-debug-aa: AA use debugger</a>
 </h3>
 <div>
@@ -433,15 +441,6 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 
 <!-------------------------------------------------------------------------- -->
 <h3>
-  <a name="lda">-lda: Loop Dependence Analysis</a>
-</h3>
-<div>
-  <p>Loop dependence analysis framework, which is used to detect dependences in
-  memory accesses in loops.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
   <a name="libcall-aa">-libcall-aa: LibCall Alias Analysis</a>
 </h3>
 <div>
@@ -1862,22 +1861,6 @@ if (X &lt; 3) {</pre>
   </ul>
 </div>
 
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="tailduplicate">-tailduplicate: Tail Duplication</a>
-</h3>
-<div>
-  <p>
-  This pass performs a limited form of tail duplication, intended to simplify
-  CFGs by removing some unconditional branches.  This pass is necessary to
-  straighten out loops created by the C front-end, but also is capable of
-  making other code nicer.  After this pass is run, the CFG simplify pass
-  should be run to clean up the mess.
-  </p>
-</div>
-
-</div>
-
 <!-- ======================================================================= -->
 <h2><a name="utilities">Utility Passes</a></h2>
 <div>
@@ -2059,7 +2042,7 @@ if (X &lt; 3) {</pre>
 
   <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-07-26 00:01:31 +0200 (Thu, 26 Jul 2012) $
+  Last modified: $Date: 2012-10-31 18:25:31 +0100 (Wed, 31 Oct 2012) $
 </address>
 
 </body>
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
new file mode 100644
index 000000000000..b45449793e0a
--- /dev/null
+++ b/docs/Phabricator.rst
@@ -0,0 +1,100 @@
+=============================
+Code Reviews with Phabricator
+=============================
+
+.. contents::
+  :local:
+
+If you prefer to use a web user interface for code reviews,
+you can now submit your patches for Clang and LLVM at
+`LLVM's Phabricator`_.
+
+Sign up
+-------
+
+There are two options to get an account on Phabricator. You can sign up
+immediately with one of the supported OAuth account types if you're comfortable
+with OAuth, but you can also email chandlerc@gmail.com to request an account to
+be created manually without using OAuth. We're working to get support in
+Phabricator to directly create new accounts, but currently this is a manual
+process.
+
+Note that if you use your Subversion user name as Phabricator user name,
+Phabricator will automatically connect your submits to your Phabricator user in
+the `Code Repository Browser`_.
+
+
+Requesting a review via the command line
+----------------------------------------
+
+Phabricator has a tool called *Arcanist* to upload patches from
+the command line. To get you set up, follow the
+`Arcanist Quick Start`_ instructions.
+
+You can learn more about how to use arc to interact with
+Phabricator in the `Arcanist User Guide`_.
+
+Requesting a review via the web interface
+-----------------------------------------
+
+The tool to create and review patches in Phabricator is called
+*Differential*.
+
+Note that you can upload patches created through various diff tools,
+including git and svn. To make reviews easier, please always include
+**as much context as possible** with your diff! Don't worry, Phabricator
+will automatically send a diff with a smaller context in the review
+email, but having the full file in the web interface will help the
+reviewer understand your code.
+
+To get a full diff, use one of the following commands (or just use Arcanist
+to upload your patch):
+
+* ``git diff -U999999 other-branch``
+* ``svn diff --diff-cmd=diff -x -U999999``
+
+To upload a new patch:
+
+* Click *Differential*.
+* Click *Create Revision*.
+* Paste the text diff or upload the patch file.
+  Note that TODO
+* Leave the drop down on *Create a new Revision...* and click *Continue*.
+* Enter a descriptive title and summary; add reviewers and mailing
+  lists that you want to be included in the review. If your patch is
+  for LLVM, cc llvm-commits; if your patch is for Clang, cc cfe-commits.
+* Click *Save*.
+
+To submit an updated patch:
+
+* Click *Differential*.
+* Click *Create Revision*.
+* Paste the updated diff.
+* Select the review you want to from the *Attach To* dropdown and click
+  *Continue*.
+* Click *Save*.
+
+Reviewing code with Phabricator
+-------------------------------
+
+Phabricator allows you to add inline comments as well as overall comments
+to a revision. To add an inline comment, select the lines of code you want
+to comment on by clicking and dragging the line numbers in the diff pane.
+
+You can add overall comments or submit your comments at the bottom of the page.
+
+Phabricator has many useful features, for example allowing you to select
+diffs between different versions of the patch as it was reviewed in the
+*Revision Update History*. Most features are self descriptive - explore, and
+if you have a question, drop by on #llvm in IRC to get help.
+
+Status
+------
+
+Currently, we're testing Phabricator for use with Clang/LLVM. Please let us
+know whether you like it and what could be improved!
+
+.. _LLVM's Phabricator: http://llvm-reviews.chandlerc.com
+.. _Code Repository Browser: http://llvm-reviews.chandlerc.com/diffusion/
+.. _Arcanist Quick Start: http://www.phabricator.com/docs/phabricator/article/Arcanist_Quick_Start.html
+.. _Arcanist User Guide: http://www.phabricator.com/docs/phabricator/article/Arcanist_User_Guide.html
diff --git a/docs/ProgrammersManual.html b/docs/ProgrammersManual.html
index 5bf499b2db4d..7c2e6c8aad92 100644
--- a/docs/ProgrammersManual.html
+++ b/docs/ProgrammersManual.html
@@ -98,6 +98,7 @@ option</a></li>
       <li><a href="#dss_valuemap">"llvm/ADT/ValueMap.h"</a></li>
       <li><a href="#dss_intervalmap">"llvm/ADT/IntervalMap.h"</a></li>
       <li><a href="#dss_map">&lt;map&gt;</a></li>
+      <li><a href="#dss_mapvector">"llvm/ADT/MapVector.h"</a></li>
       <li><a href="#dss_inteqclasses">"llvm/ADT/IntEqClasses.h"</a></li>
       <li><a href="#dss_immutablemap">"llvm/ADT/ImmutableMap.h"</a></li>
       <li><a href="#dss_othermap">Other Map-Like Container Options</a></li>
@@ -432,10 +433,10 @@ if (<a href="#AllocationInst">AllocationInst</a> *AI = dyn_cast&lt;<a href="#All
 </dl>
 
 <p>These five templates can be used with any classes, whether they have a
-v-table or not.  To add support for these templates, you simply need to add
-<tt>classof</tt> static methods to the class you are interested casting
-to. Describing this is currently outside the scope of this document, but there
-are lots of examples in the LLVM source base.</p>
+v-table or not. If you want to add support for these templates, see the
+document <a href="HowToSetUpLLVMStyleRTTI.html">How to set up LLVM-style
+RTTI for your class hierarchy </a>.
+</p>
 
 </div>
 
@@ -1848,6 +1849,24 @@ another element takes place).</p>
 
 </div>
 
+
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="dss_mapvector">"llvm/ADT/MapVector.h"</a>
+</h4>
+<div>
+
+<p> MapVector&lt;KeyT,ValueT&gt provides a subset of the DenseMap interface.
+  The main difference is that the iteration order is guaranteed to be
+  the insertion order, making it an easy (but somewhat expensive) solution
+  for non-deterministic iteration over maps of pointers. </p>
+
+<p> It is implemented by mapping from key to an index in a vector of key,value
+  pairs. This provides fast lookup and iteration, but has two main drawbacks:
+  The key is stored twice and it doesn't support removing elements. </p>
+
+</div>
+
 <!-- _______________________________________________________________________ -->
 <h4>
   <a name="dss_inteqclasses">"llvm/ADT/IntEqClasses.h"</a>
@@ -4130,7 +4149,7 @@ arguments. An argument has a pointer to the parent Function.</p>
   <a href="mailto:dhurjati@cs.uiuc.edu">Dinakar Dhurjati</a> and
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-07-25 15:46:11 +0200 (Wed, 25 Jul 2012) $
+  Last modified: $Date: 2012-10-07 02:56:09 +0200 (Sun, 07 Oct 2012) $
 </address>
 
 </body>
diff --git a/docs/README.txt b/docs/README.txt
index 2fbbf987405d..5ddd599d8a78 100644
--- a/docs/README.txt
+++ b/docs/README.txt
@@ -6,7 +6,7 @@ The LLVM documentation is currently written in two formats:
   * Plain HTML documentation.
 
   * reStructured Text documentation using the Sphinx documentation generator. It
-    is currently tested with Sphinx 1.1.3.
+    is currently tested with Sphinx 1.1.3. 
 
     For more information, see the "Sphinx Introduction for LLVM Developers"
     document.
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 85448a5f3a47..a4b1d580b637 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -466,7 +466,45 @@ Release Notes</a>.</h1>
 <p>In addition to many minor performance tweaks and bug fixes, this release
    includes a few major enhancements and additions to the optimizers:</p>
 
+<p> Loop Vectorizer - We've added a loop vectorizer and we are now able to
+    vectorize small loops. The loop vectorizer is disabled by default and
+    can be enabled using the <b>-mllvm -vectorize-loops</b> flag.
+    The SIMD vector width can be specified using the flag
+    <b>-mllvm -force-vector-width=4</b>.
+    The default value is <b>0</b> which means auto-select.
+    <br/>
+    We can now vectorize this function:
+
+    <pre class="doc_code">
+    unsigned sum_arrays(int *A, int *B, int start, int end) {
+      unsigned sum = 0;
+      for (int i = start; i &lt; end; ++i)
+        sum += A[i] + B[i] + i;
+
+      return sum;
+    }
+    </pre>
+
+    We vectorize under the following loops:
+    <ul>
+    <li>The inner most loops must have a single basic block.</li>
+    <li>The number of iterations are known before the loop starts to execute.</li>
+    <li>The loop counter needs to be incrimented by one.</li>
+    <li>The loop trip count <b>can</b> be a variable.</li>
+    <li>Loops do <b>not</b> need to start at zero.</li>
+    <li>The induction variable can be used inside the loop.</li>
+    <li>Loop reductions are supported.</li>
+    <li>Arrays with affine access pattern do <b>not</b> need to be marked as 'noalias' and are checked at runtime.</li>
+    <li>...</li>
+    </ul>
+
+</p>
+
+<p>SROA - We've re-written SROA to be significantly more powerful.
+<!-- FIXME: Add more text here... --></p>
+
 <ul>
+  <li>Branch weight metadata is preseved through more of the optimizer.</li>
   <li>...</li>
 </ul>
 
@@ -499,13 +537,14 @@ Release Notes</a>.</h1>
 
 <div>
 
-<p>We have changed the way that the Type Legalizer legalizes vectors. The type
-   legalizer now attempts to promote integer elements.  This enabled the
-   implementation of vector-select.  Additionally, we see a performance boost on
-   workloads which use vectors of chars and shorts, since they are now promoted
-   to 32-bit types, which are better supported by the SIMD instruction set.
-   Floating point types are still widened as before.</p>
+<p>Stack Coloring - We have implemented a new optimization pass
+  to merge stack objects which are used in disjoin areas of the code.
+  This optimization reduces the required stack space significantly, in cases
+  where it is clear to the optimizer that the stack slot is not shared.
+  We use the lifetime markers to tell the codegen that a certain alloca
+  is used within a region.</p>
 
+<p> We now merge consecutive loads and stores. </p>
 
 <p>We have put a significant amount of work into the code generator
    infrastructure, which allows us to implement more aggressive algorithms and
@@ -608,6 +647,46 @@ Release Notes</a>.</h1>
 
 <!--=========================================================================-->
 <h3>
+<a name="PowerPC">PowerPC Target Improvements</a>
+</h3>
+
+<div>
+
+<ul>
+<p>Many fixes and changes across LLVM (and Clang) for better compliance with
+   the 64-bit PowerPC ELF Application Binary Interface, interoperability with
+   GCC, and overall 64-bit PowerPC support.   Some highlights include:</p>
+<ul>
+  <li>  MCJIT support added.</li>
+  <li>  PPC64 relocation support and (small code model) TOC handling
+        added.</li>
+  <li>  Parameter passing and return value fixes (alignment issues,
+        padding, varargs support, proper register usage, odd-sized
+        structure support, float support, extension of return values
+        for i32 return values).</li>
+  <li>  Fixes in spill and reload code for vector registers.</li>
+  <li>  C++ exception handling enabled.</li>
+  <li>  Changes to remediate double-rounding compatibility issues with
+        respect to GCC behavior.</li>
+  <li>  Refactoring to disentangle ppc64-elf-linux ABI from Darwin
+        ppc64 ABI support.</li>
+  <li>  Assorted new test cases and test case fixes (endian and word
+        size issues).</li>
+  <li>  Fixes for big-endian codegen bugs, instruction encodings, and
+        instruction constraints.</li>
+  <li>  Implemented -integrated-as support.</li>
+  <li>  Additional support for Altivec compare operations.</li>
+  <li>  IBM long double support.</li>
+</ul>
+<p>There have also been code generation improvements for both 32- and 64-bit
+   code. Instruction scheduling support for the Freescale e500mc and e5500
+   cores has been added.</p>
+</ul>
+
+</div>
+
+<!--=========================================================================-->
+<h3>
 <a name="OtherTS">Other Target Specific Improvements</a>
 </h3>
 
@@ -646,6 +725,14 @@ Release Notes</a>.</h1>
 <p>In addition, many APIs have changed in this release.  Some of the major
    LLVM API changes are:</p>
 
+<p> We've added a new interface for allowing IR-level passes to access
+  target-specific information. A new IR-level pass, called
+  "TargetTransformInfo" provides a number of low-level interfaces.
+  LSR and LowerInvoke already use the new interface. </p>
+
+<p> The TargetData structure has been renamed to DataLayout and moved to VMCore
+to remove a dependency on Target. </p>
+
 <ul>
   <li>...</li>
 </ul>
@@ -749,7 +836,7 @@ Release Notes</a>.</h1>
   src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
 
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-07-13 14:44:23 +0200 (Fri, 13 Jul 2012) $
+  Last modified: $Date: 2012-11-20 05:22:44 +0100 (Tue, 20 Nov 2012) $
 </address>
 
 </body>
diff --git a/docs/SourceLevelDebugging.html b/docs/SourceLevelDebugging.html
index bb72bf3075c5..1dcee54f0bf9 100644
--- a/docs/SourceLevelDebugging.html
+++ b/docs/SourceLevelDebugging.html
@@ -2367,11 +2367,11 @@ bucket contents:
 |  HEADER.header_data_len | uint32_t
 |  HEADER_DATA            | HeaderData
 |-------------------------|
-|  BUCKETS                | uint32_t[n_buckets] // 32 bit hash indexes
+|  BUCKETS                | uint32_t[bucket_count] // 32 bit hash indexes
 |-------------------------|
-|  HASHES                 | uint32_t[n_buckets] // 32 bit hash values
+|  HASHES                 | uint32_t[hashes_count] // 32 bit hash values
 |-------------------------|
-|  OFFSETS                | uint32_t[n_buckets] // 32 bit offsets to hash value data
+|  OFFSETS                | uint32_t[hashes_count] // 32 bit offsets to hash value data
 |-------------------------|
 |  ALL HASH DATA          |
 `-------------------------'
@@ -2851,7 +2851,7 @@ int main ()
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-06-02 12:20:22 +0200 (Sat, 02 Jun 2012) $
+  Last modified: $Date: 2012-10-09 01:54:10 +0200 (Tue, 09 Oct 2012) $
 </address>
 
 </body>
diff --git a/docs/SphinxQuickstartTemplate.rst b/docs/SphinxQuickstartTemplate.rst
new file mode 100644
index 000000000000..75d916368e33
--- /dev/null
+++ b/docs/SphinxQuickstartTemplate.rst
@@ -0,0 +1,125 @@
+==========================
+Sphinx Quickstart Template
+==========================
+
+.. sectionauthor:: Sean Silva <silvas@purdue.edu>
+
+Introduction and Quickstart
+===========================
+
+This document is meant to get you writing documentation as fast as possible
+even if you have no previous experience with Sphinx. The goal is to take
+someone in the state of "I want to write documentation and get it added to
+LLVM's docs" and turn that into useful documentation mailed to llvm-commits
+with as little nonsense as possible.
+
+You can find this document in ``docs/SphinxQuickstartTemplate.rst``. You
+should copy it, open the new file in your text editor, write your docs, and
+then send the new document to llvm-commits for review.
+
+Focus on *content*. It is easy to fix the Sphinx (reStructuredText) syntax
+later if necessary, although reStructuredText tries to imitate common
+plain-text conventions so it should be quite natural. A basic knowledge of
+reStructuredText syntax is useful when writing the document, so the last
+~half of this document (starting with `Example Section`_) gives examples
+which should cover 99% of use cases.
+
+Let me say that again: focus on *content*.
+
+Once you have finished with the content, please send the ``.rst`` file to
+llvm-commits for review.
+
+Guidelines
+==========
+
+Try to answer the following questions in your first section:
+
+#. Why would I want to read this document?
+
+#. What should I know to be able to follow along with this document?
+
+#. What will I have learned by the end of this document?
+
+Common names for the first section are ``Introduction``, ``Overview``, or
+``Background``.
+
+If possible, make your document a "how to". Give it a name ``HowTo*.rst``
+like the other "how to" documents. This format is usually the easiest
+for another person to understand and also the most useful.
+
+You generally should not be writing documentation other than a "how to"
+unless there is already a "how to" about your topic. The reason for this
+is that without a "how to" document to read first, it is difficult for a
+person to understand a more advanced document.
+
+Focus on content (yes, I had to say it again).
+
+The rest of this document shows example reStructuredText markup constructs
+that are meant to be read by you in your text editor after you have copied
+this file into a new file for the documentation you are about to write.
+
+Example Section
+===============
+
+Your text can be *emphasized*, **bold**, or ``monospace``.
+
+Use blank lines to separate paragraphs.
+
+Headings (like ``Example Section`` just above) give your document
+structure. Use the same kind of adornments (e.g. ``======`` vs. ``------``)
+as are used in this document. The adornment must be the same length as the
+text above it. For Vim users, variations of ``yypVr=`` might be handy.
+
+Example Subsection
+------------------
+
+Make a link `like this <http://llvm.org/>`_. There is also a more
+sophisticated syntax which `can be more readable`_ for longer links since
+it disrupts the flow less. You can put the ``.. _`link text`: <URL>`` block
+pretty much anywhere later in the document.
+
+.. _`can be more readable`: http://en.wikipedia.org/wiki/LLVM
+
+Lists can be made like this:
+
+#. A list starting with ``#.`` will be automatically numbered.
+
+#. This is a second list element.
+
+   #. They nest too.
+
+You can also use unordered lists.
+
+* Stuff.
+
+  + Deeper stuff.
+
+* More stuff.
+
+Example Subsubsection
+^^^^^^^^^^^^^^^^^^^^^
+
+You can make blocks of code like this:
+
+.. code-block:: c++
+
+   int main() {
+     return 0
+   }
+
+For a shell session, use a ``bash`` code block:
+
+.. code-block:: bash
+
+   $ echo "Goodbye cruel world!"
+   $ rm -rf /
+
+If you need to show LLVM IR use the ``llvm`` code block.
+
+Hopefully you won't need to be this deep
+""""""""""""""""""""""""""""""""""""""""
+
+If you need to do fancier things than what has been shown in this document,
+you can mail the list or check Sphinx's `reStructuredText Primer`_.
+
+.. _`reStructuredText Primer`: http://sphinx.pocoo.org/rest.html
diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html
index 804e929805c9..c313083fa76a 100644
--- a/docs/TestingGuide.html
+++ b/docs/TestingGuide.html
@@ -218,11 +218,11 @@ you can run the LLVM and Clang tests simultaneously using:</p>
 
 <p>To run individual tests or subsets of tests, you can use the 'llvm-lit'
 script which is built as part of LLVM. For example, to run the
-'Integer/BitCast.ll' test by itself you can run:</p>
+'Integer/BitPacked.ll' test by itself you can run:</p>
 
 <div class="doc_code">
 <pre>
-% llvm-lit ~/llvm/test/Integer/BitCast.ll 
+% llvm-lit ~/llvm/test/Integer/BitPacked.ll 
 </pre>
 </div>
 
@@ -798,14 +798,15 @@ define two separate CHECK lines that match on the same line.
   <p>Sometimes it is necessary to mark a test case as "expected fail" or XFAIL.
   You can easily mark a test as XFAIL just by including <tt>XFAIL: </tt> on a
   line near the top of the file. This signals that the test case should succeed
-  if the test fails. Such test cases are counted separately by the testing tool. To
-  specify an expected fail, use the XFAIL keyword in the comments of the test
-  program followed by a colon and one or more regular expressions (separated by
-  a comma). The regular expressions allow you to XFAIL the test conditionally by
-  host platform. The regular expressions following the : are matched against the
-  target triplet for the host machine. If there is a match, the test is expected
-  to fail. If not, the test is expected to succeed. To XFAIL everywhere just
-  specify <tt>XFAIL: *</tt>. Here is an example of an <tt>XFAIL</tt> line:</p>
+  if the test fails. Such test cases are counted separately by the testing
+  tool. To specify an expected fail, use the XFAIL keyword in the comments of
+  the test program followed by a colon and one or more failure patterns. Each
+  failure pattern can be either '*' (to specify fail everywhere), or a part of a
+  target triple (indicating the test should fail on that platform), or the name
+  of a configurable feature (for example, "loadable_module"). If there is a
+  match, the test is expected to fail. If not, the test is expected to
+  succeed. To XFAIL everywhere just specify <tt>XFAIL: *</tt>. Here is an
+  example of an <tt>XFAIL</tt> line:</p>
 
 <div class="doc_code">
 <pre>
@@ -909,7 +910,7 @@ the <a href="TestSuiteMakefileGuide.html">Test Suite Makefile Guide.</a></p>
 
   John T. Criswell, Daniel Dunbar, Reid Spencer, and Tanya Lattner<br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-08 20:26:07 +0200 (Tue, 08 May 2012) $
+  Last modified: $Date: 2012-11-07 18:00:18 +0100 (Wed, 07 Nov 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/WritingAnLLVMBackend.html b/docs/WritingAnLLVMBackend.html
index 11517c212401..b7fdce490472 100644
--- a/docs/WritingAnLLVMBackend.html
+++ b/docs/WritingAnLLVMBackend.html
@@ -32,6 +32,7 @@
   <li><a href="#InstructionSet">Instruction Set</a>
   <ul>  
     <li><a href="#operandMapping">Instruction Operand Mapping</a></li>
+    <li><a href="#relationMapping">Instruction Relation Mapping</a></li>
     <li><a href="#implementInstr">Implement a subclass of TargetInstrInfo</a></li>
     <li><a href="#branchFolding">Branch Folding and If Conversion</a></li>
   </ul></li>
@@ -314,14 +315,14 @@ represent target components.  These methods are named <tt>get*Info</tt>, and are
 intended to obtain the instruction set (<tt>getInstrInfo</tt>), register set
 (<tt>getRegisterInfo</tt>), stack frame layout (<tt>getFrameInfo</tt>), and
 similar information. <tt>XXXTargetMachine</tt> must also implement the
-<tt>getTargetData</tt> method to access an object with target-specific data
+<tt>getDataLayout</tt> method to access an object with target-specific data
 characteristics, such as data type size and alignment requirements.
 </p>
 
 <p>
 For instance, for the SPARC target, the header file
 <tt>SparcTargetMachine.h</tt> declares prototypes for several <tt>get*Info</tt>
-and <tt>getTargetData</tt> methods that simply return a class member.
+and <tt>getDataLayout</tt> methods that simply return a class member.
 </p>
 
 <div class="doc_code">
@@ -331,7 +332,7 @@ namespace llvm {
 class Module;
 
 class SparcTargetMachine : public LLVMTargetMachine {
-  const TargetData DataLayout;       // Calculates type size &amp; alignment
+  const DataLayout DataLayout;       // Calculates type size &amp; alignment
   SparcSubtarget Subtarget;
   SparcInstrInfo InstrInfo;
   TargetFrameInfo FrameInfo;
@@ -348,7 +349,7 @@ public:
   virtual const TargetRegisterInfo *getRegisterInfo() const {
     return &amp;InstrInfo.getRegisterInfo();
   }
-  virtual const TargetData *getTargetData() const { return &amp;DataLayout; }
+  virtual const DataLayout *getDataLayout() const { return &amp;DataLayout; }
   static unsigned getModuleMatchQuality(const Module &amp;M);
 
   // Pass Pipeline Configuration
@@ -364,7 +365,7 @@ public:
 <li><tt>getInstrInfo()</tt></li>
 <li><tt>getRegisterInfo()</tt></li>
 <li><tt>getFrameInfo()</tt></li>
-<li><tt>getTargetData()</tt></li>
+<li><tt>getDataLayout()</tt></li>
 <li><tt>getSubtargetImpl()</tt></li>
 </ul>
 
@@ -1259,6 +1260,29 @@ the <tt>rd</tt>, <tt>rs1</tt>, and <tt>rs2</tt> fields respectively.
 
 <!-- ======================================================================= -->
 <h3>
+  <a name="relationMapping">Instruction Relation Mapping</a>
+</h3>
+
+<div>
+
+<p>
+This TableGen feature is used to relate instructions with each other. It is
+particularly useful when you have multiple instruction formats and need to
+switch between them after instruction selection. This entire feature is driven
+by relation models which can be defined in <tt>XXXInstrInfo.td</tt> files
+according to the target-specific instruction set. Relation models are defined
+using <tt>InstrMapping</tt> class as a base. TableGen parses all the models
+and generates instruction relation maps using the specified information.
+Relation maps are emitted as tables in the <tt>XXXGenInstrInfo.inc</tt> file
+along with the functions to query them. For the detailed information on how to
+use this feature, please refer to
+<a href="HowToUseInstrMappings.html">How to add Instruction Mappings</a>
+document.
+</p>
+</div>
+
+<!-- ======================================================================= -->
+<h3>
   <a name="implementInstr">Implement a subclass of </a>
   <a href="CodeGenerator.html#targetinstrinfo">TargetInstrInfo</a>
 </h3>
@@ -2526,7 +2550,7 @@ with assembler.
   <a href="http://www.woo.com">Mason Woo</a> and <a href="http://misha.brukman.net">Misha Brukman</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
   <br>
-  Last modified: $Date: 2012-04-19 22:20:34 +0200 (Thu, 19 Apr 2012) $
+  Last modified: $Date: 2012-10-25 17:54:06 +0200 (Thu, 25 Oct 2012) $
 </address>
 
 </body>
diff --git a/docs/llvm-theme/layout.html b/docs/_themes/llvm-theme/layout.html
index 746c2f56c82a..746c2f56c82a 100644
--- a/docs/llvm-theme/layout.html
+++ b/docs/_themes/llvm-theme/layout.html
diff --git a/docs/llvm-theme/static/contents.png b/docs/_themes/llvm-theme/static/contents.png
index 7fb82154a174..7fb82154a174 100644
--- a/docs/llvm-theme/static/contents.png
+++ b/docs/_themes/llvm-theme/static/contents.png
diff --git a/docs/llvm-theme/static/llvm-theme.css b/docs/_themes/llvm-theme/static/llvm-theme.css
index f684d00ce437..beab2ca2512b 100644
--- a/docs/llvm-theme/static/llvm-theme.css
+++ b/docs/_themes/llvm-theme/static/llvm-theme.css
@@ -18,7 +18,6 @@ body {
     font-family: 'Lucida Grande', 'Lucida Sans Unicode', 'Geneva',
                  'Verdana', sans-serif;
     font-size: 14px;
-    letter-spacing: -0.01em;
     line-height: 150%;
     text-align: center;
     background-color: #BFD1D4;
@@ -239,7 +238,6 @@ cite, code, tt {
     font-family: 'Consolas', 'Deja Vu Sans Mono',
                  'Bitstream Vera Sans Mono', monospace;
     font-size: 0.95em;
-    letter-spacing: 0.01em;
 }
 
 :not(a.reference) > tt {
@@ -274,7 +272,6 @@ pre {
     font-family: 'Consolas', 'Deja Vu Sans Mono',
                  'Bitstream Vera Sans Mono', monospace;
     font-size: 0.95em;
-    letter-spacing: 0.015em;
     line-height: 120%;
     padding: 0.5em;
     border: 1px solid #ccc;
diff --git a/docs/llvm-theme/static/logo.png b/docs/_themes/llvm-theme/static/logo.png
index 18d424c53c09..18d424c53c09 100644
--- a/docs/llvm-theme/static/logo.png
+++ b/docs/_themes/llvm-theme/static/logo.png
diff --git a/docs/llvm-theme/static/navigation.png b/docs/_themes/llvm-theme/static/navigation.png
index 1081dc1439fb..1081dc1439fb 100644
--- a/docs/llvm-theme/static/navigation.png
+++ b/docs/_themes/llvm-theme/static/navigation.png
diff --git a/docs/llvm-theme/theme.conf b/docs/_themes/llvm-theme/theme.conf
index 573fd78aba99..573fd78aba99 100644
--- a/docs/llvm-theme/theme.conf
+++ b/docs/_themes/llvm-theme/theme.conf
diff --git a/docs/conf.py b/docs/conf.py
index de0585ddb006..a1e9b5f6e286 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -98,7 +98,7 @@ html_theme = 'llvm-theme'
 #html_theme_options = {}
 
 # Add any paths that contain custom themes here, relative to this directory.
-html_theme_path = ["."]
+html_theme_path = ["_themes"]
 
 # The name for this set of Sphinx documents.  If None, it defaults to
 # "<project> v<release> documentation".
@@ -134,18 +134,7 @@ html_sidebars = {'index': 'indexsidebar.html'}
 
 # Additional templates that should be rendered to pages, maps page names to
 # template names.
-#
-# We load all the old-school HTML documentation pages into Sphinx here.
-basedir = os.path.dirname(__file__)
-html_additional_pages = {}
-for directory in ('', 'tutorial'):
-    for file in os.listdir(os.path.join(basedir, directory)):
-        if not file.endswith('.html'):
-            continue
-
-        subpath = os.path.join(directory, file)
-        name,_ = os.path.splitext(subpath)
-        html_additional_pages[name] = subpath
+#html_additional_pages = {}
 
 # If false, no module index is generated.
 #html_domain_indices = True
@@ -226,6 +215,7 @@ man_pages = []
 
 # Automatically derive the list of man pages from the contents of the command
 # guide subdirectory.
+basedir = os.path.dirname(__file__)
 man_page_authors = "Maintained by The LLVM Team (http://llvm.org/)."
 command_guide_subpath = 'CommandGuide'
 command_guide_path = os.path.join(basedir, command_guide_subpath)
@@ -237,9 +227,8 @@ for name in os.listdir(command_guide_path):
     # Otherwise, automatically extract the description.
     file_subpath = os.path.join(command_guide_subpath, name)
     with open(os.path.join(command_guide_path, name)) as f:
-        it = iter(f)
-        title = it.next()[:-1]
-        header = it.next()[:-1]
+        title = f.readline().rstrip('\n')
+        header = f.readline().rstrip('\n')
 
         if len(header) != len(title):
             print >>sys.stderr, (
diff --git a/docs/index.rst b/docs/index.rst
index 53d3e7c01b7c..d406b5257440 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -15,43 +15,43 @@ research projects.
 Similarly, documentation is broken down into several high-level groupings
 targeted at different audiences:
 
-  * **Design & Overview**
+* **Design & Overview**
 
-    Several introductory papers and presentations are available at
-    :ref:`design_and_overview`.
+ Several introductory papers and presentations are available at
+ :ref:`design_and_overview`.
 
-  * **Publications**
+* **Publications**
 
-    The list of `publications <http://llvm.org/pubs>`_ based on LLVM.
+ The list of `publications <http://llvm.org/pubs>`_ based on LLVM.
 
-  * **User Guides**
+* **User Guides**
 
-    Those new to the LLVM system should first vist the :ref:`userguides`.
+ Those new to the LLVM system should first visit the :ref:`userguides`.
 
-    NOTE: If you are a user who is only interested in using LLVM-based
-    compilers, you should look into `Clang <http://clang.llvm.org>`_ or
-    `DragonEgg <http://dragonegg.llvm.org>`_ instead. The documentation here is
-    intended for users who have a need to work with the intermediate LLVM
-    representation.
+ NOTE: If you are a user who is only interested in using LLVM-based
+ compilers, you should look into `Clang <http://clang.llvm.org>`_ or
+ `DragonEgg <http://dragonegg.llvm.org>`_ instead. The documentation here is
+ intended for users who have a need to work with the intermediate LLVM
+ representation.
 
-  * **API Clients**
+* **API Clients**
 
-    Developers of applications which use LLVM as a library should visit the
-    :ref:`programming`.
+ Developers of applications which use LLVM as a library should visit the
+ :ref:`programming`.
 
-  * **Subsystems**
+* **Subsystems**
 
-    API clients and LLVM developers may be interested in the
-    :ref:`subsystems` documentation.
+ API clients and LLVM developers may be interested in the
+ :ref:`subsystems` documentation.
 
-  * **Development Process**
+* **Development Process**
 
-    Additional documentation on the LLVM project can be found at
-    :ref:`development_process`.
+ Additional documentation on the LLVM project can be found at
+ :ref:`development_process`.
 
-  * **Mailing Lists**
+* **Mailing Lists**
 
-    For more information, consider consulting the LLVM :ref:`mailing_lists`.
+ For more information, consider consulting the LLVM :ref:`mailing_lists`.
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/programming.rst b/docs/programming.rst
index 27e43014ee3c..c4eec59417e8 100644
--- a/docs/programming.rst
+++ b/docs/programming.rst
@@ -6,14 +6,22 @@ Programming Documentation
 .. toctree::
    :hidden:
 
+   Atomics
    CodingStandards
    CommandLine
+   CompilerWriterInfo
+   ExtendingLLVM
+   HowToSetUpLLVMStyleRTTI
 
 * `LLVM Language Reference Manual <LangRef.html>`_
 
   Defines the LLVM intermediate representation and the assembly form of the
   different nodes.
 
+* :ref:`atomics`
+
+  Information about LLVM's concurrency model.
+
 * `The LLVM Programmers Manual <ProgrammersManual.html>`_
 
   Introduction to the general layout of the LLVM sourcebase, important classes
@@ -28,7 +36,12 @@ Programming Documentation
   Details the LLVM coding standards and provides useful information on writing
   efficient C++ code.
 
-* `Extending LLVM <ExtendingLLVM.html>`_
+* :doc:`HowToSetUpLLVMStyleRTTI`
+
+  How to make ``isa<>``, ``dyn_cast<>``, etc. available for clients of your
+  class hierarchy.
+
+* :ref:`extending_llvm`
 
   Look here to see how to add instructions and intrinsics to LLVM.
 
@@ -38,3 +51,7 @@ Programming Documentation
   (`tarball <http://llvm.org/doxygen/doxygen.tar.gz>`_)
 
 * `ViewVC Repository Browser <http://llvm.org/viewvc/>`_
+
+* :ref:`compiler_writer_info`
+
+  A list of helpful links for compiler writers.
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
index be33295a1510..80d0eed66339 100644
--- a/docs/subsystems.rst
+++ b/docs/subsystems.rst
@@ -15,6 +15,9 @@ Subsystem Documentation
    LinkTimeOptimization
    SegmentedStacks
    TableGenFundamentals
+   DebuggingJITedCode
+   GoldPlugin
+   MarkedUpDisassembly
 
 * `Writing an LLVM Pass <WritingAnLLVMPass.html>`_
     
@@ -74,11 +77,11 @@ Subsystem Documentation
    This document describes the interface between LLVM intermodular optimizer
    and the linker and its design
     
-* `The LLVM gold plugin <GoldPlugin.html>`_
+* :ref:`gold-plugin`
     
    How to build your programs with link-time optimization on Linux.
     
-* `The GDB JIT interface <DebuggingJITedCode.html>`_
+* :ref:`debugging-jited-code`
     
    How to debug JITed code with GDB.
     
@@ -89,3 +92,15 @@ Subsystem Documentation
 * :ref:`segmented_stacks`
 
    This document describes segmented stacks and how they are used in LLVM.
+
+* `Howto: Implementing LLVM Integrated Assembler`_
+
+   A simple guide for how to implement an LLVM integrated assembler for an
+   architecture.
+
+.. _`Howto: Implementing LLVM Integrated Assembler`: http://www.embecosm.com/download/ean10.html
+
+* :ref:`marked_up_disassembly`
+
+   This document describes the optional rich disassembly output syntax.
+
diff --git a/docs/tutorial/LangImpl4.html b/docs/tutorial/LangImpl4.html
index 3f8d4a4498e7..5e9c65676c9e 100644
--- a/docs/tutorial/LangImpl4.html
+++ b/docs/tutorial/LangImpl4.html
@@ -173,7 +173,7 @@ add a set of optimizations to run.  The code looks like this:</p>
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine->getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine->getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
@@ -523,7 +523,7 @@ at runtime.</p>
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
@@ -1103,7 +1103,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine-&gt;getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine-&gt;getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
@@ -1146,7 +1146,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl5.html b/docs/tutorial/LangImpl5.html
index a7a37374a5d1..9a9fd8c14e09 100644
--- a/docs/tutorial/LangImpl5.html
+++ b/docs/tutorial/LangImpl5.html
@@ -901,7 +901,7 @@ clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
@@ -1723,7 +1723,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine-&gt;getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine-&gt;getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
@@ -1766,7 +1766,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl6.html b/docs/tutorial/LangImpl6.html
index 112889347815..7cd87da79229 100644
--- a/docs/tutorial/LangImpl6.html
+++ b/docs/tutorial/LangImpl6.html
@@ -840,7 +840,7 @@ library, although doing that will cause problems on Windows.</p>
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
@@ -1780,7 +1780,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine-&gt;getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine-&gt;getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
@@ -1823,7 +1823,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-07-31 09:05:57 +0200 (Tue, 31 Jul 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/LangImpl7.html b/docs/tutorial/LangImpl7.html
index f1fe4049b20f..4d5a4aa7e84a 100644
--- a/docs/tutorial/LangImpl7.html
+++ b/docs/tutorial/LangImpl7.html
@@ -524,7 +524,7 @@ good codegen once again:</p>
 <pre>
     // Set up the optimizer pipeline.  Start with registering info about how the
     // target lays out data structures.
-    OurFPM.add(new TargetData(*TheExecutionEngine-&gt;getTargetData()));
+    OurFPM.add(new DataLayout(*TheExecutionEngine-&gt;getDataLayout()));
     <b>// Promote allocas to registers.
     OurFPM.add(createPromoteMemoryToRegisterPass());</b>
     // Do simple "peephole" optimizations and bit-twiddling optzns.
@@ -1008,7 +1008,7 @@ clang++ -g toy.cpp `llvm-config --cppflags --ldflags --libs core jit native` -O3
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include &lt;cstdio&gt;
@@ -2113,7 +2113,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine-&gt;getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine-&gt;getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Promote allocas to registers.
@@ -2158,7 +2158,7 @@ int main() {
 
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-06-29 14:38:19 +0200 (Fri, 29 Jun 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl4.html b/docs/tutorial/OCamlLangImpl4.html
index e3e246954f21..d3cfd3d6736a 100644
--- a/docs/tutorial/OCamlLangImpl4.html
+++ b/docs/tutorial/OCamlLangImpl4.html
@@ -189,7 +189,7 @@ add a set of optimizations to run.  The code looks like this:</p>
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combining the_fpm;
@@ -965,7 +965,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combination the_fpm;
@@ -1020,7 +1020,7 @@ extern double putchard(double X) {
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl5.html b/docs/tutorial/OCamlLangImpl5.html
index 994957e5c1be..0a759ac66d67 100644
--- a/docs/tutorial/OCamlLangImpl5.html
+++ b/docs/tutorial/OCamlLangImpl5.html
@@ -1498,7 +1498,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combination the_fpm;
@@ -1554,7 +1554,7 @@ operators</a>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl6.html b/docs/tutorial/OCamlLangImpl6.html
index cef38846435c..db252406fed7 100644
--- a/docs/tutorial/OCamlLangImpl6.html
+++ b/docs/tutorial/OCamlLangImpl6.html
@@ -1506,7 +1506,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combination the_fpm;
@@ -1568,7 +1568,7 @@ SSA construction</a>
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2012-07-31 09:05:57 +0200 (Tue, 31 Jul 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/tutorial/OCamlLangImpl7.html b/docs/tutorial/OCamlLangImpl7.html
index abe8913277f6..aa30555a1d40 100644
--- a/docs/tutorial/OCamlLangImpl7.html
+++ b/docs/tutorial/OCamlLangImpl7.html
@@ -545,7 +545,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   <b>(* Promote allocas to registers. *)
   add_memory_to_register_promotion the_fpm;</b>
@@ -1834,7 +1834,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Promote allocas to registers. *)
   add_memory_to_register_promotion the_fpm;
@@ -1898,7 +1898,7 @@ extern double printd(double X) {
   <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
   <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
   <a href="mailto:idadesub@users.sourceforge.net">Erick Tryzelaar</a><br>
-  Last modified: $Date: 2012-05-03 00:46:36 +0200 (Thu, 03 May 2012) $
+  Last modified: $Date: 2012-10-08 18:39:34 +0200 (Mon, 08 Oct 2012) $
 </address>
 </body>
 </html>
diff --git a/docs/userguides.rst b/docs/userguides.rst
index 26a5a8ccc237..8c1554dfce9c 100644
--- a/docs/userguides.rst
+++ b/docs/userguides.rst
@@ -7,14 +7,21 @@ User Guides
    :hidden:
 
    CMake
+   HowToBuildOnARM
    CommandGuide/index
    DeveloperPolicy
+   GettingStarted
    GettingStartedVS
    FAQ
    Lexicon
    Packaging
+   HowToAddABuilder
+   yaml2obj
+   HowToSubmitABug
+   SphinxQuickstartTemplate
+   Phabricator
 
-* `The LLVM Getting Started Guide <GettingStarted.html>`_
+* :ref:`getting_started`
     
    Discusses how to get up and running quickly with the LLVM infrastructure.
    Everything from unpacking and compilation of the distribution to execution
@@ -24,7 +31,11 @@ User Guides
 
    An addendum to the main Getting Started guide for those using the `CMake
    build system <http://www.cmake.org>`_.
-    
+
+* :ref:`how_to_build_on_arm`
+
+   Notes on building and testing LLVM/Clang on ARM.
+
 * `Getting Started with the LLVM System using Microsoft Visual Studio
   <GettingStartedVS.html>`_
 
@@ -57,10 +68,14 @@ User Guides
 
    This describes new features, known bugs, and other limitations.
 
-* `How to Submit A Bug Report <HowToSubmitABug.html>`_
+* :ref:`how-to-submit-a-bug-report`
     
    Instructions for properly submitting information about any bugs you run into
    in the LLVM system.
+* :doc:`SphinxQuickstartTemplate`
+
+  A template + tutorial for writing new Sphinx documentation. It is meant
+  to be read in source form.
     
 * `LLVM Testing Infrastructure Guide <TestingGuide.html>`_
 
@@ -78,7 +93,7 @@ User Guides
 
    Definition of acronyms, terms and concepts used in LLVM.
 
-* `How To Add Your Build Configuration To LLVM Buildbot Infrastructure <HowToAddABuilder.html>`_
+* :ref:`how_to_add_a_builder`
 
    Instructions for adding new builder to LLVM buildbot master.
     
diff --git a/docs/yaml2obj.rst b/docs/yaml2obj.rst
index cb59162e5aca..d051e7e22c00 100644
--- a/docs/yaml2obj.rst
+++ b/docs/yaml2obj.rst
@@ -6,9 +6,9 @@ yaml2obj
 yaml2obj takes a YAML description of an object file and converts it to a binary
 file.
 
-    $ yaml2py input-file
+    $ yaml2obj input-file
 
-.. program:: yaml2py
+.. program:: yaml2obj
 
 Outputs the binary to stdout.
 
diff --git a/examples/ExceptionDemo/ExceptionDemo.cpp b/examples/ExceptionDemo/ExceptionDemo.cpp
index 6dbd6626de94..215cb4d3714f 100644
--- a/examples/ExceptionDemo/ExceptionDemo.cpp
+++ b/examples/ExceptionDemo/ExceptionDemo.cpp
@@ -10,13 +10,13 @@
 // Demo program which implements an example LLVM exception implementation, and
 // shows several test cases including the handling of foreign exceptions.
 // It is run with type info types arguments to throw. A test will
-// be run for each given type info type. While type info types with the value 
+// be run for each given type info type. While type info types with the value
 // of -1 will trigger a foreign C++ exception to be thrown; type info types
-// <= 6 and >= 1 will cause the associated generated exceptions to be thrown 
+// <= 6 and >= 1 will cause the associated generated exceptions to be thrown
 // and caught by generated test functions; and type info types > 6
 // will result in exceptions which pass through to the test harness. All other
 // type info types are not supported and could cause a crash. In all cases,
-// the "finally" blocks of every generated test functions will executed 
+// the "finally" blocks of every generated test functions will executed
 // regardless of whether or not that test function ignores or catches the
 // thrown exception.
 //
@@ -25,25 +25,25 @@
 // ExceptionDemo
 //
 //     causes a usage to be printed to stderr
-// 
+//
 // ExceptionDemo 2 3 7 -1
 //
 //     results in the following cases:
-//         - Value 2 causes an exception with a type info type of 2 to be 
+//         - Value 2 causes an exception with a type info type of 2 to be
 //           thrown and caught by an inner generated test function.
-//         - Value 3 causes an exception with a type info type of 3 to be 
+//         - Value 3 causes an exception with a type info type of 3 to be
 //           thrown and caught by an outer generated test function.
-//         - Value 7 causes an exception with a type info type of 7 to be 
+//         - Value 7 causes an exception with a type info type of 7 to be
 //           thrown and NOT be caught by any generated function.
 //         - Value -1 causes a foreign C++ exception to be thrown and not be
 //           caught by any generated function
 //
 //     Cases -1 and 7 are caught by a C++ test harness where the validity of
-//         of a C++ catch(...) clause catching a generated exception with a 
-//         type info type of 7 is explained by: example in rules 1.6.4 in 
+//         of a C++ catch(...) clause catching a generated exception with a
+//         type info type of 7 is explained by: example in rules 1.6.4 in
 //         http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22)
 //
-// This code uses code from the llvm compiler-rt project and the llvm 
+// This code uses code from the llvm compiler-rt project and the llvm
 // Kaleidoscope project.
 //
 //===----------------------------------------------------------------------===//
@@ -57,18 +57,18 @@
 #include "llvm/PassManager.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/TargetSelect.h"
 
-// FIXME: Although all systems tested with (Linux, OS X), do not need this 
-//        header file included. A user on ubuntu reported, undefined symbols 
+// FIXME: Although all systems tested with (Linux, OS X), do not need this
+//        header file included. A user on ubuntu reported, undefined symbols
 //        for stderr, and fprintf, and the addition of this include fixed the
-//        issue for them. Given that LLVM's best practices include the goal 
-//        of reducing the number of redundant header files included, the 
-//        correct solution would be to find out why these symbols are not 
+//        issue for them. Given that LLVM's best practices include the goal
+//        of reducing the number of redundant header files included, the
+//        correct solution would be to find out why these symbols are not
 //        defined for the system in question, and fix the issue by finding out
 //        which LLVM header file, if any, would include these symbols.
 #include <cstdio>
@@ -81,11 +81,11 @@
 #define USE_GLOBAL_STR_CONSTS true
 #endif
 
-// System C++ ABI unwind types from: 
+// System C++ ABI unwind types from:
 //     http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22)
 
 extern "C" {
-  
+
   typedef enum {
     _URC_NO_REASON = 0,
     _URC_FOREIGN_EXCEPTION_CAUGHT = 1,
@@ -97,7 +97,7 @@ extern "C" {
     _URC_INSTALL_CONTEXT = 7,
     _URC_CONTINUE_UNWIND = 8
   } _Unwind_Reason_Code;
-  
+
   typedef enum {
     _UA_SEARCH_PHASE = 1,
     _UA_CLEANUP_PHASE = 2,
@@ -105,34 +105,34 @@ extern "C" {
     _UA_FORCE_UNWIND = 8,
     _UA_END_OF_STACK = 16
   } _Unwind_Action;
-  
+
   struct _Unwind_Exception;
-  
+
   typedef void (*_Unwind_Exception_Cleanup_Fn) (_Unwind_Reason_Code,
                                                 struct _Unwind_Exception *);
-  
+
   struct _Unwind_Exception {
     uint64_t exception_class;
     _Unwind_Exception_Cleanup_Fn exception_cleanup;
-    
-    uintptr_t private_1;    
-    uintptr_t private_2;    
-    
+
+    uintptr_t private_1;
+    uintptr_t private_2;
+
     // @@@ The IA-64 ABI says that this structure must be double-word aligned.
-    //  Taking that literally does not make much sense generically.  Instead 
+    //  Taking that literally does not make much sense generically.  Instead
     //  we provide the maximum alignment required by any type for the machine.
   } __attribute__((__aligned__));
-  
+
   struct _Unwind_Context;
   typedef struct _Unwind_Context *_Unwind_Context_t;
-  
+
   extern const uint8_t *_Unwind_GetLanguageSpecificData (_Unwind_Context_t c);
   extern uintptr_t _Unwind_GetGR (_Unwind_Context_t c, int i);
   extern void _Unwind_SetGR (_Unwind_Context_t c, int i, uintptr_t n);
   extern void _Unwind_SetIP (_Unwind_Context_t, uintptr_t new_value);
   extern uintptr_t _Unwind_GetIP (_Unwind_Context_t context);
   extern uintptr_t _Unwind_GetRegionStart (_Unwind_Context_t context);
-  
+
 } // extern "C"
 
 //
@@ -148,13 +148,13 @@ struct OurExceptionType_t {
 
 /// This is our Exception class which relies on a negative offset to calculate
 /// pointers to its instances from pointers to its unwindException member.
-/// 
+///
 /// Note: The above unwind.h defines struct _Unwind_Exception to be aligned
 ///       on a double word boundary. This is necessary to match the standard:
 ///       http://refspecs.freestandards.org/abi-eh-1.21.html
 struct OurBaseException_t {
   struct OurExceptionType_t type;
-  
+
   // Note: This is properly aligned in unwind.h
   struct _Unwind_Exception unwindException;
 };
@@ -165,7 +165,7 @@ typedef struct OurBaseException_t OurException;
 typedef struct _Unwind_Exception OurUnwindException;
 
 //
-// Various globals used to support typeinfo and generatted exceptions in 
+// Various globals used to support typeinfo and generatted exceptions in
 // general
 //
 
@@ -173,7 +173,7 @@ static std::map<std::string, llvm::Value*> namedValues;
 
 int64_t ourBaseFromUnwindOffset;
 
-const unsigned char ourBaseExcpClassChars[] = 
+const unsigned char ourBaseExcpClassChars[] =
 {'o', 'b', 'j', '\0', 'b', 'a', 's', '\0'};
 
 
@@ -203,7 +203,7 @@ typedef std::vector<llvm::Type*> ArgTypes;
 /// @param retType function return type
 /// @param theArgTypes function's ordered argument types
 /// @param theArgNames function's ordered arguments needed if use of this
-///        function corresponds to a function definition. Use empty 
+///        function corresponds to a function definition. Use empty
 ///        aggregate for function declarations.
 /// @param functName function name
 /// @param linkage function linkage
@@ -224,17 +224,17 @@ llvm::Function *createFunction(llvm::Module &module,
     llvm::Function::Create(functType, linkage, functName, &module);
   if (!ret || declarationOnly)
     return(ret);
-  
+
   namedValues.clear();
-  unsigned i = 0; 
+  unsigned i = 0;
   for (llvm::Function::arg_iterator argIndex = ret->arg_begin();
        i != theArgNames.size();
        ++argIndex, ++i) {
-    
+
     argIndex->setName(theArgNames[i]);
     namedValues[theArgNames[i]] = argIndex;
   }
-  
+
   return(ret);
 }
 
@@ -250,13 +250,13 @@ static llvm::AllocaInst *createEntryBlockAlloca(llvm::Function &function,
                                                 const std::string &varName,
                                                 llvm::Type *type,
                                                 llvm::Constant *initWith = 0) {
-  llvm::BasicBlock &block = function.getEntryBlock(); 
+  llvm::BasicBlock &block = function.getEntryBlock();
   llvm::IRBuilder<> tmp(&block, block.begin());
   llvm::AllocaInst *ret = tmp.CreateAlloca(type, 0, varName.c_str());
-  
-  if (initWith) 
+
+  if (initWith)
     tmp.CreateStore(initWith, ret);
-  
+
   return(ret);
 }
 
@@ -266,7 +266,7 @@ static llvm::AllocaInst *createEntryBlockAlloca(llvm::Function &function,
 //
 
 //
-// Runtime C Library functions 
+// Runtime C Library functions
 //
 
 // Note: using an extern "C" block so that static functions can be used
@@ -275,7 +275,7 @@ extern "C" {
 // Note: Better ways to decide on bit width
 //
 /// Prints a 32 bit number, according to the format, to stderr.
-/// @param intToPrint integer to print 
+/// @param intToPrint integer to print
 /// @param format printf like format to use when printing
 void print32Int(int intToPrint, const char *format) {
   if (format) {
@@ -292,7 +292,7 @@ void print32Int(int intToPrint, const char *format) {
 // Note: Better ways to decide on bit width
 //
 /// Prints a 64 bit number, according to the format, to stderr.
-/// @param intToPrint integer to print 
+/// @param intToPrint integer to print
 /// @param format printf like format to use when printing
 void print64Int(long int intToPrint, const char *format) {
   if (format) {
@@ -327,19 +327,19 @@ void deleteOurException(OurUnwindException *expToDelete) {
   fprintf(stderr,
           "deleteOurException(...).\n");
 #endif
-  
+
   if (expToDelete &&
       (expToDelete->exception_class == ourBaseExceptionClass)) {
-    
+
     free(((char*) expToDelete) + ourBaseFromUnwindOffset);
   }
 }
 
 
-/// This function is the struct _Unwind_Exception API mandated delete function 
-/// used by foreign exception handlers when deleting our exception 
+/// This function is the struct _Unwind_Exception API mandated delete function
+/// used by foreign exception handlers when deleting our exception
 /// (OurException), instances.
-/// @param reason @link http://refspecs.freestandards.org/abi-eh-1.21.html 
+/// @param reason @link http://refspecs.freestandards.org/abi-eh-1.21.html
 /// @unlink
 /// @param expToDelete exception instance to delete
 void deleteFromUnwindOurException(_Unwind_Reason_Code reason,
@@ -348,7 +348,7 @@ void deleteFromUnwindOurException(_Unwind_Reason_Code reason,
   fprintf(stderr,
           "deleteFromUnwindOurException(...).\n");
 #endif
-  
+
   deleteOurException(expToDelete);
 }
 
@@ -362,13 +362,13 @@ OurUnwindException *createOurException(int type) {
   (ret->type).type = type;
   (ret->unwindException).exception_class = ourBaseExceptionClass;
   (ret->unwindException).exception_cleanup = deleteFromUnwindOurException;
-  
+
   return(&(ret->unwindException));
 }
 
 
-/// Read a uleb128 encoded value and advance pointer 
-/// See Variable Length Data in: 
+/// Read a uleb128 encoded value and advance pointer
+/// See Variable Length Data in:
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @returns decoded value
@@ -377,22 +377,22 @@ static uintptr_t readULEB128(const uint8_t **data) {
   uintptr_t shift = 0;
   unsigned char byte;
   const uint8_t *p = *data;
-  
+
   do {
     byte = *p++;
     result |= (byte & 0x7f) << shift;
     shift += 7;
-  } 
+  }
   while (byte & 0x80);
-  
+
   *data = p;
-  
+
   return result;
 }
 
 
-/// Read a sleb128 encoded value and advance pointer 
-/// See Variable Length Data in: 
+/// Read a sleb128 encoded value and advance pointer
+/// See Variable Length Data in:
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @returns decoded value
@@ -401,26 +401,26 @@ static uintptr_t readSLEB128(const uint8_t **data) {
   uintptr_t shift = 0;
   unsigned char byte;
   const uint8_t *p = *data;
-  
+
   do {
     byte = *p++;
     result |= (byte & 0x7f) << shift;
     shift += 7;
-  } 
+  }
   while (byte & 0x80);
-  
+
   *data = p;
-  
+
   if ((byte & 0x40) && (shift < (sizeof(result) << 3))) {
     result |= (~0 << shift);
   }
-  
+
   return result;
 }
 
 
-/// Read a pointer encoded value and advance pointer 
-/// See Variable Length Data in: 
+/// Read a pointer encoded value and advance pointer
+/// See Variable Length Data in:
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
 /// @param data reference variable holding memory pointer to decode from
 /// @param encoding dwarf encoding type
@@ -428,11 +428,11 @@ static uintptr_t readSLEB128(const uint8_t **data) {
 static uintptr_t readEncodedPointer(const uint8_t **data, uint8_t encoding) {
   uintptr_t result = 0;
   const uint8_t *p = *data;
-  
-  if (encoding == llvm::dwarf::DW_EH_PE_omit) 
+
+  if (encoding == llvm::dwarf::DW_EH_PE_omit)
     return(result);
-  
-  // first get value 
+
+  // first get value
   switch (encoding & 0x0F) {
     case llvm::dwarf::DW_EH_PE_absptr:
       result = *((uintptr_t*)p);
@@ -470,15 +470,15 @@ static uintptr_t readEncodedPointer(const uint8_t **data, uint8_t encoding) {
       p += sizeof(int64_t);
       break;
     default:
-      // not supported 
+      // not supported
       abort();
       break;
   }
-  
-  // then add relative offset 
+
+  // then add relative offset
   switch (encoding & 0x70) {
     case llvm::dwarf::DW_EH_PE_absptr:
-      // do nothing 
+      // do nothing
       break;
     case llvm::dwarf::DW_EH_PE_pcrel:
       result += (uintptr_t)(*data);
@@ -488,34 +488,34 @@ static uintptr_t readEncodedPointer(const uint8_t **data, uint8_t encoding) {
     case llvm::dwarf::DW_EH_PE_funcrel:
     case llvm::dwarf::DW_EH_PE_aligned:
     default:
-      // not supported 
+      // not supported
       abort();
       break;
   }
-  
-  // then apply indirection 
+
+  // then apply indirection
   if (encoding & llvm::dwarf::DW_EH_PE_indirect) {
     result = *((uintptr_t*)result);
   }
-  
+
   *data = p;
-  
+
   return result;
 }
 
 
-/// Deals with Dwarf actions matching our type infos 
-/// (OurExceptionType_t instances). Returns whether or not a dwarf emitted 
-/// action matches the supplied exception type. If such a match succeeds, 
-/// the resultAction argument will be set with > 0 index value. Only 
-/// corresponding llvm.eh.selector type info arguments, cleanup arguments 
+/// Deals with Dwarf actions matching our type infos
+/// (OurExceptionType_t instances). Returns whether or not a dwarf emitted
+/// action matches the supplied exception type. If such a match succeeds,
+/// the resultAction argument will be set with > 0 index value. Only
+/// corresponding llvm.eh.selector type info arguments, cleanup arguments
 /// are supported. Filters are not supported.
-/// See Variable Length Data in: 
+/// See Variable Length Data in:
 /// @link http://dwarfstd.org/Dwarf3.pdf @unlink
 /// Also see @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
 /// @param resultAction reference variable which will be set with result
 /// @param classInfo our array of type info pointers (to globals)
-/// @param actionEntry index into above type info array or 0 (clean up). 
+/// @param actionEntry index into above type info array or 0 (clean up).
 ///        We do not support filters.
 /// @param exceptionClass exception class (_Unwind_Exception::exception_class)
 ///        of thrown exception.
@@ -523,22 +523,22 @@ static uintptr_t readEncodedPointer(const uint8_t **data, uint8_t encoding) {
 /// @returns whether or not a type info was found. False is returned if only
 ///          a cleanup was found
 static bool handleActionValue(int64_t *resultAction,
-                              struct OurExceptionType_t **classInfo, 
-                              uintptr_t actionEntry, 
-                              uint64_t exceptionClass, 
+                              struct OurExceptionType_t **classInfo,
+                              uintptr_t actionEntry,
+                              uint64_t exceptionClass,
                               struct _Unwind_Exception *exceptionObject) {
   bool ret = false;
-  
-  if (!resultAction || 
-      !exceptionObject || 
+
+  if (!resultAction ||
+      !exceptionObject ||
       (exceptionClass != ourBaseExceptionClass))
     return(ret);
-  
+
   struct OurBaseException_t *excp = (struct OurBaseException_t*)
   (((char*) exceptionObject) + ourBaseFromUnwindOffset);
   struct OurExceptionType_t *excpType = &(excp->type);
   int type = excpType->type;
-  
+
 #ifdef DEBUG
   fprintf(stderr,
           "handleActionValue(...): exceptionObject = <%p>, "
@@ -546,12 +546,12 @@ static bool handleActionValue(int64_t *resultAction,
           exceptionObject,
           excp);
 #endif
-  
+
   const uint8_t *actionPos = (uint8_t*) actionEntry,
   *tempActionPos;
   int64_t typeOffset = 0,
   actionOffset;
-  
+
   for (int i = 0; true; ++i) {
     // Each emitted dwarf action corresponds to a 2 tuple of
     // type info address offset, and action offset to the next
@@ -559,7 +559,7 @@ static bool handleActionValue(int64_t *resultAction,
     typeOffset = readSLEB128(&actionPos);
     tempActionPos = actionPos;
     actionOffset = readSLEB128(&tempActionPos);
-    
+
 #ifdef DEBUG
     fprintf(stderr,
             "handleActionValue(...):typeOffset: <%lld>, "
@@ -567,9 +567,9 @@ static bool handleActionValue(int64_t *resultAction,
             typeOffset,
             actionOffset);
 #endif
-    assert((typeOffset >= 0) && 
+    assert((typeOffset >= 0) &&
            "handleActionValue(...):filters are not supported.");
-    
+
     // Note: A typeOffset == 0 implies that a cleanup llvm.eh.selector
     //       argument has been matched.
     if ((typeOffset > 0) &&
@@ -583,17 +583,17 @@ static bool handleActionValue(int64_t *resultAction,
       ret = true;
       break;
     }
-    
+
 #ifdef DEBUG
     fprintf(stderr,
             "handleActionValue(...):actionValue not found.\n");
 #endif
     if (!actionOffset)
       break;
-    
+
     actionPos += actionOffset;
   }
-  
+
   return(ret);
 }
 
@@ -602,52 +602,52 @@ static bool handleActionValue(int64_t *resultAction,
 /// See @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
 /// @param version unsupported (ignored), unwind version
 /// @param lsda language specific data area
-/// @param _Unwind_Action actions minimally supported unwind stage 
+/// @param _Unwind_Action actions minimally supported unwind stage
 ///        (forced specifically not supported)
 /// @param exceptionClass exception class (_Unwind_Exception::exception_class)
 ///        of thrown exception.
 /// @param exceptionObject thrown _Unwind_Exception instance.
 /// @param context unwind system context
-/// @returns minimally supported unwinding control indicator 
-static _Unwind_Reason_Code handleLsda(int version, 
+/// @returns minimally supported unwinding control indicator
+static _Unwind_Reason_Code handleLsda(int version,
                                       const uint8_t *lsda,
                                       _Unwind_Action actions,
-                                      uint64_t exceptionClass, 
+                                      uint64_t exceptionClass,
                                     struct _Unwind_Exception *exceptionObject,
                                       _Unwind_Context_t context) {
   _Unwind_Reason_Code ret = _URC_CONTINUE_UNWIND;
-  
+
   if (!lsda)
     return(ret);
-  
+
 #ifdef DEBUG
-  fprintf(stderr, 
+  fprintf(stderr,
           "handleLsda(...):lsda is non-zero.\n");
 #endif
-  
+
   // Get the current instruction pointer and offset it before next
   // instruction in the current frame which threw the exception.
   uintptr_t pc = _Unwind_GetIP(context)-1;
-  
-  // Get beginning current frame's code (as defined by the 
+
+  // Get beginning current frame's code (as defined by the
   // emitted dwarf code)
   uintptr_t funcStart = _Unwind_GetRegionStart(context);
   uintptr_t pcOffset = pc - funcStart;
   struct OurExceptionType_t **classInfo = NULL;
-  
+
   // Note: See JITDwarfEmitter::EmitExceptionTable(...) for corresponding
   //       dwarf emission
-  
+
   // Parse LSDA header.
   uint8_t lpStartEncoding = *lsda++;
-  
+
   if (lpStartEncoding != llvm::dwarf::DW_EH_PE_omit) {
-    readEncodedPointer(&lsda, lpStartEncoding); 
+    readEncodedPointer(&lsda, lpStartEncoding);
   }
-  
+
   uint8_t ttypeEncoding = *lsda++;
   uintptr_t classInfoOffset;
-  
+
   if (ttypeEncoding != llvm::dwarf::DW_EH_PE_omit) {
     // Calculate type info locations in emitted dwarf code which
     // were flagged by type info arguments to llvm.eh.selector
@@ -655,47 +655,47 @@ static _Unwind_Reason_Code handleLsda(int version,
     classInfoOffset = readULEB128(&lsda);
     classInfo = (struct OurExceptionType_t**) (lsda + classInfoOffset);
   }
-  
-  // Walk call-site table looking for range that 
-  // includes current PC. 
-  
+
+  // Walk call-site table looking for range that
+  // includes current PC.
+
   uint8_t         callSiteEncoding = *lsda++;
   uint32_t        callSiteTableLength = readULEB128(&lsda);
   const uint8_t   *callSiteTableStart = lsda;
-  const uint8_t   *callSiteTableEnd = callSiteTableStart + 
+  const uint8_t   *callSiteTableEnd = callSiteTableStart +
   callSiteTableLength;
   const uint8_t   *actionTableStart = callSiteTableEnd;
   const uint8_t   *callSitePtr = callSiteTableStart;
-  
+
   bool foreignException = false;
-  
+
   while (callSitePtr < callSiteTableEnd) {
-    uintptr_t start = readEncodedPointer(&callSitePtr, 
+    uintptr_t start = readEncodedPointer(&callSitePtr,
                                          callSiteEncoding);
-    uintptr_t length = readEncodedPointer(&callSitePtr, 
+    uintptr_t length = readEncodedPointer(&callSitePtr,
                                           callSiteEncoding);
-    uintptr_t landingPad = readEncodedPointer(&callSitePtr, 
+    uintptr_t landingPad = readEncodedPointer(&callSitePtr,
                                               callSiteEncoding);
-    
+
     // Note: Action value
     uintptr_t actionEntry = readULEB128(&callSitePtr);
-    
+
     if (exceptionClass != ourBaseExceptionClass) {
       // We have been notified of a foreign exception being thrown,
       // and we therefore need to execute cleanup landing pads
       actionEntry = 0;
       foreignException = true;
     }
-    
+
     if (landingPad == 0) {
 #ifdef DEBUG
       fprintf(stderr,
               "handleLsda(...): No landing pad found.\n");
 #endif
-      
+
       continue; // no landing pad for this entry
     }
-    
+
     if (actionEntry) {
       actionEntry += ((uintptr_t) actionTableStart) - 1;
     }
@@ -705,55 +705,55 @@ static _Unwind_Reason_Code handleLsda(int version,
               "handleLsda(...):No action table found.\n");
 #endif
     }
-    
+
     bool exceptionMatched = false;
-    
+
     if ((start <= pcOffset) && (pcOffset < (start + length))) {
 #ifdef DEBUG
       fprintf(stderr,
               "handleLsda(...): Landing pad found.\n");
 #endif
       int64_t actionValue = 0;
-      
+
       if (actionEntry) {
         exceptionMatched = handleActionValue(&actionValue,
-                                             classInfo, 
-                                             actionEntry, 
-                                             exceptionClass, 
+                                             classInfo,
+                                             actionEntry,
+                                             exceptionClass,
                                              exceptionObject);
       }
-      
+
       if (!(actions & _UA_SEARCH_PHASE)) {
 #ifdef DEBUG
         fprintf(stderr,
                 "handleLsda(...): installed landing pad "
                 "context.\n");
 #endif
-        
+
         // Found landing pad for the PC.
-        // Set Instruction Pointer to so we re-enter function 
-        // at landing pad. The landing pad is created by the 
+        // Set Instruction Pointer to so we re-enter function
+        // at landing pad. The landing pad is created by the
         // compiler to take two parameters in registers.
-        _Unwind_SetGR(context, 
-                      __builtin_eh_return_data_regno(0), 
+        _Unwind_SetGR(context,
+                      __builtin_eh_return_data_regno(0),
                       (uintptr_t)exceptionObject);
-        
+
         // Note: this virtual register directly corresponds
         //       to the return of the llvm.eh.selector intrinsic
         if (!actionEntry || !exceptionMatched) {
           // We indicate cleanup only
-          _Unwind_SetGR(context, 
-                        __builtin_eh_return_data_regno(1), 
+          _Unwind_SetGR(context,
+                        __builtin_eh_return_data_regno(1),
                         0);
         }
         else {
           // Matched type info index of llvm.eh.selector intrinsic
           // passed here.
-          _Unwind_SetGR(context, 
-                        __builtin_eh_return_data_regno(1), 
+          _Unwind_SetGR(context,
+                        __builtin_eh_return_data_regno(1),
                         actionValue);
         }
-        
+
         // To execute landing pad set here
         _Unwind_SetIP(context, funcStart + landingPad);
         ret = _URC_INSTALL_CONTEXT;
@@ -767,19 +767,19 @@ static _Unwind_Reason_Code handleLsda(int version,
       }
       else {
         // Note: Only non-clean up handlers are marked as
-        //       found. Otherwise the clean up handlers will be 
-        //       re-found and executed during the clean up 
+        //       found. Otherwise the clean up handlers will be
+        //       re-found and executed during the clean up
         //       phase.
 #ifdef DEBUG
         fprintf(stderr,
                 "handleLsda(...): cleanup handler found.\n");
 #endif
       }
-      
+
       break;
     }
   }
-  
+
   return(ret);
 }
 
@@ -788,23 +788,23 @@ static _Unwind_Reason_Code handleLsda(int version,
 /// dwarf unwind info block. Again see: JITDwarfEmitter.cpp.
 /// See @link http://refspecs.freestandards.org/abi-eh-1.21.html @unlink
 /// @param version unsupported (ignored), unwind version
-/// @param _Unwind_Action actions minimally supported unwind stage 
+/// @param _Unwind_Action actions minimally supported unwind stage
 ///        (forced specifically not supported)
 /// @param exceptionClass exception class (_Unwind_Exception::exception_class)
 ///        of thrown exception.
 /// @param exceptionObject thrown _Unwind_Exception instance.
 /// @param context unwind system context
-/// @returns minimally supported unwinding control indicator 
-_Unwind_Reason_Code ourPersonality(int version, 
+/// @returns minimally supported unwinding control indicator
+_Unwind_Reason_Code ourPersonality(int version,
                                    _Unwind_Action actions,
-                                   uint64_t exceptionClass, 
+                                   uint64_t exceptionClass,
                                    struct _Unwind_Exception *exceptionObject,
                                    _Unwind_Context_t context) {
 #ifdef DEBUG
-  fprintf(stderr, 
+  fprintf(stderr,
           "We are in ourPersonality(...):actions is <%d>.\n",
           actions);
-  
+
   if (actions & _UA_SEARCH_PHASE) {
     fprintf(stderr, "ourPersonality(...):In search phase.\n");
   }
@@ -812,15 +812,15 @@ _Unwind_Reason_Code ourPersonality(int version,
     fprintf(stderr, "ourPersonality(...):In non-search phase.\n");
   }
 #endif
-  
+
   const uint8_t *lsda = _Unwind_GetLanguageSpecificData(context);
-  
+
 #ifdef DEBUG
-  fprintf(stderr, 
+  fprintf(stderr,
           "ourPersonality(...):lsda = <%p>.\n",
           lsda);
 #endif
-  
+
   // The real work of the personality function is captured here
   return(handleLsda(version,
                     lsda,
@@ -841,12 +841,12 @@ _Unwind_Reason_Code ourPersonality(int version,
 uint64_t genClass(const unsigned char classChars[], size_t classCharsSize)
 {
   uint64_t ret = classChars[0];
-  
+
   for (unsigned i = 1; i < classCharsSize; ++i) {
     ret <<= 8;
     ret += classChars[i];
   }
-  
+
   return(ret);
 }
 
@@ -865,37 +865,37 @@ uint64_t genClass(const unsigned char classChars[], size_t classCharsSize)
 /// @param module code for module instance
 /// @param builder builder instance
 /// @param toPrint string to print
-/// @param useGlobal A value of true (default) indicates a GlobalValue is 
-///        generated, and is used to hold the constant string. A value of 
-///        false indicates that the constant string will be stored on the 
+/// @param useGlobal A value of true (default) indicates a GlobalValue is
+///        generated, and is used to hold the constant string. A value of
+///        false indicates that the constant string will be stored on the
 ///        stack.
-void generateStringPrint(llvm::LLVMContext &context, 
+void generateStringPrint(llvm::LLVMContext &context,
                          llvm::Module &module,
-                         llvm::IRBuilder<> &builder, 
+                         llvm::IRBuilder<> &builder,
                          std::string toPrint,
                          bool useGlobal = true) {
   llvm::Function *printFunct = module.getFunction("printStr");
-  
+
   llvm::Value *stringVar;
-  llvm::Constant *stringConstant = 
+  llvm::Constant *stringConstant =
   llvm::ConstantDataArray::getString(context, toPrint);
-  
+
   if (useGlobal) {
     // Note: Does not work without allocation
-    stringVar = 
-    new llvm::GlobalVariable(module, 
+    stringVar =
+    new llvm::GlobalVariable(module,
                              stringConstant->getType(),
-                             true, 
-                             llvm::GlobalValue::LinkerPrivateLinkage, 
-                             stringConstant, 
+                             true,
+                             llvm::GlobalValue::LinkerPrivateLinkage,
+                             stringConstant,
                              "");
   }
   else {
     stringVar = builder.CreateAlloca(stringConstant->getType());
     builder.CreateStore(stringConstant, stringVar);
   }
-  
-  llvm::Value *cast = builder.CreatePointerCast(stringVar, 
+
+  llvm::Value *cast = builder.CreatePointerCast(stringVar,
                                                 builder.getInt8PtrTy());
   builder.CreateCall(printFunct, cast);
 }
@@ -909,49 +909,49 @@ void generateStringPrint(llvm::LLVMContext &context,
 /// @param printFunct function used to "print" integer
 /// @param toPrint string to print
 /// @param format printf like formating string for print
-/// @param useGlobal A value of true (default) indicates a GlobalValue is 
-///        generated, and is used to hold the constant string. A value of 
-///        false indicates that the constant string will be stored on the 
+/// @param useGlobal A value of true (default) indicates a GlobalValue is
+///        generated, and is used to hold the constant string. A value of
+///        false indicates that the constant string will be stored on the
 ///        stack.
-void generateIntegerPrint(llvm::LLVMContext &context, 
+void generateIntegerPrint(llvm::LLVMContext &context,
                           llvm::Module &module,
-                          llvm::IRBuilder<> &builder, 
+                          llvm::IRBuilder<> &builder,
                           llvm::Function &printFunct,
                           llvm::Value &toPrint,
-                          std::string format, 
+                          std::string format,
                           bool useGlobal = true) {
   llvm::Constant *stringConstant =
     llvm::ConstantDataArray::getString(context, format);
   llvm::Value *stringVar;
-  
+
   if (useGlobal) {
     // Note: Does not seem to work without allocation
-    stringVar = 
-    new llvm::GlobalVariable(module, 
+    stringVar =
+    new llvm::GlobalVariable(module,
                              stringConstant->getType(),
-                             true, 
-                             llvm::GlobalValue::LinkerPrivateLinkage, 
-                             stringConstant, 
+                             true,
+                             llvm::GlobalValue::LinkerPrivateLinkage,
+                             stringConstant,
                              "");
   }
   else {
     stringVar = builder.CreateAlloca(stringConstant->getType());
     builder.CreateStore(stringConstant, stringVar);
   }
-  
-  llvm::Value *cast = builder.CreateBitCast(stringVar, 
+
+  llvm::Value *cast = builder.CreateBitCast(stringVar,
                                             builder.getInt8PtrTy());
   builder.CreateCall2(&printFunct, &toPrint, cast);
 }
 
 
-/// Generates code to handle finally block type semantics: always runs 
-/// regardless of whether a thrown exception is passing through or the 
-/// parent function is simply exiting. In addition to printing some state 
-/// to stderr, this code will resume the exception handling--runs the 
-/// unwind resume block, if the exception has not been previously caught 
-/// by a catch clause, and will otherwise execute the end block (terminator 
-/// block). In addition this function creates the corresponding function's 
+/// Generates code to handle finally block type semantics: always runs
+/// regardless of whether a thrown exception is passing through or the
+/// parent function is simply exiting. In addition to printing some state
+/// to stderr, this code will resume the exception handling--runs the
+/// unwind resume block, if the exception has not been previously caught
+/// by a catch clause, and will otherwise execute the end block (terminator
+/// block). In addition this function creates the corresponding function's
 /// stack storage for the exception pointer and catch flag status.
 /// @param context llvm context
 /// @param module code for module instance
@@ -965,9 +965,9 @@ void generateIntegerPrint(llvm::LLVMContext &context,
 /// @param exceptionStorage reference to exception pointer storage
 /// @param caughtResultStorage reference to landingpad result storage
 /// @returns newly created block
-static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context, 
-                                            llvm::Module &module, 
-                                            llvm::IRBuilder<> &builder, 
+static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context,
+                                            llvm::Module &module,
+                                            llvm::IRBuilder<> &builder,
                                             llvm::Function &toAddTo,
                                             std::string &blockName,
                                             std::string &functionId,
@@ -976,21 +976,21 @@ static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context,
                                             llvm::Value **exceptionCaughtFlag,
                                             llvm::Value **exceptionStorage,
                                             llvm::Value **caughtResultStorage) {
-  assert(exceptionCaughtFlag && 
+  assert(exceptionCaughtFlag &&
          "ExceptionDemo::createFinallyBlock(...):exceptionCaughtFlag "
          "is NULL");
-  assert(exceptionStorage && 
+  assert(exceptionStorage &&
          "ExceptionDemo::createFinallyBlock(...):exceptionStorage "
          "is NULL");
-  assert(caughtResultStorage && 
+  assert(caughtResultStorage &&
          "ExceptionDemo::createFinallyBlock(...):caughtResultStorage "
          "is NULL");
-  
+
   *exceptionCaughtFlag = createEntryBlockAlloca(toAddTo,
                                          "exceptionCaught",
                                          ourExceptionNotThrownState->getType(),
                                          ourExceptionNotThrownState);
-  
+
   llvm::PointerType *exceptionStorageType = builder.getInt8PtrTy();
   *exceptionStorage = createEntryBlockAlloca(toAddTo,
                                              "exceptionStorage",
@@ -1002,35 +1002,35 @@ static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context,
                                               ourCaughtResultType,
                                               llvm::ConstantAggregateZero::get(
                                                 ourCaughtResultType));
-  
+
   llvm::BasicBlock *ret = llvm::BasicBlock::Create(context,
                                                    blockName,
                                                    &toAddTo);
-  
+
   builder.SetInsertPoint(ret);
-  
+
   std::ostringstream bufferToPrint;
   bufferToPrint << "Gen: Executing finally block "
     << blockName << " in " << functionId << "\n";
-  generateStringPrint(context, 
-                      module, 
-                      builder, 
+  generateStringPrint(context,
+                      module,
+                      builder,
                       bufferToPrint.str(),
                       USE_GLOBAL_STR_CONSTS);
-  
+
   llvm::SwitchInst *theSwitch = builder.CreateSwitch(builder.CreateLoad(
-                                                       *exceptionCaughtFlag), 
+                                                       *exceptionCaughtFlag),
                                                      &terminatorBlock,
                                                      2);
   theSwitch->addCase(ourExceptionCaughtState, &terminatorBlock);
   theSwitch->addCase(ourExceptionThrownState, &unwindResumeBlock);
-  
+
   return(ret);
 }
 
 
 /// Generates catch block semantics which print a string to indicate type of
-/// catch executed, sets an exception caught flag, and executes passed in 
+/// catch executed, sets an exception caught flag, and executes passed in
 /// end block (terminator block).
 /// @param context llvm context
 /// @param module code for module instance
@@ -1041,52 +1041,52 @@ static llvm::BasicBlock *createFinallyBlock(llvm::LLVMContext &context,
 /// @param terminatorBlock terminator "end" block
 /// @param exceptionCaughtFlag exception caught/thrown status
 /// @returns newly created block
-static llvm::BasicBlock *createCatchBlock(llvm::LLVMContext &context, 
-                                          llvm::Module &module, 
-                                          llvm::IRBuilder<> &builder, 
+static llvm::BasicBlock *createCatchBlock(llvm::LLVMContext &context,
+                                          llvm::Module &module,
+                                          llvm::IRBuilder<> &builder,
                                           llvm::Function &toAddTo,
                                           std::string &blockName,
                                           std::string &functionId,
                                           llvm::BasicBlock &terminatorBlock,
                                           llvm::Value &exceptionCaughtFlag) {
-  
+
   llvm::BasicBlock *ret = llvm::BasicBlock::Create(context,
                                                    blockName,
                                                    &toAddTo);
-  
+
   builder.SetInsertPoint(ret);
-  
+
   std::ostringstream bufferToPrint;
   bufferToPrint << "Gen: Executing catch block "
   << blockName
   << " in "
   << functionId
   << std::endl;
-  generateStringPrint(context, 
-                      module, 
-                      builder, 
+  generateStringPrint(context,
+                      module,
+                      builder,
                       bufferToPrint.str(),
                       USE_GLOBAL_STR_CONSTS);
   builder.CreateStore(ourExceptionCaughtState, &exceptionCaughtFlag);
   builder.CreateBr(&terminatorBlock);
-  
+
   return(ret);
 }
 
 
-/// Generates a function which invokes a function (toInvoke) and, whose 
-/// unwind block will "catch" the type info types correspondingly held in the 
-/// exceptionTypesToCatch argument. If the toInvoke function throws an 
-/// exception which does not match any type info types contained in 
-/// exceptionTypesToCatch, the generated code will call _Unwind_Resume 
-/// with the raised exception. On the other hand the generated code will 
+/// Generates a function which invokes a function (toInvoke) and, whose
+/// unwind block will "catch" the type info types correspondingly held in the
+/// exceptionTypesToCatch argument. If the toInvoke function throws an
+/// exception which does not match any type info types contained in
+/// exceptionTypesToCatch, the generated code will call _Unwind_Resume
+/// with the raised exception. On the other hand the generated code will
 /// normally exit if the toInvoke function does not throw an exception.
-/// The generated "finally" block is always run regardless of the cause of 
+/// The generated "finally" block is always run regardless of the cause of
 /// the generated function exit.
 /// The generated function is returned after being verified.
 /// @param module code for module instance
 /// @param builder builder instance
-/// @param fpm a function pass manager holding optional IR to IR 
+/// @param fpm a function pass manager holding optional IR to IR
 ///        transformations
 /// @param toInvoke inner function to invoke
 /// @param ourId id used to printing purposes
@@ -1094,76 +1094,76 @@ static llvm::BasicBlock *createCatchBlock(llvm::LLVMContext &context,
 /// @param exceptionTypesToCatch array of type info types to "catch"
 /// @returns generated function
 static
-llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module, 
-                                             llvm::IRBuilder<> &builder, 
+llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
+                                             llvm::IRBuilder<> &builder,
                                              llvm::FunctionPassManager &fpm,
                                              llvm::Function &toInvoke,
                                              std::string ourId,
                                              unsigned numExceptionsToCatch,
                                              unsigned exceptionTypesToCatch[]) {
-  
+
   llvm::LLVMContext &context = module.getContext();
   llvm::Function *toPrint32Int = module.getFunction("print32Int");
-  
+
   ArgTypes argTypes;
   argTypes.push_back(builder.getInt32Ty());
-  
+
   ArgNames argNames;
   argNames.push_back("exceptTypeToThrow");
-  
-  llvm::Function *ret = createFunction(module, 
+
+  llvm::Function *ret = createFunction(module,
                                        builder.getVoidTy(),
-                                       argTypes, 
-                                       argNames, 
+                                       argTypes,
+                                       argNames,
                                        ourId,
-                                       llvm::Function::ExternalLinkage, 
-                                       false, 
+                                       llvm::Function::ExternalLinkage,
+                                       false,
                                        false);
-  
+
   // Block which calls invoke
   llvm::BasicBlock *entryBlock = llvm::BasicBlock::Create(context,
-                                                          "entry", 
+                                                          "entry",
                                                           ret);
   // Normal block for invoke
-  llvm::BasicBlock *normalBlock = llvm::BasicBlock::Create(context, 
-                                                           "normal", 
+  llvm::BasicBlock *normalBlock = llvm::BasicBlock::Create(context,
+                                                           "normal",
                                                            ret);
   // Unwind block for invoke
-  llvm::BasicBlock *exceptionBlock = llvm::BasicBlock::Create(context, 
-                                                              "exception", 
+  llvm::BasicBlock *exceptionBlock = llvm::BasicBlock::Create(context,
+                                                              "exception",
                                                               ret);
-  
+
   // Block which routes exception to correct catch handler block
-  llvm::BasicBlock *exceptionRouteBlock = llvm::BasicBlock::Create(context, 
-                                                             "exceptionRoute", 
+  llvm::BasicBlock *exceptionRouteBlock = llvm::BasicBlock::Create(context,
+                                                             "exceptionRoute",
                                                              ret);
-  
+
   // Foreign exception handler
-  llvm::BasicBlock *externalExceptionBlock = llvm::BasicBlock::Create(context, 
-                                                          "externalException", 
+  llvm::BasicBlock *externalExceptionBlock = llvm::BasicBlock::Create(context,
+                                                          "externalException",
                                                           ret);
-  
+
   // Block which calls _Unwind_Resume
-  llvm::BasicBlock *unwindResumeBlock = llvm::BasicBlock::Create(context, 
-                                                               "unwindResume", 
+  llvm::BasicBlock *unwindResumeBlock = llvm::BasicBlock::Create(context,
+                                                               "unwindResume",
                                                                ret);
-  
+
   // Clean up block which delete exception if needed
   llvm::BasicBlock *endBlock = llvm::BasicBlock::Create(context, "end", ret);
-  
+
   std::string nextName;
   std::vector<llvm::BasicBlock*> catchBlocks(numExceptionsToCatch);
   llvm::Value *exceptionCaughtFlag = NULL;
   llvm::Value *exceptionStorage = NULL;
   llvm::Value *caughtResultStorage = NULL;
-  
-  // Finally block which will branch to unwindResumeBlock if 
+
+  // Finally block which will branch to unwindResumeBlock if
   // exception is not caught. Initializes/allocates stack locations.
-  llvm::BasicBlock *finallyBlock = createFinallyBlock(context, 
-                                                      module, 
-                                                      builder, 
-                                                      *ret, 
-                                                      nextName = "finally", 
+  llvm::BasicBlock *finallyBlock = createFinallyBlock(context,
+                                                      module,
+                                                      builder,
+                                                      *ret,
+                                                      nextName = "finally",
                                                       ourId,
                                                       *endBlock,
                                                       *unwindResumeBlock,
@@ -1171,74 +1171,74 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
                                                       &exceptionStorage,
                                                       &caughtResultStorage
                                                       );
-  
+
   for (unsigned i = 0; i < numExceptionsToCatch; ++i) {
     nextName = ourTypeInfoNames[exceptionTypesToCatch[i]];
-    
+
     // One catch block per type info to be caught
-    catchBlocks[i] = createCatchBlock(context, 
-                                      module, 
-                                      builder, 
+    catchBlocks[i] = createCatchBlock(context,
+                                      module,
+                                      builder,
                                       *ret,
-                                      nextName, 
+                                      nextName,
                                       ourId,
                                       *finallyBlock,
                                       *exceptionCaughtFlag);
   }
-  
+
   // Entry Block
-  
+
   builder.SetInsertPoint(entryBlock);
-  
+
   std::vector<llvm::Value*> args;
   args.push_back(namedValues["exceptTypeToThrow"]);
-  builder.CreateInvoke(&toInvoke, 
-                       normalBlock, 
-                       exceptionBlock, 
+  builder.CreateInvoke(&toInvoke,
+                       normalBlock,
+                       exceptionBlock,
                        args);
-  
+
   // End Block
-  
+
   builder.SetInsertPoint(endBlock);
-  
-  generateStringPrint(context, 
+
+  generateStringPrint(context,
                       module,
-                      builder, 
+                      builder,
                       "Gen: In end block: exiting in " + ourId + ".\n",
                       USE_GLOBAL_STR_CONSTS);
   llvm::Function *deleteOurException = module.getFunction("deleteOurException");
-  
+
   // Note: function handles NULL exceptions
-  builder.CreateCall(deleteOurException, 
+  builder.CreateCall(deleteOurException,
                      builder.CreateLoad(exceptionStorage));
   builder.CreateRetVoid();
-  
+
   // Normal Block
-  
+
   builder.SetInsertPoint(normalBlock);
-  
-  generateStringPrint(context, 
+
+  generateStringPrint(context,
                       module,
-                      builder, 
+                      builder,
                       "Gen: No exception in " + ourId + "!\n",
                       USE_GLOBAL_STR_CONSTS);
-  
+
   // Finally block is always called
   builder.CreateBr(finallyBlock);
-  
+
   // Unwind Resume Block
-  
+
   builder.SetInsertPoint(unwindResumeBlock);
-  
+
   builder.CreateResume(builder.CreateLoad(caughtResultStorage));
-  
+
   // Exception Block
-  
+
   builder.SetInsertPoint(exceptionBlock);
-  
+
   llvm::Function *personality = module.getFunction("ourPersonality");
-  
-  llvm::LandingPadInst *caughtResult = 
+
+  llvm::LandingPadInst *caughtResult =
     builder.CreateLandingPad(ourCaughtResultType,
                              personality,
                              numExceptionsToCatch,
@@ -1255,48 +1255,48 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
   llvm::Value *unwindException = builder.CreateExtractValue(caughtResult, 0);
   llvm::Value *retTypeInfoIndex = builder.CreateExtractValue(caughtResult, 1);
 
-  // FIXME: Redundant storage which, beyond utilizing value of 
-  //        caughtResultStore for unwindException storage, may be alleviated 
+  // FIXME: Redundant storage which, beyond utilizing value of
+  //        caughtResultStore for unwindException storage, may be alleviated
   //        altogether with a block rearrangement
   builder.CreateStore(caughtResult, caughtResultStorage);
   builder.CreateStore(unwindException, exceptionStorage);
   builder.CreateStore(ourExceptionThrownState, exceptionCaughtFlag);
-  
-  // Retrieve exception_class member from thrown exception 
+
+  // Retrieve exception_class member from thrown exception
   // (_Unwind_Exception instance). This member tells us whether or not
   // the exception is foreign.
-  llvm::Value *unwindExceptionClass = 
+  llvm::Value *unwindExceptionClass =
     builder.CreateLoad(builder.CreateStructGEP(
-             builder.CreatePointerCast(unwindException, 
-                                       ourUnwindExceptionType->getPointerTo()), 
+             builder.CreatePointerCast(unwindException,
+                                       ourUnwindExceptionType->getPointerTo()),
                                                0));
-  
+
   // Branch to the externalExceptionBlock if the exception is foreign or
   // to a catch router if not. Either way the finally block will be run.
   builder.CreateCondBr(builder.CreateICmpEQ(unwindExceptionClass,
-                            llvm::ConstantInt::get(builder.getInt64Ty(), 
+                            llvm::ConstantInt::get(builder.getInt64Ty(),
                                                    ourBaseExceptionClass)),
                        exceptionRouteBlock,
                        externalExceptionBlock);
-  
+
   // External Exception Block
-  
+
   builder.SetInsertPoint(externalExceptionBlock);
-  
-  generateStringPrint(context, 
+
+  generateStringPrint(context,
                       module,
-                      builder, 
+                      builder,
                       "Gen: Foreign exception received.\n",
                       USE_GLOBAL_STR_CONSTS);
-  
+
   // Branch to the finally block
   builder.CreateBr(finallyBlock);
-  
+
   // Exception Route Block
-  
+
   builder.SetInsertPoint(exceptionRouteBlock);
-  
-  // Casts exception pointer (_Unwind_Exception instance) to parent 
+
+  // Casts exception pointer (_Unwind_Exception instance) to parent
   // (OurException instance).
   //
   // Note: ourBaseFromUnwindOffset is usually negative
@@ -1304,34 +1304,34 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
                                   builder.CreateConstGEP1_64(unwindException,
                                                        ourBaseFromUnwindOffset),
                                   ourExceptionType->getPointerTo());
-  
+
   // Retrieve thrown exception type info type
   //
   // Note: Index is not relative to pointer but instead to structure
   //       unlike a true getelementptr (GEP) instruction
   typeInfoThrown = builder.CreateStructGEP(typeInfoThrown, 0);
-  
-  llvm::Value *typeInfoThrownType = 
+
+  llvm::Value *typeInfoThrownType =
   builder.CreateStructGEP(typeInfoThrown, 0);
-  
-  generateIntegerPrint(context, 
+
+  generateIntegerPrint(context,
                        module,
-                       builder, 
-                       *toPrint32Int, 
+                       builder,
+                       *toPrint32Int,
                        *(builder.CreateLoad(typeInfoThrownType)),
-                       "Gen: Exception type <%d> received (stack unwound) " 
-                       " in " + 
-                       ourId + 
+                       "Gen: Exception type <%d> received (stack unwound) "
+                       " in " +
+                       ourId +
                        ".\n",
                        USE_GLOBAL_STR_CONSTS);
-  
+
   // Route to matched type info catch block or run cleanup finally block
-  llvm::SwitchInst *switchToCatchBlock = builder.CreateSwitch(retTypeInfoIndex, 
-                                                          finallyBlock, 
+  llvm::SwitchInst *switchToCatchBlock = builder.CreateSwitch(retTypeInfoIndex,
+                                                          finallyBlock,
                                                           numExceptionsToCatch);
-  
+
   unsigned nextTypeToCatch;
-  
+
   for (unsigned i = 1; i <= numExceptionsToCatch; ++i) {
     nextTypeToCatch = i - 1;
     switchToCatchBlock->addCase(llvm::ConstantInt::get(
@@ -1341,18 +1341,18 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
 
   llvm::verifyFunction(*ret);
   fpm.run(*ret);
-  
+
   return(ret);
 }
 
 
 /// Generates function which throws either an exception matched to a runtime
-/// determined type info type (argument to generated function), or if this 
-/// runtime value matches nativeThrowType, throws a foreign exception by 
+/// determined type info type (argument to generated function), or if this
+/// runtime value matches nativeThrowType, throws a foreign exception by
 /// calling nativeThrowFunct.
 /// @param module code for module instance
 /// @param builder builder instance
-/// @param fpm a function pass manager holding optional IR to IR 
+/// @param fpm a function pass manager holding optional IR to IR
 ///        transformations
 /// @param ourId id used to printing purposes
 /// @param nativeThrowType a runtime argument of this value results in
@@ -1361,8 +1361,8 @@ llvm::Function *createCatchWrappedInvokeFunction(llvm::Module &module,
 ///        if the above nativeThrowType matches generated function's arg.
 /// @returns generated function
 static
-llvm::Function *createThrowExceptionFunction(llvm::Module &module, 
-                                             llvm::IRBuilder<> &builder, 
+llvm::Function *createThrowExceptionFunction(llvm::Module &module,
+                                             llvm::IRBuilder<> &builder,
                                              llvm::FunctionPassManager &fpm,
                                              std::string ourId,
                                              int32_t nativeThrowType,
@@ -1373,7 +1373,7 @@ llvm::Function *createThrowExceptionFunction(llvm::Module &module,
   unwindArgTypes.push_back(builder.getInt32Ty());
   ArgNames unwindArgNames;
   unwindArgNames.push_back("exceptTypeToThrow");
-  
+
   llvm::Function *ret = createFunction(module,
                                        builder.getVoidTy(),
                                        unwindArgTypes,
@@ -1382,88 +1382,88 @@ llvm::Function *createThrowExceptionFunction(llvm::Module &module,
                                        llvm::Function::ExternalLinkage,
                                        false,
                                        false);
-  
+
   // Throws either one of our exception or a native C++ exception depending
   // on a runtime argument value containing a type info type.
   llvm::BasicBlock *entryBlock = llvm::BasicBlock::Create(context,
-                                                          "entry", 
+                                                          "entry",
                                                           ret);
   // Throws a foreign exception
   llvm::BasicBlock *nativeThrowBlock = llvm::BasicBlock::Create(context,
-                                                                "nativeThrow", 
+                                                                "nativeThrow",
                                                                 ret);
   // Throws one of our Exceptions
   llvm::BasicBlock *generatedThrowBlock = llvm::BasicBlock::Create(context,
-                                                             "generatedThrow", 
+                                                             "generatedThrow",
                                                              ret);
   // Retrieved runtime type info type to throw
   llvm::Value *exceptionType = namedValues["exceptTypeToThrow"];
-  
+
   // nativeThrowBlock block
-  
+
   builder.SetInsertPoint(nativeThrowBlock);
-  
+
   // Throws foreign exception
   builder.CreateCall(&nativeThrowFunct, exceptionType);
   builder.CreateUnreachable();
-  
+
   // entry block
-  
+
   builder.SetInsertPoint(entryBlock);
-  
+
   llvm::Function *toPrint32Int = module.getFunction("print32Int");
-  generateIntegerPrint(context, 
+  generateIntegerPrint(context,
                        module,
-                       builder, 
-                       *toPrint32Int, 
-                       *exceptionType, 
-                       "\nGen: About to throw exception type <%d> in " + 
-                       ourId + 
+                       builder,
+                       *toPrint32Int,
+                       *exceptionType,
+                       "\nGen: About to throw exception type <%d> in " +
+                       ourId +
                        ".\n",
                        USE_GLOBAL_STR_CONSTS);
-  
+
   // Switches on runtime type info type value to determine whether or not
-  // a foreign exception is thrown. Defaults to throwing one of our 
+  // a foreign exception is thrown. Defaults to throwing one of our
   // generated exceptions.
   llvm::SwitchInst *theSwitch = builder.CreateSwitch(exceptionType,
                                                      generatedThrowBlock,
                                                      1);
-  
-  theSwitch->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 
+
+  theSwitch->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(context),
                                             nativeThrowType),
                      nativeThrowBlock);
-  
+
   // generatedThrow block
-  
+
   builder.SetInsertPoint(generatedThrowBlock);
-  
+
   llvm::Function *createOurException = module.getFunction("createOurException");
   llvm::Function *raiseOurException = module.getFunction(
                                         "_Unwind_RaiseException");
-  
+
   // Creates exception to throw with runtime type info type.
-  llvm::Value *exception = builder.CreateCall(createOurException, 
+  llvm::Value *exception = builder.CreateCall(createOurException,
                                               namedValues["exceptTypeToThrow"]);
-  
+
   // Throw generated Exception
   builder.CreateCall(raiseOurException, exception);
   builder.CreateUnreachable();
-  
+
   llvm::verifyFunction(*ret);
   fpm.run(*ret);
-  
+
   return(ret);
 }
 
 static void createStandardUtilityFunctions(unsigned numTypeInfos,
-                                           llvm::Module &module, 
+                                           llvm::Module &module,
                                            llvm::IRBuilder<> &builder);
 
-/// Creates test code by generating and organizing these functions into the 
+/// Creates test code by generating and organizing these functions into the
 /// test case. The test case consists of an outer function setup to invoke
-/// an inner function within an environment having multiple catch and single 
+/// an inner function within an environment having multiple catch and single
 /// finally blocks. This inner function is also setup to invoke a throw
-/// function within an evironment similar in nature to the outer function's 
+/// function within an evironment similar in nature to the outer function's
 /// catch and finally blocks. Each of these two functions catch mutually
 /// exclusive subsets (even or odd) of the type info types configured
 /// for this this. All generated functions have a runtime argument which
@@ -1474,26 +1474,26 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 /// a supplied a function which in turn will throw a foreign exception.
 /// @param module code for module instance
 /// @param builder builder instance
-/// @param fpm a function pass manager holding optional IR to IR 
+/// @param fpm a function pass manager holding optional IR to IR
 ///        transformations
 /// @param nativeThrowFunctName name of external function which will throw
 ///        a foreign exception
 /// @returns outermost generated test function.
-llvm::Function *createUnwindExceptionTest(llvm::Module &module, 
-                                          llvm::IRBuilder<> &builder, 
+llvm::Function *createUnwindExceptionTest(llvm::Module &module,
+                                          llvm::IRBuilder<> &builder,
                                           llvm::FunctionPassManager &fpm,
                                           std::string nativeThrowFunctName) {
   // Number of type infos to generate
   unsigned numTypeInfos = 6;
-  
+
   // Initialze intrisics and external functions to use along with exception
   // and type info globals.
   createStandardUtilityFunctions(numTypeInfos,
                                  module,
                                  builder);
   llvm::Function *nativeThrowFunct = module.getFunction(nativeThrowFunctName);
-  
-  // Create exception throw function using the value ~0 to cause 
+
+  // Create exception throw function using the value ~0 to cause
   // foreign exceptions to be thrown.
   llvm::Function *throwFunct = createThrowExceptionFunction(module,
                                                             builder,
@@ -1503,9 +1503,9 @@ llvm::Function *createUnwindExceptionTest(llvm::Module &module,
                                                             *nativeThrowFunct);
   // Inner function will catch even type infos
   unsigned innerExceptionTypesToCatch[] = {6, 2, 4};
-  size_t numExceptionTypesToCatch = sizeof(innerExceptionTypesToCatch) / 
+  size_t numExceptionTypesToCatch = sizeof(innerExceptionTypesToCatch) /
                                     sizeof(unsigned);
-  
+
   // Generate inner function.
   llvm::Function *innerCatchFunct = createCatchWrappedInvokeFunction(module,
                                                     builder,
@@ -1514,12 +1514,12 @@ llvm::Function *createUnwindExceptionTest(llvm::Module &module,
                                                     "innerCatchFunct",
                                                     numExceptionTypesToCatch,
                                                     innerExceptionTypesToCatch);
-  
+
   // Outer function will catch odd type infos
   unsigned outerExceptionTypesToCatch[] = {3, 1, 5};
-  numExceptionTypesToCatch = sizeof(outerExceptionTypesToCatch) / 
+  numExceptionTypesToCatch = sizeof(outerExceptionTypesToCatch) /
   sizeof(unsigned);
-  
+
   // Generate outer function
   llvm::Function *outerCatchFunct = createCatchWrappedInvokeFunction(module,
                                                     builder,
@@ -1528,7 +1528,7 @@ llvm::Function *createUnwindExceptionTest(llvm::Module &module,
                                                     "outerCatchFunct",
                                                     numExceptionTypesToCatch,
                                                     outerExceptionTypesToCatch);
-  
+
   // Return outer function to run
   return(outerCatchFunct);
 }
@@ -1539,15 +1539,15 @@ class OurCppRunException : public std::runtime_error {
 public:
   OurCppRunException(const std::string reason) :
   std::runtime_error(reason) {}
-  
+
   OurCppRunException (const OurCppRunException &toCopy) :
   std::runtime_error(toCopy) {}
-  
+
   OurCppRunException &operator = (const OurCppRunException &toCopy) {
     return(reinterpret_cast<OurCppRunException&>(
                                  std::runtime_error::operator=(toCopy)));
   }
-  
+
   ~OurCppRunException (void) throw () {}
 };
 
@@ -1562,7 +1562,7 @@ void throwCppException (int32_t ignoreIt) {
 
 typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow);
 
-/// This is a test harness which runs test by executing generated 
+/// This is a test harness which runs test by executing generated
 /// function with a type info type to throw. Harness wraps the execution
 /// of generated function in a C++ try catch clause.
 /// @param engine execution engine to use for executing generated function.
@@ -1572,15 +1572,15 @@ typedef void (*OurExceptionThrowFunctType) (int32_t typeToThrow);
 /// @param typeToThrow type info type of generated exception to throw, or
 ///        indicator to cause foreign exception to be thrown.
 static
-void runExceptionThrow(llvm::ExecutionEngine *engine, 
-                       llvm::Function *function, 
+void runExceptionThrow(llvm::ExecutionEngine *engine,
+                       llvm::Function *function,
                        int32_t typeToThrow) {
-  
+
   // Find test's function pointer
-  OurExceptionThrowFunctType functPtr = 
+  OurExceptionThrowFunctType functPtr =
     reinterpret_cast<OurExceptionThrowFunctType>(
        reinterpret_cast<intptr_t>(engine->getPointerToFunction(function)));
-  
+
   try {
     // Run test
     (*functPtr)(typeToThrow);
@@ -1589,15 +1589,15 @@ void runExceptionThrow(llvm::ExecutionEngine *engine,
     // Catch foreign C++ exception
     fprintf(stderr,
             "\nrunExceptionThrow(...):In C++ catch OurCppRunException "
-            "with reason: %s.\n", 
+            "with reason: %s.\n",
             exc.what());
   }
   catch (...) {
-    // Catch all exceptions including our generated ones. This latter 
+    // Catch all exceptions including our generated ones. This latter
     // functionality works according to the example in rules 1.6.4 of
-    // http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22), 
-    // given that these will be exceptions foreign to C++ 
-    // (the _Unwind_Exception::exception_class should be different from 
+    // http://sourcery.mentor.com/public/cxx-abi/abi-eh.html (v1.22),
+    // given that these will be exceptions foreign to C++
+    // (the _Unwind_Exception::exception_class should be different from
     // the one used by C++).
     fprintf(stderr,
             "\nrunExceptionThrow(...):In C++ catch all.\n");
@@ -1610,32 +1610,32 @@ void runExceptionThrow(llvm::ExecutionEngine *engine,
 
 typedef llvm::ArrayRef<llvm::Type*> TypeArray;
 
-/// This initialization routine creates type info globals and 
+/// This initialization routine creates type info globals and
 /// adds external function declarations to module.
 /// @param numTypeInfos number of linear type info associated type info types
 ///        to create as GlobalVariable instances, starting with the value 1.
 /// @param module code for module instance
 /// @param builder builder instance
 static void createStandardUtilityFunctions(unsigned numTypeInfos,
-                                           llvm::Module &module, 
+                                           llvm::Module &module,
                                            llvm::IRBuilder<> &builder) {
-  
+
   llvm::LLVMContext &context = module.getContext();
-  
+
   // Exception initializations
-  
+
   // Setup exception catch state
-  ourExceptionNotThrownState = 
+  ourExceptionNotThrownState =
     llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 0),
-  ourExceptionThrownState = 
+  ourExceptionThrownState =
     llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 1),
-  ourExceptionCaughtState = 
+  ourExceptionCaughtState =
     llvm::ConstantInt::get(llvm::Type::getInt8Ty(context), 2),
-  
-  
-  
+
+
+
   // Create our type info type
-  ourTypeInfoType = llvm::StructType::get(context, 
+  ourTypeInfoType = llvm::StructType::get(context,
                                           TypeArray(builder.getInt32Ty()));
 
   llvm::Type *caughtResultFieldTypes[] = {
@@ -1648,47 +1648,47 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
                                             TypeArray(caughtResultFieldTypes));
 
   // Create OurException type
-  ourExceptionType = llvm::StructType::get(context, 
+  ourExceptionType = llvm::StructType::get(context,
                                            TypeArray(ourTypeInfoType));
-  
+
   // Create portion of _Unwind_Exception type
   //
   // Note: Declaring only a portion of the _Unwind_Exception struct.
   //       Does this cause problems?
   ourUnwindExceptionType =
-    llvm::StructType::get(context, 
+    llvm::StructType::get(context,
                     TypeArray(builder.getInt64Ty()));
 
   struct OurBaseException_t dummyException;
-  
+
   // Calculate offset of OurException::unwindException member.
-  ourBaseFromUnwindOffset = ((uintptr_t) &dummyException) - 
+  ourBaseFromUnwindOffset = ((uintptr_t) &dummyException) -
                             ((uintptr_t) &(dummyException.unwindException));
-  
+
 #ifdef DEBUG
   fprintf(stderr,
           "createStandardUtilityFunctions(...):ourBaseFromUnwindOffset "
           "= %lld, sizeof(struct OurBaseException_t) - "
           "sizeof(struct _Unwind_Exception) = %lu.\n",
           ourBaseFromUnwindOffset,
-          sizeof(struct OurBaseException_t) - 
+          sizeof(struct OurBaseException_t) -
           sizeof(struct _Unwind_Exception));
 #endif
-  
+
   size_t numChars = sizeof(ourBaseExcpClassChars) / sizeof(char);
-  
+
   // Create our _Unwind_Exception::exception_class value
   ourBaseExceptionClass = genClass(ourBaseExcpClassChars, numChars);
-  
+
   // Type infos
-  
+
   std::string baseStr = "typeInfo", typeInfoName;
   std::ostringstream typeInfoNameBuilder;
   std::vector<llvm::Constant*> structVals;
-  
+
   llvm::Constant *nextStruct;
   llvm::GlobalVariable *nextGlobal = NULL;
-  
+
   // Generate each type info
   //
   // Note: First type info is not used.
@@ -1696,202 +1696,202 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
     structVals.clear();
     structVals.push_back(llvm::ConstantInt::get(builder.getInt32Ty(), i));
     nextStruct = llvm::ConstantStruct::get(ourTypeInfoType, structVals);
-    
+
     typeInfoNameBuilder.str("");
     typeInfoNameBuilder << baseStr << i;
     typeInfoName = typeInfoNameBuilder.str();
-    
+
     // Note: Does not seem to work without allocation
-    nextGlobal = 
-    new llvm::GlobalVariable(module, 
-                             ourTypeInfoType, 
-                             true, 
-                             llvm::GlobalValue::ExternalLinkage, 
-                             nextStruct, 
+    nextGlobal =
+    new llvm::GlobalVariable(module,
+                             ourTypeInfoType,
+                             true,
+                             llvm::GlobalValue::ExternalLinkage,
+                             nextStruct,
                              typeInfoName);
-    
+
     ourTypeInfoNames.push_back(typeInfoName);
     ourTypeInfoNamesIndex[i] = typeInfoName;
   }
-  
+
   ArgNames argNames;
   ArgTypes argTypes;
   llvm::Function *funct = NULL;
-  
+
   // print32Int
-  
+
   llvm::Type *retType = builder.getVoidTy();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt32Ty());
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "print32Int", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "print32Int",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // print64Int
-  
+
   retType = builder.getVoidTy();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt64Ty());
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "print64Int", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "print64Int",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // printStr
-  
+
   retType = builder.getVoidTy();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "printStr", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "printStr",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // throwCppException
-  
+
   retType = builder.getVoidTy();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt32Ty());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "throwCppException", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "throwCppException",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // deleteOurException
-  
+
   retType = builder.getVoidTy();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "deleteOurException", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "deleteOurException",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // createOurException
-  
+
   retType = builder.getInt8PtrTy();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt32Ty());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "createOurException", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "createOurException",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // _Unwind_RaiseException
-  
+
   retType = builder.getInt32Ty();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  funct = createFunction(module, 
-                         retType, 
-                         argTypes, 
-                         argNames, 
-                         "_Unwind_RaiseException", 
-                         llvm::Function::ExternalLinkage, 
-                         true, 
+
+  funct = createFunction(module,
+                         retType,
+                         argTypes,
+                         argNames,
+                         "_Unwind_RaiseException",
+                         llvm::Function::ExternalLinkage,
+                         true,
                          false);
-  
-  funct->addFnAttr(llvm::Attribute::NoReturn);
-  
+
+  funct->setDoesNotReturn();
+
   // _Unwind_Resume
-  
+
   retType = builder.getInt32Ty();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  funct = createFunction(module, 
-                         retType, 
-                         argTypes, 
-                         argNames, 
-                         "_Unwind_Resume", 
-                         llvm::Function::ExternalLinkage, 
-                         true, 
+
+  funct = createFunction(module,
+                         retType,
+                         argTypes,
+                         argNames,
+                         "_Unwind_Resume",
+                         llvm::Function::ExternalLinkage,
+                         true,
                          false);
-  
-  funct->addFnAttr(llvm::Attribute::NoReturn);
-  
+
+  funct->setDoesNotReturn();
+
   // ourPersonality
-  
+
   retType = builder.getInt32Ty();
-  
+
   argTypes.clear();
   argTypes.push_back(builder.getInt32Ty());
   argTypes.push_back(builder.getInt32Ty());
   argTypes.push_back(builder.getInt64Ty());
   argTypes.push_back(builder.getInt8PtrTy());
   argTypes.push_back(builder.getInt8PtrTy());
-  
+
   argNames.clear();
-  
-  createFunction(module, 
-                 retType, 
-                 argTypes, 
-                 argNames, 
-                 "ourPersonality", 
-                 llvm::Function::ExternalLinkage, 
-                 true, 
+
+  createFunction(module,
+                 retType,
+                 argTypes,
+                 argNames,
+                 "ourPersonality",
+                 llvm::Function::ExternalLinkage,
+                 true,
                  false);
-  
+
   // llvm.eh.typeid.for intrinsic
-  
+
   getDeclaration(&module, llvm::Intrinsic::eh_typeid_for);
 }
 
@@ -1901,7 +1901,7 @@ static void createStandardUtilityFunctions(unsigned numTypeInfos,
 //===----------------------------------------------------------------------===//
 
 /// Demo main routine which takes the type info types to throw. A test will
-/// be run for each given type info type. While type info types with the value 
+/// be run for each given type info type. While type info types with the value
 /// of -1 will trigger a foreign C++ exception to be thrown; type info types
 /// <= 6 and >= 1 will be caught by test functions; and type info types > 6
 /// will result in exceptions which pass through to the test harness. All other
@@ -1920,87 +1920,86 @@ int main(int argc, char *argv[]) {
             "   for a full test.\n\n");
     return(0);
   }
-  
+
   // If not set, exception handling will not be turned on
   llvm::TargetOptions Opts;
   Opts.JITExceptionHandling = true;
-  
+
   llvm::InitializeNativeTarget();
   llvm::LLVMContext &context = llvm::getGlobalContext();
   llvm::IRBuilder<> theBuilder(context);
-  
+
   // Make the module, which holds all the code.
   llvm::Module *module = new llvm::Module("my cool jit", context);
-  
+
   // Build engine with JIT
   llvm::EngineBuilder factory(module);
   factory.setEngineKind(llvm::EngineKind::JIT);
   factory.setAllocateGVsWithCode(false);
   factory.setTargetOptions(Opts);
   llvm::ExecutionEngine *executionEngine = factory.create();
-  
+
   {
     llvm::FunctionPassManager fpm(module);
-    
-    // Set up the optimizer pipeline.  
+
+    // Set up the optimizer pipeline.
     // Start with registering info about how the
     // target lays out data structures.
-    fpm.add(new llvm::TargetData(*executionEngine->getTargetData()));
-    
+    fpm.add(new llvm::DataLayout(*executionEngine->getDataLayout()));
+
     // Optimizations turned on
 #ifdef ADD_OPT_PASSES
-    
+
     // Basic AliasAnslysis support for GVN.
     fpm.add(llvm::createBasicAliasAnalysisPass());
-    
+
     // Promote allocas to registers.
     fpm.add(llvm::createPromoteMemoryToRegisterPass());
-    
+
     // Do simple "peephole" optimizations and bit-twiddling optzns.
     fpm.add(llvm::createInstructionCombiningPass());
-    
+
     // Reassociate expressions.
     fpm.add(llvm::createReassociatePass());
-    
+
     // Eliminate Common SubExpressions.
     fpm.add(llvm::createGVNPass());
-    
-    // Simplify the control flow graph (deleting unreachable 
+
+    // Simplify the control flow graph (deleting unreachable
     // blocks, etc).
     fpm.add(llvm::createCFGSimplificationPass());
 #endif  // ADD_OPT_PASSES
-    
+
     fpm.doInitialization();
-    
+
     // Generate test code using function throwCppException(...) as
     // the function which throws foreign exceptions.
-    llvm::Function *toRun = 
-    createUnwindExceptionTest(*module, 
-                              theBuilder, 
+    llvm::Function *toRun =
+    createUnwindExceptionTest(*module,
+                              theBuilder,
                               fpm,
                               "throwCppException");
-    
+
     fprintf(stderr, "\nBegin module dump:\n\n");
-    
+
     module->dump();
-    
+
     fprintf(stderr, "\nEnd module dump:\n");
-    
+
     fprintf(stderr, "\n\nBegin Test:\n");
-    
+
     for (int i = 1; i < argc; ++i) {
       // Run test for each argument whose value is the exception
       // type to throw.
-      runExceptionThrow(executionEngine, 
-                        toRun, 
+      runExceptionThrow(executionEngine,
+                        toRun,
                         (unsigned) strtoul(argv[i], NULL, 10));
     }
-    
+
     fprintf(stderr, "\nEnd Test:\n\n");
-  } 
-  
+  }
+
   delete executionEngine;
-  
+
   return 0;
 }
-
diff --git a/examples/Fibonacci/fibonacci.cpp b/examples/Fibonacci/fibonacci.cpp
index cfd9b1e33cf5..417ad6f4b602 100644
--- a/examples/Fibonacci/fibonacci.cpp
+++ b/examples/Fibonacci/fibonacci.cpp
@@ -37,7 +37,7 @@
 using namespace llvm;
 
 static Function *CreateFibFunction(Module *M, LLVMContext &Context) {
-  // Create the fib function and insert it into module M.  This function is said
+  // Create the fib function and insert it into module M. This function is said
   // to return an int and take an int parameter.
   Function *FibF =
     cast<Function>(M->getOrInsertFunction("fib", Type::getInt32Ty(Context),
diff --git a/examples/Kaleidoscope/Chapter4/toy.cpp b/examples/Kaleidoscope/Chapter4/toy.cpp
index cce4466ed57a..bc6028c900e7 100644
--- a/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/examples/Kaleidoscope/Chapter4/toy.cpp
@@ -7,7 +7,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
@@ -584,7 +584,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine->getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine->getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
diff --git a/examples/Kaleidoscope/Chapter5/toy.cpp b/examples/Kaleidoscope/Chapter5/toy.cpp
index 36dd760e5ff4..2b0b9d54feb8 100644
--- a/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/examples/Kaleidoscope/Chapter5/toy.cpp
@@ -7,7 +7,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
@@ -829,7 +829,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine->getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine->getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
diff --git a/examples/Kaleidoscope/Chapter6/toy.cpp b/examples/Kaleidoscope/Chapter6/toy.cpp
index db3495dcc98b..b751e3516bf8 100644
--- a/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/examples/Kaleidoscope/Chapter6/toy.cpp
@@ -7,7 +7,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
@@ -947,7 +947,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine->getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine->getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
diff --git a/examples/Kaleidoscope/Chapter7/toy.cpp b/examples/Kaleidoscope/Chapter7/toy.cpp
index 143b30bf4766..0ac099659064 100644
--- a/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -7,7 +7,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/TargetSelect.h"
 #include <cstdio>
@@ -1111,7 +1111,7 @@ int main() {
 
   // Set up the optimizer pipeline.  Start with registering info about how the
   // target lays out data structures.
-  OurFPM.add(new TargetData(*TheExecutionEngine->getTargetData()));
+  OurFPM.add(new DataLayout(*TheExecutionEngine->getDataLayout()));
   // Provide basic AliasAnalysis support for GVN.
   OurFPM.add(createBasicAliasAnalysisPass());
   // Promote allocas to registers.
diff --git a/examples/OCaml-Kaleidoscope/Chapter4/toy.ml b/examples/OCaml-Kaleidoscope/Chapter4/toy.ml
index 5f9d912499c9..5a6bde9458cb 100644
--- a/examples/OCaml-Kaleidoscope/Chapter4/toy.ml
+++ b/examples/OCaml-Kaleidoscope/Chapter4/toy.ml
@@ -27,7 +27,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combination the_fpm;
diff --git a/examples/OCaml-Kaleidoscope/Chapter5/toy.ml b/examples/OCaml-Kaleidoscope/Chapter5/toy.ml
index 5f9d912499c9..5a6bde9458cb 100644
--- a/examples/OCaml-Kaleidoscope/Chapter5/toy.ml
+++ b/examples/OCaml-Kaleidoscope/Chapter5/toy.ml
@@ -27,7 +27,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combination the_fpm;
diff --git a/examples/OCaml-Kaleidoscope/Chapter6/toy.ml b/examples/OCaml-Kaleidoscope/Chapter6/toy.ml
index 5f9d912499c9..5a6bde9458cb 100644
--- a/examples/OCaml-Kaleidoscope/Chapter6/toy.ml
+++ b/examples/OCaml-Kaleidoscope/Chapter6/toy.ml
@@ -27,7 +27,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Do simple "peephole" optimizations and bit-twiddling optzn. *)
   add_instruction_combination the_fpm;
diff --git a/examples/OCaml-Kaleidoscope/Chapter7/toy.ml b/examples/OCaml-Kaleidoscope/Chapter7/toy.ml
index babab28601dd..f2508a43576e 100644
--- a/examples/OCaml-Kaleidoscope/Chapter7/toy.ml
+++ b/examples/OCaml-Kaleidoscope/Chapter7/toy.ml
@@ -28,7 +28,7 @@ let main () =
 
   (* Set up the optimizer pipeline.  Start with registering info about how the
    * target lays out data structures. *)
-  TargetData.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
+  DataLayout.add (ExecutionEngine.target_data the_execution_engine) the_fpm;
 
   (* Promote allocas to registers. *)
   add_memory_to_register_promotion the_fpm;
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 0bd5db3774c6..620d0887be73 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -173,10 +173,11 @@ typedef enum {
     LLVMUWTable = 1 << 30,
     LLVMNonLazyBind = 1 << 31
 
-    // FIXME: This attribute is currently not included in the C API as
-    // a temporary measure until the API/ABI impact to the C API is understood
-    // and the path forward agreed upon.
-    //LLVMAddressSafety = 1ULL << 32
+    /* FIXME: This attribute is currently not included in the C API as
+       a temporary measure until the API/ABI impact to the C API is understood
+       and the path forward agreed upon.
+    LLVMAddressSafety = 1ULL << 32
+    */
 } LLVMAttribute;
 
 typedef enum {
@@ -282,6 +283,7 @@ typedef enum {
   LLVMLinkOnceAnyLinkage, /**< Keep one copy of function when linking (inline)*/
   LLVMLinkOnceODRLinkage, /**< Same, but only replaced by something
                             equivalent. */
+  LLVMLinkOnceODRAutoHideLinkage, /**< Like LinkOnceODR, but possibly hidden. */
   LLVMWeakAnyLinkage,     /**< Keep one copy of function when linking (weak) */
   LLVMWeakODRLinkage,     /**< Same, but only replaced by something
                             equivalent. */
@@ -295,9 +297,7 @@ typedef enum {
   LLVMGhostLinkage,       /**< Obsolete */
   LLVMCommonLinkage,      /**< Tentative definitions */
   LLVMLinkerPrivateLinkage, /**< Like Private, but linker removes. */
-  LLVMLinkerPrivateWeakLinkage, /**< Like LinkerPrivate, but is weak. */
-  LLVMLinkerPrivateWeakDefAutoLinkage /**< Like LinkerPrivateWeak, but possibly
-                                           hidden. */
+  LLVMLinkerPrivateWeakLinkage /**< Like LinkerPrivate, but is weak. */
 } LLVMLinkage;
 
 typedef enum {
@@ -1803,7 +1803,7 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg);
  * Set the alignment for a function parameter.
  *
  * @see llvm::Argument::addAttr()
- * @see llvm::Attribute::constructAlignmentFromInt()
+ * @see llvm::AttrBuilder::addAlignmentAttr()
  */
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align);
 
@@ -1869,6 +1869,27 @@ LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count);
 const char  *LLVMGetMDString(LLVMValueRef V, unsigned* Len);
 
 /**
+ * Obtain the number of operands from an MDNode value.
+ *
+ * @param V MDNode to get number of operands from.
+ * @return Number of operands of the MDNode.
+ */
+unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V);
+
+/**
+ * Obtain the given MDNode's operands.
+ *
+ * The passed LLVMValueRef pointer should point to enough memory to hold all of
+ * the operands of the given MDNode (see LLVMGetMDNodeNumOperands) as
+ * LLVMValueRefs. This memory will be populated with the LLVMValueRefs of the
+ * MDNode's operands.
+ *
+ * @param V MDNode to get the operands from.
+ * @param Dest Destination array for operands.
+ */
+void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest);
+
+/**
  * @}
  */
 
@@ -2688,7 +2709,7 @@ namespace llvm {
   
   template<typename T>
   inline T **unwrap(LLVMValueRef *Vals, unsigned Length) {
-    #if DEBUG
+    #ifdef DEBUG
     for (LLVMValueRef *I = Vals, *E = Vals + Length; I != E; ++I)
       cast<T>(*I);
     #endif
diff --git a/include/llvm-c/Disassembler.h b/include/llvm-c/Disassembler.h
index 69fdc645669b..b8c4ad9ad738 100644
--- a/include/llvm-c/Disassembler.h
+++ b/include/llvm-c/Disassembler.h
@@ -146,6 +146,15 @@ LLVMDisasmContextRef LLVMCreateDisasm(const char *TripleName, void *DisInfo,
                                       LLVMSymbolLookupCallback SymbolLookUp);
 
 /**
+ * Set the disassembler's options.  Returns 1 if it can set the Options and 0
+ * otherwise.
+ */
+int LLVMSetDisasmOptions(LLVMDisasmContextRef DC, uint64_t Options);
+
+/* The option to produce marked up assembly. */
+#define LLVMDisassembler_Option_UseMarkup 1
+
+/**
  * Dispose of a disassembler context.
  */
 void LLVMDisasmDispose(LLVMDisasmContextRef DC);
diff --git a/include/llvm-c/Target.h b/include/llvm-c/Target.h
index 89150401ffe6..57abfa0207fe 100644
--- a/include/llvm-c/Target.h
+++ b/include/llvm-c/Target.h
@@ -145,7 +145,7 @@ static inline LLVMBool LLVMInitializeNativeTarget(void) {
 /*===-- Target Data -------------------------------------------------------===*/
 
 /** Creates target data from a target layout string.
-    See the constructor llvm::TargetData::TargetData. */
+    See the constructor llvm::DataLayout::DataLayout. */
 LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep);
 
 /** Adds target data information to a pass manager. This does not take ownership
@@ -160,48 +160,58 @@ void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef, LLVMPassManagerRef);
 
 /** Converts target data to a target layout string. The string must be disposed
     with LLVMDisposeMessage.
-    See the constructor llvm::TargetData::TargetData. */
+    See the constructor llvm::DataLayout::DataLayout. */
 char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef);
 
 /** Returns the byte order of a target, either LLVMBigEndian or
     LLVMLittleEndian.
-    See the method llvm::TargetData::isLittleEndian. */
+    See the method llvm::DataLayout::isLittleEndian. */
 enum LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef);
 
 /** Returns the pointer size in bytes for a target.
-    See the method llvm::TargetData::getPointerSize. */
+    See the method llvm::DataLayout::getPointerSize. */
 unsigned LLVMPointerSize(LLVMTargetDataRef);
 
+/** Returns the pointer size in bytes for a target for a specified
+    address space.
+    See the method llvm::DataLayout::getPointerSize. */
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef, unsigned AS);
+
 /** Returns the integer type that is the same size as a pointer on a target.
-    See the method llvm::TargetData::getIntPtrType. */
+    See the method llvm::DataLayout::getIntPtrType. */
 LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef);
 
+/** Returns the integer type that is the same size as a pointer on a target.
+    This version allows the address space to be specified.
+    See the method llvm::DataLayout::getIntPtrType. */
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef, unsigned AS);
+
 /** Computes the size of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeSizeInBits. */
+    See the method llvm::DataLayout::getTypeSizeInBits. */
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef, LLVMTypeRef);
 
 /** Computes the storage size of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeStoreSize. */
+    See the method llvm::DataLayout::getTypeStoreSize. */
 unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef, LLVMTypeRef);
 
 /** Computes the ABI size of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeAllocSize. */
+    See the method llvm::DataLayout::getTypeAllocSize. */
 unsigned long long LLVMABISizeOfType(LLVMTargetDataRef, LLVMTypeRef);
 
 /** Computes the ABI alignment of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeABISize. */
+    See the method llvm::DataLayout::getTypeABISize. */
 unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef, LLVMTypeRef);
 
 /** Computes the call frame alignment of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeABISize. */
+    See the method llvm::DataLayout::getTypeABISize. */
 unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef, LLVMTypeRef);
 
 /** Computes the preferred alignment of a type in bytes for a target.
-    See the method llvm::TargetData::getTypeABISize. */
+    See the method llvm::DataLayout::getTypeABISize. */
 unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef, LLVMTypeRef);
 
 /** Computes the preferred alignment of a global variable in bytes for a target.
-    See the method llvm::TargetData::getPreferredAlignment. */
+    See the method llvm::DataLayout::getPreferredAlignment. */
 unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef,
                                         LLVMValueRef GlobalVar);
 
@@ -216,7 +226,7 @@ unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef, LLVMTypeRef StructTy,
                                        unsigned Element);
 
 /** Deallocates a TargetData.
-    See the destructor llvm::TargetData::~TargetData. */
+    See the destructor llvm::DataLayout::~DataLayout. */
 void LLVMDisposeTargetData(LLVMTargetDataRef);
 
 /**
@@ -227,15 +237,15 @@ void LLVMDisposeTargetData(LLVMTargetDataRef);
 }
 
 namespace llvm {
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
 
-  inline TargetData *unwrap(LLVMTargetDataRef P) {
-    return reinterpret_cast<TargetData*>(P);
+  inline DataLayout *unwrap(LLVMTargetDataRef P) {
+    return reinterpret_cast<DataLayout*>(P);
   }
   
-  inline LLVMTargetDataRef wrap(const TargetData *P) {
-    return reinterpret_cast<LLVMTargetDataRef>(const_cast<TargetData*>(P));
+  inline LLVMTargetDataRef wrap(const DataLayout *P) {
+    return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
   }
 
   inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
diff --git a/include/llvm-c/TargetMachine.h b/include/llvm-c/TargetMachine.h
index 0d35d73a11df..29668de46529 100644
--- a/include/llvm-c/TargetMachine.h
+++ b/include/llvm-c/TargetMachine.h
@@ -104,7 +104,7 @@ char *LLVMGetTargetMachineCPU(LLVMTargetMachineRef T);
   LLVMDisposeMessage. */
 char *LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T);
 
-/** Returns the llvm::TargetData used for this llvm:TargetMachine. */
+/** Returns the llvm::DataLayout used for this llvm:TargetMachine. */
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T);
 
 /** Emits an asm or object file for the given module to the filename. This
diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h
index 9e7c7540d766..68a9bdd38854 100644
--- a/include/llvm-c/Transforms/Vectorize.h
+++ b/include/llvm-c/Transforms/Vectorize.h
@@ -36,6 +36,9 @@ extern "C" {
 /** See llvm::createBBVectorizePass function. */
 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
 
+/** See llvm::createLoopVectorizePass function. */
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM);
+
 /**
  * @}
  */
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index 5a625a4c832f..31c6e6adbfc6 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -455,14 +455,11 @@ namespace llvm {
 
     /* The sign bit of this number.  */
     unsigned int sign: 1;
-
-    /* For PPCDoubleDouble, we have a second exponent and sign (the second
-       significand is appended to the first one, although it would be wrong to
-       regard these as a single number for arithmetic purposes).  These fields
-       are not meaningful for any other type. */
-    exponent_t exponent2 : 11;
-    unsigned int sign2: 1;
   };
+
+  // See friend declaration above. This additional declaration is required in
+  // order to compile LLVM with IBM xlC compiler.
+  hash_code hash_value(const APFloat &Arg);
 } /* namespace llvm */
 
 #endif /* LLVM_FLOAT_H */
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index f30a6e3f081c..c7c8016b8339 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -251,7 +251,7 @@ public:
   /// constructor.
   APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]);
 
-  /// This constructor interprets the string \arg str in the given radix. The
+  /// This constructor interprets the string \p str in the given radix. The
   /// interpretation stops when the first character that is not suitable for the
   /// radix is encountered, or the end of the string. Acceptable radix values
   /// are 2, 8, 10, 16, and 36. It is an error for the value implied by the 
@@ -760,7 +760,7 @@ public:
   APInt shl(unsigned shiftAmt) const {
     assert(shiftAmt <= BitWidth && "Invalid shift amount");
     if (isSingleWord()) {
-      if (shiftAmt == BitWidth)
+      if (shiftAmt >= BitWidth)
         return APInt(BitWidth, 0); // avoid undefined shift results
       return APInt(BitWidth, VAL << shiftAmt);
     }
@@ -1231,15 +1231,15 @@ public:
   }
 
   /// This method determines how many bits are required to hold the APInt
-  /// equivalent of the string given by \arg str.
+  /// equivalent of the string given by \p str.
   /// @brief Get bits required for string value.
   static unsigned getBitsNeeded(StringRef str, uint8_t radix);
 
   /// countLeadingZeros - This function is an APInt version of the
   /// countLeadingZeros_{32,64} functions in MathExtras.h. It counts the number
   /// of zeros from the most significant bit to the first one bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the most significant bit to the first
+  /// @returns BitWidth if the value is zero, otherwise
+  /// returns the number of zeros from the most significant bit to the first
   /// one bits.
   unsigned countLeadingZeros() const {
     if (isSingleWord()) {
@@ -1252,8 +1252,8 @@ public:
   /// countLeadingOnes - This function is an APInt version of the
   /// countLeadingOnes_{32,64} functions in MathExtras.h. It counts the number
   /// of ones from the most significant bit to the first zero bit.
-  /// @returns 0 if the high order bit is not set
-  /// @returns the number of 1 bits from the most significant to the least
+  /// @returns 0 if the high order bit is not set, otherwise
+  /// returns the number of 1 bits from the most significant to the least
   /// @brief Count the number of leading one bits.
   unsigned countLeadingOnes() const;
 
@@ -1266,8 +1266,8 @@ public:
   /// countTrailingZeros - This function is an APInt version of the
   /// countTrailingZeros_{32,64} functions in MathExtras.h. It counts
   /// the number of zeros from the least significant bit to the first set bit.
-  /// @returns BitWidth if the value is zero.
-  /// @returns the number of zeros from the least significant bit to the first
+  /// @returns BitWidth if the value is zero, otherwise
+  /// returns the number of zeros from the least significant bit to the first
   /// one bit.
   /// @brief Count the number of trailing zero bits.
   unsigned countTrailingZeros() const;
@@ -1275,8 +1275,8 @@ public:
   /// countTrailingOnes - This function is an APInt version of the
   /// countTrailingOnes_{32,64} functions in MathExtras.h. It counts
   /// the number of ones from the least significant bit to the first zero bit.
-  /// @returns BitWidth if the value is all ones.
-  /// @returns the number of ones from the least significant bit to the first
+  /// @returns BitWidth if the value is all ones, otherwise
+  /// returns the number of ones from the least significant bit to the first
   /// zero bit.
   /// @brief Count the number of trailing one bits.
   unsigned countTrailingOnes() const {
@@ -1288,8 +1288,8 @@ public:
   /// countPopulation - This function is an APInt version of the
   /// countPopulation_{32,64} functions in MathExtras.h. It counts the number
   /// of 1 bits in the APInt value.
-  /// @returns 0 if the value is zero.
-  /// @returns the number of set bits.
+  /// @returns 0 if the value is zero, otherwise returns the number of set
+  /// bits.
   /// @brief Count the number of bits set.
   unsigned countPopulation() const {
     if (isSingleWord())
@@ -1780,6 +1780,9 @@ inline APInt Not(const APInt& APIVal) {
 
 } // End of APIntOps namespace
 
+  // See friend declaration above. This additional declaration is required in
+  // order to compile LLVM with IBM xlC compiler.
+  hash_code hash_value(const APInt &Arg);
 } // End of llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index cf55aadef31a..1e35d6279219 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -59,12 +59,17 @@ namespace llvm {
     ArrayRef(const T *begin, const T *end)
       : Data(begin), Length(end - begin) {}
 
-    /// Construct an ArrayRef from a SmallVector.
-    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T> &Vec)
-      : Data(Vec.data()), Length(Vec.size()) {}
+    /// Construct an ArrayRef from a SmallVector. This is templated in order to
+    /// avoid instantiating SmallVectorTemplateCommon<T> whenever we
+    /// copy-construct an ArrayRef.
+    template<typename U>
+    /*implicit*/ ArrayRef(const SmallVectorTemplateCommon<T, U> &Vec)
+      : Data(Vec.data()), Length(Vec.size()) {
+    }
 
     /// Construct an ArrayRef from a std::vector.
-    /*implicit*/ ArrayRef(const std::vector<T> &Vec)
+    template<typename A>
+    /*implicit*/ ArrayRef(const std::vector<T, A> &Vec)
       : Data(Vec.empty() ? (T*)0 : &Vec[0]), Length(Vec.size()) {}
 
     /// Construct an ArrayRef from a C array.
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index 3e2e5f230a3a..9d6388f7ee61 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -172,7 +172,7 @@ public:
     unsigned BitPos = Prev % BITWORD_SIZE;
     BitWord Copy = Bits[WordPos];
     // Mask off previous bits.
-    Copy &= ~0L << BitPos;
+    Copy &= ~0UL << BitPos;
 
     if (Copy != 0) {
       if (sizeof(BitWord) == 4)
@@ -237,6 +237,34 @@ public:
     return *this;
   }
 
+  /// set - Efficiently set a range of bits in [I, E)
+  BitVector &set(unsigned I, unsigned E) {
+    assert(I <= E && "Attempted to set backwards range!");
+    assert(E <= size() && "Attempted to set out-of-bounds range!");
+
+    if (I == E) return *this;
+
+    if (I / BITWORD_SIZE == E / BITWORD_SIZE) {
+      BitWord EMask = 1UL << (E % BITWORD_SIZE);
+      BitWord IMask = 1UL << (I % BITWORD_SIZE);
+      BitWord Mask = EMask - IMask;
+      Bits[I / BITWORD_SIZE] |= Mask;
+      return *this;
+    }
+
+    BitWord PrefixMask = ~0UL << (I % BITWORD_SIZE);
+    Bits[I / BITWORD_SIZE] |= PrefixMask;
+    I = RoundUpToAlignment(I, BITWORD_SIZE);
+
+    for (; I + BITWORD_SIZE <= E; I += BITWORD_SIZE)
+      Bits[I / BITWORD_SIZE] = ~0UL;
+
+    BitWord PostfixMask = (1UL << (E % BITWORD_SIZE)) - 1;
+    Bits[I / BITWORD_SIZE] |= PostfixMask;
+
+    return *this;
+  }
+
   BitVector &reset() {
     init_words(Bits, Capacity, false);
     return *this;
@@ -247,6 +275,34 @@ public:
     return *this;
   }
 
+  /// reset - Efficiently reset a range of bits in [I, E)
+  BitVector &reset(unsigned I, unsigned E) {
+    assert(I <= E && "Attempted to reset backwards range!");
+    assert(E <= size() && "Attempted to reset out-of-bounds range!");
+
+    if (I == E) return *this;
+
+    if (I / BITWORD_SIZE == E / BITWORD_SIZE) {
+      BitWord EMask = 1UL << (E % BITWORD_SIZE);
+      BitWord IMask = 1UL << (I % BITWORD_SIZE);
+      BitWord Mask = EMask - IMask;
+      Bits[I / BITWORD_SIZE] &= ~Mask;
+      return *this;
+    }
+
+    BitWord PrefixMask = ~0UL << (I % BITWORD_SIZE);
+    Bits[I / BITWORD_SIZE] &= ~PrefixMask;
+    I = RoundUpToAlignment(I, BITWORD_SIZE);
+
+    for (; I + BITWORD_SIZE <= E; I += BITWORD_SIZE)
+      Bits[I / BITWORD_SIZE] = 0UL;
+
+    BitWord PostfixMask = (1UL << (E % BITWORD_SIZE)) - 1;
+    Bits[I / BITWORD_SIZE] &= ~PostfixMask;
+
+    return *this;
+  }
+
   BitVector &flip() {
     for (unsigned i = 0; i < NumBitWords(size()); ++i)
       Bits[i] = ~Bits[i];
@@ -311,7 +367,7 @@ public:
     return !(*this == RHS);
   }
 
-  // Intersection, union, disjoint union.
+  /// Intersection, union, disjoint union.
   BitVector &operator&=(const BitVector &RHS) {
     unsigned ThisWords = NumBitWords(size());
     unsigned RHSWords  = NumBitWords(RHS.size());
@@ -328,7 +384,7 @@ public:
     return *this;
   }
 
-  // reset - Reset bits that are set in RHS. Same as *this &= ~RHS.
+  /// reset - Reset bits that are set in RHS. Same as *this &= ~RHS.
   BitVector &reset(const BitVector &RHS) {
     unsigned ThisWords = NumBitWords(size());
     unsigned RHSWords  = NumBitWords(RHS.size());
@@ -338,6 +394,23 @@ public:
     return *this;
   }
 
+  /// test - Check if (This - RHS) is zero.
+  /// This is the same as reset(RHS) and any().
+  bool test(const BitVector &RHS) const {
+    unsigned ThisWords = NumBitWords(size());
+    unsigned RHSWords  = NumBitWords(RHS.size());
+    unsigned i;
+    for (i = 0; i != std::min(ThisWords, RHSWords); ++i)
+      if ((Bits[i] & ~RHS.Bits[i]) != 0)
+        return true;
+
+    for (; i != ThisWords ; ++i)
+      if (Bits[i] != 0)
+        return true;
+
+    return false;
+  }
+
   BitVector &operator|=(const BitVector &RHS) {
     if (size() < RHS.size())
       resize(RHS.size());
@@ -451,8 +524,11 @@ private:
     //  Then set any stray high bits of the last used word.
     unsigned ExtraBits = Size % BITWORD_SIZE;
     if (ExtraBits) {
-      Bits[UsedWords-1] &= ~(~0L << ExtraBits);
-      Bits[UsedWords-1] |= (0 - (BitWord)t) << ExtraBits;
+      BitWord ExtraBitMask = ~0UL << ExtraBits;
+      if (t)
+        Bits[UsedWords-1] |= ExtraBitMask;
+      else
+        Bits[UsedWords-1] &= ~ExtraBitMask;
     }
   }
 
diff --git a/include/llvm/ADT/DAGDeltaAlgorithm.h b/include/llvm/ADT/DAGDeltaAlgorithm.h
index e502ac4348d0..2dfed075dea5 100644
--- a/include/llvm/ADT/DAGDeltaAlgorithm.h
+++ b/include/llvm/ADT/DAGDeltaAlgorithm.h
@@ -48,17 +48,18 @@ public:
 public:
   virtual ~DAGDeltaAlgorithm() {}
 
-  /// Run - Minimize the DAG formed by the \arg Changes vertices and the \arg
-  /// Dependencies edges by executing \see ExecuteOneTest() on subsets of
+  /// Run - Minimize the DAG formed by the \p Changes vertices and the
+  /// \p Dependencies edges by executing \see ExecuteOneTest() on subsets of
   /// changes and returning the smallest set which still satisfies the test
-  /// predicate and the input \arg Dependencies.
+  /// predicate and the input \p Dependencies.
   ///
   /// \param Changes The list of changes.
   ///
   /// \param Dependencies The list of dependencies amongst changes. For each
-  /// (x,y) in \arg Dependencies, both x and y must be in \arg Changes. The
-  /// minimization algorithm guarantees that for each tested changed set S, x
-  /// \in S implies y \in S. It is an error to have cyclic dependencies.
+  /// (x,y) in \p Dependencies, both x and y must be in \p Changes. The
+  /// minimization algorithm guarantees that for each tested changed set S,
+  /// \f$ x \in S \f$ implies \f$ y \in S \f$. It is an error to have cyclic
+  /// dependencies.
   changeset_ty Run(const changeset_ty &Changes,
                    const std::vector<edge_ty> &Dependencies);
 
@@ -67,7 +68,7 @@ public:
                                   const changesetlist_ty &Sets,
                                   const changeset_ty &Required) {}
 
-  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   virtual bool ExecuteOneTest(const changeset_ty &S) = 0;
 };
 
diff --git a/include/llvm/ADT/DeltaAlgorithm.h b/include/llvm/ADT/DeltaAlgorithm.h
index 45ba19891d4f..7bf7960c63a9 100644
--- a/include/llvm/ADT/DeltaAlgorithm.h
+++ b/include/llvm/ADT/DeltaAlgorithm.h
@@ -45,23 +45,23 @@ private:
   /// since we always reduce following a success.
   std::set<changeset_ty> FailedTestsCache;
 
-  /// GetTestResult - Get the test result for the \arg Changes from the
+  /// GetTestResult - Get the test result for the \p Changes from the
   /// cache, executing the test if necessary.
   ///
   /// \param Changes - The change set to test.
   /// \return - The test result.
   bool GetTestResult(const changeset_ty &Changes);
 
-  /// Split - Partition a set of changes \arg S into one or two subsets.
+  /// Split - Partition a set of changes \p S into one or two subsets.
   void Split(const changeset_ty &S, changesetlist_ty &Res);
 
-  /// Delta - Minimize a set of \arg Changes which has been partioned into
+  /// Delta - Minimize a set of \p Changes which has been partioned into
   /// smaller sets, by attempting to remove individual subsets.
   changeset_ty Delta(const changeset_ty &Changes,
                      const changesetlist_ty &Sets);
 
-  /// Search - Search for a subset (or subsets) in \arg Sets which can be
-  /// removed from \arg Changes while still satisfying the predicate.
+  /// Search - Search for a subset (or subsets) in \p Sets which can be
+  /// removed from \p Changes while still satisfying the predicate.
   ///
   /// \param Res - On success, a subset of Changes which satisfies the
   /// predicate.
@@ -74,13 +74,13 @@ protected:
   virtual void UpdatedSearchState(const changeset_ty &Changes,
                                   const changesetlist_ty &Sets) {}
 
-  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   virtual bool ExecuteOneTest(const changeset_ty &S) = 0;
 
 public:
   virtual ~DeltaAlgorithm();
 
-  /// Run - Minimize the set \arg Changes by executing \see ExecuteOneTest() on
+  /// Run - Minimize the set \p Changes by executing \see ExecuteOneTest() on
   /// subsets of changes and returning the smallest set which still satisfies
   /// the test predicate.
   changeset_ty Run(const changeset_ty &Changes);
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index f60d688c0dce..ac4bdbd126c5 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -420,9 +420,10 @@ private:
       NumBuckets = getNumBuckets();
     }
     if (NumBuckets-(NewNumEntries+getNumTombstones()) <= NumBuckets/8) {
-      this->grow(NumBuckets);
+      this->grow(NumBuckets * 2);
       LookupBucketFor(Key, TheBucket);
     }
+    assert(TheBucket);
 
     // Only update the state after we've grown our bucket space appropriately
     // so that when growing buckets we have self-consistent entry count.
@@ -599,7 +600,7 @@ public:
     unsigned OldNumBuckets = NumBuckets;
     BucketT *OldBuckets = Buckets;
 
-    allocateBuckets(std::max<unsigned>(64, NextPowerOf2(AtLeast)));
+    allocateBuckets(std::max<unsigned>(64, NextPowerOf2(AtLeast-1)));
     assert(Buckets);
     if (!OldBuckets) {
       this->BaseT::initEmpty();
@@ -825,11 +826,11 @@ public:
   }
 
   void grow(unsigned AtLeast) {
-    if (AtLeast > InlineBuckets)
-      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast));
+    if (AtLeast >= InlineBuckets)
+      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast-1));
 
     if (Small) {
-      if (AtLeast <= InlineBuckets)
+      if (AtLeast < InlineBuckets)
         return; // Nothing to do.
 
       // First move the inline buckets into a temporary storage.
diff --git a/include/llvm/ADT/DenseMapInfo.h b/include/llvm/ADT/DenseMapInfo.h
index 1559a35c39f9..6f17a647b63d 100644
--- a/include/llvm/ADT/DenseMapInfo.h
+++ b/include/llvm/ADT/DenseMapInfo.h
@@ -31,12 +31,12 @@ struct DenseMapInfo {
 template<typename T>
 struct DenseMapInfo<T*> {
   static inline T* getEmptyKey() {
-    intptr_t Val = -1;
+    uintptr_t Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<T*>::NumLowBitsAvailable;
     return reinterpret_cast<T*>(Val);
   }
   static inline T* getTombstoneKey() {
-    intptr_t Val = -2;
+    uintptr_t Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<T*>::NumLowBitsAvailable;
     return reinterpret_cast<T*>(Val);
   }
@@ -105,7 +105,7 @@ template<> struct DenseMapInfo<int> {
 // Provide DenseMapInfo for longs.
 template<> struct DenseMapInfo<long> {
   static inline long getEmptyKey() {
-    return (1UL << (sizeof(long) * 8 - 1)) - 1L;
+    return (1UL << (sizeof(long) * 8 - 1)) - 1UL;
   }
   static inline long getTombstoneKey() { return getEmptyKey() - 1L; }
   static unsigned getHashValue(const long& Val) {
diff --git a/include/llvm/ADT/EquivalenceClasses.h b/include/llvm/ADT/EquivalenceClasses.h
index 771476c30361..1d81772ee8ae 100644
--- a/include/llvm/ADT/EquivalenceClasses.h
+++ b/include/llvm/ADT/EquivalenceClasses.h
@@ -33,6 +33,7 @@ namespace llvm {
 ///
 /// Here is a simple example using integers:
 ///
+/// \code
 ///  EquivalenceClasses<int> EC;
 ///  EC.unionSets(1, 2);                // insert 1, 2 into the same set
 ///  EC.insert(4); EC.insert(5);        // insert 4, 5 into own sets
@@ -46,6 +47,7 @@ namespace llvm {
 ///      cerr << *MI << " ";  // Print member.
 ///    cerr << "\n";   // Finish set.
 ///  }
+/// \endcode
 ///
 /// This example prints:
 ///   4
diff --git a/include/llvm/ADT/FoldingSet.h b/include/llvm/ADT/FoldingSet.h
index ba415ac2d61f..375d84abebdd 100644
--- a/include/llvm/ADT/FoldingSet.h
+++ b/include/llvm/ADT/FoldingSet.h
@@ -278,6 +278,10 @@ public:
 
   bool operator==(FoldingSetNodeIDRef) const;
 
+  /// Used to compare the "ordering" of two nodes as defined by the
+  /// profiled bits and their ordering defined by memcmp().
+  bool operator<(FoldingSetNodeIDRef) const;
+
   const unsigned *getData() const { return Data; }
   size_t getSize() const { return Size; }
 };
@@ -327,6 +331,11 @@ public:
   bool operator==(const FoldingSetNodeID &RHS) const;
   bool operator==(const FoldingSetNodeIDRef RHS) const;
 
+  /// Used to compare the "ordering" of two nodes as defined by the
+  /// profiled bits and their ordering defined by memcmp().
+  bool operator<(const FoldingSetNodeID &RHS) const;
+  bool operator<(const FoldingSetNodeIDRef RHS) const;
+
   /// Intern - Copy this node's data to a memory region allocated from the
   /// given allocator and return a FoldingSetNodeIDRef describing the
   /// interned data.
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index 6ab07254a21d..cda31a261df2 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -409,7 +409,6 @@ bool store_and_advance(char *&buffer_ptr, char *buffer_end, const T& value,
 /// combining them, this (as an optimization) directly combines the integers.
 template <typename InputIteratorT>
 hash_code hash_combine_range_impl(InputIteratorT first, InputIteratorT last) {
-  typedef typename std::iterator_traits<InputIteratorT>::value_type ValueT;
   const size_t seed = get_execution_seed();
   char buffer[64], *buffer_ptr = buffer;
   char *const buffer_end = buffer_ptr + array_lengthof(buffer);
@@ -711,7 +710,7 @@ hash_code hash_combine(const T1 &arg1) {
 #endif
 
 
-// Implementation details for implementatinos of hash_value overloads provided
+// Implementation details for implementations of hash_value overloads provided
 // here.
 namespace hashing {
 namespace detail {
diff --git a/include/llvm/ADT/ImmutableList.h b/include/llvm/ADT/ImmutableList.h
index d7c0074a9f08..20bdd903f7a5 100644
--- a/include/llvm/ADT/ImmutableList.h
+++ b/include/llvm/ADT/ImmutableList.h
@@ -33,9 +33,8 @@ class ImmutableListImpl : public FoldingSetNode {
 
   friend class ImmutableListFactory<T>;
 
-  // Do not implement.
-  void operator=(const ImmutableListImpl&);
-  ImmutableListImpl(const ImmutableListImpl&);
+  void operator=(const ImmutableListImpl&) LLVM_DELETED_FUNCTION;
+  ImmutableListImpl(const ImmutableListImpl&) LLVM_DELETED_FUNCTION;
 
 public:
   const T& getHead() const { return Head; }
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index 8346ffabff76..4883c5ba0a6b 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -122,8 +122,8 @@ public:
     }
 
   private:
-    Factory(const Factory& RHS); // DO NOT IMPLEMENT
-    void operator=(const Factory& RHS); // DO NOT IMPLEMENT
+    Factory(const Factory& RHS) LLVM_DELETED_FUNCTION;
+    void operator=(const Factory& RHS) LLVM_DELETED_FUNCTION;
   };
 
   bool contains(key_type_ref K) const {
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index 949dc44daba6..3900f96be16a 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -22,7 +22,6 @@
 #include <cassert>
 #include <functional>
 #include <vector>
-#include <stdio.h>
 
 namespace llvm {
 
@@ -84,13 +83,13 @@ public:
     }
     return NULL;
   }
-  
+
   /// getMaxElement - Find the subtree associated with the highest ranged
   ///  key value.
   ImutAVLTree* getMaxElement() {
     ImutAVLTree *T = this;
-    ImutAVLTree *Right = T->getRight();    
-    while (Right) { T = right; right = T->getRight(); }
+    ImutAVLTree *Right = T->getRight();
+    while (Right) { T = Right; Right = T->getRight(); }
     return T;
   }
 
@@ -258,7 +257,7 @@ private:
   ///  method returns false for an instance of ImutAVLTree, all subtrees
   ///  will also have this method return false.  The converse is not true.
   bool isMutable() const { return IsMutable; }
-  
+
   /// hasCachedDigest - Returns true if the digest for this tree is cached.
   ///  This can only be true if the tree is immutable.
   bool hasCachedDigest() const { return IsDigestCached; }
@@ -280,7 +279,7 @@ private:
     assert(isMutable() && "Mutable flag already removed.");
     IsMutable = false;
   }
-  
+
   /// markedCachedDigest - Clears the NoCachedDigest flag for a tree.
   void markedCachedDigest() {
     assert(!hasCachedDigest() && "NoCachedDigest flag already removed.");
@@ -349,7 +348,7 @@ public:
       else
         factory->Cache[factory->maskCacheIndex(computeDigest())] = next;
     }
-    
+
     // We need to clear the mutability bit in case we are
     // destroying the node as part of a sweep in ImutAVLFactory::recoverNodes().
     IsMutable = false;
@@ -415,7 +414,7 @@ public:
   TreeTy* getEmptyTree() const { return NULL; }
 
 protected:
-  
+
   //===--------------------------------------------------===//
   // A bunch of quick helper functions used for reasoning
   // about the properties of trees and their children.
@@ -461,7 +460,7 @@ protected:
   // returned to the caller.
   //===--------------------------------------------------===//
 
-  TreeTy* createNode(TreeTy* L, value_type_ref V, TreeTy* R) {   
+  TreeTy* createNode(TreeTy* L, value_type_ref V, TreeTy* R) {
     BumpPtrAllocator& A = getAllocator();
     TreeTy* T;
     if (!freeNodes.empty()) {
@@ -469,8 +468,7 @@ protected:
       freeNodes.pop_back();
       assert(T != L);
       assert(T != R);
-    }
-    else {
+    } else {
       T = (TreeTy*) A.Allocate<TreeTy>();
     }
     new (T) TreeTy(this, L, R, V, incrementHeight(L,R));
@@ -513,7 +511,8 @@ protected:
 
       return createNode(createNode(LL,L,LRL), LR, createNode(LRR,V,R));
     }
-    else if (hr > hl + 2) {
+
+    if (hr > hl + 2) {
       assert(!isEmpty(R) && "Right tree cannot be empty to have a height >= 2");
 
       TreeTy *RL = getLeft(R);
@@ -529,8 +528,8 @@ protected:
 
       return createNode(createNode(L,V,RLL), RL, createNode(RLR,R,RR));
     }
-    else
-      return createNode(L,V,R);
+
+    return createNode(L,V,R);
   }
 
   /// add_internal - Creates a new tree that includes the specified
@@ -604,7 +603,7 @@ protected:
     markImmutable(getLeft(T));
     markImmutable(getRight(T));
   }
-  
+
 public:
   TreeTy *getCanonicalTree(TreeTy *TNew) {
     if (!TNew)
@@ -937,7 +936,7 @@ public:
 
 private:
   TreeTy *Root;
-  
+
 public:
   /// Constructs a set from a pointer to a tree root.  In general one
   /// should use a Factory object to create sets instead of directly
@@ -1006,10 +1005,10 @@ public:
     typename TreeTy::Factory *getTreeFactory() const {
       return const_cast<typename TreeTy::Factory *>(&F);
     }
-    
+
   private:
-    Factory(const Factory& RHS); // DO NOT IMPLEMENT
-    void operator=(const Factory& RHS); // DO NOT IMPLEMENT
+    Factory(const Factory& RHS) LLVM_DELETED_FUNCTION;
+    void operator=(const Factory& RHS) LLVM_DELETED_FUNCTION;
   };
 
   friend class Factory;
@@ -1027,11 +1026,11 @@ public:
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root) : Root != RHS.Root;
   }
 
-  TreeTy *getRoot() { 
+  TreeTy *getRoot() {
     if (Root) { Root->retain(); }
     return Root;
   }
-  
+
   TreeTy *getRootWithoutRetain() const {
     return Root;
   }
@@ -1092,7 +1091,7 @@ public:
 
   void validateTree() const { if (Root) Root->validateTree(); }
 };
-  
+
 // NOTE: This may some day replace the current ImmutableSet.
 template <typename ValT, typename ValInfo = ImutContainerInfo<ValT> >
 class ImmutableSetRef {
@@ -1101,11 +1100,11 @@ public:
   typedef typename ValInfo::value_type_ref  value_type_ref;
   typedef ImutAVLTree<ValInfo> TreeTy;
   typedef typename TreeTy::Factory          FactoryTy;
-  
+
 private:
   TreeTy *Root;
   FactoryTy *Factory;
-  
+
 public:
   /// Constructs a set from a pointer to a tree root.  In general one
   /// should use a Factory object to create sets instead of directly
@@ -1133,44 +1132,44 @@ public:
   ~ImmutableSetRef() {
     if (Root) { Root->release(); }
   }
-  
+
   static inline ImmutableSetRef getEmptySet(FactoryTy *F) {
     return ImmutableSetRef(0, F);
   }
-  
+
   ImmutableSetRef add(value_type_ref V) {
     return ImmutableSetRef(Factory->add(Root, V), Factory);
   }
-  
+
   ImmutableSetRef remove(value_type_ref V) {
     return ImmutableSetRef(Factory->remove(Root, V), Factory);
   }
-    
+
   /// Returns true if the set contains the specified value.
   bool contains(value_type_ref V) const {
     return Root ? Root->contains(V) : false;
   }
-  
+
   ImmutableSet<ValT> asImmutableSet(bool canonicalize = true) const {
     return ImmutableSet<ValT>(canonicalize ?
                               Factory->getCanonicalTree(Root) : Root);
   }
-  
+
   TreeTy *getRootWithoutRetain() const {
     return Root;
   }
-  
+
   bool operator==(const ImmutableSetRef &RHS) const {
     return Root && RHS.Root ? Root->isEqual(*RHS.Root) : Root == RHS.Root;
   }
-  
+
   bool operator!=(const ImmutableSetRef &RHS) const {
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root) : Root != RHS.Root;
   }
 
   /// isEmpty - Return true if the set contains no elements.
   bool isEmpty() const { return !Root; }
-  
+
   /// isSingleton - Return true if the set contains exactly one element.
   ///   This method runs in constant time.
   bool isSingleton() const { return getHeight() == 1; }
@@ -1178,7 +1177,7 @@ public:
   //===--------------------------------------------------===//
   // Iterators.
   //===--------------------------------------------------===//
-  
+
   class iterator {
     typename TreeTy::iterator itr;
     iterator(TreeTy* t) : itr(t) {}
@@ -1194,28 +1193,28 @@ public:
     inline bool operator!=(const iterator& RHS) const { return RHS.itr != itr; }
     inline value_type *operator->() const { return &(operator*()); }
   };
-  
+
   iterator begin() const { return iterator(Root); }
   iterator end() const { return iterator(); }
-  
+
   //===--------------------------------------------------===//
   // Utility methods.
   //===--------------------------------------------------===//
-  
+
   unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
-  
+
   static inline void Profile(FoldingSetNodeID& ID, const ImmutableSetRef& S) {
     ID.AddPointer(S.Root);
   }
-  
+
   inline void Profile(FoldingSetNodeID& ID) const {
     return Profile(ID,*this);
   }
-  
+
   //===--------------------------------------------------===//
   // For testing.
   //===--------------------------------------------------===//
-  
+
   void validateTree() const { if (Root) Root->validateTree(); }
 };
 
diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
new file mode 100644
index 000000000000..6aacca5a6f0f
--- /dev/null
+++ b/include/llvm/ADT/MapVector.h
@@ -0,0 +1,90 @@
+//===- llvm/ADT/MapVector.h - Map with deterministic value order *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a map that provides insertion order iteration. The
+// interface is purposefully minimal. The key is assumed to be cheap to copy
+// and 2 copies are kept, one for indexing in a DenseMap, one for iteration in
+// a std::vector.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_MAPVECTOR_H
+#define LLVM_ADT_MAPVECTOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include <vector>
+
+namespace llvm {
+
+/// This class implements a map that also provides access to all stored values
+/// in a deterministic order. The values are kept in a std::vector and the
+/// mapping is done with DenseMap from Keys to indexes in that vector.
+template<typename KeyT, typename ValueT,
+         typename MapType = llvm::DenseMap<KeyT, unsigned>,
+         typename VectorType = std::vector<std::pair<KeyT, ValueT> > >
+class MapVector {
+  typedef typename VectorType::size_type SizeType;
+
+  MapType Map;
+  VectorType Vector;
+
+public:
+  typedef typename VectorType::iterator iterator;
+  typedef typename VectorType::const_iterator const_iterator;
+
+  SizeType size() const {
+    return Vector.size();
+  }
+
+  iterator begin() {
+    return Vector.begin();
+  }
+
+  const_iterator begin() const {
+    return Vector.begin();
+  }
+
+  iterator end() {
+    return Vector.end();
+  }
+
+  const_iterator end() const {
+    return Vector.end();
+  }
+
+  bool empty() const {
+    return Vector.empty();
+  }
+
+  void clear() {
+    Map.clear();
+    Vector.clear();
+  }
+
+  ValueT &operator[](const KeyT &Key) {
+    std::pair<KeyT, unsigned> Pair = std::make_pair(Key, 0);
+    std::pair<typename MapType::iterator, bool> Result = Map.insert(Pair);
+    unsigned &I = Result.first->second;
+    if (Result.second) {
+      Vector.push_back(std::make_pair(Key, ValueT()));
+      I = Vector.size() - 1;
+    }
+    return Vector[I].second;
+  }
+
+  unsigned count(const KeyT &Key) const {
+    typename MapType::const_iterator Pos = Map.find(Key);
+    return Pos == Map.end()? 0 : 1;
+  }
+};
+
+}
+
+#endif
diff --git a/include/llvm/ADT/Optional.h b/include/llvm/ADT/Optional.h
index ee8b69f3d12f..f43aeb1bc4d9 100644
--- a/include/llvm/ADT/Optional.h
+++ b/include/llvm/ADT/Optional.h
@@ -16,8 +16,13 @@
 #ifndef LLVM_ADT_OPTIONAL
 #define LLVM_ADT_OPTIONAL
 
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 
+#if LLVM_USE_RVALUE_REFERENCES
+#include <utility>
+#endif
+
 namespace llvm {
 
 template<typename T>
@@ -28,6 +33,10 @@ public:
   explicit Optional() : x(), hasVal(false) {}
   Optional(const T &y) : x(y), hasVal(true) {}
 
+#if LLVM_USE_RVALUE_REFERENCES
+  Optional(T &&y) : x(std::forward<T>(y)), hasVal(true) {}
+#endif
+
   static inline Optional create(const T* y) {
     return y ? Optional(*y) : Optional();
   }
diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h
index 6d9c30597789..05bcd40d0862 100644
--- a/include/llvm/ADT/OwningPtr.h
+++ b/include/llvm/ADT/OwningPtr.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_OWNING_PTR_H
 #define LLVM_ADT_OWNING_PTR_H
 
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 
@@ -25,12 +26,21 @@ namespace llvm {
 /// pointee object can be taken away from OwningPtr by using the take method.
 template<class T>
 class OwningPtr {
-  OwningPtr(OwningPtr const &);             // DO NOT IMPLEMENT
-  OwningPtr &operator=(OwningPtr const &);  // DO NOT IMPLEMENT
+  OwningPtr(OwningPtr const &) LLVM_DELETED_FUNCTION;
+  OwningPtr &operator=(OwningPtr const &) LLVM_DELETED_FUNCTION;
   T *Ptr;
 public:
   explicit OwningPtr(T *P = 0) : Ptr(P) {}
 
+#if LLVM_USE_RVALUE_REFERENCES
+  OwningPtr(OwningPtr &&Other) : Ptr(Other.take()) {}
+
+  OwningPtr &operator=(OwningPtr &&Other) {
+    reset(Other.take());
+    return *this;
+  }
+#endif
+
   ~OwningPtr() {
     delete Ptr;
   }
@@ -79,12 +89,21 @@ inline void swap(OwningPtr<T> &a, OwningPtr<T> &b) {
 ///  functionality as OwningPtr, except that it works for array types.
 template<class T>
 class OwningArrayPtr {
-  OwningArrayPtr(OwningArrayPtr const &);            // DO NOT IMPLEMENT
-  OwningArrayPtr &operator=(OwningArrayPtr const &); // DO NOT IMPLEMENT
+  OwningArrayPtr(OwningArrayPtr const &) LLVM_DELETED_FUNCTION;
+  OwningArrayPtr &operator=(OwningArrayPtr const &) LLVM_DELETED_FUNCTION;
   T *Ptr;
 public:
   explicit OwningArrayPtr(T *P = 0) : Ptr(P) {}
 
+#if LLVM_USE_RVALUE_REFERENCES
+  OwningArrayPtr(OwningArrayPtr &&Other) : Ptr(Other.take()) {}
+
+  OwningArrayPtr &operator=(OwningArrayPtr &&Other) {
+    reset(Other.take());
+    return *this;
+  }
+#endif
+
   ~OwningArrayPtr() {
     delete [] Ptr;
   }
diff --git a/include/llvm/ADT/PackedVector.h b/include/llvm/ADT/PackedVector.h
index 2eaddc2b4eea..1ae2a77e7eaf 100644
--- a/include/llvm/ADT/PackedVector.h
+++ b/include/llvm/ADT/PackedVector.h
@@ -19,32 +19,32 @@
 
 namespace llvm {
 
-template <typename T, unsigned BitNum, bool isSigned>
+template <typename T, unsigned BitNum, typename BitVectorTy, bool isSigned>
 class PackedVectorBase;
 
 // This won't be necessary if we can specialize members without specializing
 // the parent template.
-template <typename T, unsigned BitNum>
-class PackedVectorBase<T, BitNum, false> {
+template <typename T, unsigned BitNum, typename BitVectorTy>
+class PackedVectorBase<T, BitNum, BitVectorTy, false> {
 protected:
-  static T getValue(const llvm::BitVector &Bits, unsigned Idx) {
+  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
     T val = T();
     for (unsigned i = 0; i != BitNum; ++i)
       val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
     return val;
   }
 
-  static void setValue(llvm::BitVector &Bits, unsigned Idx, T val) {
+  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
     assert((val >> BitNum) == 0 && "value is too big");
     for (unsigned i = 0; i != BitNum; ++i)
       Bits[(Idx << (BitNum-1)) + i] = val & (T(1) << i);
   }
 };
 
-template <typename T, unsigned BitNum>
-class PackedVectorBase<T, BitNum, true> {
+template <typename T, unsigned BitNum, typename BitVectorTy>
+class PackedVectorBase<T, BitNum, BitVectorTy, true> {
 protected:
-  static T getValue(const llvm::BitVector &Bits, unsigned Idx) {
+  static T getValue(const BitVectorTy &Bits, unsigned Idx) {
     T val = T();
     for (unsigned i = 0; i != BitNum-1; ++i)
       val = T(val | ((Bits[(Idx << (BitNum-1)) + i] ? 1UL : 0UL) << i));
@@ -53,7 +53,7 @@ protected:
     return val;
   }
 
-  static void setValue(llvm::BitVector &Bits, unsigned Idx, T val) {
+  static void setValue(BitVectorTy &Bits, unsigned Idx, T val) {
     if (val < 0) {
       val = ~val;
       Bits.set((Idx << (BitNum-1)) + BitNum-1);
@@ -71,11 +71,12 @@ protected:
 /// @endcode
 /// will create a vector accepting values -2, -1, 0, 1. Any other value will hit
 /// an assertion.
-template <typename T, unsigned BitNum>
-class PackedVector : public PackedVectorBase<T, BitNum,
+template <typename T, unsigned BitNum, typename BitVectorTy = BitVector>
+class PackedVector : public PackedVectorBase<T, BitNum, BitVectorTy,
                                             std::numeric_limits<T>::is_signed> {
-  llvm::BitVector Bits;
-  typedef PackedVectorBase<T, BitNum, std::numeric_limits<T>::is_signed> base;
+  BitVectorTy Bits;
+  typedef PackedVectorBase<T, BitNum, BitVectorTy,
+                           std::numeric_limits<T>::is_signed> base;
 
 public:
   class reference {
diff --git a/include/llvm/ADT/PointerIntPair.h b/include/llvm/ADT/PointerIntPair.h
index fcc758b43a27..71c379bad5a4 100644
--- a/include/llvm/ADT/PointerIntPair.h
+++ b/include/llvm/ADT/PointerIntPair.h
@@ -135,12 +135,12 @@ template<typename PointerTy, unsigned IntBits, typename IntType>
 struct DenseMapInfo<PointerIntPair<PointerTy, IntBits, IntType> > {
   typedef PointerIntPair<PointerTy, IntBits, IntType> Ty;
   static Ty getEmptyKey() {
-    intptr_t Val = -1;
+    uintptr_t Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<PointerTy>::NumLowBitsAvailable;
     return Ty(reinterpret_cast<PointerTy>(Val), IntType((1 << IntBits)-1));
   }
   static Ty getTombstoneKey() {
-    intptr_t Val = -2;
+    uintptr_t Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<PointerTy>::NumLowBitsAvailable;
     return Ty(reinterpret_cast<PointerTy>(Val), IntType(0));
   }
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index a6803ee0eddf..efddd9f9b857 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -90,8 +90,8 @@ class ScopedHashTableScope {
   /// LastValInScope - This is the last value that was inserted for this scope
   /// or null if none have been inserted yet.
   ScopedHashTableVal<K, V> *LastValInScope;
-  void operator=(ScopedHashTableScope&);       // DO NOT IMPLEMENT
-  ScopedHashTableScope(ScopedHashTableScope&); // DO NOT IMPLEMENT
+  void operator=(ScopedHashTableScope&) LLVM_DELETED_FUNCTION;
+  ScopedHashTableScope(ScopedHashTableScope&) LLVM_DELETED_FUNCTION;
 public:
   ScopedHashTableScope(ScopedHashTable<K, V, KInfo, AllocatorTy> &HT);
   ~ScopedHashTableScope();
diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index 965f0deacaa2..d2f7286c2596 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h
@@ -27,10 +27,11 @@
 
 namespace llvm {
 
+/// \brief A vector that has set insertion semantics.
+///
 /// This adapter class provides a way to keep a set of things that also has the
 /// property of a deterministic iteration order. The order of iteration is the
 /// order of insertion.
-/// @brief A vector that has set insertion semantics.
 template <typename T, typename Vector = std::vector<T>,
                       typename Set = SmallSet<T, 16> >
 class SetVector {
@@ -45,59 +46,59 @@ public:
   typedef typename vector_type::const_iterator const_iterator;
   typedef typename vector_type::size_type size_type;
 
-  /// @brief Construct an empty SetVector
+  /// \brief Construct an empty SetVector
   SetVector() {}
 
-  /// @brief Initialize a SetVector with a range of elements
+  /// \brief Initialize a SetVector with a range of elements
   template<typename It>
   SetVector(It Start, It End) {
     insert(Start, End);
   }
 
-  /// @brief Determine if the SetVector is empty or not.
+  /// \brief Determine if the SetVector is empty or not.
   bool empty() const {
     return vector_.empty();
   }
 
-  /// @brief Determine the number of elements in the SetVector.
+  /// \brief Determine the number of elements in the SetVector.
   size_type size() const {
     return vector_.size();
   }
 
-  /// @brief Get an iterator to the beginning of the SetVector.
+  /// \brief Get an iterator to the beginning of the SetVector.
   iterator begin() {
     return vector_.begin();
   }
 
-  /// @brief Get a const_iterator to the beginning of the SetVector.
+  /// \brief Get a const_iterator to the beginning of the SetVector.
   const_iterator begin() const {
     return vector_.begin();
   }
 
-  /// @brief Get an iterator to the end of the SetVector.
+  /// \brief Get an iterator to the end of the SetVector.
   iterator end() {
     return vector_.end();
   }
 
-  /// @brief Get a const_iterator to the end of the SetVector.
+  /// \brief Get a const_iterator to the end of the SetVector.
   const_iterator end() const {
     return vector_.end();
   }
 
-  /// @brief Return the last element of the SetVector.
+  /// \brief Return the last element of the SetVector.
   const T &back() const {
     assert(!empty() && "Cannot call back() on empty SetVector!");
     return vector_.back();
   }
 
-  /// @brief Index into the SetVector.
+  /// \brief Index into the SetVector.
   const_reference operator[](size_type n) const {
     assert(n < vector_.size() && "SetVector access out of range!");
     return vector_[n];
   }
 
-  /// @returns true iff the element was inserted into the SetVector.
-  /// @brief Insert a new element into the SetVector.
+  /// \brief Insert a new element into the SetVector.
+  /// \returns true iff the element was inserted into the SetVector.
   bool insert(const value_type &X) {
     bool result = set_.insert(X);
     if (result)
@@ -105,7 +106,7 @@ public:
     return result;
   }
 
-  /// @brief Insert a range of elements into the SetVector.
+  /// \brief Insert a range of elements into the SetVector.
   template<typename It>
   void insert(It Start, It End) {
     for (; Start != End; ++Start)
@@ -113,7 +114,7 @@ public:
         vector_.push_back(*Start);
   }
 
-  /// @brief Remove an item from the set vector.
+  /// \brief Remove an item from the set vector.
   bool remove(const value_type& X) {
     if (set_.erase(X)) {
       typename vector_type::iterator I =
@@ -125,20 +126,44 @@ public:
     return false;
   }
 
-
-  /// @returns 0 if the element is not in the SetVector, 1 if it is.
-  /// @brief Count the number of elements of a given key in the SetVector.
+  /// \brief Remove items from the set vector based on a predicate function.
+  ///
+  /// This is intended to be equivalent to the following code, if we could
+  /// write it:
+  ///
+  /// \code
+  ///   V.erase(std::remove_if(V.begin(), V.end(), P), V.end());
+  /// \endcode
+  ///
+  /// However, SetVector doesn't expose non-const iterators, making any
+  /// algorithm like remove_if impossible to use.
+  ///
+  /// \returns true if any element is removed.
+  template <typename UnaryPredicate>
+  bool remove_if(UnaryPredicate P) {
+    typename vector_type::iterator I
+      = std::remove_if(vector_.begin(), vector_.end(),
+                       TestAndEraseFromSet<UnaryPredicate>(P, set_));
+    if (I == vector_.end())
+      return false;
+    vector_.erase(I, vector_.end());
+    return true;
+  }
+
+
+  /// \brief Count the number of elements of a given key in the SetVector.
+  /// \returns 0 if the element is not in the SetVector, 1 if it is.
   size_type count(const key_type &key) const {
     return set_.count(key);
   }
 
-  /// @brief Completely clear the SetVector
+  /// \brief Completely clear the SetVector
   void clear() {
     set_.clear();
     vector_.clear();
   }
 
-  /// @brief Remove the last element of the SetVector.
+  /// \brief Remove the last element of the SetVector.
   void pop_back() {
     assert(!empty() && "Cannot remove an element from an empty SetVector!");
     set_.erase(back());
@@ -160,18 +185,41 @@ public:
   }
 
 private:
+  /// \brief A wrapper predicate designed for use with std::remove_if.
+  ///
+  /// This predicate wraps a predicate suitable for use with std::remove_if to
+  /// call set_.erase(x) on each element which is slated for removal.
+  template <typename UnaryPredicate>
+  class TestAndEraseFromSet {
+    UnaryPredicate P;
+    set_type &set_;
+
+  public:
+    typedef typename UnaryPredicate::argument_type argument_type;
+
+    TestAndEraseFromSet(UnaryPredicate P, set_type &set_) : P(P), set_(set_) {}
+
+    bool operator()(argument_type Arg) {
+      if (P(Arg)) {
+        set_.erase(Arg);
+        return true;
+      }
+      return false;
+    }
+  };
+
   set_type set_;         ///< The set.
   vector_type vector_;   ///< The vector.
 };
 
-/// SmallSetVector - A SetVector that performs no allocations if smaller than
+/// \brief A SetVector that performs no allocations if smaller than
 /// a certain size.
 template <typename T, unsigned N>
 class SmallSetVector : public SetVector<T, SmallVector<T, N>, SmallSet<T, N> > {
 public:
   SmallSetVector() {}
 
-  /// @brief Initialize a SmallSetVector with a range of elements
+  /// \brief Initialize a SmallSetVector with a range of elements
   template<typename It>
   SmallSetVector(It Start, It End) {
     this->insert(Start, End);
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index 7a645e0c7241..a9cd54e13b38 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -300,6 +300,21 @@ public:
     return *this;
   }
 
+  /// set - Efficiently set a range of bits in [I, E)
+  SmallBitVector &set(unsigned I, unsigned E) {
+    assert(I <= E && "Attempted to set backwards range!");
+    assert(E <= size() && "Attempted to set out-of-bounds range!");
+    if (I == E) return *this;
+    if (isSmall()) {
+      uintptr_t EMask = ((uintptr_t)1) << E;
+      uintptr_t IMask = ((uintptr_t)1) << I;
+      uintptr_t Mask = EMask - IMask;
+      setSmallBits(getSmallBits() | Mask);
+    } else
+      getPointer()->set(I, E);
+    return *this;
+  }
+
   SmallBitVector &reset() {
     if (isSmall())
       setSmallBits(0);
@@ -316,6 +331,21 @@ public:
     return *this;
   }
 
+  /// reset - Efficiently reset a range of bits in [I, E)
+  SmallBitVector &reset(unsigned I, unsigned E) {
+    assert(I <= E && "Attempted to reset backwards range!");
+    assert(E <= size() && "Attempted to reset out-of-bounds range!");
+    if (I == E) return *this;
+    if (isSmall()) {
+      uintptr_t EMask = ((uintptr_t)1) << E;
+      uintptr_t IMask = ((uintptr_t)1) << I;
+      uintptr_t Mask = EMask - IMask;
+      setSmallBits(getSmallBits() & ~Mask);
+    } else
+      getPointer()->reset(I, E);
+    return *this;
+  }
+
   SmallBitVector &flip() {
     if (isSmall())
       setSmallBits(~getSmallBits());
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 498a0345d8bb..3bb883088c59 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -15,12 +15,13 @@
 #ifndef LLVM_ADT_SMALLPTRSET_H
 #define LLVM_ADT_SMALLPTRSET_H
 
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
 #include <cassert>
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/PointerLikeTypeTraits.h"
 
 namespace llvm {
 
@@ -132,7 +133,7 @@ private:
   /// Grow - Allocate a larger backing store for the buckets and move it over.
   void Grow(unsigned NewSize);
 
-  void operator=(const SmallPtrSetImpl &RHS);  // DO NOT IMPLEMENT.
+  void operator=(const SmallPtrSetImpl &RHS) LLVM_DELETED_FUNCTION;
 protected:
   /// swap - Swaps the elements of two sets.
   /// Note: This method assumes that both sets have the same small size.
diff --git a/include/llvm/ADT/SmallString.h b/include/llvm/ADT/SmallString.h
index c6f0a5bf1542..8da99d1c125c 100644
--- a/include/llvm/ADT/SmallString.h
+++ b/include/llvm/ADT/SmallString.h
@@ -44,25 +44,25 @@ public:
   /// @name String Assignment
   /// @{
 
-  /// Assign from a repeated element
+  /// Assign from a repeated element.
   void assign(size_t NumElts, char Elt) {
     this->SmallVectorImpl<char>::assign(NumElts, Elt);
   }
 
-  /// Assign from an iterator pair
+  /// Assign from an iterator pair.
   template<typename in_iter>
   void assign(in_iter S, in_iter E) {
     this->clear();
     SmallVectorImpl<char>::append(S, E);
   }
 
-  /// Assign from a StringRef
+  /// Assign from a StringRef.
   void assign(StringRef RHS) {
     this->clear();
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
 
-  /// Assign from a SmallVector
+  /// Assign from a SmallVector.
   void assign(const SmallVectorImpl<char> &RHS) {
     this->clear();
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
@@ -72,7 +72,7 @@ public:
   /// @name String Concatenation
   /// @{
 
-  /// Append from an iterator pair
+  /// Append from an iterator pair.
   template<typename in_iter>
   void append(in_iter S, in_iter E) {
     SmallVectorImpl<char>::append(S, E);
@@ -83,12 +83,12 @@ public:
   }
 
 
-  /// Append from a StringRef
+  /// Append from a StringRef.
   void append(StringRef RHS) {
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
 
-  /// Append from a SmallVector
+  /// Append from a SmallVector.
   void append(const SmallVectorImpl<char> &RHS) {
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
@@ -97,19 +97,19 @@ public:
   /// @name String Comparison
   /// @{
 
-  /// equals - Check for string equality, this is more efficient than
-  /// compare() when the relative ordering of inequal strings isn't needed.
+  /// Check for string equality.  This is more efficient than compare() when
+  /// the relative ordering of inequal strings isn't needed.
   bool equals(StringRef RHS) const {
     return str().equals(RHS);
   }
 
-  /// equals_lower - Check for string equality, ignoring case.
+  /// Check for string equality, ignoring case.
   bool equals_lower(StringRef RHS) const {
     return str().equals_lower(RHS);
   }
 
-  /// compare - Compare two strings; the result is -1, 0, or 1 if this string
-  /// is lexicographically less than, equal to, or greater than the \arg RHS.
+  /// Compare two strings; the result is -1, 0, or 1 if this string is
+  /// lexicographically less than, equal to, or greater than the \p RHS.
   int compare(StringRef RHS) const {
     return str().compare(RHS);
   }
@@ -129,12 +129,12 @@ public:
   /// @name String Predicates
   /// @{
 
-  /// startswith - Check if this string starts with the given \arg Prefix.
+  /// startswith - Check if this string starts with the given \p Prefix.
   bool startswith(StringRef Prefix) const {
     return str().startswith(Prefix);
   }
 
-  /// endswith - Check if this string ends with the given \arg Suffix.
+  /// endswith - Check if this string ends with the given \p Suffix.
   bool endswith(StringRef Suffix) const {
     return str().endswith(Suffix);
   }
@@ -143,76 +143,76 @@ public:
   /// @name String Searching
   /// @{
 
-  /// find - Search for the first character \arg C in the string.
+  /// find - Search for the first character \p C in the string.
   ///
-  /// \return - The index of the first occurrence of \arg C, or npos if not
+  /// \return - The index of the first occurrence of \p C, or npos if not
   /// found.
   size_t find(char C, size_t From = 0) const {
     return str().find(C, From);
   }
 
-  /// find - Search for the first string \arg Str in the string.
+  /// Search for the first string \p Str in the string.
   ///
-  /// \return - The index of the first occurrence of \arg Str, or npos if not
+  /// \returns The index of the first occurrence of \p Str, or npos if not
   /// found.
   size_t find(StringRef Str, size_t From = 0) const {
     return str().find(Str, From);
   }
 
-  /// rfind - Search for the last character \arg C in the string.
+  /// Search for the last character \p C in the string.
   ///
-  /// \return - The index of the last occurrence of \arg C, or npos if not
+  /// \returns The index of the last occurrence of \p C, or npos if not
   /// found.
   size_t rfind(char C, size_t From = StringRef::npos) const {
     return str().rfind(C, From);
   }
 
-  /// rfind - Search for the last string \arg Str in the string.
+  /// Search for the last string \p Str in the string.
   ///
-  /// \return - The index of the last occurrence of \arg Str, or npos if not
+  /// \returns The index of the last occurrence of \p Str, or npos if not
   /// found.
   size_t rfind(StringRef Str) const {
     return str().rfind(Str);
   }
 
-  /// find_first_of - Find the first character in the string that is \arg C,
-  /// or npos if not found. Same as find.
+  /// Find the first character in the string that is \p C, or npos if not
+  /// found. Same as find.
   size_t find_first_of(char C, size_t From = 0) const {
     return str().find_first_of(C, From);
   }
 
-  /// find_first_of - Find the first character in the string that is in \arg
-  /// Chars, or npos if not found.
+  /// Find the first character in the string that is in \p Chars, or npos if
+  /// not found.
   ///
-  /// Note: O(size() + Chars.size())
+  /// Complexity: O(size() + Chars.size())
   size_t find_first_of(StringRef Chars, size_t From = 0) const {
     return str().find_first_of(Chars, From);
   }
 
-  /// find_first_not_of - Find the first character in the string that is not
-  /// \arg C or npos if not found.
+  /// Find the first character in the string that is not \p C or npos if not
+  /// found.
   size_t find_first_not_of(char C, size_t From = 0) const {
     return str().find_first_not_of(C, From);
   }
 
-  /// find_first_not_of - Find the first character in the string that is not
-  /// in the string \arg Chars, or npos if not found.
+  /// Find the first character in the string that is not in the string
+  /// \p Chars, or npos if not found.
   ///
-  /// Note: O(size() + Chars.size())
+  /// Complexity: O(size() + Chars.size())
   size_t find_first_not_of(StringRef Chars, size_t From = 0) const {
     return str().find_first_not_of(Chars, From);
   }
 
-  /// find_last_of - Find the last character in the string that is \arg C, or
-  /// npos if not found.
+  /// Find the last character in the string that is \p C, or npos if not
+  /// found.
   size_t find_last_of(char C, size_t From = StringRef::npos) const {
     return str().find_last_of(C, From);
   }
 
-  /// find_last_of - Find the last character in the string that is in \arg C,
-  /// or npos if not found.
+  /// Find the last character in the string that is in \p C, or npos if not
+  /// found.
   ///
-  /// Note: O(size() + Chars.size())
+  /// Complexity: O(size() + Chars.size())
   size_t find_last_of(
       StringRef Chars, size_t From = StringRef::npos) const {
     return str().find_last_of(Chars, From);
@@ -222,13 +222,13 @@ public:
   /// @name Helpful Algorithms
   /// @{
 
-  /// count - Return the number of occurrences of \arg C in the string.
+  /// Return the number of occurrences of \p C in the string.
   size_t count(char C) const {
     return str().count(C);
   }
 
-  /// count - Return the number of non-overlapped occurrences of \arg Str in
-  /// the string.
+  /// Return the number of non-overlapped occurrences of \p Str in the
+  /// string.
   size_t count(StringRef Str) const {
     return str().count(Str);
   }
@@ -237,36 +237,36 @@ public:
   /// @name Substring Operations
   /// @{
 
-  /// substr - Return a reference to the substring from [Start, Start + N).
+  /// Return a reference to the substring from [Start, Start + N).
   ///
-  /// \param Start - The index of the starting character in the substring; if
+  /// \param Start The index of the starting character in the substring; if
   /// the index is npos or greater than the length of the string then the
   /// empty substring will be returned.
   ///
-  /// \param N - The number of characters to included in the substring. If N
+  /// \param N The number of characters to included in the substring. If \p N
   /// exceeds the number of characters remaining in the string, the string
-  /// suffix (starting with \arg Start) will be returned.
+  /// suffix (starting with \p Start) will be returned.
   StringRef substr(size_t Start, size_t N = StringRef::npos) const {
     return str().substr(Start, N);
   }
 
-  /// slice - Return a reference to the substring from [Start, End).
+  /// Return a reference to the substring from [Start, End).
   ///
-  /// \param Start - The index of the starting character in the substring; if
+  /// \param Start The index of the starting character in the substring; if
   /// the index is npos or greater than the length of the string then the
   /// empty substring will be returned.
   ///
-  /// \param End - The index following the last character to include in the
-  /// substring. If this is npos, or less than \arg Start, or exceeds the
+  /// \param End The index following the last character to include in the
+  /// substring. If this is npos, or less than \p Start, or exceeds the
   /// number of characters remaining in the string, the string suffix
-  /// (starting with \arg Start) will be returned.
+  /// (starting with \p Start) will be returned.
   StringRef slice(size_t Start, size_t End) const {
     return str().slice(Start, End);
   }
 
   // Extra methods.
 
-  /// Explicit conversion to StringRef
+  /// Explicit conversion to StringRef.
   StringRef str() const { return StringRef(this->begin(), this->size()); }
 
   // TODO: Make this const, if it's safe...
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 9fbbbe4f3b5d..6e0fd94dfe67 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_SMALLVECTOR_H
 #define LLVM_ADT_SMALLVECTOR_H
 
+#include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/type_traits.h"
 #include <algorithm>
@@ -32,44 +33,20 @@ class SmallVectorBase {
 protected:
   void *BeginX, *EndX, *CapacityX;
 
-  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
-  // don't want it to be automatically run, so we need to represent the space as
-  // something else.  An array of char would work great, but might not be
-  // aligned sufficiently.  Instead we use some number of union instances for
-  // the space, which guarantee maximal alignment.
-  union U {
-    double D;
-    long double LD;
-    long long L;
-    void *P;
-  } FirstEl;
-  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
-
 protected:
-  SmallVectorBase(size_t Size)
-    : BeginX(&FirstEl), EndX(&FirstEl), CapacityX((char*)&FirstEl+Size) {}
-
-  /// isSmall - Return true if this is a smallvector which has not had dynamic
-  /// memory allocated for it.
-  bool isSmall() const {
-    return BeginX == static_cast<const void*>(&FirstEl);
-  }
-
-  /// resetToSmall - Put this vector in a state of being small.
-  void resetToSmall() {
-    BeginX = EndX = CapacityX = &FirstEl;
-  }
+  SmallVectorBase(void *FirstEl, size_t Size)
+    : BeginX(FirstEl), EndX(FirstEl), CapacityX((char*)FirstEl+Size) {}
 
   /// grow_pod - This is an implementation of the grow() method which only works
   /// on POD-like data types and is out of line to reduce code duplication.
-  void grow_pod(size_t MinSizeInBytes, size_t TSize);
+  void grow_pod(void *FirstEl, size_t MinSizeInBytes, size_t TSize);
 
 public:
   /// size_in_bytes - This returns size()*sizeof(T).
   size_t size_in_bytes() const {
     return size_t((char*)EndX - (char*)BeginX);
   }
-  
+
   /// capacity_in_bytes - This returns capacity()*sizeof(T).
   size_t capacity_in_bytes() const {
     return size_t((char*)CapacityX - (char*)BeginX);
@@ -78,11 +55,41 @@ public:
   bool empty() const { return BeginX == EndX; }
 };
 
+template <typename T, unsigned N> struct SmallVectorStorage;
 
-template <typename T>
+/// SmallVectorTemplateCommon - This is the part of SmallVectorTemplateBase
+/// which does not depend on whether the type T is a POD. The extra dummy
+/// template argument is used by ArrayRef to avoid unnecessarily requiring T
+/// to be complete.
+template <typename T, typename = void>
 class SmallVectorTemplateCommon : public SmallVectorBase {
+private:
+  template <typename, unsigned> friend struct SmallVectorStorage;
+
+  // Allocate raw space for N elements of type T.  If T has a ctor or dtor, we
+  // don't want it to be automatically run, so we need to represent the space as
+  // something else.  Use an array of char of sufficient alignment.
+  typedef llvm::AlignedCharArrayUnion<T> U;
+  U FirstEl;
+  // Space after 'FirstEl' is clobbered, do not add any instance vars after it.
+
 protected:
-  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(Size) {}
+  SmallVectorTemplateCommon(size_t Size) : SmallVectorBase(&FirstEl, Size) {}
+
+  void grow_pod(size_t MinSizeInBytes, size_t TSize) {
+    SmallVectorBase::grow_pod(&FirstEl, MinSizeInBytes, TSize);
+  }
+
+  /// isSmall - Return true if this is a smallvector which has not had dynamic
+  /// memory allocated for it.
+  bool isSmall() const {
+    return BeginX == static_cast<const void*>(&FirstEl);
+  }
+
+  /// resetToSmall - Put this vector in a state of being small.
+  void resetToSmall() {
+    BeginX = EndX = CapacityX = &FirstEl;
+  }
 
   void setEnd(T *P) { this->EndX = P; }
 public:
@@ -677,8 +684,8 @@ public:
                                         RHS.begin(), RHS.end());
   }
 
-  /// set_size - Set the array size to \arg N, which the current array must have
-  /// enough capacity for.
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
   ///
   /// This does not construct or destroy any elements in the vector.
   ///
@@ -844,6 +851,17 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
 }
 #endif
 
+/// Storage for the SmallVector elements which aren't contained in
+/// SmallVectorTemplateCommon. There are 'N-1' elements here. The remaining '1'
+/// element is in the base class. This is specialized for the N=1 and N=0 cases
+/// to avoid allocating unnecessary storage.
+template <typename T, unsigned N>
+struct SmallVectorStorage {
+  typename SmallVectorTemplateCommon<T>::U InlineElts[N - 1];
+};
+template <typename T> struct SmallVectorStorage<T, 1> {};
+template <typename T> struct SmallVectorStorage<T, 0> {};
+
 /// SmallVector - This is a 'vector' (really, a variable-sized array), optimized
 /// for the case when the array is small.  It contains some number of elements
 /// in-place, which allows it to avoid heap allocation when the actual number of
@@ -854,41 +872,23 @@ SmallVectorImpl<T> &SmallVectorImpl<T>::operator=(SmallVectorImpl<T> &&RHS) {
 ///
 template <typename T, unsigned N>
 class SmallVector : public SmallVectorImpl<T> {
-  /// InlineElts - These are 'N-1' elements that are stored inline in the body
-  /// of the vector.  The extra '1' element is stored in SmallVectorImpl.
-  typedef typename SmallVectorImpl<T>::U U;
-  enum {
-    // MinUs - The number of U's require to cover N T's.
-    MinUs = (static_cast<unsigned int>(sizeof(T))*N +
-             static_cast<unsigned int>(sizeof(U)) - 1) /
-            static_cast<unsigned int>(sizeof(U)),
-
-    // NumInlineEltsElts - The number of elements actually in this array.  There
-    // is already one in the parent class, and we have to round up to avoid
-    // having a zero-element array.
-    NumInlineEltsElts = MinUs > 1 ? (MinUs - 1) : 1,
-
-    // NumTsAvailable - The number of T's we actually have space for, which may
-    // be more than N due to rounding.
-    NumTsAvailable = (NumInlineEltsElts+1)*static_cast<unsigned int>(sizeof(U))/
-                     static_cast<unsigned int>(sizeof(T))
-  };
-  U InlineElts[NumInlineEltsElts];
+  /// Storage - Inline space for elements which aren't stored in the base class.
+  SmallVectorStorage<T, N> Storage;
 public:
-  SmallVector() : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector() : SmallVectorImpl<T>(N) {
   }
 
   explicit SmallVector(unsigned Size, const T &Value = T())
-    : SmallVectorImpl<T>(NumTsAvailable) {
+    : SmallVectorImpl<T>(N) {
     this->assign(Size, Value);
   }
 
   template<typename ItTy>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(N) {
     this->append(S, E);
   }
 
-  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(N) {
     if (!RHS.empty())
       SmallVectorImpl<T>::operator=(RHS);
   }
@@ -899,7 +899,7 @@ public:
   }
 
 #if LLVM_USE_RVALUE_REFERENCES
-  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(NumTsAvailable) {
+  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(N) {
     if (!RHS.empty())
       SmallVectorImpl<T>::operator=(::std::move(RHS));
   }
@@ -912,48 +912,6 @@ public:
 
 };
 
-/// Specialize SmallVector at N=0.  This specialization guarantees
-/// that it can be instantiated at an incomplete T if none of its
-/// members are required.
-template <typename T>
-class SmallVector<T,0> : public SmallVectorImpl<T> {
-public:
-  SmallVector() : SmallVectorImpl<T>(0) {
-  }
-
-  explicit SmallVector(unsigned Size, const T &Value = T())
-    : SmallVectorImpl<T>(0) {
-    this->assign(Size, Value);
-  }
-
-  template<typename ItTy>
-  SmallVector(ItTy S, ItTy E) : SmallVectorImpl<T>(0) {
-    this->append(S, E);
-  }
-
-  SmallVector(const SmallVector &RHS) : SmallVectorImpl<T>(0) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(RHS);
-  }
-
-  const SmallVector &operator=(const SmallVector &RHS) {
-    SmallVectorImpl<T>::operator=(RHS);
-    return *this;
-  }
-
-#if LLVM_USE_RVALUE_REFERENCES
-  SmallVector(SmallVector &&RHS) : SmallVectorImpl<T>(0) {
-    if (!RHS.empty())
-      SmallVectorImpl<T>::operator=(::std::move(RHS));
-  }
-
-  const SmallVector &operator=(SmallVector &&RHS) {
-    SmallVectorImpl<T>::operator=(::std::move(RHS));
-    return *this;
-  }
-#endif
-};
-
 template<typename T, unsigned N>
 static inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
   return X.capacity_in_bytes();
diff --git a/include/llvm/ADT/SparseBitVector.h b/include/llvm/ADT/SparseBitVector.h
index 89774c3f5628..306e92832f0b 100644
--- a/include/llvm/ADT/SparseBitVector.h
+++ b/include/llvm/ADT/SparseBitVector.h
@@ -158,7 +158,7 @@ public:
             && "Word Position outside of element");
 
     // Mask off previous bits.
-    Copy &= ~0L << BitPos;
+    Copy &= ~0UL << BitPos;
 
     if (Copy != 0) {
       if (sizeof(BitWord) == 4)
@@ -262,6 +262,22 @@ public:
   }
 };
 
+template <unsigned ElementSize>
+struct ilist_traits<SparseBitVectorElement<ElementSize> >
+  : public ilist_default_traits<SparseBitVectorElement<ElementSize> > {
+  typedef SparseBitVectorElement<ElementSize> Element;
+
+  Element *createSentinel() const { return static_cast<Element *>(&Sentinel); }
+  static void destroySentinel(Element *) {}
+
+  Element *provideInitialHead() const { return createSentinel(); }
+  Element *ensureHead(Element *) const { return createSentinel(); }
+  static void noteHead(Element *, Element *) {}
+
+private:
+  mutable ilist_half_node<Element> Sentinel;
+};
+
 template <unsigned ElementSize = 128>
 class SparseBitVector {
   typedef ilist<SparseBitVectorElement<ElementSize> > ElementList;
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index 556963334894..063c6755c680 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -110,9 +110,9 @@ struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
 /// For sets that may grow to thousands of elements, SparseT should be set to
 /// uint16_t or uint32_t.
 ///
-/// @param ValueT      The type of objects in the set.
-/// @param KeyFunctorT A functor that computes an unsigned index from KeyT.
-/// @param SparseT     An unsigned integer type. See above.
+/// @tparam ValueT      The type of objects in the set.
+/// @tparam KeyFunctorT A functor that computes an unsigned index from KeyT.
+/// @tparam SparseT     An unsigned integer type. See above.
 ///
 template<typename ValueT,
          typename KeyFunctorT = llvm::identity<unsigned>,
@@ -128,8 +128,8 @@ class SparseSet {
 
   // Disable copy construction and assignment.
   // This data structure is not meant to be used that way.
-  SparseSet(const SparseSet&); // DO NOT IMPLEMENT.
-  SparseSet &operator=(const SparseSet&); // DO NOT IMPLEMENT.
+  SparseSet(const SparseSet&) LLVM_DELETED_FUNCTION;
+  SparseSet &operator=(const SparseSet&) LLVM_DELETED_FUNCTION;
 
 public:
   typedef ValueT value_type;
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 655d884e7baa..bf27c4313f82 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -21,7 +21,7 @@ namespace llvm {
 template<typename T> class SmallVectorImpl;
 
 /// hexdigit - Return the hexadecimal character for the
-/// given number \arg X (which should be less than 16).
+/// given number \p X (which should be less than 16).
 static inline char hexdigit(unsigned X, bool LowerCase = false) {
   const char HexChar = LowerCase ? 'a' : 'A';
   return X < 10 ? '0' + X : HexChar + X - 10;
@@ -125,10 +125,29 @@ void SplitString(StringRef Source,
 //   X*33+c -> X*33^c
 static inline unsigned HashString(StringRef Str, unsigned Result = 0) {
   for (unsigned i = 0, e = Str.size(); i != e; ++i)
-    Result = Result * 33 + Str[i];
+    Result = Result * 33 + (unsigned char)Str[i];
   return Result;
 }
 
+/// Returns the English suffix for an ordinal integer (-st, -nd, -rd, -th).
+static inline StringRef getOrdinalSuffix(unsigned Val) {
+  // It is critically important that we do this perfectly for
+  // user-written sequences with over 100 elements.
+  switch (Val % 100) {
+  case 11:
+  case 12:
+  case 13:
+    return "th";
+  default:
+    switch (Val % 10) {
+      case 1: return "st";
+      case 2: return "nd";
+      case 3: return "rd";
+      default: return "th";
+    }
+  }
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index cd846031c5a0..292bde0cd900 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -138,7 +138,7 @@ namespace llvm {
     }
 
     /// compare - Compare two strings; the result is -1, 0, or 1 if this string
-    /// is lexicographically less than, equal to, or greater than the \arg RHS.
+    /// is lexicographically less than, equal to, or greater than the \p RHS.
     int compare(StringRef RHS) const {
       // Check the prefix for a mismatch.
       if (int Res = compareMemory(Data, RHS.Data, min(Length, RHS.Length)))
@@ -205,13 +205,13 @@ namespace llvm {
     /// @name String Predicates
     /// @{
 
-    /// startswith - Check if this string starts with the given \arg Prefix.
+    /// Check if this string starts with the given \p Prefix.
     bool startswith(StringRef Prefix) const {
       return Length >= Prefix.Length &&
              compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
 
-    /// endswith - Check if this string ends with the given \arg Suffix.
+    /// Check if this string ends with the given \p Suffix.
     bool endswith(StringRef Suffix) const {
       return Length >= Suffix.Length &&
         compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
@@ -221,9 +221,9 @@ namespace llvm {
     /// @name String Searching
     /// @{
 
-    /// find - Search for the first character \arg C in the string.
+    /// Search for the first character \p C in the string.
     ///
-    /// \return - The index of the first occurrence of \arg C, or npos if not
+    /// \returns The index of the first occurrence of \p C, or npos if not
     /// found.
     size_t find(char C, size_t From = 0) const {
       for (size_t i = min(From, Length), e = Length; i != e; ++i)
@@ -232,15 +232,15 @@ namespace llvm {
       return npos;
     }
 
-    /// find - Search for the first string \arg Str in the string.
+    /// Search for the first string \p Str in the string.
     ///
-    /// \return - The index of the first occurrence of \arg Str, or npos if not
+    /// \returns The index of the first occurrence of \p Str, or npos if not
     /// found.
     size_t find(StringRef Str, size_t From = 0) const;
 
-    /// rfind - Search for the last character \arg C in the string.
+    /// Search for the last character \p C in the string.
     ///
-    /// \return - The index of the last occurrence of \arg C, or npos if not
+    /// \returns The index of the last occurrence of \p C, or npos if not
     /// found.
     size_t rfind(char C, size_t From = npos) const {
       From = min(From, Length);
@@ -253,61 +253,61 @@ namespace llvm {
       return npos;
     }
 
-    /// rfind - Search for the last string \arg Str in the string.
+    /// Search for the last string \p Str in the string.
     ///
-    /// \return - The index of the last occurrence of \arg Str, or npos if not
+    /// \returns The index of the last occurrence of \p Str, or npos if not
     /// found.
     size_t rfind(StringRef Str) const;
 
-    /// find_first_of - Find the first character in the string that is \arg C,
-    /// or npos if not found. Same as find.
+    /// Find the first character in the string that is \p C, or npos if not
+    /// found. Same as find.
     size_type find_first_of(char C, size_t From = 0) const {
       return find(C, From);
     }
 
-    /// find_first_of - Find the first character in the string that is in \arg
-    /// Chars, or npos if not found.
+    /// Find the first character in the string that is in \p Chars, or npos if
+    /// not found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_first_of(StringRef Chars, size_t From = 0) const;
 
-    /// find_first_not_of - Find the first character in the string that is not
-    /// \arg C or npos if not found.
+    /// Find the first character in the string that is not \p C or npos if not
+    /// found.
     size_type find_first_not_of(char C, size_t From = 0) const;
 
-    /// find_first_not_of - Find the first character in the string that is not
-    /// in the string \arg Chars, or npos if not found.
+    /// Find the first character in the string that is not in the string
+    /// \p Chars, or npos if not found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_first_not_of(StringRef Chars, size_t From = 0) const;
 
-    /// find_last_of - Find the last character in the string that is \arg C, or
-    /// npos if not found.
+    /// Find the last character in the string that is \p C, or npos if not
+    /// found.
     size_type find_last_of(char C, size_t From = npos) const {
       return rfind(C, From);
     }
 
-    /// find_last_of - Find the last character in the string that is in \arg C,
-    /// or npos if not found.
+    /// Find the last character in the string that is in \p C, or npos if not
+    /// found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_last_of(StringRef Chars, size_t From = npos) const;
 
-    /// find_last_not_of - Find the last character in the string that is not
-    /// \arg C, or npos if not found.
+    /// Find the last character in the string that is not \p C, or npos if not
+    /// found.
     size_type find_last_not_of(char C, size_t From = npos) const;
 
-    /// find_last_not_of - Find the last character in the string that is not in
-    /// \arg Chars, or npos if not found.
+    /// Find the last character in the string that is not in \p Chars, or
+    /// npos if not found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_last_not_of(StringRef Chars, size_t From = npos) const;
 
     /// @}
     /// @name Helpful Algorithms
     /// @{
 
-    /// count - Return the number of occurrences of \arg C in the string.
+    /// Return the number of occurrences of \p C in the string.
     size_t count(char C) const {
       size_t Count = 0;
       for (size_t i = 0, e = Length; i != e; ++i)
@@ -316,18 +316,17 @@ namespace llvm {
       return Count;
     }
 
-    /// count - Return the number of non-overlapped occurrences of \arg Str in
+    /// Return the number of non-overlapped occurrences of \p Str in
     /// the string.
     size_t count(StringRef Str) const;
 
-    /// getAsInteger - Parse the current string as an integer of the specified
-    /// radix.  If Radix is specified as zero, this does radix autosensing using
+    /// Parse the current string as an integer of the specified radix.  If
+    /// \p Radix is specified as zero, this does radix autosensing using
     /// extended C rules: 0 is octal, 0x is hex, 0b is binary.
     ///
     /// If the string is invalid or if only a subset of the string is valid,
     /// this returns true to signify the error.  The string is considered
     /// erroneous if empty or if it overflows T.
-    ///
     template <typename T>
     typename enable_if_c<std::numeric_limits<T>::is_signed, bool>::type
     getAsInteger(unsigned Radix, T &Result) const {
@@ -350,13 +349,12 @@ namespace llvm {
       return false;
     }
 
-    /// getAsInteger - Parse the current string as an integer of the
-    /// specified radix, or of an autosensed radix if the radix given
-    /// is 0.  The current value in Result is discarded, and the
-    /// storage is changed to be wide enough to store the parsed
-    /// integer.
+    /// Parse the current string as an integer of the specified \p Radix, or of
+    /// an autosensed radix if the \p Radix given is 0.  The current value in
+    /// \p Result is discarded, and the storage is changed to be wide enough to
+    /// store the parsed integer.
     ///
-    /// Returns true if the string does not solely consist of a valid
+    /// \returns true if the string does not solely consist of a valid
     /// non-empty number in the appropriate base.
     ///
     /// APInt::fromString is superficially similar but assumes the
@@ -367,70 +365,70 @@ namespace llvm {
     /// @name String Operations
     /// @{
 
-    // lower - Convert the given ASCII string to lowercase.
+    // Convert the given ASCII string to lowercase.
     std::string lower() const;
 
-    /// upper - Convert the given ASCII string to uppercase.
+    /// Convert the given ASCII string to uppercase.
     std::string upper() const;
 
     /// @}
     /// @name Substring Operations
     /// @{
 
-    /// substr - Return a reference to the substring from [Start, Start + N).
+    /// Return a reference to the substring from [Start, Start + N).
     ///
-    /// \param Start - The index of the starting character in the substring; if
+    /// \param Start The index of the starting character in the substring; if
     /// the index is npos or greater than the length of the string then the
     /// empty substring will be returned.
     ///
-    /// \param N - The number of characters to included in the substring. If N
+    /// \param N The number of characters to included in the substring. If N
     /// exceeds the number of characters remaining in the string, the string
-    /// suffix (starting with \arg Start) will be returned.
+    /// suffix (starting with \p Start) will be returned.
     StringRef substr(size_t Start, size_t N = npos) const {
       Start = min(Start, Length);
       return StringRef(Data + Start, min(N, Length - Start));
     }
     
-    /// drop_front - Return a StringRef equal to 'this' but with the first
-    /// elements dropped.
+    /// Return a StringRef equal to 'this' but with the first \p N elements
+    /// dropped.
     StringRef drop_front(unsigned N = 1) const {
       assert(size() >= N && "Dropping more elements than exist");
       return substr(N);
     }
 
-    /// drop_back - Return a StringRef equal to 'this' but with the last
-    /// elements dropped.
+    /// Return a StringRef equal to 'this' but with the last \p N elements
+    /// dropped.
     StringRef drop_back(unsigned N = 1) const {
       assert(size() >= N && "Dropping more elements than exist");
       return substr(0, size()-N);
     }
 
-    /// slice - Return a reference to the substring from [Start, End).
+    /// Return a reference to the substring from [Start, End).
     ///
-    /// \param Start - The index of the starting character in the substring; if
+    /// \param Start The index of the starting character in the substring; if
     /// the index is npos or greater than the length of the string then the
     /// empty substring will be returned.
     ///
-    /// \param End - The index following the last character to include in the
-    /// substring. If this is npos, or less than \arg Start, or exceeds the
+    /// \param End The index following the last character to include in the
+    /// substring. If this is npos, or less than \p Start, or exceeds the
     /// number of characters remaining in the string, the string suffix
-    /// (starting with \arg Start) will be returned.
+    /// (starting with \p Start) will be returned.
     StringRef slice(size_t Start, size_t End) const {
       Start = min(Start, Length);
       End = min(max(Start, End), Length);
       return StringRef(Data + Start, End - Start);
     }
 
-    /// split - Split into two substrings around the first occurrence of a
-    /// separator character.
+    /// Split into two substrings around the first occurrence of a separator
+    /// character.
     ///
-    /// If \arg Separator is in the string, then the result is a pair (LHS, RHS)
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
     /// such that (*this == LHS + Separator + RHS) is true and RHS is
-    /// maximal. If \arg Separator is not in the string, then the result is a
+    /// maximal. If \p Separator is not in the string, then the result is a
     /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
     ///
-    /// \param Separator - The character to split on.
-    /// \return - The split substrings.
+    /// \param Separator The character to split on.
+    /// \returns The split substrings.
     std::pair<StringRef, StringRef> split(char Separator) const {
       size_t Idx = find(Separator);
       if (Idx == npos)
@@ -438,12 +436,12 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
     }
 
-    /// split - Split into two substrings around the first occurrence of a
-    /// separator string.
+    /// Split into two substrings around the first occurrence of a separator
+    /// string.
     ///
-    /// If \arg Separator is in the string, then the result is a pair (LHS, RHS)
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
     /// such that (*this == LHS + Separator + RHS) is true and RHS is
-    /// maximal. If \arg Separator is not in the string, then the result is a
+    /// maximal. If \p Separator is not in the string, then the result is a
     /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
     ///
     /// \param Separator - The string to split on.
@@ -455,14 +453,13 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx + Separator.size(), npos));
     }
 
-    /// split - Split into substrings around the occurrences of a separator
-    /// string.
+    /// Split into substrings around the occurrences of a separator string.
     ///
-    /// Each substring is stored in \arg A. If \arg MaxSplit is >= 0, at most
-    /// \arg MaxSplit splits are done and consequently <= \arg MaxSplit
+    /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most
+    /// \p MaxSplit splits are done and consequently <= \p MaxSplit
     /// elements are added to A.
-    /// If \arg KeepEmpty is false, empty strings are not added to \arg A. They
-    /// still count when considering \arg MaxSplit
+    /// If \p KeepEmpty is false, empty strings are not added to \p A. They
+    /// still count when considering \p MaxSplit
     /// An useful invariant is that
     /// Separator.join(A) == *this if MaxSplit == -1 and KeepEmpty == true
     ///
@@ -474,12 +471,12 @@ namespace llvm {
                StringRef Separator, int MaxSplit = -1,
                bool KeepEmpty = true) const;
 
-    /// rsplit - Split into two substrings around the last occurrence of a
-    /// separator character.
+    /// Split into two substrings around the last occurrence of a separator
+    /// character.
     ///
-    /// If \arg Separator is in the string, then the result is a pair (LHS, RHS)
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
     /// such that (*this == LHS + Separator + RHS) is true and RHS is
-    /// minimal. If \arg Separator is not in the string, then the result is a
+    /// minimal. If \p Separator is not in the string, then the result is a
     /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
     ///
     /// \param Separator - The character to split on.
@@ -491,20 +488,20 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
     }
 
-    /// ltrim - Return string with consecutive characters in \arg Chars starting
-    /// from the left removed.
+    /// Return string with consecutive characters in \p Chars starting from
+    /// the left removed.
     StringRef ltrim(StringRef Chars = " \t\n\v\f\r") const {
       return drop_front(std::min(Length, find_first_not_of(Chars)));
     }
 
-    /// rtrim - Return string with consecutive characters in \arg Chars starting
-    /// from the right removed.
+    /// Return string with consecutive characters in \p Chars starting from
+    /// the right removed.
     StringRef rtrim(StringRef Chars = " \t\n\v\f\r") const {
       return drop_back(Length - std::min(Length, find_last_not_of(Chars) + 1));
     }
 
-    /// trim - Return string with consecutive characters in \arg Chars starting
-    /// from the left and right removed.
+    /// Return string with consecutive characters in \p Chars starting from
+    /// the left and right removed.
     StringRef trim(StringRef Chars = " \t\n\v\f\r") const {
       return ltrim(Chars).rtrim(Chars);
     }
diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index 9c55f6b70e36..b69a964a23ba 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h
@@ -29,8 +29,13 @@ namespace llvm {
       assert(!InLang.empty());
       const char *KeyStart = InLang.data();
       const char *KeyEnd = KeyStart + InLang.size();
-      return base::insert(llvm::StringMapEntry<char>::
-                          Create(KeyStart, KeyEnd, base::getAllocator(), '+'));
+      llvm::StringMapEntry<char> *Entry = llvm::StringMapEntry<char>::
+                            Create(KeyStart, KeyEnd, base::getAllocator(), '+');
+      if (!base::insert(Entry)) {
+        Entry->Destroy(base::getAllocator());
+        return false;
+      }
+      return true;
     }
   };
 }
diff --git a/include/llvm/ADT/Trie.h b/include/llvm/ADT/Trie.h
deleted file mode 100644
index 845af015b052..000000000000
--- a/include/llvm/ADT/Trie.h
+++ /dev/null
@@ -1,334 +0,0 @@
-//===- llvm/ADT/Trie.h ---- Generic trie structure --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class defines a generic trie structure. The trie structure
-// is immutable after creation, but the payload contained within it is not.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_TRIE_H
-#define LLVM_ADT_TRIE_H
-
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/Support/DOTGraphTraits.h"
-
-#include <cassert>
-#include <vector>
-
-namespace llvm {
-
-// FIXME:
-// - Labels are usually small, maybe it's better to use SmallString
-// - Should we use char* during construction?
-// - Should we templatize Empty with traits-like interface?
-
-template<class Payload>
-class Trie {
-  friend class GraphTraits<Trie<Payload> >;
-  friend class DOTGraphTraits<Trie<Payload> >;
-public:
-  class Node {
-    friend class Trie;
-
-  public:
-    typedef std::vector<Node*> NodeVectorType;
-    typedef typename NodeVectorType::iterator iterator;
-    typedef typename NodeVectorType::const_iterator const_iterator;
-
-  private:
-    enum QueryResult {
-      Same           = -3,
-      StringIsPrefix = -2,
-      LabelIsPrefix  = -1,
-      DontMatch      = 0,
-      HaveCommonPart
-    };
-
-    struct NodeCmp {
-      bool operator() (Node* N1, Node* N2) {
-        return (N1->Label[0] < N2->Label[0]);
-      }
-      bool operator() (Node* N, char Id) {
-        return (N->Label[0] < Id);
-      }
-    };
-
-    std::string Label;
-    Payload Data;
-    NodeVectorType Children;
-
-    // Do not implement
-    Node(const Node&);
-    Node& operator=(const Node&);
-
-    inline void addEdge(Node* N) {
-      if (Children.empty())
-        Children.push_back(N);
-      else {
-        iterator I = std::lower_bound(Children.begin(), Children.end(),
-                                      N, NodeCmp());
-        // FIXME: no dups are allowed
-        Children.insert(I, N);
-      }
-    }
-
-    inline void setEdge(Node* N) {
-      char Id = N->Label[0];
-      iterator I = std::lower_bound(Children.begin(), Children.end(),
-                                     Id, NodeCmp());
-      assert(I != Children.end() && "Node does not exists!");
-      *I = N;
-    }
-
-    QueryResult query(const std::string& s) const {
-      unsigned i, l;
-      unsigned l1 = s.length();
-      unsigned l2 = Label.length();
-
-      // Find the length of common part
-      l = std::min(l1, l2);
-      i = 0;
-      while ((i < l) && (s[i] == Label[i]))
-        ++i;
-
-      if (i == l) { // One is prefix of another, find who is who
-        if (l1 == l2)
-          return Same;
-        else if (i == l1)
-          return StringIsPrefix;
-        else
-          return LabelIsPrefix;
-      } else // s and Label have common (possible empty) part, return its length
-        return (QueryResult)i;
-    }
-
-  public:
-    inline explicit Node(const Payload& data, const std::string& label = ""):
-        Label(label), Data(data) { }
-
-    inline const Payload& data() const { return Data; }
-    inline void setData(const Payload& data) { Data = data; }
-
-    inline const std::string& label() const { return Label; }
-
-#if 0
-    inline void dump() {
-      llvm::cerr << "Node: " << this << "\n"
-                << "Label: " << Label << "\n"
-                << "Children:\n";
-
-      for (iterator I = Children.begin(), E = Children.end(); I != E; ++I)
-        llvm::cerr << (*I)->Label << "\n";
-    }
-#endif
-
-    inline Node* getEdge(char Id) {
-      Node* fNode = NULL;
-      iterator I = std::lower_bound(Children.begin(), Children.end(),
-                                          Id, NodeCmp());
-      if (I != Children.end() && (*I)->Label[0] == Id)
-        fNode = *I;
-
-      return fNode;
-    }
-
-    inline iterator       begin()       { return Children.begin(); }
-    inline const_iterator begin() const { return Children.begin(); }
-    inline iterator       end  ()       { return Children.end();   }
-    inline const_iterator end  () const { return Children.end();   }
-
-    inline size_t         size () const { return Children.size();  }
-    inline bool           empty() const { return Children.empty(); }
-    inline const Node*   &front() const { return Children.front(); }
-    inline       Node*   &front()       { return Children.front(); }
-    inline const Node*   &back()  const { return Children.back();  }
-    inline       Node*   &back()        { return Children.back();  }
-
-  };
-
-private:
-  std::vector<Node*> Nodes;
-  Payload Empty;
-
-  inline Node* addNode(const Payload& data, const std::string label = "") {
-    Node* N = new Node(data, label);
-    Nodes.push_back(N);
-    return N;
-  }
-
-  inline Node* splitEdge(Node* N, char Id, size_t index) {
-    Node* eNode = N->getEdge(Id);
-    assert(eNode && "Node doesn't exist");
-
-    const std::string &l = eNode->Label;
-    assert(index > 0 && index < l.length() && "Trying to split too far!");
-    std::string l1 = l.substr(0, index);
-    std::string l2 = l.substr(index);
-
-    Node* nNode = addNode(Empty, l1);
-    N->setEdge(nNode);
-
-    eNode->Label = l2;
-    nNode->addEdge(eNode);
-
-    return nNode;
-  }
-
-  // Do not implement
-  Trie(const Trie&);
-  Trie& operator=(const Trie&);
-
-public:
-  inline explicit Trie(const Payload& empty):Empty(empty) {
-    addNode(Empty);
-  }
-  inline ~Trie() {
-    for (unsigned i = 0, e = Nodes.size(); i != e; ++i)
-      delete Nodes[i];
-  }
-
-  inline Node* getRoot() const { return Nodes[0]; }
-
-  bool addString(const std::string& s, const Payload& data);
-  const Payload& lookup(const std::string& s) const;
-
-};
-
-// Define this out-of-line to dissuade the C++ compiler from inlining it.
-template<class Payload>
-bool Trie<Payload>::addString(const std::string& s, const Payload& data) {
-  Node* cNode = getRoot();
-  Node* tNode = NULL;
-  std::string s1(s);
-
-  while (tNode == NULL) {
-    char Id = s1[0];
-    if (Node* nNode = cNode->getEdge(Id)) {
-      typename Node::QueryResult r = nNode->query(s1);
-
-      switch (r) {
-      case Node::Same:
-      case Node::StringIsPrefix:
-        // Currently we don't allow to have two strings in the trie one
-        // being a prefix of another. This should be fixed.
-        assert(0 && "FIXME!");
-        return false;
-      case Node::DontMatch:
-        llvm_unreachable("Impossible!");
-      case Node::LabelIsPrefix:
-        s1 = s1.substr(nNode->label().length());
-        cNode = nNode;
-        break;
-      default:
-        nNode = splitEdge(cNode, Id, r);
-        tNode = addNode(data, s1.substr(r));
-        nNode->addEdge(tNode);
-      }
-    } else {
-      tNode = addNode(data, s1);
-      cNode->addEdge(tNode);
-    }
-  }
-
-  return true;
-}
-
-template<class Payload>
-const Payload& Trie<Payload>::lookup(const std::string& s) const {
-  Node* cNode = getRoot();
-  Node* tNode = NULL;
-  std::string s1(s);
-
-  while (tNode == NULL) {
-    char Id = s1[0];
-    if (Node* nNode = cNode->getEdge(Id)) {
-      typename Node::QueryResult r = nNode->query(s1);
-
-      switch (r) {
-      case Node::Same:
-        tNode = nNode;
-        break;
-      case Node::StringIsPrefix:
-        return Empty;
-      case Node::DontMatch:
-        llvm_unreachable("Impossible!");
-      case Node::LabelIsPrefix:
-        s1 = s1.substr(nNode->label().length());
-        cNode = nNode;
-        break;
-      default:
-        return Empty;
-      }
-    } else
-      return Empty;
-  }
-
-  return tNode->data();
-}
-
-template<class Payload>
-struct GraphTraits<Trie<Payload> > {
-  typedef Trie<Payload> TrieType;
-  typedef typename TrieType::Node NodeType;
-  typedef typename NodeType::iterator ChildIteratorType;
-
-  static inline NodeType *getEntryNode(const TrieType& T) {
-    return T.getRoot();
-  }
-
-  static inline ChildIteratorType child_begin(NodeType *N) {
-    return N->begin();
-  }
-  static inline ChildIteratorType child_end(NodeType *N) { return N->end(); }
-
-  typedef typename std::vector<NodeType*>::const_iterator nodes_iterator;
-
-  static inline nodes_iterator nodes_begin(const TrieType& G) {
-    return G.Nodes.begin();
-  }
-  static inline nodes_iterator nodes_end(const TrieType& G) {
-    return G.Nodes.end();
-  }
-
-};
-
-template<class Payload>
-struct DOTGraphTraits<Trie<Payload> > : public DefaultDOTGraphTraits {
-  typedef typename Trie<Payload>::Node NodeType;
-  typedef typename GraphTraits<Trie<Payload> >::ChildIteratorType EdgeIter;
-
-  static std::string getGraphName(const Trie<Payload>& T) {
-    return "Trie";
-  }
-
-  static std::string getNodeLabel(NodeType* Node, const Trie<Payload>& T) {
-    if (T.getRoot() == Node)
-      return "<Root>";
-    else
-      return Node->label();
-  }
-
-  static std::string getEdgeSourceLabel(NodeType* Node, EdgeIter I) {
-    NodeType* N = *I;
-    return N->label().substr(0, 1);
-  }
-
-  static std::string getNodeAttributes(const NodeType* Node,
-                                       const Trie<Payload>& T) {
-    if (Node->data() != T.Empty)
-      return "color=blue";
-
-    return "";
-  }
-
-};
-
-} // end of llvm namespace
-
-#endif // LLVM_ADT_TRIE_H
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 7f7061ab01b9..408d70cf76f8 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -65,7 +65,9 @@ public:
     nvptx,   // NVPTX: 32-bit
     nvptx64, // NVPTX: 64-bit
     le32,    // le32: generic little-endian 32-bit CPU (PNaCl / Emscripten)
-    amdil   // amdil: amd IL
+    amdil,   // amdil: amd IL
+    spir,    // SPIR: standard portable IR for OpenCL 32-bit version
+    spir64   // SPIR: standard portable IR for OpenCL 64-bit version
   };
   enum VendorType {
     UnknownVendor,
@@ -74,7 +76,9 @@ public:
     PC,
     SCEI,
     BGP,
-    BGQ
+    BGQ,
+    Freescale,
+    IBM
   };
   enum OSType {
     UnknownOS,
@@ -99,7 +103,8 @@ public:
     RTEMS,
     NativeClient,
     CNK,         // BG/P Compute-Node Kernel
-    Bitrig
+    Bitrig,
+    AIX
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -109,7 +114,8 @@ public:
     GNUEABIHF,
     EABI,
     MachO,
-    ANDROIDEABI
+    Android,
+    ELF
   };
 
 private:
@@ -341,7 +347,7 @@ public:
   /// to a known type.
   void setEnvironment(EnvironmentType Kind);
 
-  /// setTriple - Set all components to the new triple \arg Str.
+  /// setTriple - Set all components to the new triple \p Str.
   void setTriple(const Twine &Str);
 
   /// setArchName - Set the architecture (first) component of the
@@ -392,11 +398,10 @@ public:
   /// @name Static helpers for IDs.
   /// @{
 
-  /// getArchTypeName - Get the canonical name for the \arg Kind
-  /// architecture.
+  /// getArchTypeName - Get the canonical name for the \p Kind architecture.
   static const char *getArchTypeName(ArchType Kind);
 
-  /// getArchTypePrefix - Get the "prefix" canonical name for the \arg Kind
+  /// getArchTypePrefix - Get the "prefix" canonical name for the \p Kind
   /// architecture. This is the prefix used by the architecture specific
   /// builtins, and is suitable for passing to \see
   /// Intrinsic::getIntrinsicForGCCBuiltin().
@@ -404,15 +409,13 @@ public:
   /// \return - The architecture prefix, or 0 if none is defined.
   static const char *getArchTypePrefix(ArchType Kind);
 
-  /// getVendorTypeName - Get the canonical name for the \arg Kind
-  /// vendor.
+  /// getVendorTypeName - Get the canonical name for the \p Kind vendor.
   static const char *getVendorTypeName(VendorType Kind);
 
-  /// getOSTypeName - Get the canonical name for the \arg Kind operating
-  /// system.
+  /// getOSTypeName - Get the canonical name for the \p Kind operating system.
   static const char *getOSTypeName(OSType Kind);
 
-  /// getEnvironmentTypeName - Get the canonical name for the \arg Kind
+  /// getEnvironmentTypeName - Get the canonical name for the \p Kind
   /// environment.
   static const char *getEnvironmentTypeName(EnvironmentType Kind);
 
@@ -424,11 +427,6 @@ public:
   /// architecture name (e.g., "x86").
   static ArchType getArchTypeForLLVMName(StringRef Str);
 
-  /// getArchTypeForDarwinArchName - Get the architecture type for a "Darwin"
-  /// architecture name, for example as accepted by "gcc -arch" (see also
-  /// arch(3)).
-  static ArchType getArchTypeForDarwinArchName(StringRef Str);
-
   /// @}
 };
 
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index 9101df8cee37..cc290d51d272 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -44,7 +44,7 @@ namespace llvm {
   /// itself, and renders as an empty string. This can be returned from APIs to
   /// effectively nullify any concatenations performed on the result.
   ///
-  /// \b Implementation \n
+  /// \b Implementation
   ///
   /// Given the nature of a Twine, it is not possible for the Twine's
   /// concatenation method to construct interior nodes; the result must be
@@ -67,7 +67,7 @@ namespace llvm {
   ///
   /// These invariants are check by \see isValid().
   ///
-  /// \b Efficiency Considerations \n
+  /// \b Efficiency Considerations
   ///
   /// The Twine is designed to yield efficient and small code for common
   /// situations. For this reason, the concat() method is inlined so that
@@ -303,37 +303,37 @@ namespace llvm {
       LHS.character = static_cast<char>(Val);
     }
 
-    /// Construct a twine to print \arg Val as an unsigned decimal integer.
+    /// Construct a twine to print \p Val as an unsigned decimal integer.
     explicit Twine(unsigned Val)
       : LHSKind(DecUIKind), RHSKind(EmptyKind) {
       LHS.decUI = Val;
     }
 
-    /// Construct a twine to print \arg Val as a signed decimal integer.
+    /// Construct a twine to print \p Val as a signed decimal integer.
     explicit Twine(int Val)
       : LHSKind(DecIKind), RHSKind(EmptyKind) {
       LHS.decI = Val;
     }
 
-    /// Construct a twine to print \arg Val as an unsigned decimal integer.
+    /// Construct a twine to print \p Val as an unsigned decimal integer.
     explicit Twine(const unsigned long &Val)
       : LHSKind(DecULKind), RHSKind(EmptyKind) {
       LHS.decUL = &Val;
     }
 
-    /// Construct a twine to print \arg Val as a signed decimal integer.
+    /// Construct a twine to print \p Val as a signed decimal integer.
     explicit Twine(const long &Val)
       : LHSKind(DecLKind), RHSKind(EmptyKind) {
       LHS.decL = &Val;
     }
 
-    /// Construct a twine to print \arg Val as an unsigned decimal integer.
+    /// Construct a twine to print \p Val as an unsigned decimal integer.
     explicit Twine(const unsigned long long &Val)
       : LHSKind(DecULLKind), RHSKind(EmptyKind) {
       LHS.decULL = &Val;
     }
 
-    /// Construct a twine to print \arg Val as a signed decimal integer.
+    /// Construct a twine to print \p Val as a signed decimal integer.
     explicit Twine(const long long &Val)
       : LHSKind(DecLLKind), RHSKind(EmptyKind) {
       LHS.decLL = &Val;
@@ -370,7 +370,7 @@ namespace llvm {
     /// @name Numeric Conversions
     /// @{
 
-    // Construct a twine to print \arg Val as an unsigned hexadecimal integer.
+    // Construct a twine to print \p Val as an unsigned hexadecimal integer.
     static Twine utohexstr(const uint64_t &Val) {
       Child LHS, RHS;
       LHS.uHex = &Val;
@@ -447,17 +447,17 @@ namespace llvm {
     /// The returned StringRef's size does not include the null terminator.
     StringRef toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const;
 
-    /// print - Write the concatenated string represented by this twine to the
-    /// stream \arg OS.
+    /// Write the concatenated string represented by this twine to the
+    /// stream \p OS.
     void print(raw_ostream &OS) const;
 
-    /// dump - Dump the concatenated string represented by this twine to stderr.
+    /// Dump the concatenated string represented by this twine to stderr.
     void dump() const;
 
-    /// print - Write the representation of this twine to the stream \arg OS.
+    /// Write the representation of this twine to the stream \p OS.
     void printRepr(raw_ostream &OS) const;
 
-    /// dumpRepr - Dump the representation of this twine to stderr.
+    /// Dump the representation of this twine to stderr.
     void dumpRepr() const;
 
     /// @}
diff --git a/include/llvm/ADT/ValueMap.h b/include/llvm/ADT/ValueMap.h
index f7e255181e23..d23fccf3e8cc 100644
--- a/include/llvm/ADT/ValueMap.h
+++ b/include/llvm/ADT/ValueMap.h
@@ -80,8 +80,8 @@ class ValueMap {
   typedef typename Config::ExtraData ExtraData;
   MapT Map;
   ExtraData Data;
-  ValueMap(const ValueMap&); // DO NOT IMPLEMENT
-  ValueMap& operator=(const ValueMap&); // DO NOT IMPLEMENT
+  ValueMap(const ValueMap&) LLVM_DELETED_FUNCTION;
+  ValueMap& operator=(const ValueMap&) LLVM_DELETED_FUNCTION;
 public:
   typedef KeyT key_type;
   typedef ValueT mapped_type;
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index ba9864a98a7e..7f5cd1718142 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -38,6 +38,7 @@
 #ifndef LLVM_ADT_ILIST_H
 #define LLVM_ADT_ILIST_H
 
+#include "llvm/Support/Compiler.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -331,8 +332,8 @@ class iplist : public Traits {
 
   // No fundamental reason why iplist can't be copyable, but the default
   // copy/copy-assign won't do.
-  iplist(const iplist &);         // do not implement
-  void operator=(const iplist &); // do not implement
+  iplist(const iplist &) LLVM_DELETED_FUNCTION;
+  void operator=(const iplist &) LLVM_DELETED_FUNCTION;
 
 public:
   typedef NodeTy *pointer;
diff --git a/include/llvm/AddressingMode.h b/include/llvm/AddressingMode.h
new file mode 100644
index 000000000000..70b3c05238c5
--- /dev/null
+++ b/include/llvm/AddressingMode.h
@@ -0,0 +1,41 @@
+//===--------- llvm/AddressingMode.h - Addressing Mode    -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//  This file contains addressing mode data structures which are shared
+//  between LSR and a number of places in the codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADDRESSING_MODE_H
+#define LLVM_ADDRESSING_MODE_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+/// AddrMode - This represents an addressing mode of:
+///    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
+/// If BaseGV is null,  there is no BaseGV.
+/// If BaseOffs is zero, there is no base offset.
+/// If HasBaseReg is false, there is no base register.
+/// If Scale is zero, there is no ScaleReg.  Scale of 1 indicates a reg with
+/// no scale.
+///
+struct AddrMode {
+  GlobalValue *BaseGV;
+  int64_t      BaseOffs;
+  bool         HasBaseReg;
+  int64_t      Scale;
+  AddrMode() : BaseGV(0), BaseOffs(0), HasBaseReg(false), Scale(0) {}
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 674868a02633..be274afd1552 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -45,7 +45,8 @@ namespace llvm {
 class LoadInst;
 class StoreInst;
 class VAArgInst;
-class TargetData;
+class DataLayout;
+class TargetLibraryInfo;
 class Pass;
 class AnalysisUsage;
 class MemTransferInst;
@@ -54,7 +55,8 @@ class DominatorTree;
 
 class AliasAnalysis {
 protected:
-  const TargetData *TD;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
 
 private:
   AliasAnalysis *AA;       // Previous Alias Analysis to chain to.
@@ -73,7 +75,7 @@ protected:
 
 public:
   static char ID; // Class identification, replacement for typeinfo
-  AliasAnalysis() : TD(0), AA(0) {}
+  AliasAnalysis() : TD(0), TLI(0), AA(0) {}
   virtual ~AliasAnalysis();  // We want to be subclassed
 
   /// UnknownSize - This is a special value which can be used with the
@@ -81,12 +83,17 @@ public:
   /// know the sizes of the potential memory references.
   static uint64_t const UnknownSize = ~UINT64_C(0);
 
-  /// getTargetData - Return a pointer to the current TargetData object, or
-  /// null if no TargetData object is available.
+  /// getDataLayout - Return a pointer to the current DataLayout object, or
+  /// null if no DataLayout object is available.
   ///
-  const TargetData *getTargetData() const { return TD; }
+  const DataLayout *getDataLayout() const { return TD; }
 
-  /// getTypeStoreSize - Return the TargetData store size for the given type,
+  /// getTargetLibraryInfo - Return a pointer to the current TargetLibraryInfo
+  /// object, or null if no TargetLibraryInfo object is available.
+  ///
+  const TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
+
+  /// getTypeStoreSize - Return the DataLayout store size for the given type,
   /// if known, or a conservative value otherwise.
   ///
   uint64_t getTypeStoreSize(Type *Ty);
@@ -187,6 +194,11 @@ public:
     return isNoAlias(Location(V1, V1Size), Location(V2, V2Size));
   }
   
+  /// isNoAlias - A convenience wrapper.
+  bool isNoAlias(const Value *V1, const Value *V2) {
+    return isNoAlias(Location(V1), Location(V2));
+  }
+  
   /// isMustAlias - A convenience wrapper.
   bool isMustAlias(const Location &LocA, const Location &LocB) {
     return alias(LocA, LocB) == MustAlias;
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 95626d624a13..1e606c81d9c7 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -109,7 +109,6 @@ class AliasSet : public ilist_node<AliasSet> {
 
   PointerRec *PtrList, **PtrListEnd;  // Doubly linked list of nodes.
   AliasSet *Forward;             // Forwarding pointer.
-  AliasSet *Next, *Prev;         // Doubly linked list of AliasSets.
 
   // All instructions without a specific address in this alias set.
   std::vector<AssertingVH<Instruction> > UnknownInsts;
@@ -226,8 +225,8 @@ private:
                AccessTy(NoModRef), AliasTy(MustAlias), Volatile(false) {
   }
 
-  AliasSet(const AliasSet &AS);        // do not implement
-  void operator=(const AliasSet &AS);  // do not implement
+  AliasSet(const AliasSet &AS) LLVM_DELETED_FUNCTION;
+  void operator=(const AliasSet &AS) LLVM_DELETED_FUNCTION;
 
   PointerRec *getSomePointer() const {
     return PtrList;
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 006daa082946..c0567daa3a5e 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -28,11 +28,14 @@ class raw_ostream;
 ///
 /// This is a function analysis pass which provides information on the relative
 /// probabilities of each "edge" in the function's CFG where such an edge is
-/// defined by a pair of basic blocks. The probability for a given block and
-/// a successor block are always relative to the probabilities of the other
-/// successor blocks. Another way of looking at it is that the probabilities
-/// for a given block B and each of its successors should sum to exactly
-/// one (100%).
+/// defined by a pair (PredBlock and an index in the successors). The
+/// probability of an edge from one block is always relative to the
+/// probabilities of other edges from the block. The probabilites of all edges
+/// from a block sum to exactly one (100%).
+/// We use a pair (PredBlock and an index in the successors) to uniquely
+/// identify an edge, since we can have multiple edges from Src to Dst.
+/// As an example, we can have a switch which jumps to Dst with value 0 and
+/// value 10.
 class BranchProbabilityInfo : public FunctionPass {
 public:
   static char ID;
@@ -52,6 +55,12 @@ public:
   /// leaving the 'Src' block. The returned probability is never zero, and can
   /// only be one if the source block has only one successor.
   BranchProbability getEdgeProbability(const BasicBlock *Src,
+                                       unsigned IndexInSuccessors) const;
+
+  /// \brief Get the probability of going from Src to Dst.
+  ///
+  /// It returns the sum of all probabilities for edges from Src to Dst.
+  BranchProbability getEdgeProbability(const BasicBlock *Src,
                                        const BasicBlock *Dst) const;
 
   /// \brief Test if an edge is hot relative to other out-edges of the Src.
@@ -74,25 +83,34 @@ public:
   raw_ostream &printEdgeProbability(raw_ostream &OS, const BasicBlock *Src,
                                     const BasicBlock *Dst) const;
 
-  /// \brief Get the raw edge weight calculated for the block pair.
+  /// \brief Get the raw edge weight calculated for the edge.
   ///
   /// This returns the raw edge weight. It is guaranteed to fall between 1 and
   /// UINT32_MAX. Note that the raw edge weight is not meaningful in isolation.
   /// This interface should be very carefully, and primarily by routines that
   /// are updating the analysis by later calling setEdgeWeight.
+  uint32_t getEdgeWeight(const BasicBlock *Src,
+                         unsigned IndexInSuccessors) const;
+
+  /// \brief Get the raw edge weight calculated for the block pair.
+  ///
+  /// This returns the sum of all raw edge weights from Src to Dst.
+  /// It is guaranteed to fall between 1 and UINT32_MAX.
   uint32_t getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const;
 
-  /// \brief Set the raw edge weight for the block pair.
+  /// \brief Set the raw edge weight for a given edge.
   ///
-  /// This allows a pass to explicitly set the edge weight for a block. It can
+  /// This allows a pass to explicitly set the edge weight for an edge. It can
   /// be used when updating the CFG to update and preserve the branch
   /// probability information. Read the implementation of how these edge
   /// weights are calculated carefully before using!
-  void setEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst,
+  void setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
                      uint32_t Weight);
 
 private:
-  typedef std::pair<const BasicBlock *, const BasicBlock *> Edge;
+  // Since we allow duplicate edges from one basic block to another, we use
+  // a pair (PredBlock and an index in the successors) to specify an edge.
+  typedef std::pair<const BasicBlock *, unsigned> Edge;
 
   // Default weight value. Used when we don't have information about the edge.
   // TODO: DEFAULT_WEIGHT makes sense during static predication, when none of
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index fb77da7b69ea..6a9ed310375a 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -185,9 +185,9 @@ private:
   /// in the CalledFunctions array of this or other CallGraphNodes.
   unsigned NumReferences;
 
-  CallGraphNode(const CallGraphNode &);            // DO NOT IMPLEMENT
-  void operator=(const CallGraphNode &);           // DO NOT IMPLEMENT
-  
+  CallGraphNode(const CallGraphNode &) LLVM_DELETED_FUNCTION;
+  void operator=(const CallGraphNode &) LLVM_DELETED_FUNCTION;
+ 
   void DropRef() { --NumReferences; }
   void AddRef() { ++NumReferences; }
 public:
diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h
index 9b5e8425ad29..2889269b957a 100644
--- a/include/llvm/Analysis/CaptureTracking.h
+++ b/include/llvm/Analysis/CaptureTracking.h
@@ -46,7 +46,7 @@ namespace llvm {
     /// capture) return false. To search it, return true.
     ///
     /// U->getUser() is always an Instruction.
-    virtual bool shouldExplore(Use *U) = 0;
+    virtual bool shouldExplore(Use *U);
 
     /// captured - Information about the pointer was captured by the user of
     /// use U. Return true to stop the traversal or false to continue looking
diff --git a/include/llvm/Analysis/CodeMetrics.h b/include/llvm/Analysis/CodeMetrics.h
index 03c807cf8326..4398faa20a7b 100644
--- a/include/llvm/Analysis/CodeMetrics.h
+++ b/include/llvm/Analysis/CodeMetrics.h
@@ -22,11 +22,11 @@ namespace llvm {
   class BasicBlock;
   class Function;
   class Instruction;
-  class TargetData;
+  class DataLayout;
   class Value;
 
   /// \brief Check whether an instruction is likely to be "free" when lowered.
-  bool isInstructionFree(const Instruction *I, const TargetData *TD = 0);
+  bool isInstructionFree(const Instruction *I, const DataLayout *TD = 0);
 
   /// \brief Check whether a call will lower to something small.
   ///
@@ -85,10 +85,10 @@ namespace llvm {
                     NumRets(0) {}
 
     /// \brief Add information about a block to the current state.
-    void analyzeBasicBlock(const BasicBlock *BB, const TargetData *TD = 0);
+    void analyzeBasicBlock(const BasicBlock *BB, const DataLayout *TD = 0);
 
     /// \brief Add information about a function to the current state.
-    void analyzeFunction(Function *F, const TargetData *TD = 0);
+    void analyzeFunction(Function *F, const DataLayout *TD = 0);
   };
 }
 
diff --git a/include/llvm/Analysis/ConstantFolding.h b/include/llvm/Analysis/ConstantFolding.h
index 2fdef5f0836e..12e623ea9be4 100644
--- a/include/llvm/Analysis/ConstantFolding.h
+++ b/include/llvm/Analysis/ConstantFolding.h
@@ -12,7 +12,7 @@
 //
 // Also, to supplement the basic VMCore ConstantExpr simplifications,
 // this file declares some additional folding routines that can make use of
-// TargetData information. These functions cannot go in VMCore due to library
+// DataLayout information. These functions cannot go in VMCore due to library
 // dependency issues.
 //
 //===----------------------------------------------------------------------===//
@@ -24,7 +24,7 @@ namespace llvm {
   class Constant;
   class ConstantExpr;
   class Instruction;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
   class Function;
   class Type;
@@ -36,14 +36,14 @@ namespace llvm {
 /// Note that this fails if not all of the operands are constant.  Otherwise,
 /// this function can only fail when attempting to fold instructions like loads
 /// and stores, which have no constant expression form.
-Constant *ConstantFoldInstruction(Instruction *I, const TargetData *TD = 0,
+Constant *ConstantFoldInstruction(Instruction *I, const DataLayout *TD = 0,
                                   const TargetLibraryInfo *TLI = 0);
 
 /// ConstantFoldConstantExpression - Attempt to fold the constant expression
-/// using the specified TargetData.  If successful, the constant result is
+/// using the specified DataLayout.  If successful, the constant result is
 /// result is returned, if not, null is returned.
 Constant *ConstantFoldConstantExpression(const ConstantExpr *CE,
-                                         const TargetData *TD = 0,
+                                         const DataLayout *TD = 0,
                                          const TargetLibraryInfo *TLI = 0);
 
 /// ConstantFoldInstOperands - Attempt to constant fold an instruction with the
@@ -54,7 +54,7 @@ Constant *ConstantFoldConstantExpression(const ConstantExpr *CE,
 ///
 Constant *ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                    ArrayRef<Constant *> Ops,
-                                   const TargetData *TD = 0,
+                                   const DataLayout *TD = 0,
                                    const TargetLibraryInfo *TLI = 0);
 
 /// ConstantFoldCompareInstOperands - Attempt to constant fold a compare
@@ -63,7 +63,7 @@ Constant *ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
 ///
 Constant *ConstantFoldCompareInstOperands(unsigned Predicate,
                                           Constant *LHS, Constant *RHS,
-                                          const TargetData *TD = 0,
+                                          const DataLayout *TD = 0,
                                           const TargetLibraryInfo *TLI = 0);
 
 /// ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue
@@ -75,7 +75,7 @@ Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
 /// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
 /// produce if it is constant and determinable.  If this is not determinable,
 /// return null.
-Constant *ConstantFoldLoadFromConstPtr(Constant *C, const TargetData *TD = 0);
+Constant *ConstantFoldLoadFromConstPtr(Constant *C, const DataLayout *TD = 0);
 
 /// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
 /// getelementptr constantexpr, return the constant value being addressed by the
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
new file mode 100644
index 000000000000..b4327eeb0b1e
--- /dev/null
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -0,0 +1,885 @@
+//===-- llvm/Analysis/DependenceAnalysis.h -------------------- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// DependenceAnalysis is an LLVM pass that analyses dependences between memory
+// accesses. Currently, it is an implementation of the approach described in
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+//
+// There's a single entry point that analyzes the dependence between a pair
+// of memory references in a function, returning either NULL, for no dependence,
+// or a more-or-less detailed description of the dependence between them.
+//
+// Please note that this is work in progress and the interface is subject to
+// change.
+//
+// Plausible changes:
+//    Return a set of more precise dependences instead of just one dependence
+//    summarizing all.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DEPENDENCEANALYSIS_H
+#define LLVM_ANALYSIS_DEPENDENCEANALYSIS_H
+
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace llvm {
+  class AliasAnalysis;
+  class Loop;
+  class LoopInfo;
+  class ScalarEvolution;
+  class SCEV;
+  class SCEVConstant;
+  class raw_ostream;
+
+  /// Dependence - This class represents a dependence between two memory
+  /// memory references in a function. It contains minimal information and
+  /// is used in the very common situation where the compiler is unable to
+  /// determine anything beyond the existence of a dependence; that is, it
+  /// represents a confused dependence (see also FullDependence). In most
+  /// cases (for output, flow, and anti dependences), the dependence implies
+  /// an ordering, where the source must precede the destination; in contrast,
+  /// input dependences are unordered.
+  class Dependence {
+  public:
+    Dependence(const Instruction *Source,
+               const Instruction *Destination) :
+      Src(Source), Dst(Destination) {}
+    virtual ~Dependence() {}
+
+    /// Dependence::DVEntry - Each level in the distance/direction vector
+    /// has a direction (or perhaps a union of several directions), and
+    /// perhaps a distance.
+    struct DVEntry {
+      enum { NONE = 0,
+             LT = 1,
+             EQ = 2,
+             LE = 3,
+             GT = 4,
+             NE = 5,
+             GE = 6,
+             ALL = 7 };
+      unsigned char Direction : 3; // Init to ALL, then refine.
+      bool Scalar    : 1; // Init to true.
+      bool PeelFirst : 1; // Peeling the first iteration will break dependence.
+      bool PeelLast  : 1; // Peeling the last iteration will break the dependence.
+      bool Splitable : 1; // Splitting the loop will break dependence.
+      const SCEV *Distance; // NULL implies no distance available.
+      DVEntry() : Direction(ALL), Scalar(true), PeelFirst(false),
+                  PeelLast(false), Splitable(false), Distance(NULL) { }
+    };
+
+    /// getSrc - Returns the source instruction for this dependence.
+    ///
+    const Instruction *getSrc() const { return Src; }
+
+    /// getDst - Returns the destination instruction for this dependence.
+    ///
+    const Instruction *getDst() const { return Dst; }
+
+    /// isInput - Returns true if this is an input dependence.
+    ///
+    bool isInput() const;
+
+    /// isOutput - Returns true if this is an output dependence.
+    ///
+    bool isOutput() const;
+
+    /// isFlow - Returns true if this is a flow (aka true) dependence.
+    ///
+    bool isFlow() const;
+
+    /// isAnti - Returns true if this is an anti dependence.
+    ///
+    bool isAnti() const;
+
+    /// isOrdered - Returns true if dependence is Output, Flow, or Anti
+    ///
+    bool isOrdered() const { return isOutput() || isFlow() || isAnti(); }
+
+    /// isUnordered - Returns true if dependence is Input
+    ///
+    bool isUnordered() const { return isInput(); }
+
+    /// isLoopIndependent - Returns true if this is a loop-independent
+    /// dependence.
+    virtual bool isLoopIndependent() const { return true; }
+
+    /// isConfused - Returns true if this dependence is confused
+    /// (the compiler understands nothing and makes worst-case
+    /// assumptions).
+    virtual bool isConfused() const { return true; }
+
+    /// isConsistent - Returns true if this dependence is consistent
+    /// (occurs every time the source and destination are executed).
+    virtual bool isConsistent() const { return false; }
+
+    /// getLevels - Returns the number of common loops surrounding the
+    /// source and destination of the dependence.
+    virtual unsigned getLevels() const { return 0; }
+
+    /// getDirection - Returns the direction associated with a particular
+    /// level.
+    virtual unsigned getDirection(unsigned Level) const { return DVEntry::ALL; }
+
+    /// getDistance - Returns the distance (or NULL) associated with a
+    /// particular level.
+    virtual const SCEV *getDistance(unsigned Level) const { return NULL; }
+
+    /// isPeelFirst - Returns true if peeling the first iteration from
+    /// this loop will break this dependence.
+    virtual bool isPeelFirst(unsigned Level) const { return false; }
+
+    /// isPeelLast - Returns true if peeling the last iteration from
+    /// this loop will break this dependence.
+    virtual bool isPeelLast(unsigned Level) const { return false; }
+
+    /// isSplitable - Returns true if splitting this loop will break
+    /// the dependence.
+    virtual bool isSplitable(unsigned Level) const { return false; }
+
+    /// isScalar - Returns true if a particular level is scalar; that is,
+    /// if no subscript in the source or destination mention the induction
+    /// variable associated with the loop at this level.
+    virtual bool isScalar(unsigned Level) const;
+
+    /// dump - For debugging purposes, dumps a dependence to OS.
+    ///
+    void dump(raw_ostream &OS) const;
+  private:
+    const Instruction *Src, *Dst;
+    friend class DependenceAnalysis;
+  };
+
+
+  /// FullDependence - This class represents a dependence between two memory
+  /// references in a function. It contains detailed information about the
+  /// dependence (direction vectors, etc) and is used when the compiler is
+  /// able to accurately analyze the interaction of the references; that is,
+  /// it is not a confused dependence (see Dependence). In most cases
+  /// (for output, flow, and anti dependences), the dependence implies an
+  /// ordering, where the source must precede the destination; in contrast,
+  /// input dependences are unordered.
+  class FullDependence : public Dependence {
+  public:
+    FullDependence(const Instruction *Src,
+                   const Instruction *Dst,
+                   bool LoopIndependent,
+                   unsigned Levels);
+    ~FullDependence() {
+      delete DV;
+    }
+
+    /// isLoopIndependent - Returns true if this is a loop-independent
+    /// dependence.
+    bool isLoopIndependent() const { return LoopIndependent; }
+
+    /// isConfused - Returns true if this dependence is confused
+    /// (the compiler understands nothing and makes worst-case
+    /// assumptions).
+    bool isConfused() const { return false; }
+
+    /// isConsistent - Returns true if this dependence is consistent
+    /// (occurs every time the source and destination are executed).
+    bool isConsistent() const { return Consistent; }
+
+    /// getLevels - Returns the number of common loops surrounding the
+    /// source and destination of the dependence.
+    unsigned getLevels() const { return Levels; }
+
+    /// getDirection - Returns the direction associated with a particular
+    /// level.
+    unsigned getDirection(unsigned Level) const;
+
+    /// getDistance - Returns the distance (or NULL) associated with a
+    /// particular level.
+    const SCEV *getDistance(unsigned Level) const;
+
+    /// isPeelFirst - Returns true if peeling the first iteration from
+    /// this loop will break this dependence.
+    bool isPeelFirst(unsigned Level) const;
+
+    /// isPeelLast - Returns true if peeling the last iteration from
+    /// this loop will break this dependence.
+    bool isPeelLast(unsigned Level) const;
+
+    /// isSplitable - Returns true if splitting the loop will break
+    /// the dependence.
+    bool isSplitable(unsigned Level) const;
+
+    /// isScalar - Returns true if a particular level is scalar; that is,
+    /// if no subscript in the source or destination mention the induction
+    /// variable associated with the loop at this level.
+    bool isScalar(unsigned Level) const;
+  private:
+    unsigned short Levels;
+    bool LoopIndependent;
+    bool Consistent; // Init to true, then refine.
+    DVEntry *DV;
+    friend class DependenceAnalysis;
+  };
+
+
+  /// DependenceAnalysis - This class is the main dependence-analysis driver.
+  ///
+  class DependenceAnalysis : public FunctionPass {
+    void operator=(const DependenceAnalysis &);     // do not implement
+    DependenceAnalysis(const DependenceAnalysis &); // do not implement
+  public:
+    /// depends - Tests for a dependence between the Src and Dst instructions.
+    /// Returns NULL if no dependence; otherwise, returns a Dependence (or a
+    /// FullDependence) with as much information as can be gleaned.
+    /// The flag PossiblyLoopIndependent should be set by the caller
+    /// if it appears that control flow can reach from Src to Dst
+    /// without traversing a loop back edge.
+    Dependence *depends(const Instruction *Src,
+                        const Instruction *Dst,
+                        bool PossiblyLoopIndependent);
+
+    /// getSplitIteration - Give a dependence that's splitable at some
+    /// particular level, return the iteration that should be used to split
+    /// the loop.
+    ///
+    /// Generally, the dependence analyzer will be used to build
+    /// a dependence graph for a function (basically a map from instructions
+    /// to dependences). Looking for cycles in the graph shows us loops
+    /// that cannot be trivially vectorized/parallelized.
+    ///
+    /// We can try to improve the situation by examining all the dependences
+    /// that make up the cycle, looking for ones we can break.
+    /// Sometimes, peeling the first or last iteration of a loop will break
+    /// dependences, and there are flags for those possibilities.
+    /// Sometimes, splitting a loop at some other iteration will do the trick,
+    /// and we've got a flag for that case. Rather than waste the space to
+    /// record the exact iteration (since we rarely know), we provide
+    /// a method that calculates the iteration. It's a drag that it must work
+    /// from scratch, but wonderful in that it's possible.
+    ///
+    /// Here's an example:
+    ///
+    ///    for (i = 0; i < 10; i++)
+    ///        A[i] = ...
+    ///        ... = A[11 - i]
+    ///
+    /// There's a loop-carried flow dependence from the store to the load,
+    /// found by the weak-crossing SIV test. The dependence will have a flag,
+    /// indicating that the dependence can be broken by splitting the loop.
+    /// Calling getSplitIteration will return 5.
+    /// Splitting the loop breaks the dependence, like so:
+    ///
+    ///    for (i = 0; i <= 5; i++)
+    ///        A[i] = ...
+    ///        ... = A[11 - i]
+    ///    for (i = 6; i < 10; i++)
+    ///        A[i] = ...
+    ///        ... = A[11 - i]
+    ///
+    /// breaks the dependence and allows us to vectorize/parallelize
+    /// both loops.
+    const SCEV *getSplitIteration(const Dependence *Dep, unsigned Level);
+
+  private:
+    AliasAnalysis *AA;
+    ScalarEvolution *SE;
+    LoopInfo *LI;
+    Function *F;
+
+    /// Subscript - This private struct represents a pair of subscripts from
+    /// a pair of potentially multi-dimensional array references. We use a
+    /// vector of them to guide subscript partitioning.
+    struct Subscript {
+      const SCEV *Src;
+      const SCEV *Dst;
+      enum ClassificationKind { ZIV, SIV, RDIV, MIV, NonLinear } Classification;
+      SmallBitVector Loops;
+      SmallBitVector GroupLoops;
+      SmallBitVector Group;
+    };
+
+    struct CoefficientInfo {
+      const SCEV *Coeff;
+      const SCEV *PosPart;
+      const SCEV *NegPart;
+      const SCEV *Iterations;
+    };
+
+    struct BoundInfo {
+      const SCEV *Iterations;
+      const SCEV *Upper[8];
+      const SCEV *Lower[8];
+      unsigned char Direction;
+      unsigned char DirSet;
+    };
+
+    /// Constraint - This private class represents a constraint, as defined
+    /// in the paper
+    ///
+    ///           Practical Dependence Testing
+    ///           Goff, Kennedy, Tseng
+    ///           PLDI 1991
+    ///
+    /// There are 5 kinds of constraint, in a hierarchy.
+    ///   1) Any - indicates no constraint, any dependence is possible.
+    ///   2) Line - A line ax + by = c, where a, b, and c are parameters,
+    ///             representing the dependence equation.
+    ///   3) Distance - The value d of the dependence distance;
+    ///   4) Point - A point <x, y> representing the dependence from
+    ///              iteration x to iteration y.
+    ///   5) Empty - No dependence is possible.
+    class Constraint {
+    private:
+      enum ConstraintKind { Empty, Point, Distance, Line, Any } Kind;
+      ScalarEvolution *SE;
+      const SCEV *A;
+      const SCEV *B;
+      const SCEV *C;
+      const Loop *AssociatedLoop;
+    public:
+      /// isEmpty - Return true if the constraint is of kind Empty.
+      bool isEmpty() const { return Kind == Empty; }
+
+      /// isPoint - Return true if the constraint is of kind Point.
+      bool isPoint() const { return Kind == Point; }
+
+      /// isDistance - Return true if the constraint is of kind Distance.
+      bool isDistance() const { return Kind == Distance; }
+
+      /// isLine - Return true if the constraint is of kind Line.
+      /// Since Distance's can also be represented as Lines, we also return
+      /// true if the constraint is of kind Distance.
+      bool isLine() const { return Kind == Line || Kind == Distance; }
+
+      /// isAny - Return true if the constraint is of kind Any;
+      bool isAny() const { return Kind == Any; }
+
+      /// getX - If constraint is a point <X, Y>, returns X.
+      /// Otherwise assert.
+      const SCEV *getX() const;
+
+      /// getY - If constraint is a point <X, Y>, returns Y.
+      /// Otherwise assert.
+      const SCEV *getY() const;
+
+      /// getA - If constraint is a line AX + BY = C, returns A.
+      /// Otherwise assert.
+      const SCEV *getA() const;
+
+      /// getB - If constraint is a line AX + BY = C, returns B.
+      /// Otherwise assert.
+      const SCEV *getB() const;
+
+      /// getC - If constraint is a line AX + BY = C, returns C.
+      /// Otherwise assert.
+      const SCEV *getC() const;
+
+      /// getD - If constraint is a distance, returns D.
+      /// Otherwise assert.
+      const SCEV *getD() const;
+
+      /// getAssociatedLoop - Returns the loop associated with this constraint.
+      const Loop *getAssociatedLoop() const;
+
+      /// setPoint - Change a constraint to Point.
+      void setPoint(const SCEV *X, const SCEV *Y, const Loop *CurrentLoop);
+
+      /// setLine - Change a constraint to Line.
+      void setLine(const SCEV *A, const SCEV *B,
+                   const SCEV *C, const Loop *CurrentLoop);
+
+      /// setDistance - Change a constraint to Distance.
+      void setDistance(const SCEV *D, const Loop *CurrentLoop);
+
+      /// setEmpty - Change a constraint to Empty.
+      void setEmpty();
+
+      /// setAny - Change a constraint to Any.
+      void setAny(ScalarEvolution *SE);
+
+      /// dump - For debugging purposes. Dumps the constraint
+      /// out to OS.
+      void dump(raw_ostream &OS) const;
+    };
+
+
+    /// establishNestingLevels - Examines the loop nesting of the Src and Dst
+    /// instructions and establishes their shared loops. Sets the variables
+    /// CommonLevels, SrcLevels, and MaxLevels.
+    /// The source and destination instructions needn't be contained in the same
+    /// loop. The routine establishNestingLevels finds the level of most deeply
+    /// nested loop that contains them both, CommonLevels. An instruction that's
+    /// not contained in a loop is at level = 0. MaxLevels is equal to the level
+    /// of the source plus the level of the destination, minus CommonLevels.
+    /// This lets us allocate vectors MaxLevels in length, with room for every
+    /// distinct loop referenced in both the source and destination subscripts.
+    /// The variable SrcLevels is the nesting depth of the source instruction.
+    /// It's used to help calculate distinct loops referenced by the destination.
+    /// Here's the map from loops to levels:
+    ///            0 - unused
+    ///            1 - outermost common loop
+    ///          ... - other common loops
+    /// CommonLevels - innermost common loop
+    ///          ... - loops containing Src but not Dst
+    ///    SrcLevels - innermost loop containing Src but not Dst
+    ///          ... - loops containing Dst but not Src
+    ///    MaxLevels - innermost loop containing Dst but not Src
+    /// Consider the follow code fragment:
+    ///    for (a = ...) {
+    ///      for (b = ...) {
+    ///        for (c = ...) {
+    ///          for (d = ...) {
+    ///            A[] = ...;
+    ///          }
+    ///        }
+    ///        for (e = ...) {
+    ///          for (f = ...) {
+    ///            for (g = ...) {
+    ///              ... = A[];
+    ///            }
+    ///          }
+    ///        }
+    ///      }
+    ///    }
+    /// If we're looking at the possibility of a dependence between the store
+    /// to A (the Src) and the load from A (the Dst), we'll note that they
+    /// have 2 loops in common, so CommonLevels will equal 2 and the direction
+    /// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
+    /// A map from loop names to level indices would look like
+    ///     a - 1
+    ///     b - 2 = CommonLevels
+    ///     c - 3
+    ///     d - 4 = SrcLevels
+    ///     e - 5
+    ///     f - 6
+    ///     g - 7 = MaxLevels
+    void establishNestingLevels(const Instruction *Src,
+                                const Instruction *Dst);
+
+    unsigned CommonLevels, SrcLevels, MaxLevels;
+
+    /// mapSrcLoop - Given one of the loops containing the source, return
+    /// its level index in our numbering scheme.
+    unsigned mapSrcLoop(const Loop *SrcLoop) const;
+
+    /// mapDstLoop - Given one of the loops containing the destination,
+    /// return its level index in our numbering scheme.
+    unsigned mapDstLoop(const Loop *DstLoop) const;
+
+    /// isLoopInvariant - Returns true if Expression is loop invariant
+    /// in LoopNest.
+    bool isLoopInvariant(const SCEV *Expression, const Loop *LoopNest) const;
+
+    /// removeMatchingExtensions - Examines a subscript pair.
+    /// If the source and destination are identically sign (or zero)
+    /// extended, it strips off the extension in an effort to
+    /// simplify the actual analysis.
+    void removeMatchingExtensions(Subscript *Pair);
+
+    /// collectCommonLoops - Finds the set of loops from the LoopNest that
+    /// have a level <= CommonLevels and are referred to by the SCEV Expression.
+    void collectCommonLoops(const SCEV *Expression,
+                            const Loop *LoopNest,
+                            SmallBitVector &Loops) const;
+
+    /// checkSrcSubscript - Examines the SCEV Src, returning true iff it's
+    /// linear. Collect the set of loops mentioned by Src.
+    bool checkSrcSubscript(const SCEV *Src,
+                           const Loop *LoopNest,
+                           SmallBitVector &Loops);
+
+    /// checkDstSubscript - Examines the SCEV Dst, returning true iff it's
+    /// linear. Collect the set of loops mentioned by Dst.
+    bool checkDstSubscript(const SCEV *Dst,
+                           const Loop *LoopNest,
+                           SmallBitVector &Loops);
+
+    /// isKnownPredicate - Compare X and Y using the predicate Pred.
+    /// Basically a wrapper for SCEV::isKnownPredicate,
+    /// but tries harder, especially in the presence of sign and zero
+    /// extensions and symbolics.
+    bool isKnownPredicate(ICmpInst::Predicate Pred,
+                          const SCEV *X,
+                          const SCEV *Y) const;
+
+    /// collectUpperBound - All subscripts are the same type (on my machine,
+    /// an i64). The loop bound may be a smaller type. collectUpperBound
+    /// find the bound, if available, and zero extends it to the Type T.
+    /// (I zero extend since the bound should always be >= 0.)
+    /// If no upper bound is available, return NULL.
+    const SCEV *collectUpperBound(const Loop *l, Type *T) const;
+
+    /// collectConstantUpperBound - Calls collectUpperBound(), then
+    /// attempts to cast it to SCEVConstant. If the cast fails,
+    /// returns NULL.
+    const SCEVConstant *collectConstantUpperBound(const Loop *l, Type *T) const;
+
+    /// classifyPair - Examines the subscript pair (the Src and Dst SCEVs)
+    /// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
+    /// Collects the associated loops in a set.
+    Subscript::ClassificationKind classifyPair(const SCEV *Src,
+                                           const Loop *SrcLoopNest,
+                                           const SCEV *Dst,
+                                           const Loop *DstLoopNest,
+                                           SmallBitVector &Loops);
+
+    /// testZIV - Tests the ZIV subscript pair (Src and Dst) for dependence.
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// If the dependence isn't proven to exist,
+    /// marks the Result as inconsistent.
+    bool testZIV(const SCEV *Src,
+                 const SCEV *Dst,
+                 FullDependence &Result) const;
+
+    /// testSIV - Tests the SIV subscript pair (Src and Dst) for dependence.
+    /// Things of the form [c1 + a1*i] and [c2 + a2*j], where
+    /// i and j are induction variables, c1 and c2 are loop invariant,
+    /// and a1 and a2 are constant.
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Sets appropriate direction vector entry and, when possible,
+    /// the distance vector entry.
+    /// If the dependence isn't proven to exist,
+    /// marks the Result as inconsistent.
+    bool testSIV(const SCEV *Src,
+                 const SCEV *Dst,
+                 unsigned &Level,
+                 FullDependence &Result,
+                 Constraint &NewConstraint,
+                 const SCEV *&SplitIter) const;
+
+    /// testRDIV - Tests the RDIV subscript pair (Src and Dst) for dependence.
+    /// Things of the form [c1 + a1*i] and [c2 + a2*j]
+    /// where i and j are induction variables, c1 and c2 are loop invariant,
+    /// and a1 and a2 are constant.
+    /// With minor algebra, this test can also be used for things like
+    /// [c1 + a1*i + a2*j][c2].
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Marks the Result as inconsistent.
+    bool testRDIV(const SCEV *Src,
+                  const SCEV *Dst,
+                  FullDependence &Result) const;
+
+    /// testMIV - Tests the MIV subscript pair (Src and Dst) for dependence.
+    /// Returns true if dependence disproved.
+    /// Can sometimes refine direction vectors.
+    bool testMIV(const SCEV *Src,
+                 const SCEV *Dst,
+                 const SmallBitVector &Loops,
+                 FullDependence &Result) const;
+
+    /// strongSIVtest - Tests the strong SIV subscript pair (Src and Dst)
+    /// for dependence.
+    /// Things of the form [c1 + a*i] and [c2 + a*i],
+    /// where i is an induction variable, c1 and c2 are loop invariant,
+    /// and a is a constant
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Sets appropriate direction and distance.
+    bool strongSIVtest(const SCEV *Coeff,
+                       const SCEV *SrcConst,
+                       const SCEV *DstConst,
+                       const Loop *CurrentLoop,
+                       unsigned Level,
+                       FullDependence &Result,
+                       Constraint &NewConstraint) const;
+
+    /// weakCrossingSIVtest - Tests the weak-crossing SIV subscript pair
+    /// (Src and Dst) for dependence.
+    /// Things of the form [c1 + a*i] and [c2 - a*i],
+    /// where i is an induction variable, c1 and c2 are loop invariant,
+    /// and a is a constant.
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Sets appropriate direction entry.
+    /// Set consistent to false.
+    /// Marks the dependence as splitable.
+    bool weakCrossingSIVtest(const SCEV *SrcCoeff,
+                             const SCEV *SrcConst,
+                             const SCEV *DstConst,
+                             const Loop *CurrentLoop,
+                             unsigned Level,
+                             FullDependence &Result,
+                             Constraint &NewConstraint,
+                             const SCEV *&SplitIter) const;
+
+    /// ExactSIVtest - Tests the SIV subscript pair
+    /// (Src and Dst) for dependence.
+    /// Things of the form [c1 + a1*i] and [c2 + a2*i],
+    /// where i is an induction variable, c1 and c2 are loop invariant,
+    /// and a1 and a2 are constant.
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Sets appropriate direction entry.
+    /// Set consistent to false.
+    bool exactSIVtest(const SCEV *SrcCoeff,
+                      const SCEV *DstCoeff,
+                      const SCEV *SrcConst,
+                      const SCEV *DstConst,
+                      const Loop *CurrentLoop,
+                      unsigned Level,
+                      FullDependence &Result,
+                      Constraint &NewConstraint) const;
+
+    /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
+    /// (Src and Dst) for dependence.
+    /// Things of the form [c1] and [c2 + a*i],
+    /// where i is an induction variable, c1 and c2 are loop invariant,
+    /// and a is a constant. See also weakZeroDstSIVtest.
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Sets appropriate direction entry.
+    /// Set consistent to false.
+    /// If loop peeling will break the dependence, mark appropriately.
+    bool weakZeroSrcSIVtest(const SCEV *DstCoeff,
+                            const SCEV *SrcConst,
+                            const SCEV *DstConst,
+                            const Loop *CurrentLoop,
+                            unsigned Level,
+                            FullDependence &Result,
+                            Constraint &NewConstraint) const;
+
+    /// weakZeroDstSIVtest - Tests the weak-zero SIV subscript pair
+    /// (Src and Dst) for dependence.
+    /// Things of the form [c1 + a*i] and [c2],
+    /// where i is an induction variable, c1 and c2 are loop invariant,
+    /// and a is a constant. See also weakZeroSrcSIVtest.
+    /// Returns true if any possible dependence is disproved.
+    /// If there might be a dependence, returns false.
+    /// Sets appropriate direction entry.
+    /// Set consistent to false.
+    /// If loop peeling will break the dependence, mark appropriately.
+    bool weakZeroDstSIVtest(const SCEV *SrcCoeff,
+                            const SCEV *SrcConst,
+                            const SCEV *DstConst,
+                            const Loop *CurrentLoop,
+                            unsigned Level,
+                            FullDependence &Result,
+                            Constraint &NewConstraint) const;
+
+    /// exactRDIVtest - Tests the RDIV subscript pair for dependence.
+    /// Things of the form [c1 + a*i] and [c2 + b*j],
+    /// where i and j are induction variable, c1 and c2 are loop invariant,
+    /// and a and b are constants.
+    /// Returns true if any possible dependence is disproved.
+    /// Marks the result as inconsistent.
+    /// Works in some cases that symbolicRDIVtest doesn't,
+    /// and vice versa.
+    bool exactRDIVtest(const SCEV *SrcCoeff,
+                       const SCEV *DstCoeff,
+                       const SCEV *SrcConst,
+                       const SCEV *DstConst,
+                       const Loop *SrcLoop,
+                       const Loop *DstLoop,
+                       FullDependence &Result) const;
+
+    /// symbolicRDIVtest - Tests the RDIV subscript pair for dependence.
+    /// Things of the form [c1 + a*i] and [c2 + b*j],
+    /// where i and j are induction variable, c1 and c2 are loop invariant,
+    /// and a and b are constants.
+    /// Returns true if any possible dependence is disproved.
+    /// Marks the result as inconsistent.
+    /// Works in some cases that exactRDIVtest doesn't,
+    /// and vice versa. Can also be used as a backup for
+    /// ordinary SIV tests.
+    bool symbolicRDIVtest(const SCEV *SrcCoeff,
+                          const SCEV *DstCoeff,
+                          const SCEV *SrcConst,
+                          const SCEV *DstConst,
+                          const Loop *SrcLoop,
+                          const Loop *DstLoop) const;
+
+    /// gcdMIVtest - Tests an MIV subscript pair for dependence.
+    /// Returns true if any possible dependence is disproved.
+    /// Marks the result as inconsistent.
+    /// Can sometimes disprove the equal direction for 1 or more loops.
+    //  Can handle some symbolics that even the SIV tests don't get,
+    /// so we use it as a backup for everything.
+    bool gcdMIVtest(const SCEV *Src,
+                    const SCEV *Dst,
+                    FullDependence &Result) const;
+
+    /// banerjeeMIVtest - Tests an MIV subscript pair for dependence.
+    /// Returns true if any possible dependence is disproved.
+    /// Marks the result as inconsistent.
+    /// Computes directions.
+    bool banerjeeMIVtest(const SCEV *Src,
+                         const SCEV *Dst,
+                         const SmallBitVector &Loops,
+                         FullDependence &Result) const;
+
+    /// collectCoefficientInfo - Walks through the subscript,
+    /// collecting each coefficient, the associated loop bounds,
+    /// and recording its positive and negative parts for later use.
+    CoefficientInfo *collectCoeffInfo(const SCEV *Subscript,
+                                      bool SrcFlag,
+                                      const SCEV *&Constant) const;
+
+    /// getPositivePart - X^+ = max(X, 0).
+    ///
+    const SCEV *getPositivePart(const SCEV *X) const;
+
+    /// getNegativePart - X^- = min(X, 0).
+    ///
+    const SCEV *getNegativePart(const SCEV *X) const;
+
+    /// getLowerBound - Looks through all the bounds info and
+    /// computes the lower bound given the current direction settings
+    /// at each level.
+    const SCEV *getLowerBound(BoundInfo *Bound) const;
+
+    /// getUpperBound - Looks through all the bounds info and
+    /// computes the upper bound given the current direction settings
+    /// at each level.
+    const SCEV *getUpperBound(BoundInfo *Bound) const;
+
+    /// exploreDirections - Hierarchically expands the direction vector
+    /// search space, combining the directions of discovered dependences
+    /// in the DirSet field of Bound. Returns the number of distinct
+    /// dependences discovered. If the dependence is disproved,
+    /// it will return 0.
+    unsigned exploreDirections(unsigned Level,
+                               CoefficientInfo *A,
+                               CoefficientInfo *B,
+                               BoundInfo *Bound,
+                               const SmallBitVector &Loops,
+                               unsigned &DepthExpanded,
+                               const SCEV *Delta) const;
+
+    /// testBounds - Returns true iff the current bounds are plausible.
+    ///
+    bool testBounds(unsigned char DirKind,
+                    unsigned Level,
+                    BoundInfo *Bound,
+                    const SCEV *Delta) const;
+
+    /// findBoundsALL - Computes the upper and lower bounds for level K
+    /// using the * direction. Records them in Bound.
+    void findBoundsALL(CoefficientInfo *A,
+                       CoefficientInfo *B,
+                       BoundInfo *Bound,
+                       unsigned K) const;
+
+    /// findBoundsLT - Computes the upper and lower bounds for level K
+    /// using the < direction. Records them in Bound.
+    void findBoundsLT(CoefficientInfo *A,
+                      CoefficientInfo *B,
+                      BoundInfo *Bound,
+                      unsigned K) const;
+
+    /// findBoundsGT - Computes the upper and lower bounds for level K
+    /// using the > direction. Records them in Bound.
+    void findBoundsGT(CoefficientInfo *A,
+                      CoefficientInfo *B,
+                      BoundInfo *Bound,
+                      unsigned K) const;
+
+    /// findBoundsEQ - Computes the upper and lower bounds for level K
+    /// using the = direction. Records them in Bound.
+    void findBoundsEQ(CoefficientInfo *A,
+                      CoefficientInfo *B,
+                      BoundInfo *Bound,
+                      unsigned K) const;
+
+    /// intersectConstraints - Updates X with the intersection
+    /// of the Constraints X and Y. Returns true if X has changed.
+    bool intersectConstraints(Constraint *X,
+                              const Constraint *Y);
+
+    /// propagate - Review the constraints, looking for opportunities
+    /// to simplify a subscript pair (Src and Dst).
+    /// Return true if some simplification occurs.
+    /// If the simplification isn't exact (that is, if it is conservative
+    /// in terms of dependence), set consistent to false.
+    bool propagate(const SCEV *&Src,
+                   const SCEV *&Dst,
+                   SmallBitVector &Loops,
+                   SmallVector<Constraint, 4> &Constraints,
+                   bool &Consistent);
+
+    /// propagateDistance - Attempt to propagate a distance
+    /// constraint into a subscript pair (Src and Dst).
+    /// Return true if some simplification occurs.
+    /// If the simplification isn't exact (that is, if it is conservative
+    /// in terms of dependence), set consistent to false.
+    bool propagateDistance(const SCEV *&Src,
+                           const SCEV *&Dst,
+                           Constraint &CurConstraint,
+                           bool &Consistent);
+
+    /// propagatePoint - Attempt to propagate a point
+    /// constraint into a subscript pair (Src and Dst).
+    /// Return true if some simplification occurs.
+    bool propagatePoint(const SCEV *&Src,
+                        const SCEV *&Dst,
+                        Constraint &CurConstraint);
+
+    /// propagateLine - Attempt to propagate a line
+    /// constraint into a subscript pair (Src and Dst).
+    /// Return true if some simplification occurs.
+    /// If the simplification isn't exact (that is, if it is conservative
+    /// in terms of dependence), set consistent to false.
+    bool propagateLine(const SCEV *&Src,
+                       const SCEV *&Dst,
+                       Constraint &CurConstraint,
+                       bool &Consistent);
+
+    /// findCoefficient - Given a linear SCEV,
+    /// return the coefficient corresponding to specified loop.
+    /// If there isn't one, return the SCEV constant 0.
+    /// For example, given a*i + b*j + c*k, returning the coefficient
+    /// corresponding to the j loop would yield b.
+    const SCEV *findCoefficient(const SCEV *Expr,
+                                const Loop *TargetLoop) const;
+
+    /// zeroCoefficient - Given a linear SCEV,
+    /// return the SCEV given by zeroing out the coefficient
+    /// corresponding to the specified loop.
+    /// For example, given a*i + b*j + c*k, zeroing the coefficient
+    /// corresponding to the j loop would yield a*i + c*k.
+    const SCEV *zeroCoefficient(const SCEV *Expr,
+                                const Loop *TargetLoop) const;
+
+    /// addToCoefficient - Given a linear SCEV Expr,
+    /// return the SCEV given by adding some Value to the
+    /// coefficient corresponding to the specified TargetLoop.
+    /// For example, given a*i + b*j + c*k, adding 1 to the coefficient
+    /// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
+    const SCEV *addToCoefficient(const SCEV *Expr,
+                                 const Loop *TargetLoop,
+                                 const SCEV *Value)  const;
+
+    /// updateDirection - Update direction vector entry
+    /// based on the current constraint.
+    void updateDirection(Dependence::DVEntry &Level,
+                         const Constraint &CurConstraint) const;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    DependenceAnalysis() : FunctionPass(ID) {
+      initializeDependenceAnalysisPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F);
+    void releaseMemory();
+    void getAnalysisUsage(AnalysisUsage &) const;
+    void print(raw_ostream &, const Module * = 0) const;
+  }; // class DependenceAnalysis
+
+  /// createDependenceAnalysisPass - This creates an instance of the
+  /// DependenceAnalysis pass.
+  FunctionPass *createDependenceAnalysisPass();
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/Analysis/Dominators.h b/include/llvm/Analysis/Dominators.h
index a1cc196eae30..8940971558a3 100644
--- a/include/llvm/Analysis/Dominators.h
+++ b/include/llvm/Analysis/Dominators.h
@@ -346,7 +346,7 @@ public:
   DomTreeNodeBase<NodeT> *getRootNode() { return RootNode; }
   const DomTreeNodeBase<NodeT> *getRootNode() const { return RootNode; }
 
-  /// properlyDominates - Returns true iff this dominates N and this != N.
+  /// properlyDominates - Returns true iff A dominates B and A != B.
   /// Note that this is not a constant time operation!
   ///
   bool properlyDominates(const DomTreeNodeBase<NodeT> *A,
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
index 2bf79b9c932b..9b98013a1683 100644
--- a/include/llvm/Analysis/IVUsers.h
+++ b/include/llvm/Analysis/IVUsers.h
@@ -28,7 +28,7 @@ class IVUsers;
 class ScalarEvolution;
 class SCEV;
 class IVUsers;
-class TargetData;
+class DataLayout;
 
 /// IVStrideUse - Keep track of one use of a strided induction variable.
 /// The Expr member keeps track of the expression, User is the actual user
@@ -123,7 +123,7 @@ class IVUsers : public LoopPass {
   LoopInfo *LI;
   DominatorTree *DT;
   ScalarEvolution *SE;
-  TargetData *TD;
+  DataLayout *TD;
   SmallPtrSet<Instruction*,16> Processed;
 
   /// IVUses - A list of all tracked IV uses of induction variable expressions
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 0cba135222b9..a075db33427d 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -26,7 +26,7 @@
 namespace llvm {
 
   class CallSite;
-  class TargetData;
+  class DataLayout;
 
   namespace InlineConstants {
     // Various magic constants used to adjust heuristics.
@@ -36,6 +36,9 @@ namespace llvm {
     const int LastCallToStaticBonus = -15000;
     const int ColdccPenalty = 2000;
     const int NoreturnPenalty = 10000;
+    /// Do not inline functions which allocate this many bytes on the stack
+    /// when the caller is recursive.
+    const unsigned TotalAllocaSizeRecursiveCaller = 1024;
   }
 
   /// \brief Represents the cost of inlining a function.
@@ -101,13 +104,13 @@ namespace llvm {
 
   /// InlineCostAnalyzer - Cost analyzer used by inliner.
   class InlineCostAnalyzer {
-    // TargetData if available, or null.
-    const TargetData *TD;
+    // DataLayout if available, or null.
+    const DataLayout *TD;
 
   public:
     InlineCostAnalyzer(): TD(0) {}
 
-    void setTargetData(const TargetData *TData) { TD = TData; }
+    void setDataLayout(const DataLayout *TData) { TD = TData; }
 
     /// \brief Get an InlineCost object representing the cost of inlining this
     /// callsite.
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 152e885bf667..e561e3742b64 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -24,7 +24,7 @@ namespace llvm {
   class ArrayRef;
   class DominatorTree;
   class Instruction;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
   class Type;
   class Value;
@@ -32,122 +32,122 @@ namespace llvm {
   /// SimplifyAddInst - Given operands for an Add, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
-                         const TargetData *TD = 0,
+                         const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifySubInst - Given operands for a Sub, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
-                         const TargetData *TD = 0,
+                         const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifyMulInst - Given operands for a Mul, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyMulInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyMulInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifySDivInst - Given operands for an SDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifySDivInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifySDivInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyUDivInst - Given operands for a UDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyUDivInst(Value *LHS, Value *RHS, const TargetData *TD = 0, 
+  Value *SimplifyUDivInst(Value *LHS, Value *RHS, const DataLayout *TD = 0, 
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyFDivInst - Given operands for an FDiv, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyFDivInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyFDivInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifySRemInst - Given operands for an SRem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifySRemInst(Value *LHS, Value *RHS, const TargetData *TD = 0, 
+  Value *SimplifySRemInst(Value *LHS, Value *RHS, const DataLayout *TD = 0, 
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyURemInst - Given operands for a URem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyURemInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyURemInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyFRemInst - Given operands for an FRem, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyFRemInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyFRemInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyShlInst - Given operands for a Shl, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                         const TargetData *TD = 0, 
+                         const DataLayout *TD = 0, 
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifyLShrInst - Given operands for a LShr, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                          const TargetData *TD = 0,
+                          const DataLayout *TD = 0,
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyAShrInst - Given operands for a AShr, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                          const TargetData *TD = 0,
+                          const DataLayout *TD = 0,
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyAndInst - Given operands for an And, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyAndInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyAndInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifyOrInst - Given operands for an Or, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyOrInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyOrInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                         const TargetLibraryInfo *TLI = 0,
                         const DominatorTree *DT = 0);
 
   /// SimplifyXorInst - Given operands for a Xor, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyXorInst(Value *LHS, Value *RHS, const TargetData *TD = 0,
+  Value *SimplifyXorInst(Value *LHS, Value *RHS, const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifyICmpInst - Given operands for an ICmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          const TargetData *TD = 0, 
+                          const DataLayout *TD = 0, 
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifyFCmpInst - Given operands for an FCmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          const TargetData *TD = 0, 
+                          const DataLayout *TD = 0, 
                           const TargetLibraryInfo *TLI = 0,
                           const DominatorTree *DT = 0);
 
   /// SimplifySelectInst - Given operands for a SelectInst, see if we can fold
   /// the result.  If not, this returns null.
   Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
-                            const TargetData *TD = 0,
+                            const DataLayout *TD = 0,
                             const TargetLibraryInfo *TLI = 0,
                             const DominatorTree *DT = 0);
 
   /// SimplifyGEPInst - Given operands for an GetElementPtrInst, see if we can
   /// fold the result.  If not, this returns null.
-  Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const TargetData *TD = 0,
+  Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
@@ -155,13 +155,13 @@ namespace llvm {
   /// can fold the result.  If not, this returns null.
   Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
                                  ArrayRef<unsigned> Idxs,
-                                 const TargetData *TD = 0,
+                                 const DataLayout *TD = 0,
                                  const TargetLibraryInfo *TLI = 0,
                                  const DominatorTree *DT = 0);
 
   /// SimplifyTruncInst - Given operands for an TruncInst, see if we can fold
   /// the result.  If not, this returns null.
-  Value *SimplifyTruncInst(Value *Op, Type *Ty, const TargetData *TD = 0,
+  Value *SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD = 0,
                            const TargetLibraryInfo *TLI = 0,
                            const DominatorTree *DT = 0);
 
@@ -171,20 +171,20 @@ namespace llvm {
   /// SimplifyCmpInst - Given operands for a CmpInst, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                         const TargetData *TD = 0,
+                         const DataLayout *TD = 0,
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
   /// SimplifyBinOp - Given operands for a BinaryOperator, see if we can
   /// fold the result.  If not, this returns null.
   Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                       const TargetData *TD = 0, 
+                       const DataLayout *TD = 0, 
                        const TargetLibraryInfo *TLI = 0,
                        const DominatorTree *DT = 0);
 
   /// SimplifyInstruction - See if we can compute a simplified version of this
   /// instruction.  If not, this returns null.
-  Value *SimplifyInstruction(Instruction *I, const TargetData *TD = 0,
+  Value *SimplifyInstruction(Instruction *I, const DataLayout *TD = 0,
                              const TargetLibraryInfo *TLI = 0,
                              const DominatorTree *DT = 0);
 
@@ -198,7 +198,7 @@ namespace llvm {
   ///
   /// The function returns true if any simplifications were performed.
   bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
-                                     const TargetData *TD = 0,
+                                     const DataLayout *TD = 0,
                                      const TargetLibraryInfo *TLI = 0,
                                      const DominatorTree *DT = 0);
 
@@ -209,7 +209,7 @@ namespace llvm {
   /// of the users impacted. It returns true if any simplifications were
   /// performed.
   bool recursivelySimplifyInstruction(Instruction *I,
-                                      const TargetData *TD = 0,
+                                      const DataLayout *TD = 0,
                                       const TargetLibraryInfo *TLI = 0,
                                       const DominatorTree *DT = 0);
 } // end namespace llvm
diff --git a/include/llvm/Analysis/IntervalPartition.h b/include/llvm/Analysis/IntervalPartition.h
index df7313f18f3d..bce84be2f4fd 100644
--- a/include/llvm/Analysis/IntervalPartition.h
+++ b/include/llvm/Analysis/IntervalPartition.h
@@ -33,8 +33,8 @@ namespace llvm {
 //
 // IntervalPartition - This class builds and holds an "interval partition" for
 // a function.  This partition divides the control flow graph into a set of
-// maximal intervals, as defined with the properties above.  Intuitively, a
-// BasicBlock is a (possibly nonexistent) loop with a "tail" of non looping
+// maximal intervals, as defined with the properties above.  Intuitively, an
+// interval is a (possibly nonexistent) loop with a "tail" of non looping
 // nodes following it.
 //
 class IntervalPartition : public FunctionPass {
diff --git a/include/llvm/Analysis/LazyValueInfo.h b/include/llvm/Analysis/LazyValueInfo.h
index 065c230fb2fd..197e94e5fd32 100644
--- a/include/llvm/Analysis/LazyValueInfo.h
+++ b/include/llvm/Analysis/LazyValueInfo.h
@@ -19,18 +19,18 @@
 
 namespace llvm {
   class Constant;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
   class Value;
   
 /// LazyValueInfo - This pass computes, caches, and vends lazy value constraint
 /// information.
 class LazyValueInfo : public FunctionPass {
-  class TargetData *TD;
+  class DataLayout *TD;
   class TargetLibraryInfo *TLI;
   void *PImpl;
-  LazyValueInfo(const LazyValueInfo&); // DO NOT IMPLEMENT.
-  void operator=(const LazyValueInfo&); // DO NOT IMPLEMENT.
+  LazyValueInfo(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
 public:
   static char ID;
   LazyValueInfo() : FunctionPass(ID), PImpl(0) {
diff --git a/include/llvm/Analysis/Loads.h b/include/llvm/Analysis/Loads.h
index 5f0aefbeb015..afc90c2f7441 100644
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h
@@ -19,7 +19,7 @@
 namespace llvm {
 
 class AliasAnalysis;
-class TargetData;
+class DataLayout;
 class MDNode;
 
 /// isSafeToLoadUnconditionally - Return true if we know that executing a load
@@ -27,7 +27,7 @@ class MDNode;
 /// specified pointer, we do a quick local scan of the basic block containing
 /// ScanFrom, to determine if the address is already accessed.
 bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
-                                 unsigned Align, const TargetData *TD = 0);
+                                 unsigned Align, const DataLayout *TD = 0);
 
 /// FindAvailableLoadedValue - Scan the ScanBB block backwards (starting at
 /// the instruction before ScanFrom) checking to see if we have the value at
diff --git a/include/llvm/Analysis/LoopDependenceAnalysis.h b/include/llvm/Analysis/LoopDependenceAnalysis.h
deleted file mode 100644
index f195d2782418..000000000000
--- a/include/llvm/Analysis/LoopDependenceAnalysis.h
+++ /dev/null
@@ -1,124 +0,0 @@
-//===- llvm/Analysis/LoopDependenceAnalysis.h --------------- -*- C++ -*---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// LoopDependenceAnalysis is an LLVM pass that analyses dependences in memory
-// accesses in loops.
-//
-// Please note that this is work in progress and the interface is subject to
-// change.
-//
-// TODO: adapt as interface progresses
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_LOOP_DEPENDENCE_ANALYSIS_H
-#define LLVM_ANALYSIS_LOOP_DEPENDENCE_ANALYSIS_H
-
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Support/Allocator.h"
-
-namespace llvm {
-
-class AliasAnalysis;
-class AnalysisUsage;
-class ScalarEvolution;
-class SCEV;
-class Value;
-class raw_ostream;
-
-class LoopDependenceAnalysis : public LoopPass {
-  AliasAnalysis *AA;
-  ScalarEvolution *SE;
-
-  /// L - The loop we are currently analysing.
-  Loop *L;
-
-  /// TODO: doc
-  enum DependenceResult { Independent = 0, Dependent = 1, Unknown = 2 };
-
-  /// TODO: doc
-  struct Subscript {
-    /// TODO: Add distance, direction, breaking conditions, ...
-  };
-
-  /// DependencePair - Represents a data dependence relation between to memory
-  /// reference instructions.
-  struct DependencePair : public FastFoldingSetNode {
-    Value *A;
-    Value *B;
-    DependenceResult Result;
-    SmallVector<Subscript, 4> Subscripts;
-
-    DependencePair(const FoldingSetNodeID &ID, Value *a, Value *b) :
-        FastFoldingSetNode(ID), A(a), B(b), Result(Unknown), Subscripts() {}
-  };
-
-  /// findOrInsertDependencePair - Return true if a DependencePair for the
-  /// given Values already exists, false if a new DependencePair had to be
-  /// created. The third argument is set to the pair found or created.
-  bool findOrInsertDependencePair(Value*, Value*, DependencePair*&);
-
-  /// getLoops - Collect all loops of the loop nest L in which
-  /// a given SCEV is variant.
-  void getLoops(const SCEV*, DenseSet<const Loop*>*) const;
-
-  /// isLoopInvariant - True if a given SCEV is invariant in all loops of the
-  /// loop nest starting at the innermost loop L.
-  bool isLoopInvariant(const SCEV*) const;
-
-  /// isAffine - An SCEV is affine with respect to the loop nest starting at
-  /// the innermost loop L if it is of the form A+B*X where A, B are invariant
-  /// in the loop nest and X is a induction variable in the loop nest.
-  bool isAffine(const SCEV*) const;
-
-  /// TODO: doc
-  bool isZIVPair(const SCEV*, const SCEV*) const;
-  bool isSIVPair(const SCEV*, const SCEV*) const;
-  DependenceResult analyseZIV(const SCEV*, const SCEV*, Subscript*) const;
-  DependenceResult analyseSIV(const SCEV*, const SCEV*, Subscript*) const;
-  DependenceResult analyseMIV(const SCEV*, const SCEV*, Subscript*) const;
-  DependenceResult analyseSubscript(const SCEV*, const SCEV*, Subscript*) const;
-  DependenceResult analysePair(DependencePair*) const;
-
-public:
-  static char ID; // Class identification, replacement for typeinfo
-  LoopDependenceAnalysis() : LoopPass(ID) {
-    initializeLoopDependenceAnalysisPass(*PassRegistry::getPassRegistry());
-  }
-
-  /// isDependencePair - Check whether two values can possibly give rise to
-  /// a data dependence: that is the case if both are instructions accessing
-  /// memory and at least one of those accesses is a write.
-  bool isDependencePair(const Value*, const Value*) const;
-
-  /// depends - Return a boolean indicating if there is a data dependence
-  /// between two instructions.
-  bool depends(Value*, Value*);
-
-  bool runOnLoop(Loop*, LPPassManager&);
-  virtual void releaseMemory();
-  virtual void getAnalysisUsage(AnalysisUsage&) const;
-  void print(raw_ostream&, const Module* = 0) const;
-
-private:
-  FoldingSet<DependencePair> Pairs;
-  BumpPtrAllocator PairAllocator;
-}; // class LoopDependenceAnalysis
-
-// createLoopDependenceAnalysisPass - This creates an instance of the
-// LoopDependenceAnalysis pass.
-//
-LoopPass *createLoopDependenceAnalysisPass();
-
-} // namespace llvm
-
-#endif /* LLVM_ANALYSIS_LOOP_DEPENDENCE_ANALYSIS_H */
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index eeb482d82a2b..c5d7b0128e74 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -72,10 +72,9 @@ class LoopBase {
   // Blocks - The list of blocks in this loop.  First entry is the header node.
   std::vector<BlockT*> Blocks;
 
-  // DO NOT IMPLEMENT
-  LoopBase(const LoopBase<BlockT, LoopT> &);
-  // DO NOT IMPLEMENT
-  const LoopBase<BlockT, LoopT>&operator=(const LoopBase<BlockT, LoopT> &);
+  LoopBase(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
+  const LoopBase<BlockT, LoopT>&
+    operator=(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
 public:
   /// Loop ctor - This creates an empty loop.
   LoopBase() : ParentLoop(0) {}
@@ -416,8 +415,8 @@ class LoopInfoBase {
   friend class LoopBase<BlockT, LoopT>;
   friend class LoopInfo;
 
-  void operator=(const LoopInfoBase &); // do not implement
-  LoopInfoBase(const LoopInfo &);       // do not implement
+  void operator=(const LoopInfoBase &) LLVM_DELETED_FUNCTION;
+  LoopInfoBase(const LoopInfo &) LLVM_DELETED_FUNCTION;
 public:
   LoopInfoBase() { }
   ~LoopInfoBase() { releaseMemory(); }
@@ -550,8 +549,8 @@ class LoopInfo : public FunctionPass {
   LoopInfoBase<BasicBlock, Loop> LI;
   friend class LoopBase<BasicBlock, Loop>;
 
-  void operator=(const LoopInfo &); // do not implement
-  LoopInfo(const LoopInfo &);       // do not implement
+  void operator=(const LoopInfo &) LLVM_DELETED_FUNCTION;
+  LoopInfo(const LoopInfo &) LLVM_DELETED_FUNCTION;
 public:
   static char ID; // Pass identification, replacement for typeid
 
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index c07fbf7aa827..3bb96f96bf52 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -145,7 +145,6 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPredecessor() const {
 
   // Loop over the predecessors of the header node...
   BlockT *Header = getHeader();
-  typedef GraphTraits<BlockT*> BlockTraits;
   typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
   for (typename InvBlockTraits::ChildIteratorType PI =
          InvBlockTraits::child_begin(Header),
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index e674e74520d2..a842898e4100 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -27,7 +27,8 @@
 namespace llvm {
 class CallInst;
 class PointerType;
-class TargetData;
+class DataLayout;
+class TargetLibraryInfo;
 class Type;
 class Value;
 
@@ -35,27 +36,33 @@ class Value;
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
-bool isAllocationFn(const Value *V, bool LookThroughBitCast = false);
+bool isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
+                    bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
-bool isNoAliasFn(const Value *V, bool LookThroughBitCast = false);
+bool isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
+                 bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
-bool isMallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                    bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
-bool isCallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                    bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
-bool isAllocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                   bool LookThroughBitCast = false);
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// reallocates memory (such as realloc).
-bool isReallocLikeFn(const Value *V, bool LookThroughBitCast = false);
+bool isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                     bool LookThroughBitCast = false);
 
 
 //===----------------------------------------------------------------------===//
@@ -65,36 +72,39 @@ bool isReallocLikeFn(const Value *V, bool LookThroughBitCast = false);
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
-const CallInst *extractMallocCall(const Value *I);
-static inline CallInst *extractMallocCall(Value *I) {
-  return const_cast<CallInst*>(extractMallocCall((const Value*)I));
+const CallInst *extractMallocCall(const Value *I, const TargetLibraryInfo *TLI);
+static inline CallInst *extractMallocCall(Value *I,
+                                          const TargetLibraryInfo *TLI) {
+  return const_cast<CallInst*>(extractMallocCall((const Value*)I, TLI));
 }
 
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction 
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
-const CallInst *isArrayMalloc(const Value *I, const TargetData *TD);
+const CallInst *isArrayMalloc(const Value *I, const DataLayout *TD,
+                              const TargetLibraryInfo *TLI);
 
 /// getMallocType - Returns the PointerType resulting from the malloc call.
 /// The PointerType depends on the number of bitcast uses of the malloc call:
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-PointerType *getMallocType(const CallInst *CI);
+PointerType *getMallocType(const CallInst *CI, const TargetLibraryInfo *TLI);
 
 /// getMallocAllocatedType - Returns the Type allocated by malloc call.
 /// The Type depends on the number of bitcast uses of the malloc call:
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-Type *getMallocAllocatedType(const CallInst *CI);
+Type *getMallocAllocatedType(const CallInst *CI, const TargetLibraryInfo *TLI);
 
 /// getMallocArraySize - Returns the array size of a malloc call.  If the 
 /// argument passed to malloc is a multiple of the size of the malloced type,
 /// then return that multiple.  For non-array mallocs, the multiple is
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
-Value *getMallocArraySize(CallInst *CI, const TargetData *TD,
+Value *getMallocArraySize(CallInst *CI, const DataLayout *TD,
+                          const TargetLibraryInfo *TLI,
                           bool LookThroughSExt = false);
 
 
@@ -104,9 +114,10 @@ Value *getMallocArraySize(CallInst *CI, const TargetData *TD,
 
 /// extractCallocCall - Returns the corresponding CallInst if the instruction
 /// is a calloc call.
-const CallInst *extractCallocCall(const Value *I);
-static inline CallInst *extractCallocCall(Value *I) {
-  return const_cast<CallInst*>(extractCallocCall((const Value*)I));
+const CallInst *extractCallocCall(const Value *I, const TargetLibraryInfo *TLI);
+static inline CallInst *extractCallocCall(Value *I,
+                                          const TargetLibraryInfo *TLI) {
+  return const_cast<CallInst*>(extractCallocCall((const Value*)I, TLI));
 }
 
 
@@ -115,10 +126,10 @@ static inline CallInst *extractCallocCall(Value *I) {
 //
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
-const CallInst *isFreeCall(const Value *I);
+const CallInst *isFreeCall(const Value *I, const TargetLibraryInfo *TLI);
   
-static inline CallInst *isFreeCall(Value *I) {
-  return const_cast<CallInst*>(isFreeCall((const Value*)I));
+static inline CallInst *isFreeCall(Value *I, const TargetLibraryInfo *TLI) {
+  return const_cast<CallInst*>(isFreeCall((const Value*)I, TLI));
 }
 
   
@@ -130,8 +141,8 @@ static inline CallInst *isFreeCall(Value *I) {
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
-bool getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
-                   bool RoundToAlign = false);
+bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+                   const TargetLibraryInfo *TLI, bool RoundToAlign = false);
 
 
 
@@ -142,10 +153,12 @@ typedef std::pair<APInt, APInt> SizeOffsetType;
 class ObjectSizeOffsetVisitor
   : public InstVisitor<ObjectSizeOffsetVisitor, SizeOffsetType> {
 
-  const TargetData *TD;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
   bool RoundToAlign;
   unsigned IntTyBits;
   APInt Zero;
+  SmallPtrSet<Instruction *, 8> SeenInsts;
 
   APInt align(APInt Size, uint64_t Align);
 
@@ -154,8 +167,8 @@ class ObjectSizeOffsetVisitor
   }
 
 public:
-  ObjectSizeOffsetVisitor(const TargetData *TD, LLVMContext &Context,
-                          bool RoundToAlign = false);
+  ObjectSizeOffsetVisitor(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                          LLVMContext &Context, bool RoundToAlign = false);
 
   SizeOffsetType compute(Value *V);
 
@@ -200,10 +213,10 @@ class ObjectSizeOffsetEvaluator
   typedef DenseMap<const Value*, WeakEvalType> CacheMapTy;
   typedef SmallPtrSet<const Value*, 8> PtrSetTy;
 
-  const TargetData *TD;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
   LLVMContext &Context;
   BuilderTy Builder;
-  ObjectSizeOffsetVisitor Visitor;
   IntegerType *IntTy;
   Value *Zero;
   CacheMapTy CacheMap;
@@ -215,7 +228,8 @@ class ObjectSizeOffsetEvaluator
   SizeOffsetEvalType compute_(Value *V);
 
 public:
-  ObjectSizeOffsetEvaluator(const TargetData *TD, LLVMContext &Context);
+  ObjectSizeOffsetEvaluator(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                            LLVMContext &Context);
   SizeOffsetEvalType compute(Value *V);
 
   bool knownSize(SizeOffsetEvalType SizeOffset) {
diff --git a/include/llvm/Analysis/MemoryDependenceAnalysis.h b/include/llvm/Analysis/MemoryDependenceAnalysis.h
index 7e049d633b49..a715eaeee11c 100644
--- a/include/llvm/Analysis/MemoryDependenceAnalysis.h
+++ b/include/llvm/Analysis/MemoryDependenceAnalysis.h
@@ -29,7 +29,7 @@ namespace llvm {
   class Instruction;
   class CallSite;
   class AliasAnalysis;
-  class TargetData;
+  class DataLayout;
   class MemoryDependenceAnalysis;
   class PredIteratorCache;
   class DominatorTree;
@@ -323,7 +323,7 @@ namespace llvm {
     
     /// Current AA implementation, just a cache.
     AliasAnalysis *AA;
-    TargetData *TD;
+    DataLayout *TD;
     DominatorTree *DT;
     OwningPtr<PredIteratorCache> PredCache;
   public:
@@ -412,7 +412,7 @@ namespace llvm {
                                                     int64_t MemLocOffs,
                                                     unsigned MemLocSize,
                                                     const LoadInst *LI,
-                                                    const TargetData &TD);
+                                                    const DataLayout &TD);
     
   private:
     MemDepResult getCallSiteDependencyFrom(CallSite C, bool isReadOnlyCall,
diff --git a/include/llvm/Analysis/PHITransAddr.h b/include/llvm/Analysis/PHITransAddr.h
index ff9a24790a99..5a77fcebafa0 100644
--- a/include/llvm/Analysis/PHITransAddr.h
+++ b/include/llvm/Analysis/PHITransAddr.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
   class DominatorTree;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
 
 /// PHITransAddr - An address value which tracks and handles phi translation.
@@ -37,7 +37,7 @@ class PHITransAddr {
   Value *Addr;
   
   /// TD - The target data we are playing with if known, otherwise null.
-  const TargetData *TD;
+  const DataLayout *TD;
 
   /// TLI - The target library info if known, otherwise null.
   const TargetLibraryInfo *TLI;
@@ -45,7 +45,7 @@ class PHITransAddr {
   /// InstInputs - The inputs for our symbolic address.
   SmallVector<Instruction*, 4> InstInputs;
 public:
-  PHITransAddr(Value *addr, const TargetData *td) : Addr(addr), TD(td), TLI(0) {
+  PHITransAddr(Value *addr, const DataLayout *td) : Addr(addr), TD(td), TLI(0) {
     // If the address is an instruction, the whole thing is considered an input.
     if (Instruction *I = dyn_cast<Instruction>(Addr))
       InstInputs.push_back(I);
diff --git a/include/llvm/Analysis/Passes.h b/include/llvm/Analysis/Passes.h
index a22bd12dec1e..27726f49bcce 100644
--- a/include/llvm/Analysis/Passes.h
+++ b/include/llvm/Analysis/Passes.h
@@ -103,6 +103,14 @@ namespace llvm {
 
   //===--------------------------------------------------------------------===//
   //
+  // createProfileMetadataLoaderPass - This pass loads information from a
+  // profile dump file and sets branch weight metadata.
+  //
+  ModulePass *createProfileMetadataLoaderPass();
+  extern char &ProfileMetadataLoaderPassID;
+
+  //===--------------------------------------------------------------------===//
+  //
   // createNoProfileInfoPass - This pass implements the default "no profile".
   //
   ImmutablePass *createNoProfileInfoPass();
@@ -172,11 +180,20 @@ namespace llvm {
 
   //===--------------------------------------------------------------------===//
   //
-  // createLoopDependenceAnalysisPass - This creates an instance of the
-  // LoopDependenceAnalysis pass.
+  // createDependenceAnalysisPass - This creates an instance of the
+  // DependenceAnalysis pass.
+  //
+  FunctionPass *createDependenceAnalysisPass();
+
+  //===--------------------------------------------------------------------===//
+  //
+  // createCostModelAnalysisPass - This creates an instance of the
+  // CostModelAnalysis pass.
   //
-  LoopPass *createLoopDependenceAnalysisPass();
+  FunctionPass *createCostModelAnalysisPass();
 
+  //===--------------------------------------------------------------------===//
+  //
   // Minor pass prototypes, allowing us to expose them through bugpoint and
   // analyze.
   FunctionPass *createInstCountPass();
diff --git a/include/llvm/Analysis/ProfileDataLoader.h b/include/llvm/Analysis/ProfileDataLoader.h
new file mode 100644
index 000000000000..9efbafcef41c
--- /dev/null
+++ b/include/llvm/Analysis/ProfileDataLoader.h
@@ -0,0 +1,139 @@
+//===- ProfileDataLoader.h - Load & convert profile info ----*- C++ -*-===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileDataLoader class is used to load profiling data from a dump file.
+// The ProfileDataT<FType, BType> class is used to store the mapping of this
+// data to control flow edges.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILEDATALOADER_H
+#define LLVM_ANALYSIS_PROFILEDATALOADER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <string>
+
+namespace llvm {
+
+class ModulePass;
+class Function;
+class BasicBlock;
+
+// Helper for dumping edges to dbgs().
+raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *,
+                                                  const BasicBlock *> E);
+
+/// \brief The ProfileDataT<FType, BType> class is used to store the mapping of
+/// profiling data to control flow edges.
+///
+/// An edge is defined by its source and sink basic blocks.
+template<class FType, class BType>
+class ProfileDataT {
+public:
+  // The profiling information defines an Edge by its source and sink basic
+  // blocks.
+  typedef std::pair<const BType*, const BType*> Edge;
+
+private:
+  typedef DenseMap<Edge, unsigned> EdgeWeights;
+
+  /// \brief Count the number of times a transition between two blocks is
+  /// executed.
+  ///
+  /// As a special case, we also hold an edge from the null BasicBlock to the
+  /// entry block to indicate how many times the function was entered.
+  DenseMap<const FType*, EdgeWeights> EdgeInformation;
+
+public:
+  /// getFunction() - Returns the Function for an Edge.
+  static const FType *getFunction(Edge e) {
+    // e.first may be NULL
+    assert(((!e.first) || (e.first->getParent() == e.second->getParent()))
+           && "A ProfileData::Edge can not be between two functions");
+    assert(e.second && "A ProfileData::Edge must have a real sink");
+    return e.second->getParent();
+  }
+
+  /// getEdge() - Creates an Edge between two BasicBlocks.
+  static Edge getEdge(const BType *Src, const BType *Dest) {
+    return Edge(Src, Dest);
+  }
+
+  /// getEdgeWeight - Return the number of times that a given edge was
+  /// executed.
+  unsigned getEdgeWeight(Edge e) const {
+    const FType *f = getFunction(e);
+    assert((EdgeInformation.find(f) != EdgeInformation.end())
+           && "No profiling information for function");
+    EdgeWeights weights = EdgeInformation.find(f)->second;
+
+    assert((weights.find(e) != weights.end())
+           && "No profiling information for edge");
+    return weights.find(e)->second;
+  }
+
+  /// addEdgeWeight - Add 'weight' to the already stored execution count for
+  /// this edge.
+  void addEdgeWeight(Edge e, unsigned weight) {
+    EdgeInformation[getFunction(e)][e] += weight;
+  }
+};
+
+typedef ProfileDataT<Function, BasicBlock> ProfileData;
+//typedef ProfileDataT<MachineFunction, MachineBasicBlock> MachineProfileData;
+
+/// The ProfileDataLoader class is used to load raw profiling data from the
+/// dump file.
+class ProfileDataLoader {
+private:
+  /// The name of the file where the raw profiling data is stored.
+  const std::string &Filename;
+
+  /// A vector of the command line arguments used when the target program was
+  /// run to generate profiling data.  One entry per program run.
+  SmallVector<std::string, 1> CommandLines;
+
+  /// The raw values for how many times each edge was traversed, values from
+  /// multiple program runs are accumulated.
+  SmallVector<unsigned, 32> EdgeCounts;
+
+public:
+  /// ProfileDataLoader ctor - Read the specified profiling data file, exiting
+  /// the program if the file is invalid or broken.
+  ProfileDataLoader(const char *ToolName, const std::string &Filename);
+
+  /// A special value used to represent the weight of an edge which has not
+  /// been counted yet.
+  static const unsigned Uncounted;
+
+  /// getNumExecutions - Return the number of times the target program was run
+  /// to generate this profiling data.
+  unsigned getNumExecutions() const { return CommandLines.size(); }
+
+  /// getExecution - Return the command line parameters used to generate the
+  /// i'th set of profiling data.
+  const std::string &getExecution(unsigned i) const { return CommandLines[i]; }
+
+  const std::string &getFileName() const { return Filename; }
+
+  /// getRawEdgeCounts - Return the raw profiling data, this is just a list of
+  /// numbers with no mappings to edges.
+  ArrayRef<unsigned> getRawEdgeCounts() const { return EdgeCounts; }
+};
+
+/// createProfileMetadataLoaderPass - This function returns a Pass that loads
+/// the profiling information for the module from the specified filename.
+ModulePass *createProfileMetadataLoaderPass(const std::string &Filename);
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Analysis/ProfileDataTypes.h b/include/llvm/Analysis/ProfileDataTypes.h
new file mode 100644
index 000000000000..1be15e025da9
--- /dev/null
+++ b/include/llvm/Analysis/ProfileDataTypes.h
@@ -0,0 +1,39 @@
+/*===-- ProfileDataTypes.h - Profiling info shared constants --------------===*\
+|*
+|*                     The LLVM Compiler Infrastructure
+|*
+|* This file is distributed under the University of Illinois Open Source
+|* License. See LICENSE.TXT for details.
+|*
+|*===----------------------------------------------------------------------===*|
+|*
+|* This file defines constants shared by the various different profiling
+|* runtime libraries and the LLVM C++ profile metadata loader. It must be a
+|* C header because, at present, the profiling runtimes are written in C.
+|*
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_ANALYSIS_PROFILEDATATYPES_H
+#define LLVM_ANALYSIS_PROFILEDATATYPES_H
+
+/* Included by libprofile. */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* TODO: Strip out unused entries once ProfileInfo etc has been removed. */
+enum ProfilingType {
+  ArgumentInfo  = 1,   /* The command line argument block */
+  FunctionInfo  = 2,   /* Function profiling information  */
+  BlockInfo     = 3,   /* Block profiling information     */
+  EdgeInfo      = 4,   /* Edge profiling information      */
+  PathInfo      = 5,   /* Path profiling information      */
+  BBTraceInfo   = 6,   /* Basic block trace information   */
+  OptEdgeInfo   = 7    /* Edge profiling information, optimal version */
+};
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* LLVM_ANALYSIS_PROFILEDATATYPES_H */
diff --git a/include/llvm/Analysis/ProfileInfoTypes.h b/include/llvm/Analysis/ProfileInfoTypes.h
index 6b4ac85082b0..45aab5b70d2b 100644
--- a/include/llvm/Analysis/ProfileInfoTypes.h
+++ b/include/llvm/Analysis/ProfileInfoTypes.h
@@ -27,15 +27,7 @@ enum ProfilingStorageType {
   ProfilingHash = 2
 };
 
-enum ProfilingType {
-  ArgumentInfo  = 1,   /* The command line argument block */
-  FunctionInfo  = 2,   /* Function profiling information  */
-  BlockInfo     = 3,   /* Block profiling information     */
-  EdgeInfo      = 4,   /* Edge profiling information      */
-  PathInfo      = 5,   /* Path profiling information      */
-  BBTraceInfo   = 6,   /* Basic block trace information   */
-  OptEdgeInfo   = 7    /* Edge profiling information, optimal version */
-};
+#include "llvm/Analysis/ProfileDataTypes.h"
 
 /*
  * The header for tables that map path numbers to path counters.
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 188d11c2833c..48d7ee6b5476 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -54,10 +54,8 @@ class FlatIt {};
 /// @brief A RegionNode represents a subregion or a BasicBlock that is part of a
 /// Region.
 class RegionNode {
-  // DO NOT IMPLEMENT
-  RegionNode(const RegionNode &);
-  // DO NOT IMPLEMENT
-  const RegionNode &operator=(const RegionNode &);
+  RegionNode(const RegionNode &) LLVM_DELETED_FUNCTION;
+  const RegionNode &operator=(const RegionNode &) LLVM_DELETED_FUNCTION;
 
 protected:
   /// This is the entry basic block that starts this region node.  If this is a
@@ -203,10 +201,8 @@ inline Region* RegionNode::getNodeAs<Region>() const {
 /// tree, the second one creates a graphical representation using graphviz.
 class Region : public RegionNode {
   friend class RegionInfo;
-  // DO NOT IMPLEMENT
-  Region(const Region &);
-  // DO NOT IMPLEMENT
-  const Region &operator=(const Region &);
+  Region(const Region &) LLVM_DELETED_FUNCTION;
+  const Region &operator=(const Region &) LLVM_DELETED_FUNCTION;
 
   // Information necessary to manage this Region.
   RegionInfo* RI;
@@ -473,27 +469,6 @@ public:
   const_iterator end() const { return children.end(); }
   //@}
 
-  /// @name BasicBlock Node Iterators
-  ///
-  /// These iterators iterate over all BasicBlock RegionNodes that are
-  /// contained in this Region. The iterator also iterates over BasicBlock
-  /// RegionNodes that are elements of a subregion of this Region. It is
-  /// therefore called a flat iterator.
-  //@{
-  typedef df_iterator<RegionNode*, SmallPtrSet<RegionNode*, 8>, false,
-                      GraphTraits<FlatIt<RegionNode*> > > block_node_iterator;
-
-  typedef df_iterator<const RegionNode*, SmallPtrSet<const RegionNode*, 8>,
-                      false, GraphTraits<FlatIt<const RegionNode*> > >
-            const_block_node_iterator;
-
-  block_node_iterator block_node_begin();
-  block_node_iterator block_node_end();
-
-  const_block_node_iterator block_node_begin() const;
-  const_block_node_iterator block_node_end() const;
-  //@}
-
   /// @name BasicBlock Iterators
   ///
   /// These iterators iterate over all BasicBlocks that are contained in this
@@ -586,10 +561,8 @@ class RegionInfo : public FunctionPass {
   typedef DenseMap<BasicBlock*, Region*> BBtoRegionMap;
   typedef SmallPtrSet<Region*, 4> RegionSet;
 
-  // DO NOT IMPLEMENT
-  RegionInfo(const RegionInfo &);
-  // DO NOT IMPLEMENT
-  const RegionInfo &operator=(const RegionInfo &);
+  RegionInfo(const RegionInfo &) LLVM_DELETED_FUNCTION;
+  const RegionInfo &operator=(const RegionInfo &) LLVM_DELETED_FUNCTION;
 
   DominatorTree *DT;
   PostDominatorTree *PDT;
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index c213ade5e8e3..235adca02175 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -40,7 +40,7 @@ namespace llvm {
   class DominatorTree;
   class Type;
   class ScalarEvolution;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
   class LLVMContext;
   class Loop;
@@ -70,8 +70,8 @@ namespace llvm {
     unsigned short SubclassData;
 
   private:
-    SCEV(const SCEV &);            // DO NOT IMPLEMENT
-    void operator=(const SCEV &);  // DO NOT IMPLEMENT
+    SCEV(const SCEV &) LLVM_DELETED_FUNCTION;
+    void operator=(const SCEV &) LLVM_DELETED_FUNCTION;
 
   public:
     /// NoWrapFlags are bitfield indices into SubclassData.
@@ -162,7 +162,6 @@ namespace llvm {
     SCEVCouldNotCompute();
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVCouldNotCompute *S) { return true; }
     static bool classof(const SCEV *S);
   };
 
@@ -227,7 +226,7 @@ namespace llvm {
 
     /// TD - The target data information for the target we are targeting.
     ///
-    TargetData *TD;
+    DataLayout *TD;
 
     /// TLI - The target library information for the target we are targeting.
     ///
@@ -874,6 +873,7 @@ namespace llvm {
     virtual void releaseMemory();
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
     virtual void print(raw_ostream &OS, const Module* = 0) const;
+    virtual void verifyAnalysis() const;
 
   private:
     FoldingSet<SCEV> UniqueSCEVs;
diff --git a/include/llvm/Analysis/ScalarEvolutionExpressions.h b/include/llvm/Analysis/ScalarEvolutionExpressions.h
index ded12974face..54db7d6bcf0d 100644
--- a/include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ b/include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -46,7 +46,6 @@ namespace llvm {
     Type *getType() const { return V->getType(); }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVConstant *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scConstant;
     }
@@ -68,7 +67,6 @@ namespace llvm {
     Type *getType() const { return Ty; }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVCastExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scTruncate ||
              S->getSCEVType() == scZeroExtend ||
@@ -88,7 +86,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVTruncateExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scTruncate;
     }
@@ -106,7 +103,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVZeroExtendExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scZeroExtend;
     }
@@ -124,7 +120,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVSignExtendExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scSignExtend;
     }
@@ -166,7 +161,6 @@ namespace llvm {
     }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVNAryExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scAddExpr ||
              S->getSCEVType() == scMulExpr ||
@@ -188,7 +182,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVCommutativeExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scAddExpr ||
              S->getSCEVType() == scMulExpr ||
@@ -223,7 +216,6 @@ namespace llvm {
     }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVAddExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scAddExpr;
     }
@@ -242,7 +234,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVMulExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scMulExpr;
     }
@@ -274,7 +265,6 @@ namespace llvm {
     }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVUDivExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scUDivExpr;
     }
@@ -358,7 +348,6 @@ namespace llvm {
     }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVAddRecExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scAddRecExpr;
     }
@@ -380,7 +369,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVSMaxExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scSMaxExpr;
     }
@@ -402,7 +390,6 @@ namespace llvm {
 
   public:
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVUMaxExpr *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scUMaxExpr;
     }
@@ -449,7 +436,6 @@ namespace llvm {
     Type *getType() const { return getValPtr()->getType(); }
 
     /// Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const SCEVUnknown *S) { return true; }
     static inline bool classof(const SCEV *S) {
       return S->getSCEVType() == scUnknown;
     }
diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index c3c2f4b0668c..b758eca42e78 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -130,9 +130,9 @@ class SparseSolver {
   /// PHI nodes retriggered.
   typedef std::pair<BasicBlock*,BasicBlock*> Edge;
   std::set<Edge> KnownFeasibleEdges;
-  
-  SparseSolver(const SparseSolver&);    // DO NOT IMPLEMENT
-  void operator=(const SparseSolver&);  // DO NOT IMPLEMENT
+
+  SparseSolver(const SparseSolver&) LLVM_DELETED_FUNCTION;
+  void operator=(const SparseSolver&) LLVM_DELETED_FUNCTION;
 public:
   explicit SparseSolver(AbstractLatticeFunction *Lattice)
     : LatticeFunc(Lattice) {}
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index e8d45f6bb8d4..a85752446bb0 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -22,7 +22,7 @@ namespace llvm {
   class Value;
   class Instruction;
   class APInt;
-  class TargetData;
+  class DataLayout;
   class StringRef;
   class MDNode;
 
@@ -37,27 +37,27 @@ namespace llvm {
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   void ComputeMaskedBits(Value *V,  APInt &KnownZero, APInt &KnownOne,
-                         const TargetData *TD = 0, unsigned Depth = 0);
+                         const DataLayout *TD = 0, unsigned Depth = 0);
   void computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero);
 
   /// ComputeSignBit - Determine whether the sign bit is known to be zero or
   /// one.  Convenience wrapper around ComputeMaskedBits.
   void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                      const TargetData *TD = 0, unsigned Depth = 0);
+                      const DataLayout *TD = 0, unsigned Depth = 0);
 
   /// isPowerOfTwo - Return true if the given value is known to have exactly one
   /// bit set when defined. For vectors return true if every element is known to
   /// be a power of two when defined.  Supports values with integer or pointer
   /// type and vectors of integers.  If 'OrZero' is set then returns true if the
   /// given value is either a power of two or zero.
-  bool isPowerOfTwo(Value *V, const TargetData *TD = 0, bool OrZero = false,
+  bool isPowerOfTwo(Value *V, const DataLayout *TD = 0, bool OrZero = false,
                     unsigned Depth = 0);
 
   /// isKnownNonZero - Return true if the given value is known to be non-zero
   /// when defined.  For vectors return true if every element is known to be
   /// non-zero when defined.  Supports values with integer or pointer type and
   /// vectors of integers.
-  bool isKnownNonZero(Value *V, const TargetData *TD = 0, unsigned Depth = 0);
+  bool isKnownNonZero(Value *V, const DataLayout *TD = 0, unsigned Depth = 0);
 
   /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
   /// this predicate to simplify operations downstream.  Mask is known to be
@@ -69,7 +69,7 @@ namespace llvm {
   /// same width as the vector element, and the bit is set only if it is true
   /// for all of the elements in the vector.
   bool MaskedValueIsZero(Value *V, const APInt &Mask, 
-                         const TargetData *TD = 0, unsigned Depth = 0);
+                         const DataLayout *TD = 0, unsigned Depth = 0);
 
   
   /// ComputeNumSignBits - Return the number of times the sign bit of the
@@ -80,7 +80,7 @@ namespace llvm {
   ///
   /// 'Op' must have a scalar integer type.
   ///
-  unsigned ComputeNumSignBits(Value *Op, const TargetData *TD = 0,
+  unsigned ComputeNumSignBits(Value *Op, const DataLayout *TD = 0,
                               unsigned Depth = 0);
 
   /// ComputeMultiple - This function computes the integer multiple of Base that
@@ -118,10 +118,10 @@ namespace llvm {
   /// it can be expressed as a base pointer plus a constant offset.  Return the
   /// base and offset to the caller.
   Value *GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                          const TargetData &TD);
+                                          const DataLayout &TD);
   static inline const Value *
   GetPointerBaseWithConstantOffset(const Value *Ptr, int64_t &Offset,
-                                   const TargetData &TD) {
+                                   const DataLayout &TD) {
     return GetPointerBaseWithConstantOffset(const_cast<Value*>(Ptr), Offset,TD);
   }
   
@@ -143,10 +143,10 @@ namespace llvm {
   /// being addressed.  Note that the returned value has pointer type if the
   /// specified value does.  If the MaxLookup value is non-zero, it limits the
   /// number of instructions to be stripped off.
-  Value *GetUnderlyingObject(Value *V, const TargetData *TD = 0,
+  Value *GetUnderlyingObject(Value *V, const DataLayout *TD = 0,
                              unsigned MaxLookup = 6);
   static inline const Value *
-  GetUnderlyingObject(const Value *V, const TargetData *TD = 0,
+  GetUnderlyingObject(const Value *V, const DataLayout *TD = 0,
                       unsigned MaxLookup = 6) {
     return GetUnderlyingObject(const_cast<Value *>(V), TD, MaxLookup);
   }
@@ -156,7 +156,7 @@ namespace llvm {
   /// multiple objects.
   void GetUnderlyingObjects(Value *V,
                             SmallVectorImpl<Value *> &Objects,
-                            const TargetData *TD = 0,
+                            const DataLayout *TD = 0,
                             unsigned MaxLookup = 6);
 
   /// onlyUsedByLifetimeMarkers - Return true if the only users of this pointer
@@ -182,7 +182,7 @@ namespace llvm {
   /// However, this method can return true for instructions that read memory;
   /// for such instructions, moving them may change the resulting value.
   bool isSafeToSpeculativelyExecute(const Value *V,
-                                    const TargetData *TD = 0);
+                                    const DataLayout *TD = 0);
 
 } // end namespace llvm
 
diff --git a/include/llvm/Argument.h b/include/llvm/Argument.h
index e66075c1f235..b1c22185191d 100644
--- a/include/llvm/Argument.h
+++ b/include/llvm/Argument.h
@@ -68,8 +68,8 @@ public:
   /// attribute on it in its containing function.
   bool hasNoCaptureAttr() const;
   
-  /// hasSRetAttr - Return true if this argument has the sret attribute on it in
-  /// its containing function.
+  /// hasStructRetAttr - Return true if this argument has the sret attribute on
+  /// it in its containing function.
   bool hasStructRetAttr() const;
 
   /// addAttr - Add a Attribute to an argument
@@ -81,7 +81,6 @@ public:
   /// classof - Methods for support type inquiry through isa, cast, and
   /// dyn_cast:
   ///
-  static inline bool classof(const Argument *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == ArgumentVal;
   }
diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index 223aa0063906..a9c2d743ff4a 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h
@@ -21,268 +21,280 @@
 #include <string>
 
 namespace llvm {
-class Type;
-
-namespace Attribute {
-/// We use this proxy POD type to allow constructing Attributes constants
-/// using initializer lists. Do not use this class directly.
-struct AttrConst {
-  uint64_t v;
-  AttrConst operator | (const AttrConst Attrs) const {
-    AttrConst Res = {v | Attrs.v};
-    return Res;
-  }
-  AttrConst operator ~ () const {
-    AttrConst Res = {~v};
-    return Res;
-  }
-};
-}  // namespace Attribute
 
+class AttrBuilder;
+class AttributesImpl;
+class LLVMContext;
+class Type;
 
 /// Attributes - A bitset of attributes.
 class Attributes {
- public:
-  Attributes() : Bits(0) { }
-  explicit Attributes(uint64_t Val) : Bits(Val) { }
-  /*implicit*/ Attributes(Attribute::AttrConst Val) : Bits(Val.v) { }
-  // This is a "safe bool() operator".
-  operator const void *() const { return Bits ? this : 0; }
-  bool isEmptyOrSingleton() const { return (Bits & (Bits - 1)) == 0; }
-  bool operator == (const Attributes &Attrs) const {
-    return Bits == Attrs.Bits;
+public:
+  /// Function parameters and results can have attributes to indicate how they
+  /// should be treated by optimizations and code generation. This enumeration
+  /// lists the attributes that can be associated with parameters, function
+  /// results or the function itself.
+  ///
+  /// Note that uwtable is about the ABI or the user mandating an entry in the
+  /// unwind table. The nounwind attribute is about an exception passing by the
+  /// function.
+  ///
+  /// In a theoretical system that uses tables for profiling and sjlj for
+  /// exceptions, they would be fully independent. In a normal system that uses
+  /// tables for both, the semantics are:
+  ///
+  /// nil                = Needs an entry because an exception might pass by.
+  /// nounwind           = No need for an entry
+  /// uwtable            = Needs an entry because the ABI says so and because
+  ///                      an exception might pass by.
+  /// uwtable + nounwind = Needs an entry because the ABI says so.
+
+  enum AttrVal {
+    // IR-Level Attributes
+    None,                  ///< No attributes have been set
+    AddressSafety,         ///< Address safety checking is on.
+    Alignment,             ///< Alignment of parameter (5 bits)
+                           ///< stored as log2 of alignment with +1 bias
+                           ///< 0 means unaligned different from align 1
+    AlwaysInline,          ///< inline=always
+    ByVal,                 ///< Pass structure by value
+    InlineHint,            ///< Source said inlining was desirable
+    InReg,                 ///< Force argument to be passed in register
+    MinSize,               ///< Function must be optimized for size first
+    Naked,                 ///< Naked function
+    Nest,                  ///< Nested function static chain
+    NoAlias,               ///< Considered to not alias after call
+    NoCapture,             ///< Function creates no aliases of pointer
+    NoImplicitFloat,       ///< Disable implicit floating point insts
+    NoInline,              ///< inline=never
+    NonLazyBind,           ///< Function is called early and/or
+                           ///< often, so lazy binding isn't worthwhile
+    NoRedZone,             ///< Disable redzone
+    NoReturn,              ///< Mark the function as not returning
+    NoUnwind,              ///< Function doesn't unwind stack
+    OptimizeForSize,       ///< opt_size
+    ReadNone,              ///< Function does not access memory
+    ReadOnly,              ///< Function only reads from memory
+    ReturnsTwice,          ///< Function can return twice
+    SExt,                  ///< Sign extended before/after call
+    StackAlignment,        ///< Alignment of stack for function (3 bits)
+                           ///< stored as log2 of alignment with +1 bias 0
+                           ///< means unaligned (different from
+                           ///< alignstack={1))
+    StackProtect,          ///< Stack protection.
+    StackProtectReq,       ///< Stack protection required.
+    StructRet,             ///< Hidden pointer to structure to return
+    UWTable,               ///< Function must be in a unwind table
+    ZExt                   ///< Zero extended before/after call
+  };
+private:
+  AttributesImpl *Attrs;
+  Attributes(AttributesImpl *A) : Attrs(A) {}
+public:
+  Attributes() : Attrs(0) {}
+  Attributes(const Attributes &A) : Attrs(A.Attrs) {}
+  Attributes &operator=(const Attributes &A) {
+    Attrs = A.Attrs;
+    return *this;
   }
-  bool operator != (const Attributes &Attrs) const {
-    return Bits != Attrs.Bits;
+
+  /// get - Return a uniquified Attributes object. This takes the uniquified
+  /// value from the Builder and wraps it in the Attributes class.
+  static Attributes get(LLVMContext &Context, ArrayRef<AttrVal> Vals);
+  static Attributes get(LLVMContext &Context, AttrBuilder &B);
+
+  /// @brief Return true if the attribute is present.
+  bool hasAttribute(AttrVal Val) const;
+
+  /// @brief Return true if attributes exist
+  bool hasAttributes() const;
+
+  /// @brief Return true if the attributes are a non-null intersection.
+  bool hasAttributes(const Attributes &A) const;
+
+  /// @brief Returns the alignment field of an attribute as a byte alignment
+  /// value.
+  unsigned getAlignment() const;
+
+  /// @brief Returns the stack alignment field of an attribute as a byte
+  /// alignment value.
+  unsigned getStackAlignment() const;
+
+  /// @brief Parameter attributes that do not apply to vararg call arguments.
+  bool hasIncompatibleWithVarArgsAttrs() const {
+    return hasAttribute(Attributes::StructRet);
   }
-  Attributes operator | (const Attributes &Attrs) const {
-    return Attributes(Bits | Attrs.Bits);
+
+  /// @brief Attributes that only apply to function parameters.
+  bool hasParameterOnlyAttrs() const {
+    return hasAttribute(Attributes::ByVal) ||
+      hasAttribute(Attributes::Nest) ||
+      hasAttribute(Attributes::StructRet) ||
+      hasAttribute(Attributes::NoCapture);
   }
-  Attributes operator & (const Attributes &Attrs) const {
-    return Attributes(Bits & Attrs.Bits);
+
+  /// @brief Attributes that may be applied to the function itself.  These cannot
+  /// be used on return values or function parameters.
+  bool hasFunctionOnlyAttrs() const {
+    return hasAttribute(Attributes::NoReturn) ||
+      hasAttribute(Attributes::NoUnwind) ||
+      hasAttribute(Attributes::ReadNone) ||
+      hasAttribute(Attributes::ReadOnly) ||
+      hasAttribute(Attributes::NoInline) ||
+      hasAttribute(Attributes::AlwaysInline) ||
+      hasAttribute(Attributes::OptimizeForSize) ||
+      hasAttribute(Attributes::StackProtect) ||
+      hasAttribute(Attributes::StackProtectReq) ||
+      hasAttribute(Attributes::NoRedZone) ||
+      hasAttribute(Attributes::NoImplicitFloat) ||
+      hasAttribute(Attributes::Naked) ||
+      hasAttribute(Attributes::InlineHint) ||
+      hasAttribute(Attributes::StackAlignment) ||
+      hasAttribute(Attributes::UWTable) ||
+      hasAttribute(Attributes::NonLazyBind) ||
+      hasAttribute(Attributes::ReturnsTwice) ||
+      hasAttribute(Attributes::AddressSafety) ||
+      hasAttribute(Attributes::MinSize);
   }
-  Attributes operator ^ (const Attributes &Attrs) const {
-    return Attributes(Bits ^ Attrs.Bits);
+
+  bool operator==(const Attributes &A) const {
+    return Attrs == A.Attrs;
   }
-  Attributes &operator |= (const Attributes &Attrs) {
-    Bits |= Attrs.Bits;
-    return *this;
+  bool operator!=(const Attributes &A) const {
+    return Attrs != A.Attrs;
   }
-  Attributes &operator &= (const Attributes &Attrs) {
-    Bits &= Attrs.Bits;
-    return *this;
+
+  uint64_t Raw() const;
+
+  /// @brief Which attributes cannot be applied to a type.
+  static Attributes typeIncompatible(Type *Ty);
+
+  /// encodeLLVMAttributesForBitcode - This returns an integer containing an
+  /// encoding of all the LLVM attributes found in the given attribute bitset.
+  /// Any change to this encoding is a breaking change to bitcode compatibility.
+  static uint64_t encodeLLVMAttributesForBitcode(Attributes Attrs);
+
+  /// decodeLLVMAttributesForBitcode - This returns an attribute bitset
+  /// containing the LLVM attributes that have been decoded from the given
+  /// integer.  This function must stay in sync with
+  /// 'encodeLLVMAttributesForBitcode'.
+  static Attributes decodeLLVMAttributesForBitcode(LLVMContext &C,
+                                                   uint64_t EncodedAttrs);
+
+  /// getAsString - The set of Attributes set in Attributes is converted to a
+  /// string of equivalent mnemonics. This is, presumably, for writing out the
+  /// mnemonics for the assembly writer.
+  /// @brief Convert attribute bits to text
+  std::string getAsString() const;
+};
+
+//===----------------------------------------------------------------------===//
+/// AttrBuilder - This class is used in conjunction with the Attributes::get
+/// method to create an Attributes object. The object itself is uniquified. The
+/// Builder's value, however, is not. So this can be used as a quick way to test
+/// for equality, presence of attributes, etc.
+class AttrBuilder {
+  uint64_t Bits;
+public:
+  AttrBuilder() : Bits(0) {}
+  explicit AttrBuilder(uint64_t B) : Bits(B) {}
+  AttrBuilder(const Attributes &A) : Bits(A.Raw()) {}
+  AttrBuilder(const AttrBuilder &B) : Bits(B.Bits) {}
+
+  void clear() { Bits = 0; }
+
+  /// addAttribute - Add an attribute to the builder.
+  AttrBuilder &addAttribute(Attributes::AttrVal Val);
+
+  /// removeAttribute - Remove an attribute from the builder.
+  AttrBuilder &removeAttribute(Attributes::AttrVal Val);
+
+  /// addAttribute - Add the attributes from A to the builder.
+  AttrBuilder &addAttributes(const Attributes &A);
+
+  /// removeAttribute - Remove the attributes from A from the builder.
+  AttrBuilder &removeAttributes(const Attributes &A);
+
+  /// hasAttribute - Return true if the builder has the specified attribute.
+  bool hasAttribute(Attributes::AttrVal A) const;
+
+  /// hasAttributes - Return true if the builder has IR-level attributes.
+  bool hasAttributes() const;
+
+  /// hasAttributes - Return true if the builder has any attribute that's in the
+  /// specified attribute.
+  bool hasAttributes(const Attributes &A) const;
+
+  /// hasAlignmentAttr - Return true if the builder has an alignment attribute.
+  bool hasAlignmentAttr() const;
+
+  /// getAlignment - Retrieve the alignment attribute, if it exists.
+  uint64_t getAlignment() const;
+
+  /// getStackAlignment - Retrieve the stack alignment attribute, if it exists.
+  uint64_t getStackAlignment() const;
+
+  /// addAlignmentAttr - This turns an int alignment (which must be a power of
+  /// 2) into the form used internally in Attributes.
+  AttrBuilder &addAlignmentAttr(unsigned Align);
+
+  /// addStackAlignmentAttr - This turns an int stack alignment (which must be a
+  /// power of 2) into the form used internally in Attributes.
+  AttrBuilder &addStackAlignmentAttr(unsigned Align);
+
+  /// addRawValue - Add the raw value to the internal representation.
+  /// N.B. This should be used ONLY for decoding LLVM bitcode!
+  AttrBuilder &addRawValue(uint64_t Val);
+
+  /// @brief Remove attributes that are used on functions only.
+  void removeFunctionOnlyAttrs() {
+    removeAttribute(Attributes::NoReturn)
+      .removeAttribute(Attributes::NoUnwind)
+      .removeAttribute(Attributes::ReadNone)
+      .removeAttribute(Attributes::ReadOnly)
+      .removeAttribute(Attributes::NoInline)
+      .removeAttribute(Attributes::AlwaysInline)
+      .removeAttribute(Attributes::OptimizeForSize)
+      .removeAttribute(Attributes::StackProtect)
+      .removeAttribute(Attributes::StackProtectReq)
+      .removeAttribute(Attributes::NoRedZone)
+      .removeAttribute(Attributes::NoImplicitFloat)
+      .removeAttribute(Attributes::Naked)
+      .removeAttribute(Attributes::InlineHint)
+      .removeAttribute(Attributes::StackAlignment)
+      .removeAttribute(Attributes::UWTable)
+      .removeAttribute(Attributes::NonLazyBind)
+      .removeAttribute(Attributes::ReturnsTwice)
+      .removeAttribute(Attributes::AddressSafety)
+      .removeAttribute(Attributes::MinSize);
   }
-  Attributes operator ~ () const { return Attributes(~Bits); }
+
   uint64_t Raw() const { return Bits; }
- private:
-  // Currently, we need less than 64 bits.
-  uint64_t Bits;
-};
 
-namespace Attribute {
-
-/// Function parameters and results can have attributes to indicate how they
-/// should be treated by optimizations and code generation. This enumeration
-/// lists the attributes that can be associated with parameters, function
-/// results or the function itself.
-/// @brief Function attributes.
-
-// We declare AttrConst objects that will be used throughout the code
-// and also raw uint64_t objects with _i suffix to be used below for other
-// constant declarations. This is done to avoid static CTORs and at the same
-// time to keep type-safety of Attributes.
-#define DECLARE_LLVM_ATTRIBUTE(name, value) \
-  const uint64_t name##_i = value; \
-  const AttrConst name = {value};
-
-DECLARE_LLVM_ATTRIBUTE(None,0)    ///< No attributes have been set
-DECLARE_LLVM_ATTRIBUTE(ZExt,1<<0) ///< Zero extended before/after call
-DECLARE_LLVM_ATTRIBUTE(SExt,1<<1) ///< Sign extended before/after call
-DECLARE_LLVM_ATTRIBUTE(NoReturn,1<<2) ///< Mark the function as not returning
-DECLARE_LLVM_ATTRIBUTE(InReg,1<<3) ///< Force argument to be passed in register
-DECLARE_LLVM_ATTRIBUTE(StructRet,1<<4) ///< Hidden pointer to structure to return
-DECLARE_LLVM_ATTRIBUTE(NoUnwind,1<<5) ///< Function doesn't unwind stack
-DECLARE_LLVM_ATTRIBUTE(NoAlias,1<<6) ///< Considered to not alias after call
-DECLARE_LLVM_ATTRIBUTE(ByVal,1<<7) ///< Pass structure by value
-DECLARE_LLVM_ATTRIBUTE(Nest,1<<8) ///< Nested function static chain
-DECLARE_LLVM_ATTRIBUTE(ReadNone,1<<9) ///< Function does not access memory
-DECLARE_LLVM_ATTRIBUTE(ReadOnly,1<<10) ///< Function only reads from memory
-DECLARE_LLVM_ATTRIBUTE(NoInline,1<<11) ///< inline=never
-DECLARE_LLVM_ATTRIBUTE(AlwaysInline,1<<12) ///< inline=always
-DECLARE_LLVM_ATTRIBUTE(OptimizeForSize,1<<13) ///< opt_size
-DECLARE_LLVM_ATTRIBUTE(StackProtect,1<<14) ///< Stack protection.
-DECLARE_LLVM_ATTRIBUTE(StackProtectReq,1<<15) ///< Stack protection required.
-DECLARE_LLVM_ATTRIBUTE(Alignment,31<<16) ///< Alignment of parameter (5 bits)
-                                     // stored as log2 of alignment with +1 bias
-                                     // 0 means unaligned different from align 1
-DECLARE_LLVM_ATTRIBUTE(NoCapture,1<<21) ///< Function creates no aliases of pointer
-DECLARE_LLVM_ATTRIBUTE(NoRedZone,1<<22) /// disable redzone
-DECLARE_LLVM_ATTRIBUTE(NoImplicitFloat,1<<23) /// disable implicit floating point
-                                           /// instructions.
-DECLARE_LLVM_ATTRIBUTE(Naked,1<<24) ///< Naked function
-DECLARE_LLVM_ATTRIBUTE(InlineHint,1<<25) ///< source said inlining was
-                                           ///desirable
-DECLARE_LLVM_ATTRIBUTE(StackAlignment,7<<26) ///< Alignment of stack for
-                                           ///function (3 bits) stored as log2
-                                           ///of alignment with +1 bias
-                                           ///0 means unaligned (different from
-                                           ///alignstack= {1))
-DECLARE_LLVM_ATTRIBUTE(ReturnsTwice,1<<29) ///< Function can return twice
-DECLARE_LLVM_ATTRIBUTE(UWTable,1<<30) ///< Function must be in a unwind
-                                           ///table
-DECLARE_LLVM_ATTRIBUTE(NonLazyBind,1U<<31) ///< Function is called early and/or
-                                            /// often, so lazy binding isn't
-                                            /// worthwhile.
-DECLARE_LLVM_ATTRIBUTE(AddressSafety,1ULL<<32) ///< Address safety checking is on.
-DECLARE_LLVM_ATTRIBUTE(IANSDialect,1ULL<<33) ///< Inline asm non-standard dialect.
-                                           /// When not set, ATT dialect assumed.
-                                           /// When set implies the Intel dialect.
-
-#undef DECLARE_LLVM_ATTRIBUTE
-
-/// Note that uwtable is about the ABI or the user mandating an entry in the
-/// unwind table. The nounwind attribute is about an exception passing by the
-/// function.
-/// In a theoretical system that uses tables for profiling and sjlj for
-/// exceptions, they would be fully independent. In a normal system that
-/// uses tables for both, the semantics are:
-/// nil                = Needs an entry because an exception might pass by.
-/// nounwind           = No need for an entry
-/// uwtable            = Needs an entry because the ABI says so and because
-///                      an exception might pass by.
-/// uwtable + nounwind = Needs an entry because the ABI says so.
-
-/// @brief Attributes that only apply to function parameters.
-const AttrConst ParameterOnly = {ByVal_i | Nest_i |
-    StructRet_i | NoCapture_i};
-
-/// @brief Attributes that may be applied to the function itself.  These cannot
-/// be used on return values or function parameters.
-const AttrConst FunctionOnly = {NoReturn_i | NoUnwind_i | ReadNone_i |
-  ReadOnly_i | NoInline_i | AlwaysInline_i | OptimizeForSize_i |
-  StackProtect_i | StackProtectReq_i | NoRedZone_i | NoImplicitFloat_i |
-  Naked_i | InlineHint_i | StackAlignment_i |
-  UWTable_i | NonLazyBind_i | ReturnsTwice_i | AddressSafety_i |
-  IANSDialect_i};
-
-/// @brief Parameter attributes that do not apply to vararg call arguments.
-const AttrConst VarArgsIncompatible = {StructRet_i};
-
-/// @brief Attributes that are mutually incompatible.
-const AttrConst MutuallyIncompatible[5] = {
-  {ByVal_i | Nest_i | StructRet_i},
-  {ByVal_i | Nest_i | InReg_i },
-  {ZExt_i  | SExt_i},
-  {ReadNone_i | ReadOnly_i},
-  {NoInline_i | AlwaysInline_i}
+  bool operator==(const AttrBuilder &B) {
+    return Bits == B.Bits;
+  }
+  bool operator!=(const AttrBuilder &B) {
+    return Bits != B.Bits;
+  }
 };
 
-/// @brief Which attributes cannot be applied to a type.
-Attributes typeIncompatible(Type *Ty);
-
-/// This turns an int alignment (a power of 2, normally) into the
-/// form used internally in Attributes.
-inline Attributes constructAlignmentFromInt(unsigned i) {
-  // Default alignment, allow the target to define how to align it.
-  if (i == 0)
-    return None;
-
-  assert(isPowerOf2_32(i) && "Alignment must be a power of two.");
-  assert(i <= 0x40000000 && "Alignment too large.");
-  return Attributes((Log2_32(i)+1) << 16);
-}
-
-/// This returns the alignment field of an attribute as a byte alignment value.
-inline unsigned getAlignmentFromAttrs(Attributes A) {
-  Attributes Align = A & Attribute::Alignment;
-  if (!Align)
-    return 0;
-
-  return 1U << ((Align.Raw() >> 16) - 1);
-}
-
-/// This turns an int stack alignment (which must be a power of 2) into
-/// the form used internally in Attributes.
-inline Attributes constructStackAlignmentFromInt(unsigned i) {
-  // Default alignment, allow the target to define how to align it.
-  if (i == 0)
-    return None;
-
-  assert(isPowerOf2_32(i) && "Alignment must be a power of two.");
-  assert(i <= 0x100 && "Alignment too large.");
-  return Attributes((Log2_32(i)+1) << 26);
-}
-
-/// This returns the stack alignment field of an attribute as a byte alignment
-/// value.
-inline unsigned getStackAlignmentFromAttrs(Attributes A) {
-  Attributes StackAlign = A & Attribute::StackAlignment;
-  if (!StackAlign)
-    return 0;
-
-  return 1U << ((StackAlign.Raw() >> 26) - 1);
-}
-
-/// This returns an integer containing an encoding of all the
-/// LLVM attributes found in the given attribute bitset.  Any
-/// change to this encoding is a breaking change to bitcode
-/// compatibility.
-inline uint64_t encodeLLVMAttributesForBitcode(Attributes Attrs) {
-  // FIXME: It doesn't make sense to store the alignment information as an
-  // expanded out value, we should store it as a log2 value.  However, we can't
-  // just change that here without breaking bitcode compatibility.  If this ever
-  // becomes a problem in practice, we should introduce new tag numbers in the
-  // bitcode file and have those tags use a more efficiently encoded alignment
-  // field.
-
-  // Store the alignment in the bitcode as a 16-bit raw value instead of a
-  // 5-bit log2 encoded value. Shift the bits above the alignment up by
-  // 11 bits.
-
-  uint64_t EncodedAttrs = Attrs.Raw() & 0xffff;
-  if (Attrs & Attribute::Alignment)
-    EncodedAttrs |= (1ull << 16) <<
-      (((Attrs & Attribute::Alignment).Raw()-1) >> 16);
-  EncodedAttrs |= (Attrs.Raw() & (0xfffull << 21)) << 11;
-
-  return EncodedAttrs;
-}
-
-/// This returns an attribute bitset containing the LLVM attributes
-/// that have been decoded from the given integer.  This function
-/// must stay in sync with 'encodeLLVMAttributesForBitcode'.
-inline Attributes decodeLLVMAttributesForBitcode(uint64_t EncodedAttrs) {
-  // The alignment is stored as a 16-bit raw value from bits 31--16.
-  // We shift the bits above 31 down by 11 bits.
-
-  unsigned Alignment = (EncodedAttrs & (0xffffull << 16)) >> 16;
-  assert((!Alignment || isPowerOf2_32(Alignment)) &&
-         "Alignment must be a power of two.");
-
-  Attributes Attrs(EncodedAttrs & 0xffff);
-  if (Alignment)
-    Attrs |= Attribute::constructAlignmentFromInt(Alignment);
-  Attrs |= Attributes((EncodedAttrs & (0xfffull << 32)) >> 11);
-
-  return Attrs;
-}
-
-
-/// The set of Attributes set in Attributes is converted to a
-/// string of equivalent mnemonics. This is, presumably, for writing out
-/// the mnemonics for the assembly writer.
-/// @brief Convert attribute bits to text
-std::string getAsString(Attributes Attrs);
-} // end namespace Attribute
-
-/// This is just a pair of values to associate a set of attributes
-/// with an index.
-struct AttributeWithIndex {
-  Attributes Attrs; ///< The attributes that are set, or'd together.
-  unsigned Index; ///< Index of the parameter for which the attributes apply.
-                  ///< Index 0 is used for return value attributes.
-                  ///< Index ~0U is used for function attributes.
+//===----------------------------------------------------------------------===//
+// AttributeWithIndex
+//===----------------------------------------------------------------------===//
 
+/// AttributeWithIndex - This is just a pair of values to associate a set of
+/// attributes with an index.
+struct AttributeWithIndex {
+  Attributes Attrs;  ///< The attributes that are set, or'd together.
+  unsigned Index;    ///< Index of the parameter for which the attributes apply.
+                     ///< Index 0 is used for return value attributes.
+                     ///< Index ~0U is used for function attributes.
+
+  static AttributeWithIndex get(LLVMContext &C, unsigned Idx,
+                                ArrayRef<Attributes::AttrVal> Attrs) {
+    return get(Idx, Attributes::get(C, Attrs));
+  }
   static AttributeWithIndex get(unsigned Idx, Attributes Attrs) {
     AttributeWithIndex P;
     P.Index = Idx;
@@ -300,31 +312,42 @@ class AttributeListImpl;
 /// AttrListPtr - This class manages the ref count for the opaque
 /// AttributeListImpl object and provides accessors for it.
 class AttrListPtr {
-  /// AttrList - The attributes that we are managing.  This can be null
-  /// to represent the empty attributes list.
+public:
+  enum AttrIndex {
+    ReturnIndex = 0U,
+    FunctionIndex = ~0U
+  };
+private:
+  /// @brief The attributes that we are managing.  This can be null to represent
+  /// the empty attributes list.
   AttributeListImpl *AttrList;
+
+  /// @brief The attributes for the specified index are returned.  Attributes
+  /// for the result are denoted with Idx = 0.
+  Attributes getAttributes(unsigned Idx) const;
+
+  explicit AttrListPtr(AttributeListImpl *LI) : AttrList(LI) {}
 public:
   AttrListPtr() : AttrList(0) {}
-  AttrListPtr(const AttrListPtr &P);
+  AttrListPtr(const AttrListPtr &P) : AttrList(P.AttrList) {}
   const AttrListPtr &operator=(const AttrListPtr &RHS);
-  ~AttrListPtr();
 
   //===--------------------------------------------------------------------===//
   // Attribute List Construction and Mutation
   //===--------------------------------------------------------------------===//
 
   /// get - Return a Attributes list with the specified parameters in it.
-  static AttrListPtr get(ArrayRef<AttributeWithIndex> Attrs);
+  static AttrListPtr get(LLVMContext &C, ArrayRef<AttributeWithIndex> Attrs);
 
   /// addAttr - Add the specified attribute at the specified index to this
   /// attribute list.  Since attribute lists are immutable, this
   /// returns the new list.
-  AttrListPtr addAttr(unsigned Idx, Attributes Attrs) const;
+  AttrListPtr addAttr(LLVMContext &C, unsigned Idx, Attributes Attrs) const;
 
   /// removeAttr - Remove the specified attribute at the specified index from
   /// this attribute list.  Since attribute lists are immutable, this
   /// returns the new list.
-  AttrListPtr removeAttr(unsigned Idx, Attributes Attrs) const;
+  AttrListPtr removeAttr(LLVMContext &C, unsigned Idx, Attributes Attrs) const;
 
   //===--------------------------------------------------------------------===//
   // Attribute List Accessors
@@ -332,36 +355,38 @@ public:
   /// getParamAttributes - The attributes for the specified index are
   /// returned.
   Attributes getParamAttributes(unsigned Idx) const {
-    assert (Idx && Idx != ~0U && "Invalid parameter index!");
     return getAttributes(Idx);
   }
 
   /// getRetAttributes - The attributes for the ret value are
   /// returned.
   Attributes getRetAttributes() const {
-    return getAttributes(0);
+    return getAttributes(ReturnIndex);
   }
 
   /// getFnAttributes - The function attributes are returned.
   Attributes getFnAttributes() const {
-    return getAttributes(~0U);
+    return getAttributes(FunctionIndex);
   }
 
   /// paramHasAttr - Return true if the specified parameter index has the
   /// specified attribute set.
   bool paramHasAttr(unsigned Idx, Attributes Attr) const {
-    return getAttributes(Idx) & Attr;
+    return getAttributes(Idx).hasAttributes(Attr);
   }
 
   /// getParamAlignment - Return the alignment for the specified function
   /// parameter.
   unsigned getParamAlignment(unsigned Idx) const {
-    return Attribute::getAlignmentFromAttrs(getAttributes(Idx));
+    return getAttributes(Idx).getAlignment();
   }
 
   /// hasAttrSomewhere - Return true if the specified attribute is set for at
   /// least one parameter or for the return value.
-  bool hasAttrSomewhere(Attributes Attr) const;
+  bool hasAttrSomewhere(Attributes::AttrVal Attr) const;
+
+  unsigned getNumAttrs() const;
+  Attributes &getAttributesAtIndex(unsigned i) const;
 
   /// operator==/!= - Provide equality predicates.
   bool operator==(const AttrListPtr &RHS) const
@@ -369,8 +394,6 @@ public:
   bool operator!=(const AttrListPtr &RHS) const
   { return AttrList != RHS.AttrList; }
 
-  void dump() const;
-
   //===--------------------------------------------------------------------===//
   // Attribute List Introspection
   //===--------------------------------------------------------------------===//
@@ -400,13 +423,7 @@ public:
   /// holds a index number plus a set of attributes.
   const AttributeWithIndex &getSlot(unsigned Slot) const;
 
-private:
-  explicit AttrListPtr(AttributeListImpl *L);
-
-  /// getAttributes - The attributes for the specified index are
-  /// returned.  Attributes for the result are denoted with Idx = 0.
-  Attributes getAttributes(unsigned Idx) const;
-
+  void dump() const;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/BasicBlock.h b/include/llvm/BasicBlock.h
index d2aa1673d921..02c2a96b6c64 100644
--- a/include/llvm/BasicBlock.h
+++ b/include/llvm/BasicBlock.h
@@ -79,8 +79,8 @@ private:
   void setParent(Function *parent);
   friend class SymbolTableListTraits<BasicBlock, Function>;
 
-  BasicBlock(const BasicBlock &);     // Do not implement
-  void operator=(const BasicBlock &); // Do not implement
+  BasicBlock(const BasicBlock &) LLVM_DELETED_FUNCTION;
+  void operator=(const BasicBlock &) LLVM_DELETED_FUNCTION;
 
   /// BasicBlock ctor - If the function parameter is specified, the basic block
   /// is automatically inserted at either the end of the function (if
@@ -213,7 +213,6 @@ public:
   ValueSymbolTable *getValueSymbolTable();
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const BasicBlock *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::BasicBlockVal;
   }
diff --git a/include/llvm/Bitcode/Archive.h b/include/llvm/Bitcode/Archive.h
index 3c75e5882dbd..4fd4b5d90a9e 100644
--- a/include/llvm/Bitcode/Archive.h
+++ b/include/llvm/Bitcode/Archive.h
@@ -415,8 +415,8 @@ class Archive {
     /// name will be truncated at 15 characters. If \p Compress is specified,
     /// all archive members will be compressed before being written. If
     /// \p PrintSymTab is true, the symbol table will be printed to std::cout.
-    /// @returns true if an error occurred, \p error set to error message
-    /// @returns false if the writing succeeded.
+    /// @returns true if an error occurred, \p error set to error message;
+    /// returns false if the writing succeeded.
     /// @brief Write (possibly modified) archive contents to disk
     bool writeToDisk(
       bool CreateSymbolTable=false,   ///< Create Symbol table
@@ -480,8 +480,8 @@ class Archive {
     /// Writes one ArchiveMember to an ofstream. If an error occurs, returns
     /// false, otherwise true. If an error occurs and error is non-null then
     /// it will be set to an error message.
-    /// @returns false Writing member succeeded
-    /// @returns true Writing member failed, \p error set to error message
+    /// @returns false if writing member succeeded,
+    /// returns true if writing member failed, \p error set to error message.
     bool writeMember(
       const ArchiveMember& member, ///< The member to be written
       std::ofstream& ARFile,       ///< The file to write member onto
@@ -527,9 +527,9 @@ class Archive {
   /// @name Hidden
   /// @{
   private:
-    Archive();                          ///< Do not implement
-    Archive(const Archive&);            ///< Do not implement
-    Archive& operator=(const Archive&); ///< Do not implement
+    Archive() LLVM_DELETED_FUNCTION;
+    Archive(const Archive&) LLVM_DELETED_FUNCTION;
+    Archive& operator=(const Archive&) LLVM_DELETED_FUNCTION;
   /// @}
 };
 
diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index 65868294403c..840f57e7526d 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h
@@ -47,9 +47,9 @@ private:
   /// block/record name information in the BlockInfo block. Only llvm-bcanalyzer
   /// uses this.
   bool IgnoreBlockInfoNames;
-  
-  BitstreamReader(const BitstreamReader&);  // DO NOT IMPLEMENT
-  void operator=(const BitstreamReader&);  // DO NOT IMPLEMENT
+
+  BitstreamReader(const BitstreamReader&) LLVM_DELETED_FUNCTION;
+  void operator=(const BitstreamReader&) LLVM_DELETED_FUNCTION;
 public:
   BitstreamReader() : IgnoreBlockInfoNames(true) {
   }
@@ -409,7 +409,7 @@ public:
   }
 
   /// EnterSubBlock - Having read the ENTER_SUBBLOCK abbrevid, enter
-  /// the block, and return true if the block is valid.
+  /// the block, and return true if the block has an error.
   bool EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = 0) {
     // Save the current block's state on BlockScope.
     BlockScope.push_back(Block(CurCodeSize));
diff --git a/include/llvm/Bitcode/BitstreamWriter.h b/include/llvm/Bitcode/BitstreamWriter.h
index 475da133f8a8..dea118f98ed2 100644
--- a/include/llvm/Bitcode/BitstreamWriter.h
+++ b/include/llvm/Bitcode/BitstreamWriter.h
@@ -155,6 +155,7 @@ public:
   }
 
   void EmitVBR(uint32_t Val, unsigned NumBits) {
+    assert(NumBits <= 32 && "Too many bits to emit!");
     uint32_t Threshold = 1U << (NumBits-1);
 
     // Emit the bits with VBR encoding, NumBits-1 bits at a time.
@@ -167,10 +168,11 @@ public:
   }
 
   void EmitVBR64(uint64_t Val, unsigned NumBits) {
+    assert(NumBits <= 32 && "Too many bits to emit!");
     if ((uint32_t)Val == Val)
       return EmitVBR((uint32_t)Val, NumBits);
 
-    uint64_t Threshold = 1U << (NumBits-1);
+    uint32_t Threshold = 1U << (NumBits-1);
 
     // Emit the bits with VBR encoding, NumBits-1 bits at a time.
     while (Val >= Threshold) {
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index a8c34cb82995..c1dc190304c2 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -161,11 +161,14 @@ namespace bitc {
     CST_CODE_CE_INSERTELT  = 15,  // CE_INSERTELT:  [opval, opval, opval]
     CST_CODE_CE_SHUFFLEVEC = 16,  // CE_SHUFFLEVEC: [opval, opval, opval]
     CST_CODE_CE_CMP        = 17,  // CE_CMP:        [opty, opval, opval, pred]
-    CST_CODE_INLINEASM     = 18,  // INLINEASM:     [sideeffect,asmstr,conststr]
+    CST_CODE_INLINEASM_OLD = 18,  // INLINEASM:     [sideeffect|alignstack,
+                                  //                 asmstr,conststr]
     CST_CODE_CE_SHUFVEC_EX = 19,  // SHUFVEC_EX:    [opty, opval, opval, opval]
     CST_CODE_CE_INBOUNDS_GEP = 20,// INBOUNDS_GEP:  [n x operands]
     CST_CODE_BLOCKADDRESS  = 21,  // CST_CODE_BLOCKADDRESS [fnty, fnval, bb#]
-    CST_CODE_DATA          = 22   // DATA:          [n x elements]
+    CST_CODE_DATA          = 22,  // DATA:          [n x elements]
+    CST_CODE_INLINEASM     = 23   // INLINEASM:     [sideeffect|alignstack|
+                                  //                 asmdialect,asmstr,conststr]
   };
 
   /// CastOpcodes - These are values used in the bitcode files to encode which
diff --git a/include/llvm/CallingConv.h b/include/llvm/CallingConv.h
index 4c5ee626709a..053f4eb326f9 100644
--- a/include/llvm/CallingConv.h
+++ b/include/llvm/CallingConv.h
@@ -94,7 +94,29 @@ namespace CallingConv {
 
     /// MBLAZE_INTR - Calling convention used for MBlaze interrupt support
     /// routines (i.e. GCC's save_volatiles attribute).
-    MBLAZE_SVOL = 74
+    MBLAZE_SVOL = 74,
+
+    /// SPIR_FUNC - Calling convention for SPIR non-kernel device functions.
+    /// No lowering or expansion of arguments.
+    /// Structures are passed as a pointer to a struct with the byval attribute.
+    /// Functions can only call SPIR_FUNC and SPIR_KERNEL functions.
+    /// Functions can only have zero or one return values.
+    /// Variable arguments are not allowed, except for printf.
+    /// How arguments/return values are lowered are not specified.
+    /// Functions are only visible to the devices.
+    SPIR_FUNC = 75,
+
+    /// SPIR_KERNEL - Calling convention for SPIR kernel functions.
+    /// Inherits the restrictions of SPIR_FUNC, except
+    /// Cannot have non-void return values.
+    /// Cannot have variable arguments.
+    /// Can also be called by the host.
+    /// Is externally visible.
+    SPIR_KERNEL = 76,
+
+    /// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins
+    Intel_OCL_BI = 77
+
   };
 } // End CallingConv namespace
 
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 170a528a5a22..a92b85939f37 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -17,6 +17,7 @@
 #define LLVM_CODEGEN_ASMPRINTER_H
 
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -47,7 +48,7 @@ namespace llvm {
   class DwarfException;
   class Mangler;
   class TargetLoweringObjectFile;
-  class TargetData;
+  class DataLayout;
   class TargetMachine;
 
   /// AsmPrinter - This class is intended to be used as a driving class for all
@@ -130,8 +131,8 @@ namespace llvm {
     /// getObjFileLowering - Return information about object file lowering.
     const TargetLoweringObjectFile &getObjFileLowering() const;
 
-    /// getTargetData - Return information about data layout.
-    const TargetData &getTargetData() const;
+    /// getDataLayout - Return information about data layout.
+    const DataLayout &getDataLayout() const;
 
     /// getCurrentSection() - Return the current section we are emitting to.
     const MCSection *getCurrentSection() const;
@@ -460,7 +461,8 @@ namespace llvm {
     mutable unsigned SetCounter;
 
     /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
-    void EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = 0) const;
+    void EmitInlineAsm(StringRef Str, const MDNode *LocMDNode = 0,
+                    InlineAsm::AsmDialect AsmDialect = InlineAsm::AD_ATT) const;
 
     /// EmitInlineAsm - This method formats and emits the specified machine
     /// instruction that is an inline asm.
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 3afe3095d4f6..436918b1eb33 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/Target/TargetCallingConv.h"
 #include "llvm/CallingConv.h"
@@ -288,6 +289,7 @@ public:
     StackOffset = ((StackOffset + Align-1) & ~(Align-1));
     unsigned Result = StackOffset;
     StackOffset += Size;
+    MF.getFrameInfo()->ensureMaxAlignment(Align);
     return Result;
   }
 
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
new file mode 100644
index 000000000000..90ee23424498
--- /dev/null
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -0,0 +1,228 @@
+//===-- CommandFlags.h - Register Coalescing Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains codegen-specific flags that are shared between different
+// command line tools. The tools "llc" and "opt" both use this file to prevent
+// flag duplication.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_COMMAND_LINE_FLAGS_H
+#define LLVM_CODEGEN_COMMAND_LINE_FLAGS_H
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+using namespace llvm;
+
+cl::opt<std::string>
+MArch("march", cl::desc("Architecture to generate code for (see --version)"));
+
+cl::opt<std::string>
+MCPU("mcpu",
+     cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+     cl::value_desc("cpu-name"),
+     cl::init(""));
+
+cl::list<std::string>
+MAttrs("mattr",
+       cl::CommaSeparated,
+       cl::desc("Target specific attributes (-mattr=help for details)"),
+       cl::value_desc("a1,+a2,-a3,..."));
+
+cl::opt<Reloc::Model>
+RelocModel("relocation-model",
+           cl::desc("Choose relocation model"),
+           cl::init(Reloc::Default),
+           cl::values(
+              clEnumValN(Reloc::Default, "default",
+                      "Target default relocation model"),
+              clEnumValN(Reloc::Static, "static",
+                      "Non-relocatable code"),
+              clEnumValN(Reloc::PIC_, "pic",
+                      "Fully relocatable, position independent code"),
+              clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
+                      "Relocatable external references, non-relocatable code"),
+              clEnumValEnd));
+
+cl::opt<llvm::CodeModel::Model>
+CMModel("code-model",
+        cl::desc("Choose code model"),
+        cl::init(CodeModel::Default),
+        cl::values(clEnumValN(CodeModel::Default, "default",
+                              "Target default code model"),
+                   clEnumValN(CodeModel::Small, "small",
+                              "Small code model"),
+                   clEnumValN(CodeModel::Kernel, "kernel",
+                              "Kernel code model"),
+                   clEnumValN(CodeModel::Medium, "medium",
+                              "Medium code model"),
+                   clEnumValN(CodeModel::Large, "large",
+                              "Large code model"),
+                   clEnumValEnd));
+
+cl::opt<bool>
+RelaxAll("mc-relax-all",
+         cl::desc("When used with filetype=obj, "
+                  "relax all fixups in the emitted object file"));
+
+cl::opt<TargetMachine::CodeGenFileType>
+FileType("filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
+  cl::desc("Choose a file type (not all types are supported by all targets):"),
+  cl::values(
+             clEnumValN(TargetMachine::CGFT_AssemblyFile, "asm",
+                        "Emit an assembly ('.s') file"),
+             clEnumValN(TargetMachine::CGFT_ObjectFile, "obj",
+                        "Emit a native object ('.o') file"),
+             clEnumValN(TargetMachine::CGFT_Null, "null",
+                        "Emit nothing, for performance testing"),
+             clEnumValEnd));
+
+cl::opt<bool> DisableDotLoc("disable-dot-loc", cl::Hidden,
+                            cl::desc("Do not use .loc entries"));
+
+cl::opt<bool> DisableCFI("disable-cfi", cl::Hidden,
+                         cl::desc("Do not use .cfi_* directives"));
+
+cl::opt<bool> EnableDwarfDirectory("enable-dwarf-directory", cl::Hidden,
+                  cl::desc("Use .file directives with an explicit directory."));
+
+cl::opt<bool>
+DisableRedZone("disable-red-zone",
+               cl::desc("Do not emit code that uses the red zone."),
+               cl::init(false));
+
+cl::opt<bool>
+EnableFPMAD("enable-fp-mad",
+            cl::desc("Enable less precise MAD instructions to be generated"),
+            cl::init(false));
+
+cl::opt<bool>
+DisableFPElim("disable-fp-elim",
+              cl::desc("Disable frame pointer elimination optimization"),
+              cl::init(false));
+
+cl::opt<bool>
+DisableFPElimNonLeaf("disable-non-leaf-fp-elim",
+  cl::desc("Disable frame pointer elimination optimization for non-leaf funcs"),
+  cl::init(false));
+
+cl::opt<bool>
+EnableUnsafeFPMath("enable-unsafe-fp-math",
+                cl::desc("Enable optimizations that may decrease FP precision"),
+                cl::init(false));
+
+cl::opt<bool>
+EnableNoInfsFPMath("enable-no-infs-fp-math",
+                cl::desc("Enable FP math optimizations that assume no +-Infs"),
+                cl::init(false));
+
+cl::opt<bool>
+EnableNoNaNsFPMath("enable-no-nans-fp-math",
+                   cl::desc("Enable FP math optimizations that assume no NaNs"),
+                   cl::init(false));
+
+cl::opt<bool>
+EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
+      cl::Hidden,
+      cl::desc("Force codegen to assume rounding mode can change dynamically"),
+      cl::init(false));
+
+cl::opt<bool>
+GenerateSoftFloatCalls("soft-float",
+                    cl::desc("Generate software floating point library calls"),
+                    cl::init(false));
+
+cl::opt<llvm::FloatABI::ABIType>
+FloatABIForCalls("float-abi",
+                 cl::desc("Choose float ABI type"),
+                 cl::init(FloatABI::Default),
+                 cl::values(
+                     clEnumValN(FloatABI::Default, "default",
+                                "Target default float ABI type"),
+                     clEnumValN(FloatABI::Soft, "soft",
+                                "Soft float ABI (implied by -soft-float)"),
+                     clEnumValN(FloatABI::Hard, "hard",
+                                "Hard float ABI (uses FP registers)"),
+                     clEnumValEnd));
+
+cl::opt<llvm::FPOpFusion::FPOpFusionMode>
+FuseFPOps("fp-contract",
+          cl::desc("Enable aggresive formation of fused FP ops"),
+          cl::init(FPOpFusion::Standard),
+          cl::values(
+              clEnumValN(FPOpFusion::Fast, "fast",
+                         "Fuse FP ops whenever profitable"),
+              clEnumValN(FPOpFusion::Standard, "on",
+                         "Only fuse 'blessed' FP ops."),
+              clEnumValN(FPOpFusion::Strict, "off",
+                         "Only fuse FP ops when the result won't be effected."),
+              clEnumValEnd));
+
+cl::opt<bool>
+DontPlaceZerosInBSS("nozero-initialized-in-bss",
+              cl::desc("Don't place zero-initialized symbols into bss section"),
+              cl::init(false));
+
+cl::opt<bool>
+EnableGuaranteedTailCallOpt("tailcallopt",
+  cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
+  cl::init(false));
+
+cl::opt<bool>
+DisableTailCalls("disable-tail-calls",
+                 cl::desc("Never emit tail calls"),
+                 cl::init(false));
+
+cl::opt<unsigned>
+OverrideStackAlignment("stack-alignment",
+                       cl::desc("Override default stack alignment"),
+                       cl::init(0));
+
+cl::opt<bool>
+EnableRealignStack("realign-stack",
+                   cl::desc("Realign stack if needed"),
+                   cl::init(true));
+
+cl::opt<std::string>
+TrapFuncName("trap-func", cl::Hidden,
+        cl::desc("Emit a call to trap function rather than a trap instruction"),
+        cl::init(""));
+
+cl::opt<bool>
+EnablePIE("enable-pie",
+          cl::desc("Assume the creation of a position independent executable."),
+          cl::init(false));
+
+cl::opt<bool>
+SegmentedStacks("segmented-stacks",
+                cl::desc("Use segmented stacks if possible."),
+                cl::init(false));
+
+cl::opt<bool>
+UseInitArray("use-init-array",
+             cl::desc("Use .init_array instead of .ctors."),
+             cl::init(false));
+
+cl::opt<std::string> StopAfter("stop-after",
+                            cl::desc("Stop compilation after a specific pass"),
+                            cl::value_desc("pass-name"),
+                                      cl::init(""));
+cl::opt<std::string> StartAfter("start-after",
+                          cl::desc("Resume compilation after a specific pass"),
+                          cl::value_desc("pass-name"),
+                          cl::init(""));
+
+cl::opt<unsigned>
+SSPBufferSize("stack-protector-buffer-size", cl::init(8),
+              cl::desc("Lower bound for a buffer to be considered for "
+                       "stack protection"));
+#endif
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index 7cb96952aa61..7c24e36092b4 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -32,7 +32,7 @@ class MachineFunction;
 class MachineInstr;
 class MachineFrameInfo;
 class MachineRegisterInfo;
-class TargetData;
+class DataLayout;
 class TargetInstrInfo;
 class TargetLibraryInfo;
 class TargetLowering;
@@ -54,7 +54,7 @@ protected:
   MachineConstantPool &MCP;
   DebugLoc DL;
   const TargetMachine &TM;
-  const TargetData &TD;
+  const DataLayout &TD;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
   const TargetRegisterInfo &TRI;
diff --git a/include/llvm/CodeGen/GCMetadata.h b/include/llvm/CodeGen/GCMetadata.h
index 20e33f74f650..076f6f39fe2c 100644
--- a/include/llvm/CodeGen/GCMetadata.h
+++ b/include/llvm/CodeGen/GCMetadata.h
@@ -122,6 +122,11 @@ namespace llvm {
       Roots.push_back(GCRoot(Num, Metadata));
     }
 
+    /// removeStackRoot - Removes a root.
+    roots_iterator removeStackRoot(roots_iterator position) {
+      return Roots.erase(position);
+    }
+
     /// addSafePoint - Notes the existence of a safe point. Num is the ID of the
     /// label just prior to the safe point (if the code generator is using
     /// MachineModuleInfo).
diff --git a/include/llvm/CodeGen/GCMetadataPrinter.h b/include/llvm/CodeGen/GCMetadataPrinter.h
index 17a265300000..4a6b5ac19c36 100644
--- a/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -48,9 +48,10 @@ namespace llvm {
     // May only be subclassed.
     GCMetadataPrinter();
 
-    // Do not implement.
-    GCMetadataPrinter(const GCMetadataPrinter &);
-    GCMetadataPrinter &operator=(const GCMetadataPrinter &);
+  private:
+    GCMetadataPrinter(const GCMetadataPrinter &) LLVM_DELETED_FUNCTION;
+    GCMetadataPrinter &
+      operator=(const GCMetadataPrinter &) LLVM_DELETED_FUNCTION;
 
   public:
     GCStrategy &getStrategy() { return *S; }
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index f387bd518f17..5d0a3b4c7067 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -637,6 +637,10 @@ namespace ISD {
     ATOMIC_LOAD_UMIN,
     ATOMIC_LOAD_UMAX,
 
+    /// This corresponds to the llvm.lifetime.* intrinsics. The first operand
+    /// is the chain and the second operand is the alloca pointer.
+    LIFETIME_START, LIFETIME_END,
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END
diff --git a/include/llvm/CodeGen/IntrinsicLowering.h b/include/llvm/CodeGen/IntrinsicLowering.h
index 767b66622549..5a3fb4b1a3df 100644
--- a/include/llvm/CodeGen/IntrinsicLowering.h
+++ b/include/llvm/CodeGen/IntrinsicLowering.h
@@ -21,15 +21,15 @@
 namespace llvm {
   class CallInst;
   class Module;
-  class TargetData;
+  class DataLayout;
 
   class IntrinsicLowering {
-    const TargetData& TD;
+    const DataLayout& TD;
 
     
     bool Warned;
   public:
-    explicit IntrinsicLowering(const TargetData &td) :
+    explicit IntrinsicLowering(const DataLayout &td) :
       TD(td), Warned(false) {}
 
     /// AddPrototypes - This method, if called, causes all of the prototypes
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index a3ce47c02a1a..185e414ae2cd 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -29,6 +29,7 @@
 #include <climits>
 
 namespace llvm {
+  class CoalescerPair;
   class LiveIntervals;
   class MachineInstr;
   class MachineRegisterInfo;
@@ -113,9 +114,6 @@ namespace llvm {
 
     void dump() const;
     void print(raw_ostream &os) const;
-
-  private:
-    LiveRange(); // DO NOT IMPLEMENT
   };
 
   template <> struct isPodLike<LiveRange> { static const bool value = true; };
@@ -275,11 +273,6 @@ namespace llvm {
     void MergeValueInAsValue(const LiveInterval &RHS,
                              const VNInfo *RHSValNo, VNInfo *LHSValNo);
 
-    /// Copy - Copy the specified live interval. This copies all the fields
-    /// except for the register of the interval.
-    void Copy(const LiveInterval &RHS, MachineRegisterInfo *MRI,
-              VNInfo::Allocator &VNInfoAllocator);
-
     bool empty() const { return ranges.empty(); }
 
     /// beginIndex - Return the lowest numbered slot covered by interval.
@@ -312,12 +305,6 @@ namespace llvm {
       return r != end() && r->end == index;
     }
 
-    /// killedInRange - Return true if the interval has kills in [Start,End).
-    /// Note that the kill point is considered the end of a live range, so it is
-    /// not contained in the live range. If a live range ends at End, it won't
-    /// be counted as a kill by this method.
-    bool killedInRange(SlotIndex Start, SlotIndex End) const;
-
     /// getLiveRangeContaining - Return the live range that contains the
     /// specified index, or null if there is none.
     const LiveRange *getLiveRangeContaining(SlotIndex Idx) const {
@@ -366,6 +353,14 @@ namespace llvm {
       return overlapsFrom(other, other.begin());
     }
 
+    /// overlaps - Return true if the two intervals have overlapping segments
+    /// that are not coalescable according to CP.
+    ///
+    /// Overlapping segments where one interval is defined by a coalescable
+    /// copy are allowed.
+    bool overlaps(const LiveInterval &Other, const CoalescerPair &CP,
+                  const SlotIndexes&) const;
+
     /// overlaps - Return true if the live interval overlaps a range specified
     /// by [Start, End).
     bool overlaps(SlotIndex Start, SlotIndex End) const;
@@ -469,7 +464,7 @@ namespace llvm {
                              VNInfo *LHSValNo = 0,
                              const VNInfo *RHSValNo = 0);
 
-    LiveInterval& operator=(const LiveInterval& rhs); // DO NOT IMPLEMENT
+    LiveInterval& operator=(const LiveInterval& rhs) LLVM_DELETED_FUNCTION;
 
   };
 
@@ -501,7 +496,9 @@ namespace llvm {
       if (I == E)
         return;
       // Is this an instruction live-in segment?
-      if (SlotIndex::isEarlierInstr(I->start, Idx)) {
+      // If Idx is the start index of a basic block, include live-in segments
+      // that start at Idx.getBaseIndex().
+      if (I->start <= Idx.getBaseIndex()) {
         EarlyVal = I->valno;
         EndPoint = I->end;
         // Move to the potentially live-out segment.
@@ -510,6 +507,12 @@ namespace llvm {
           if (++I == E)
             return;
         }
+        // Special case: A PHIDef value can have its def in the middle of a
+        // segment if the value happens to be live out of the layout
+        // predecessor.
+        // Such a value is not live-in.
+        if (EarlyVal->def == Idx.getBaseIndex())
+          EarlyVal = 0;
       }
       // I now points to the segment that may be live-through, or defined by
       // this instr. Ignore segments starting after the current instr.
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index da521dbc535f..b421753dd536 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -65,12 +65,6 @@ namespace llvm {
     /// Live interval pointers for all the virtual registers.
     IndexedMap<LiveInterval*, VirtReg2IndexFunctor> VirtRegIntervals;
 
-    /// AllocatableRegs - A bit vector of allocatable registers.
-    BitVector AllocatableRegs;
-
-    /// ReservedRegs - A bit vector of reserved registers.
-    BitVector ReservedRegs;
-
     /// RegMaskSlots - Sorted list of instructions with register mask operands.
     /// Always use the 'r' slot, RegMasks are normal clobbers, not early
     /// clobbers.
@@ -123,18 +117,6 @@ namespace llvm {
       return VirtRegIntervals.inBounds(Reg) && VirtRegIntervals[Reg];
     }
 
-    /// isAllocatable - is the physical register reg allocatable in the current
-    /// function?
-    bool isAllocatable(unsigned reg) const {
-      return AllocatableRegs.test(reg);
-    }
-
-    /// isReserved - is the physical register reg reserved in the current
-    /// function
-    bool isReserved(unsigned reg) const {
-      return ReservedRegs.test(reg);
-    }
-
     // Interval creation.
     LiveInterval &getOrCreateInterval(unsigned Reg) {
       if (!hasInterval(Reg)) {
@@ -165,6 +147,26 @@ namespace llvm {
     bool shrinkToUses(LiveInterval *li,
                       SmallVectorImpl<MachineInstr*> *dead = 0);
 
+    /// extendToIndices - Extend the live range of LI to reach all points in
+    /// Indices. The points in the Indices array must be jointly dominated by
+    /// existing defs in LI. PHI-defs are added as needed to maintain SSA form.
+    ///
+    /// If a SlotIndex in Indices is the end index of a basic block, LI will be
+    /// extended to be live out of the basic block.
+    ///
+    /// See also LiveRangeCalc::extend().
+    void extendToIndices(LiveInterval *LI, ArrayRef<SlotIndex> Indices);
+
+    /// pruneValue - If an LI value is live at Kill, prune its live range by
+    /// removing any liveness reachable from Kill. Add live range end points to
+    /// EndPoints such that extendToIndices(LI, EndPoints) will reconstruct the
+    /// value's live range.
+    ///
+    /// Calling pruneValue() and extendToIndices() can be used to reconstruct
+    /// SSA form after adding defs to a virtual register.
+    void pruneValue(LiveInterval *LI, SlotIndex Kill,
+                    SmallVectorImpl<SlotIndex> *EndPoints);
+
     SlotIndexes *getSlotIndexes() const {
       return Indexes;
     }
@@ -252,21 +254,26 @@ namespace llvm {
 
     /// addKillFlags - Add kill flags to any instruction that kills a virtual
     /// register.
-    void addKillFlags();
+    void addKillFlags(const VirtRegMap*);
 
     /// handleMove - call this method to notify LiveIntervals that
     /// instruction 'mi' has been moved within a basic block. This will update
     /// the live intervals for all operands of mi. Moves between basic blocks
     /// are not supported.
-    void handleMove(MachineInstr* MI);
+    ///
+    /// \param UpdateFlags Update live intervals for nonallocatable physregs.
+    void handleMove(MachineInstr* MI, bool UpdateFlags = false);
 
     /// moveIntoBundle - Update intervals for operands of MI so that they
     /// begin/end on the SlotIndex for BundleStart.
     ///
+    /// \param UpdateFlags Update live intervals for nonallocatable physregs.
+    ///
     /// Requires MI and BundleStart to have SlotIndexes, and assumes
     /// existing liveness is accurate. BundleStart should be the first
     /// instruction in the Bundle.
-    void handleMoveIntoBundle(MachineInstr* MI, MachineInstr* BundleStart);
+    void handleMoveIntoBundle(MachineInstr* MI, MachineInstr* BundleStart,
+                              bool UpdateFlags = false);
 
     // Register mask functions.
     //
diff --git a/include/llvm/CodeGen/LiveVariables.h b/include/llvm/CodeGen/LiveVariables.h
index d4bb409e0605..3bb134b8fb2a 100644
--- a/include/llvm/CodeGen/LiveVariables.h
+++ b/include/llvm/CodeGen/LiveVariables.h
@@ -126,12 +126,6 @@ private:
   /// building live intervals.
   SparseBitVector<> PHIJoins;
 
-  /// ReservedRegisters - This vector keeps track of which registers
-  /// are reserved register which are not allocatable by the target machine.
-  /// We can not track liveness for values that are in this set.
-  ///
-  BitVector ReservedRegisters;
-
 private:   // Intermediate data structures
   MachineFunction *MF;
 
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index c917bd8b8183..97c39458d93d 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -351,6 +351,8 @@ public:
   /// parameter is stored in Weights list and it may be used by
   /// MachineBranchProbabilityInfo analysis to calculate branch probability.
   ///
+  /// Note that duplicate Machine CFG edges are not allowed.
+  ///
   void addSuccessor(MachineBasicBlock *succ, uint32_t weight = 0);
 
   /// removeSuccessor - Remove successor from the successors list of this
@@ -545,6 +547,28 @@ public:
     return findDebugLoc(MBBI.getInstrIterator());
   }
 
+  /// Possible outcome of a register liveness query to computeRegisterLiveness()
+  enum LivenessQueryResult {
+    LQR_Live,            ///< Register is known to be live.
+    LQR_OverlappingLive, ///< Register itself is not live, but some overlapping
+                         ///< register is.
+    LQR_Dead,            ///< Register is known to be dead.
+    LQR_Unknown          ///< Register liveness not decidable from local
+                         ///< neighborhood.
+  };
+
+  /// computeRegisterLiveness - Return whether (physical) register \c Reg
+  /// has been <def>ined and not <kill>ed as of just before \c MI.
+  /// 
+  /// Search is localised to a neighborhood of
+  /// \c Neighborhood instructions before (searching for defs or kills) and
+  /// Neighborhood instructions after (searching just for defs) MI.
+  ///
+  /// \c Reg must be a physical register.
+  LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                              unsigned Reg, MachineInstr *MI,
+                                              unsigned Neighborhood=10);
+
   // Debugging methods.
   void dump() const;
   void print(raw_ostream &OS, SlotIndexes* = 0) const;
@@ -572,7 +596,7 @@ private:
   /// getSuccWeight - Return weight of the edge from this block to MBB. This
   /// method should NOT be called directly, but by using getEdgeWeight method
   /// from MachineBranchProbabilityInfo class.
-  uint32_t getSuccWeight(const MachineBasicBlock *succ) const;
+  uint32_t getSuccWeight(const_succ_iterator Succ) const;
 
 
   // Methods used to maintain doubly linked list of blocks...
diff --git a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
index af4db7d6bde6..12189ceb7f16 100644
--- a/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
+++ b/include/llvm/CodeGen/MachineBranchProbabilityInfo.h
@@ -16,14 +16,12 @@
 #define LLVM_CODEGEN_MACHINEBRANCHPROBABILITYINFO_H
 
 #include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/Support/BranchProbability.h"
 #include <climits>
 
 namespace llvm {
 
-class raw_ostream;
-class MachineBasicBlock;
-
 class MachineBranchProbabilityInfo : public ImmutablePass {
   virtual void anchor();
 
@@ -52,6 +50,11 @@ public:
   uint32_t getEdgeWeight(const MachineBasicBlock *Src,
                          const MachineBasicBlock *Dst) const;
 
+  // Same thing, but using a const_succ_iterator from Src. This is faster when
+  // the iterator is already available.
+  uint32_t getEdgeWeight(const MachineBasicBlock *Src,
+                         MachineBasicBlock::const_succ_iterator Dst) const;
+
   // Get sum of the block successors' weights, potentially scaling them to fit
   // within 32-bits. If scaling is required, sets Scale based on the necessary
   // adjustment. Any edge weights used with the sum should be divided by Scale.
diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h
index d6d65a24defb..8ed215d75bcf 100644
--- a/include/llvm/CodeGen/MachineConstantPool.h
+++ b/include/llvm/CodeGen/MachineConstantPool.h
@@ -25,7 +25,7 @@ namespace llvm {
 
 class Constant;
 class FoldingSetNodeID;
-class TargetData;
+class DataLayout;
 class TargetMachine;
 class Type;
 class MachineConstantPool;
@@ -132,14 +132,14 @@ public:
 /// address of the function constant pool values.
 /// @brief The machine constant pool.
 class MachineConstantPool {
-  const TargetData *TD;   ///< The machine's TargetData.
+  const DataLayout *TD;   ///< The machine's DataLayout.
   unsigned PoolAlignment; ///< The alignment for the pool.
   std::vector<MachineConstantPoolEntry> Constants; ///< The pool of constants.
   /// MachineConstantPoolValues that use an existing MachineConstantPoolEntry.
   DenseSet<MachineConstantPoolValue*> MachineCPVsSharingEntries;
 public:
   /// @brief The only constructor.
-  explicit MachineConstantPool(const TargetData *td)
+  explicit MachineConstantPool(const DataLayout *td)
     : TD(td), PoolAlignment(1) {}
   ~MachineConstantPool();
     
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 8b958e437ed3..0e4e132e40d9 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -21,13 +21,15 @@
 
 namespace llvm {
 class raw_ostream;
-class TargetData;
+class DataLayout;
 class TargetRegisterClass;
 class Type;
 class MachineFunction;
 class MachineBasicBlock;
 class TargetFrameLowering;
 class BitVector;
+class Value;
+class AllocaInst;
 
 /// The CalleeSavedInfo class tracks the information need to locate where a
 /// callee saved register is in the current frame.
@@ -103,14 +105,18 @@ class MachineFrameInfo {
     // protector.
     bool MayNeedSP;
 
+    /// Alloca - If this stack object is originated from an Alloca instruction
+    /// this value saves the original IR allocation. Can be NULL.
+    const AllocaInst *Alloca;
+
     // PreAllocated - If true, the object was mapped into the local frame
     // block and doesn't need additional handling for allocation beyond that.
     bool PreAllocated;
 
     StackObject(uint64_t Sz, unsigned Al, int64_t SP, bool IM,
-                bool isSS, bool NSP)
+                bool isSS, bool NSP, const AllocaInst *Val)
       : SPOffset(SP), Size(Sz), Alignment(Al), isImmutable(IM),
-        isSpillSlot(isSS), MayNeedSP(NSP), PreAllocated(false) {}
+        isSpillSlot(isSS), MayNeedSP(NSP), Alloca(Val), PreAllocated(false) {}
   };
 
   /// Objects - The list of stack objects allocated...
@@ -362,6 +368,14 @@ public:
     ensureMaxAlignment(Align);
   }
 
+  /// getObjectAllocation - Return the underlying Alloca of the specified
+  /// stack object if it exists. Returns 0 if none exists.
+  const AllocaInst* getObjectAllocation(int ObjectIdx) const {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    return Objects[ObjectIdx+NumFixedObjects].Alloca;
+  }
+
   /// NeedsStackProtector - Returns true if the object may need stack
   /// protectors.
   bool MayNeedStackProtector(int ObjectIdx) const {
@@ -482,9 +496,10 @@ public:
   /// a nonnegative identifier to represent it.
   ///
   int CreateStackObject(uint64_t Size, unsigned Alignment, bool isSS,
-                        bool MayNeedSP = false) {
+                        bool MayNeedSP = false, const AllocaInst *Alloca = 0) {
     assert(Size != 0 && "Cannot allocate zero size stack objects!");
-    Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP));
+    Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, MayNeedSP,
+                                  Alloca));
     int Index = (int)Objects.size() - NumFixedObjects - 1;
     assert(Index >= 0 && "Bad frame index!");
     ensureMaxAlignment(Alignment);
@@ -516,7 +531,7 @@ public:
   ///
   int CreateVariableSizedObject(unsigned Alignment) {
     HasVarSizedObjects = true;
-    Objects.push_back(StackObject(0, Alignment, 0, false, false, true));
+    Objects.push_back(StackObject(0, Alignment, 0, false, false, true, 0));
     ensureMaxAlignment(Alignment);
     return (int)Objects.size()-NumFixedObjects-1;
   }
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 062c7508c496..025e18a9dde0 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -127,8 +127,8 @@ class MachineFunction {
   /// about the control flow of such functions.
   bool ExposesReturnsTwice;
 
-  MachineFunction(const MachineFunction &); // DO NOT IMPLEMENT
-  void operator=(const MachineFunction&);   // DO NOT IMPLEMENT
+  MachineFunction(const MachineFunction &) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineFunction&) LLVM_DELETED_FUNCTION;
 public:
   MachineFunction(const Function *Fn, const TargetMachine &TM,
                   unsigned FunctionNum, MachineModuleInfo &MMI,
@@ -138,15 +138,19 @@ public:
   MachineModuleInfo &getMMI() const { return MMI; }
   GCModuleInfo *getGMI() const { return GMI; }
   MCContext &getContext() const { return Ctx; }
-  
+
   /// getFunction - Return the LLVM function that this machine code represents
   ///
   const Function *getFunction() const { return Fn; }
 
+  /// getName - Return the name of the corresponding LLVM function.
+  ///
+  StringRef getName() const;
+
   /// getFunctionNumber - Return a unique ID for the current function.
   ///
   unsigned getFunctionNumber() const { return FunctionNumber; }
-  
+
   /// getTarget - Return the target machine this machine code is compiled with
   ///
   const TargetMachine &getTarget() const { return Target; }
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 27756abf3f54..7eb03a93012d 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Support/DebugLoc.h"
 #include <vector>
 
@@ -81,8 +82,8 @@ private:
   MachineBasicBlock *Parent;            // Pointer to the owning basic block.
   DebugLoc debugLoc;                    // Source line information.
 
-  MachineInstr(const MachineInstr&);   // DO NOT IMPLEMENT
-  void operator=(const MachineInstr&); // DO NOT IMPLEMENT
+  MachineInstr(const MachineInstr&) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineInstr&) LLVM_DELETED_FUNCTION;
 
   // Intrusive list support
   friend struct ilist_traits<MachineInstr>;
@@ -97,25 +98,10 @@ private:
   /// MCID NULL and no operands.
   MachineInstr();
 
-  // The next two constructors have DebugLoc and non-DebugLoc versions;
-  // over time, the non-DebugLoc versions should be phased out and eventually
-  // removed.
-
-  /// MachineInstr ctor - This constructor creates a MachineInstr and adds the
-  /// implicit operands.  It reserves space for the number of operands specified
-  /// by the MCInstrDesc.  The version with a DebugLoc should be preferred.
-  explicit MachineInstr(const MCInstrDesc &MCID, bool NoImp = false);
-
-  /// MachineInstr ctor - Work exactly the same as the ctor above, except that
-  /// the MachineInstr is created and added to the end of the specified basic
-  /// block.  The version with a DebugLoc should be preferred.
-  MachineInstr(MachineBasicBlock *MBB, const MCInstrDesc &MCID);
-
   /// MachineInstr ctor - This constructor create a MachineInstr and add the
   /// implicit operands.  It reserves space for number of operands specified by
   /// MCInstrDesc.  An explicit DebugLoc is supplied.
-  explicit MachineInstr(const MCInstrDesc &MCID, const DebugLoc dl,
-                        bool NoImp = false);
+  MachineInstr(const MCInstrDesc &MCID, const DebugLoc dl, bool NoImp = false);
 
   /// MachineInstr ctor - Work exactly the same as the ctor above, except that
   /// the MachineInstr is created and added to the end of the specified basic
@@ -459,6 +445,11 @@ public:
   /// Instructions with this flag set are not necessarily simple load
   /// instructions, they may load a value and modify it, for example.
   bool mayLoad(QueryType Type = AnyInBundle) const {
+    if (isInlineAsm()) {
+      unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+      if (ExtraInfo & InlineAsm::Extra_MayLoad)
+        return true;
+    }
     return hasProperty(MCID::MayLoad, Type);
   }
 
@@ -468,6 +459,11 @@ public:
   /// instructions, they may store a modified value based on their operands, or
   /// may not actually modify anything, for example.
   bool mayStore(QueryType Type = AnyInBundle) const {
+    if (isInlineAsm()) {
+      unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+      if (ExtraInfo & InlineAsm::Extra_MayStore)
+        return true;
+    }
     return hasProperty(MCID::MayStore, Type);
   }
 
@@ -610,6 +606,7 @@ public:
   bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
   bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; }
   bool isStackAligningInlineAsm() const;
+  InlineAsm::AsmDialect getInlineAsmDialect() const;
   bool isInsertSubreg() const {
     return getOpcode() == TargetOpcode::INSERT_SUBREG;
   }
@@ -782,16 +779,43 @@ public:
                         const TargetInstrInfo *TII,
                         const TargetRegisterInfo *TRI) const;
 
+  /// tieOperands - Add a tie between the register operands at DefIdx and
+  /// UseIdx. The tie will cause the register allocator to ensure that the two
+  /// operands are assigned the same physical register.
+  ///
+  /// Tied operands are managed automatically for explicit operands in the
+  /// MCInstrDesc. This method is for exceptional cases like inline asm.
+  void tieOperands(unsigned DefIdx, unsigned UseIdx);
+
+  /// findTiedOperandIdx - Given the index of a tied register operand, find the
+  /// operand it is tied to. Defs are tied to uses and vice versa. Returns the
+  /// index of the tied operand which must exist.
+  unsigned findTiedOperandIdx(unsigned OpIdx) const;
+
   /// isRegTiedToUseOperand - Given the index of a register def operand,
   /// check if the register def is tied to a source operand, due to either
   /// two-address elimination or inline assembly constraints. Returns the
   /// first tied use operand index by reference if UseOpIdx is not null.
-  bool isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx = 0) const;
+  bool isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx = 0) const {
+    const MachineOperand &MO = getOperand(DefOpIdx);
+    if (!MO.isReg() || !MO.isDef() || !MO.isTied())
+      return false;
+    if (UseOpIdx)
+      *UseOpIdx = findTiedOperandIdx(DefOpIdx);
+    return true;
+  }
 
   /// isRegTiedToDefOperand - Return true if the use operand of the specified
   /// index is tied to an def operand. It also returns the def operand index by
   /// reference if DefOpIdx is not null.
-  bool isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx = 0) const;
+  bool isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx = 0) const {
+    const MachineOperand &MO = getOperand(UseOpIdx);
+    if (!MO.isReg() || !MO.isUse() || !MO.isTied())
+      return false;
+    if (DefOpIdx)
+      *DefOpIdx = findTiedOperandIdx(UseOpIdx);
+    return true;
+  }
 
   /// clearKillInfo - Clears kill flags on all operands.
   ///
@@ -852,11 +876,11 @@ public:
   bool isSafeToReMat(const TargetInstrInfo *TII, AliasAnalysis *AA,
                      unsigned DstReg) const;
 
-  /// hasVolatileMemoryRef - Return true if this instruction may have a
-  /// volatile memory reference, or if the information describing the
-  /// memory reference is not available. Return false if it is known to
-  /// have no volatile memory references.
-  bool hasVolatileMemoryRef() const;
+  /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+  /// or volatile memory reference, or if the information describing the memory
+  /// reference is not available. Return false if it is known to have no
+  /// ordered or volatile memory references.
+  bool hasOrderedMemoryRef() const;
 
   /// isInvariantLoad - Return true if this instruction is loading from a
   /// location whose value is invariant across the function.  For example,
@@ -935,6 +959,15 @@ private:
   /// return null.
   MachineRegisterInfo *getRegInfo();
 
+  /// untieRegOperand - Break any tie involving OpIdx.
+  void untieRegOperand(unsigned OpIdx) {
+    MachineOperand &MO = getOperand(OpIdx);
+    if (MO.isReg() && MO.isTied()) {
+      getOperand(findTiedOperandIdx(OpIdx)).TiedTo = 0;
+      MO.TiedTo = 0;
+    }
+  }
+
   /// addImplicitDefUseOperands - Add all implicit def and use operands to
   /// this instruction.
   void addImplicitDefUseOperands();
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 654361f9d423..770685358aba 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -176,15 +176,24 @@ public:
   }
 
   // Add a displacement from an existing MachineOperand with an added offset.
-  const MachineInstrBuilder &addDisp(const MachineOperand &Disp,
-                                     int64_t off) const {
+  const MachineInstrBuilder &addDisp(const MachineOperand &Disp, int64_t off,
+                                     unsigned char TargetFlags = 0) const {
     switch (Disp.getType()) {
       default:
         llvm_unreachable("Unhandled operand type in addDisp()");
       case MachineOperand::MO_Immediate:
         return addImm(Disp.getImm() + off);
-      case MachineOperand::MO_GlobalAddress:
-        return addGlobalAddress(Disp.getGlobal(), Disp.getOffset() + off);
+      case MachineOperand::MO_GlobalAddress: {
+        // If caller specifies new TargetFlags then use it, otherwise the
+        // default behavior is to copy the target flags from the existing
+        // MachineOperand. This means if the caller wants to clear the
+        // target flags it needs to do so explicitly.
+        if (TargetFlags)
+          return addGlobalAddress(Disp.getGlobal(), Disp.getOffset() + off,
+                                  TargetFlags);
+        return addGlobalAddress(Disp.getGlobal(), Disp.getOffset() + off,
+                                Disp.getTargetFlags());
+      }
     }
   }
 };
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index dc5f9a6ec82d..854ba06209cd 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -130,9 +130,9 @@ public:
     return OpI - InstrI->operands_begin();
   }
 
-  /// RegInfo - Information about a virtual register used by a set of operands.
+  /// VirtRegInfo - Information about a virtual register used by a set of operands.
   ///
-  struct RegInfo {
+  struct VirtRegInfo {
     /// Reads - One of the operands read the virtual register.  This does not
     /// include <undef> or <internal> use operands, see MO::readsReg().
     bool Reads;
@@ -146,6 +146,32 @@ public:
     bool Tied;
   };
 
+  /// PhysRegInfo - Information about a physical register used by a set of
+  /// operands.
+  struct PhysRegInfo {
+    /// Clobbers - Reg or an overlapping register is defined, or a regmask 
+    /// clobbers Reg.
+    bool Clobbers;
+
+    /// Defines - Reg or a super-register is defined.
+    bool Defines;
+
+    /// DefinesOverlap - Reg or an overlapping register is defined.
+    bool DefinesOverlap;
+
+    /// Reads - Read or a super-register is read.
+    bool Reads;
+
+    /// ReadsOverlap - Reg or an overlapping register is read.
+    bool ReadsOverlap;
+
+    /// DefinesDead - All defs of a Reg or a super-register are dead.
+    bool DefinesDead;
+
+    /// There is a kill of Reg or a super-register.
+    bool Kills;
+  };
+
   /// analyzeVirtReg - Analyze how the current instruction or bundle uses a
   /// virtual register.  This function should not be called after operator++(),
   /// it expects a fresh iterator.
@@ -154,8 +180,16 @@ public:
   /// @param Ops When set, this vector will receive an (MI, OpNum) entry for
   ///            each operand referring to Reg.
   /// @returns A filled-in RegInfo struct.
-  RegInfo analyzeVirtReg(unsigned Reg,
+  VirtRegInfo analyzeVirtReg(unsigned Reg,
                  SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops = 0);
+
+  /// analyzePhysReg - Analyze how the current instruction or bundle uses a
+  /// physical register.  This function should not be called after operator++(),
+  /// it expects a fresh iterator.
+  ///
+  /// @param Reg The physical register to analyze.
+  /// @returns A filled-in PhysRegInfo struct.
+  PhysRegInfo analyzePhysReg(unsigned Reg, const TargetRegisterInfo *TRI);
 };
 
 /// MIOperands - Iterate over operands of a single instruction.
diff --git a/include/llvm/CodeGen/MachineJumpTableInfo.h b/include/llvm/CodeGen/MachineJumpTableInfo.h
index f7c4e8642d53..928145d279b6 100644
--- a/include/llvm/CodeGen/MachineJumpTableInfo.h
+++ b/include/llvm/CodeGen/MachineJumpTableInfo.h
@@ -26,7 +26,7 @@
 namespace llvm {
 
 class MachineBasicBlock;
-class TargetData;
+class DataLayout;
 class raw_ostream;
 
 /// MachineJumpTableEntry - One jump table in the jump table info.
@@ -84,9 +84,9 @@ public:
   JTEntryKind getEntryKind() const { return EntryKind; }
 
   /// getEntrySize - Return the size of each entry in the jump table.
-  unsigned getEntrySize(const TargetData &TD) const;
+  unsigned getEntrySize(const DataLayout &TD) const;
   /// getEntryAlignment - Return the alignment of each entry in the jump table.
-  unsigned getEntryAlignment(const TargetData &TD) const;
+  unsigned getEntryAlignment(const DataLayout &TD) const;
 
   /// createJumpTableIndex - Create a new jump table.
   ///
diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h
index 3e204bed15ad..d53f041128ac 100644
--- a/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/include/llvm/CodeGen/MachineLoopInfo.h
@@ -73,8 +73,8 @@ class MachineLoopInfo : public MachineFunctionPass {
   LoopInfoBase<MachineBasicBlock, MachineLoop> LI;
   friend class LoopBase<MachineBasicBlock, MachineLoop>;
 
-  void operator=(const MachineLoopInfo &);  // do not implement
-  MachineLoopInfo(const MachineLoopInfo &); // do not implement
+  void operator=(const MachineLoopInfo &) LLVM_DELETED_FUNCTION;
+  MachineLoopInfo(const MachineLoopInfo &) LLVM_DELETED_FUNCTION;
 
 public:
   static char ID; // Pass identification, replacement for typeid
diff --git a/include/llvm/CodeGen/MachineMemOperand.h b/include/llvm/CodeGen/MachineMemOperand.h
index 1ac9080b75d5..ddb127120f20 100644
--- a/include/llvm/CodeGen/MachineMemOperand.h
+++ b/include/llvm/CodeGen/MachineMemOperand.h
@@ -151,6 +151,15 @@ public:
   bool isNonTemporal() const { return Flags & MONonTemporal; }
   bool isInvariant() const { return Flags & MOInvariant; }
 
+  /// isUnordered - Returns true if this memory operation doesn't have any
+  /// ordering constraints other than normal aliasing. Volatile and atomic
+  /// memory operations can't be reordered.
+  ///
+  /// Currently, we don't model the difference between volatile and atomic
+  /// operations. They should retain their ordering relative to all memory
+  /// operations.
+  bool isUnordered() const { return !isVolatile(); }
+
   /// refineAlignment - Update this MachineMemOperand to reflect the alignment
   /// of MMO, if it has a greater alignment. This must only be used when the
   /// new alignment applies to all users of this MachineMemOperand.
diff --git a/include/llvm/CodeGen/MachineModuleInfoImpls.h b/include/llvm/CodeGen/MachineModuleInfoImpls.h
index 9401ffd199d4..7afc7eb6b357 100644
--- a/include/llvm/CodeGen/MachineModuleInfoImpls.h
+++ b/include/llvm/CodeGen/MachineModuleInfoImpls.h
@@ -38,7 +38,7 @@ namespace llvm {
     /// this GV is external.
     DenseMap<MCSymbol*, StubValueTy> HiddenGVStubs;
     
-    virtual void Anchor();  // Out of line virtual method.
+    virtual void anchor();  // Out of line virtual method.
   public:
     MachineModuleInfoMachO(const MachineModuleInfo &) {}
     
@@ -76,7 +76,7 @@ namespace llvm {
     /// mode.
     DenseMap<MCSymbol*, StubValueTy> GVStubs;
 
-    virtual void Anchor();  // Out of line virtual method.
+    virtual void anchor();  // Out of line virtual method.
   public:
     MachineModuleInfoELF(const MachineModuleInfo &) {}
 
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 37d42b358382..606833cd4081 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_CODEGEN_MACHINEOPERAND_H
 #define LLVM_CODEGEN_MACHINEOPERAND_H
 
-#include "llvm/ADT/Hashing.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
@@ -30,6 +29,7 @@ class MachineRegisterInfo;
 class MDNode;
 class TargetMachine;
 class TargetRegisterInfo;
+class hash_code;
 class raw_ostream;
 class MCSymbol;
 
@@ -60,12 +60,20 @@ private:
   /// union.
   unsigned char OpKind; // MachineOperandType
 
-  /// SubReg - Subregister number, only valid for MO_Register.  A value of 0
-  /// indicates the MO_Register has no subReg.
-  unsigned char SubReg;
+  // This union is discriminated by OpKind.
+  union {
+    /// SubReg - Subregister number, only valid for MO_Register.  A value of 0
+    /// indicates the MO_Register has no subReg.
+    unsigned char SubReg;
+
+    /// TargetFlags - This is a set of target-specific operand flags.
+    unsigned char TargetFlags;
+  };
 
-  /// TargetFlags - This is a set of target-specific operand flags.
-  unsigned char TargetFlags;
+  /// TiedTo - Non-zero when this register operand is tied to another register
+  /// operand. The encoding of this field is described in the block comment
+  /// before MachineInstr::tieOperands().
+  unsigned char TiedTo : 4;
 
   /// IsDef/IsImp/IsKill/IsDead flags - These are only valid for MO_Register
   /// operands.
@@ -176,9 +184,17 @@ public:
   ///
   MachineOperandType getType() const { return (MachineOperandType)OpKind; }
 
-  unsigned char getTargetFlags() const { return TargetFlags; }
-  void setTargetFlags(unsigned char F) { TargetFlags = F; }
-  void addTargetFlag(unsigned char F) { TargetFlags |= F; }
+  unsigned char getTargetFlags() const {
+    return isReg() ? 0 : TargetFlags;
+  }
+  void setTargetFlags(unsigned char F) {
+    assert(!isReg() && "Register operands can't have target flags");
+    TargetFlags = F;
+  }
+  void addTargetFlag(unsigned char F) {
+    assert(!isReg() && "Register operands can't have target flags");
+    TargetFlags |= F;
+  }
 
 
   /// getParent - Return the instruction that this operand belongs to.
@@ -288,6 +304,11 @@ public:
     return IsEarlyClobber;
   }
 
+  bool isTied() const {
+    assert(isReg() && "Wrong MachineOperand accessor");
+    return TiedTo;
+  }
+
   bool isDebug() const {
     assert(isReg() && "Wrong MachineOperand accessor");
     return IsDebug;
@@ -421,7 +442,7 @@ public:
   int64_t getOffset() const {
     assert((isGlobal() || isSymbol() || isCPI() || isTargetIndex() ||
             isBlockAddress()) && "Wrong MachineOperand accessor");
-    return (int64_t(Contents.OffsetedInfo.OffsetHi) << 32) |
+    return int64_t(uint64_t(Contents.OffsetedInfo.OffsetHi) << 32) |
            SmallContents.OffsetLo;
   }
 
@@ -548,6 +569,7 @@ public:
     Op.IsUndef = isUndef;
     Op.IsInternalRead = isInternalRead;
     Op.IsEarlyClobber = isEarlyClobber;
+    Op.TiedTo = 0;
     Op.IsDebug = isDebug;
     Op.SmallContents.RegNo = Reg;
     Op.Contents.Reg.Prev = 0;
@@ -606,11 +628,11 @@ public:
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
-  static MachineOperand CreateBA(const BlockAddress *BA,
+  static MachineOperand CreateBA(const BlockAddress *BA, int64_t Offset,
                                  unsigned char TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_BlockAddress);
     Op.Contents.OffsetedInfo.Val.BA = BA;
-    Op.setOffset(0); // Offset is always 0.
+    Op.setOffset(Offset);
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
@@ -665,6 +687,9 @@ inline raw_ostream &operator<<(raw_ostream &OS, const MachineOperand& MO) {
   return OS;
 }
 
+  // See friend declaration above. This additional declaration is required in
+  // order to compile LLVM with IBM xlC compiler.
+  hash_code hash_value(const MachineOperand &MO);
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
new file mode 100644
index 000000000000..a9fc8434abee
--- /dev/null
+++ b/include/llvm/CodeGen/MachinePostDominators.h
@@ -0,0 +1,87 @@
+//=- llvm/CodeGen/MachineDominators.h ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes interfaces to post dominance information for
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEPOSTDOMINATORS_H
+#define LLVM_CODEGEN_MACHINEPOSTDOMINATORS_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/DominatorInternals.h"
+
+namespace llvm {
+
+///
+/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
+/// to compute the a post-dominator tree.
+///
+struct MachinePostDominatorTree : public MachineFunctionPass {
+private:
+  DominatorTreeBase<MachineBasicBlock> *DT;
+
+public:
+  static char ID;
+
+  MachinePostDominatorTree();
+
+  ~MachinePostDominatorTree();
+
+  FunctionPass *createMachinePostDominatorTreePass();
+
+  const std::vector<MachineBasicBlock *> &getRoots() const {
+    return DT->getRoots();
+  }
+
+  MachineDomTreeNode *getRootNode() const {
+    return DT->getRootNode();
+  }
+
+  MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  bool dominates(MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->dominates(A, B);
+  }
+
+  bool dominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->dominates(A, B);
+  }
+
+  bool
+  properlyDominates(const MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  bool
+  properlyDominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
+                                                       MachineBasicBlock *B) {
+    return DT->findNearestCommonDominator(A, B);
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual void print(llvm::raw_ostream &OS, const Module *M = 0) const;
+};
+} //end of namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 42a8aa43d982..4e86363f071a 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -77,16 +77,20 @@ class MachineRegisterInfo {
     return MO->Contents.Reg.Next;
   }
 
-  /// UsedPhysRegs - This is a bit vector that is computed and set by the
+  /// UsedRegUnits - This is a bit vector that is computed and set by the
   /// register allocator, and must be kept up to date by passes that run after
   /// register allocation (though most don't modify this).  This is used
   /// so that the code generator knows which callee save registers to save and
   /// for other target specific uses.
-  /// This vector only has bits set for registers explicitly used, not their
-  /// aliases.
-  BitVector UsedPhysRegs;
-
-  /// UsedPhysRegMask - Additional used physregs, but including aliases.
+  /// This vector has bits set for register units that are modified in the
+  /// current function. It doesn't include registers clobbered by function
+  /// calls with register mask operands.
+  BitVector UsedRegUnits;
+
+  /// UsedPhysRegMask - Additional used physregs including aliases.
+  /// This bit vector represents all the registers clobbered by function calls.
+  /// It can model things that UsedRegUnits can't, such as function calls that
+  /// clobber ymm7 but preserve the low half in xmm7.
   BitVector UsedPhysRegMask;
 
   /// ReservedRegs - This is a bit vector of reserved registers.  The target
@@ -95,9 +99,6 @@ class MachineRegisterInfo {
   /// started.
   BitVector ReservedRegs;
 
-  /// AllocatableRegs - From TRI->getAllocatableSet.
-  mutable BitVector AllocatableRegs;
-
   /// LiveIns/LiveOuts - Keep track of the physical registers that are
   /// livein/liveout of the function.  Live in values are typically arguments in
   /// registers, live out values are typically return values in registers.
@@ -106,8 +107,8 @@ class MachineRegisterInfo {
   std::vector<std::pair<unsigned, unsigned> > LiveIns;
   std::vector<unsigned> LiveOuts;
 
-  MachineRegisterInfo(const MachineRegisterInfo&); // DO NOT IMPLEMENT
-  void operator=(const MachineRegisterInfo&);      // DO NOT IMPLEMENT
+  MachineRegisterInfo(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
 public:
   explicit MachineRegisterInfo(const TargetRegisterInfo &TRI);
   ~MachineRegisterInfo();
@@ -360,29 +361,27 @@ public:
   //===--------------------------------------------------------------------===//
 
   /// isPhysRegUsed - Return true if the specified register is used in this
-  /// function.  This only works after register allocation.
+  /// function. Also check for clobbered aliases and registers clobbered by
+  /// function calls with register mask operands.
+  ///
+  /// This only works after register allocation. It is primarily used by
+  /// PrologEpilogInserter to determine which callee-saved registers need
+  /// spilling.
   bool isPhysRegUsed(unsigned Reg) const {
-    return UsedPhysRegs.test(Reg) || UsedPhysRegMask.test(Reg);
-  }
-
-  /// isPhysRegOrOverlapUsed - Return true if Reg or any overlapping register
-  /// is used in this function.
-  bool isPhysRegOrOverlapUsed(unsigned Reg) const {
     if (UsedPhysRegMask.test(Reg))
       return true;
-    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-      if (UsedPhysRegs.test(*AI))
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      if (UsedRegUnits.test(*Units))
         return true;
     return false;
   }
 
   /// setPhysRegUsed - Mark the specified register used in this function.
   /// This should only be called during and after register allocation.
-  void setPhysRegUsed(unsigned Reg) { UsedPhysRegs.set(Reg); }
-
-  /// addPhysRegsUsed - Mark the specified registers used in this function.
-  /// This should only be called during and after register allocation.
-  void addPhysRegsUsed(const BitVector &Regs) { UsedPhysRegs |= Regs; }
+  void setPhysRegUsed(unsigned Reg) {
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      UsedRegUnits.set(*Units);
+  }
 
   /// addPhysRegsUsedFromRegMask - Mark any registers not in RegMask as used.
   /// This corresponds to the bit mask attached to register mask operands.
@@ -393,8 +392,9 @@ public:
   /// setPhysRegUnused - Mark the specified register unused in this function.
   /// This should only be called during and after register allocation.
   void setPhysRegUnused(unsigned Reg) {
-    UsedPhysRegs.reset(Reg);
     UsedPhysRegMask.reset(Reg);
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      UsedRegUnits.reset(*Units);
   }
 
 
@@ -427,6 +427,34 @@ public:
     return !reservedRegsFrozen() || ReservedRegs.test(PhysReg);
   }
 
+  /// getReservedRegs - Returns a reference to the frozen set of reserved
+  /// registers. This method should always be preferred to calling
+  /// TRI::getReservedRegs() when possible.
+  const BitVector &getReservedRegs() const {
+    assert(reservedRegsFrozen() &&
+           "Reserved registers haven't been frozen yet. "
+           "Use TRI::getReservedRegs().");
+    return ReservedRegs;
+  }
+
+  /// isReserved - Returns true when PhysReg is a reserved register.
+  ///
+  /// Reserved registers may belong to an allocatable register class, but the
+  /// target has explicitly requested that they are not used.
+  ///
+  bool isReserved(unsigned PhysReg) const {
+    return getReservedRegs().test(PhysReg);
+  }
+
+  /// isAllocatable - Returns true when PhysReg belongs to an allocatable
+  /// register class and it hasn't been reserved.
+  ///
+  /// Allocatable registers may show up in the allocation order of some virtual
+  /// register, so a register allocator needs to track its liveness and
+  /// availability.
+  bool isAllocatable(unsigned PhysReg) const {
+    return TRI->isInAllocatableClass(PhysReg) && !isReserved(PhysReg);
+  }
 
   //===--------------------------------------------------------------------===//
   // LiveIn/LiveOut Management
diff --git a/include/llvm/CodeGen/MachineSSAUpdater.h b/include/llvm/CodeGen/MachineSSAUpdater.h
index cbb45a71275c..edf93d13bd1d 100644
--- a/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_CODEGEN_MACHINESSAUPDATER_H
 #define LLVM_CODEGEN_MACHINESSAUPDATER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
   class MachineBasicBlock;
   class MachineFunction;
@@ -106,8 +108,8 @@ private:
   void ReplaceRegWith(unsigned OldReg, unsigned NewReg);
   unsigned GetValueAtEndOfBlockInternal(MachineBasicBlock *BB);
 
-  void operator=(const MachineSSAUpdater&); // DO NOT IMPLEMENT
-  MachineSSAUpdater(const MachineSSAUpdater&);     // DO NOT IMPLEMENT
+  void operator=(const MachineSSAUpdater&) LLVM_DELETED_FUNCTION;
+  MachineSSAUpdater(const MachineSSAUpdater&) LLVM_DELETED_FUNCTION;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 8da2045ad0be..31bd606f9320 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -28,9 +28,15 @@
 #define MACHINESCHEDULER_H
 
 #include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 namespace llvm {
 
+extern cl::opt<bool> ForceTopDown;
+extern cl::opt<bool> ForceBottomUp;
+
 class AliasAnalysis;
 class LiveIntervals;
 class MachineDominatorTree;
@@ -93,6 +99,237 @@ public:
   }
 };
 
+class ScheduleDAGMI;
+
+/// MachineSchedStrategy - Interface to the scheduling algorithm used by
+/// ScheduleDAGMI.
+class MachineSchedStrategy {
+public:
+  virtual ~MachineSchedStrategy() {}
+
+  /// Initialize the strategy after building the DAG for a new region.
+  virtual void initialize(ScheduleDAGMI *DAG) = 0;
+
+  /// Notify this strategy that all roots have been released (including those
+  /// that depend on EntrySU or ExitSU).
+  virtual void registerRoots() {}
+
+  /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to
+  /// schedule the node at the top of the unscheduled region. Otherwise it will
+  /// be scheduled at the bottom.
+  virtual SUnit *pickNode(bool &IsTopNode) = 0;
+
+  /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an
+  /// instruction and updated scheduled/remaining flags in the DAG nodes.
+  virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
+
+  /// When all predecessor dependencies have been resolved, free this node for
+  /// top-down scheduling.
+  virtual void releaseTopNode(SUnit *SU) = 0;
+  /// When all successor dependencies have been resolved, free this node for
+  /// bottom-up scheduling.
+  virtual void releaseBottomNode(SUnit *SU) = 0;
+};
+
+/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience
+/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified
+/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in.
+///
+/// This is a convenience class that may be used by implementations of
+/// MachineSchedStrategy.
+class ReadyQueue {
+  unsigned ID;
+  std::string Name;
+  std::vector<SUnit*> Queue;
+
+public:
+  ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {}
+
+  unsigned getID() const { return ID; }
+
+  StringRef getName() const { return Name; }
+
+  // SU is in this queue if it's NodeQueueID is a superset of this ID.
+  bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); }
+
+  bool empty() const { return Queue.empty(); }
+
+  void clear() { Queue.clear(); }
+
+  unsigned size() const { return Queue.size(); }
+
+  typedef std::vector<SUnit*>::iterator iterator;
+
+  iterator begin() { return Queue.begin(); }
+
+  iterator end() { return Queue.end(); }
+
+  iterator find(SUnit *SU) {
+    return std::find(Queue.begin(), Queue.end(), SU);
+  }
+
+  void push(SUnit *SU) {
+    Queue.push_back(SU);
+    SU->NodeQueueId |= ID;
+  }
+
+  iterator remove(iterator I) {
+    (*I)->NodeQueueId &= ~ID;
+    *I = Queue.back();
+    unsigned idx = I - Queue.begin();
+    Queue.pop_back();
+    return Queue.begin() + idx;
+  }
+
+#ifndef NDEBUG
+  void dump();
+#endif
+};
+
+/// Mutate the DAG as a postpass after normal DAG building.
+class ScheduleDAGMutation {
+public:
+  virtual ~ScheduleDAGMutation() {}
+
+  virtual void apply(ScheduleDAGMI *DAG) = 0;
+};
+
+/// ScheduleDAGMI is an implementation of ScheduleDAGInstrs that schedules
+/// machine instructions while updating LiveIntervals and tracking regpressure.
+class ScheduleDAGMI : public ScheduleDAGInstrs {
+protected:
+  AliasAnalysis *AA;
+  RegisterClassInfo *RegClassInfo;
+  MachineSchedStrategy *SchedImpl;
+
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<ScheduleDAGMutation*> Mutations;
+
+  MachineBasicBlock::iterator LiveRegionEnd;
+
+  /// Register pressure in this region computed by buildSchedGraph.
+  IntervalPressure RegPressure;
+  RegPressureTracker RPTracker;
+
+  /// List of pressure sets that exceed the target's pressure limit before
+  /// scheduling, listed in increasing set ID order. Each pressure set is paired
+  /// with its max pressure in the currently scheduled regions.
+  std::vector<PressureElement> RegionCriticalPSets;
+
+  /// The top of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentTop;
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  /// The bottom of the unscheduled zone.
+  MachineBasicBlock::iterator CurrentBottom;
+  IntervalPressure BotPressure;
+  RegPressureTracker BotRPTracker;
+
+#ifndef NDEBUG
+  /// The number of instructions scheduled so far. Used to cut off the
+  /// scheduler at the point determined by misched-cutoff.
+  unsigned NumInstrsScheduled;
+#endif
+
+public:
+  ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
+    ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
+    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
+    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
+    CurrentBottom(), BotRPTracker(BotPressure) {
+#ifndef NDEBUG
+    NumInstrsScheduled = 0;
+#endif
+  }
+
+  virtual ~ScheduleDAGMI() {
+    delete SchedImpl;
+  }
+
+  /// Add a postprocessing step to the DAG builder.
+  /// Mutations are applied in the order that they are added after normal DAG
+  /// building and before MachineSchedStrategy initialization.
+  void addMutation(ScheduleDAGMutation *Mutation) {
+    Mutations.push_back(Mutation);
+  }
+
+  MachineBasicBlock::iterator top() const { return CurrentTop; }
+  MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
+
+  /// Implement the ScheduleDAGInstrs interface for handling the next scheduling
+  /// region. This covers all instructions in a block, while schedule() may only
+  /// cover a subset.
+  void enterRegion(MachineBasicBlock *bb,
+                   MachineBasicBlock::iterator begin,
+                   MachineBasicBlock::iterator end,
+                   unsigned endcount);
+
+
+  /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
+  /// reorderable instructions.
+  virtual void schedule();
+
+  /// Get current register pressure for the top scheduled instructions.
+  const IntervalPressure &getTopPressure() const { return TopPressure; }
+  const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
+
+  /// Get current register pressure for the bottom scheduled instructions.
+  const IntervalPressure &getBotPressure() const { return BotPressure; }
+  const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; }
+
+  /// Get register pressure for the entire scheduling region before scheduling.
+  const IntervalPressure &getRegPressure() const { return RegPressure; }
+
+  const std::vector<PressureElement> &getRegionCriticalPSets() const {
+    return RegionCriticalPSets;
+  }
+
+protected:
+  // Top-Level entry points for the schedule() driver...
+
+  /// Call ScheduleDAGInstrs::buildSchedGraph with register pressure tracking
+  /// enabled. This sets up three trackers. RPTracker will cover the entire DAG
+  /// region, TopTracker and BottomTracker will be initialized to the top and
+  /// bottom of the DAG region without covereing any unscheduled instruction.
+  void buildDAGWithRegPressure();
+
+  /// Apply each ScheduleDAGMutation step in order. This allows different
+  /// instances of ScheduleDAGMI to perform custom DAG postprocessing.
+  void postprocessDAG();
+
+  /// Identify DAG roots and setup scheduler queues.
+  void initQueues();
+
+  /// Move an instruction and update register pressure.
+  void scheduleMI(SUnit *SU, bool IsTopNode);
+
+  /// Update scheduler DAG and queues after scheduling an instruction.
+  void updateQueues(SUnit *SU, bool IsTopNode);
+
+  /// Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
+  void placeDebugValues();
+
+  /// \brief dump the scheduled Sequence.
+  void dumpSchedule() const;
+
+  // Lesser helpers...
+
+  void initRegPressure();
+
+  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
+
+  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
+  bool checkSchedLimit();
+
+  void releaseRoots();
+
+  void releaseSucc(SUnit *SU, SDep *SuccEdge);
+  void releaseSuccessors(SUnit *SU);
+  void releasePred(SUnit *SU, SDep *PredEdge);
+  void releasePredecessors(SUnit *SU);
+};
+
 } // namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index a5d8b0dbd6a7..83c379b48cba 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -19,6 +19,7 @@
 
 #include <list>
 #include <map>
+#include <llvm/ADT/ilist.h>
 
 namespace PBQP {
 
@@ -31,16 +32,16 @@ namespace PBQP {
     class NodeEntry;
     class EdgeEntry;
 
-    typedef std::list<NodeEntry> NodeList;
-    typedef std::list<EdgeEntry> EdgeList;
+    typedef llvm::ilist<NodeEntry> NodeList;
+    typedef llvm::ilist<EdgeEntry> EdgeList;
 
   public:
 
-    typedef NodeList::iterator NodeItr;
-    typedef NodeList::const_iterator ConstNodeItr;
+    typedef NodeEntry* NodeItr;
+    typedef const NodeEntry* ConstNodeItr;
 
-    typedef EdgeList::iterator EdgeItr;
-    typedef EdgeList::const_iterator ConstEdgeItr;
+    typedef EdgeEntry* EdgeItr;
+    typedef const EdgeEntry* ConstEdgeItr;
 
   private:
 
@@ -52,12 +53,14 @@ namespace PBQP {
 
   private:
 
-    class NodeEntry {
+    class NodeEntry : public llvm::ilist_node<NodeEntry> {
+      friend struct llvm::ilist_sentinel_traits<NodeEntry>;
     private:
       Vector costs;      
       AdjEdgeList adjEdges;
       unsigned degree;
       void *data;
+      NodeEntry() : costs(0, 0) {}
     public:
       NodeEntry(const Vector &costs) : costs(costs), degree(0) {}
       Vector& getCosts() { return costs; }
@@ -77,12 +80,14 @@ namespace PBQP {
       void* getData() { return data; }
     };
 
-    class EdgeEntry {
+    class EdgeEntry : public llvm::ilist_node<EdgeEntry> {
+      friend struct llvm::ilist_sentinel_traits<EdgeEntry>;
     private:
       NodeItr node1, node2;
       Matrix costs;
       AdjEdgeItr node1AEItr, node2AEItr;
       void *data;
+      EdgeEntry() : costs(0, 0, 0) {}
     public:
       EdgeEntry(NodeItr node1, NodeItr node2, const Matrix &costs)
         : node1(node1), node2(node2), costs(costs) {}
diff --git a/include/llvm/CodeGen/PBQP/HeuristicBase.h b/include/llvm/CodeGen/PBQP/HeuristicBase.h
index 3fee18cc42d9..0c1fcb7eaf78 100644
--- a/include/llvm/CodeGen/PBQP/HeuristicBase.h
+++ b/include/llvm/CodeGen/PBQP/HeuristicBase.h
@@ -113,7 +113,7 @@ namespace PBQP {
     }
 
     /// \brief Add the given node to the list of nodes to be optimally reduced.
-    /// @return nItr Node iterator to be added.
+    /// @param nItr Node iterator to be added.
     ///
     /// You probably don't want to over-ride this, except perhaps to record
     /// statistics before calling this implementation. HeuristicBase relies on
@@ -193,8 +193,9 @@ namespace PBQP {
     ///        reduce list.
     /// @return True if a reduction takes place, false if the heuristic reduce
     ///         list is empty.
-    void heuristicReduce() {
+    bool heuristicReduce() {
       llvm_unreachable("Must be implemented in derived class.");
+      return false;
     }
 
     /// \brief Prepare a change in the costs on the given edge.
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 07b3b45873ae..7bd576494ef7 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -404,6 +404,10 @@ namespace llvm {
   /// inserting cmov instructions.
   extern char &EarlyIfConverterID;
 
+  /// StackSlotColoring - This pass performs stack coloring and merging.
+  /// It merges disjoint allocas to reduce the stack size.
+  extern char &StackColoringID;
+
   /// IfConverter - This pass performs machine code if conversion.
   extern char &IfConverterID;
 
diff --git a/include/llvm/CodeGen/PseudoSourceValue.h b/include/llvm/CodeGen/PseudoSourceValue.h
index 7dab4f948628..8f52d3bf47d2 100644
--- a/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/include/llvm/CodeGen/PseudoSourceValue.h
@@ -50,7 +50,6 @@ namespace llvm {
     /// classof - Methods for support type inquiry through isa, cast, and
     /// dyn_cast:
     ///
-    static inline bool classof(const PseudoSourceValue *) { return true; }
     static inline bool classof(const Value *V) {
       return V->getValueID() == PseudoSourceValueVal ||
              V->getValueID() == FixedStackPseudoSourceValueVal;
@@ -90,9 +89,6 @@ namespace llvm {
     /// classof - Methods for support type inquiry through isa, cast, and
     /// dyn_cast:
     ///
-    static inline bool classof(const FixedStackPseudoSourceValue *) {
-      return true;
-    }
     static inline bool classof(const Value *V) {
       return V->getValueID() == FixedStackPseudoSourceValueVal;
     }
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index bce3ec739b61..acfc07dd31a2 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -109,8 +109,8 @@ namespace llvm {
   /// class to support additional constraints for your architecture.
   class PBQPBuilder {
   private:
-    PBQPBuilder(const PBQPBuilder&) {}
-    void operator=(const PBQPBuilder&) {}
+    PBQPBuilder(const PBQPBuilder&) LLVM_DELETED_FUNCTION;
+    void operator=(const PBQPBuilder&) LLVM_DELETED_FUNCTION;
   public:
 
     typedef std::set<unsigned> RegSet;
diff --git a/include/llvm/CodeGen/RegisterClassInfo.h b/include/llvm/CodeGen/RegisterClassInfo.h
index 400e1f48ce54..4467b62f2370 100644
--- a/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/include/llvm/CodeGen/RegisterClassInfo.h
@@ -106,25 +106,6 @@ public:
       return CalleeSaved[N-1];
     return 0;
   }
-
-  /// isReserved - Returns true when PhysReg is a reserved register.
-  ///
-  /// Reserved registers may belong to an allocatable register class, but the
-  /// target has explicitly requested that they are not used.
-  ///
-  bool isReserved(unsigned PhysReg) const {
-    return Reserved.test(PhysReg);
-  }
-
-  /// isAllocatable - Returns true when PhysReg belongs to an allocatable
-  /// register class and it hasn't been reserved.
-  ///
-  /// Allocatable registers may show up in the allocation order of some virtual
-  /// register, so a register allocator needs to track its liveness and
-  /// availability.
-  bool isAllocatable(unsigned PhysReg) const {
-    return TRI->isInAllocatableClass(PhysReg) && !isReserved(PhysReg);
-  }
 };
 } // end namespace llvm
 
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 2043155bc53f..30326d05df04 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -43,7 +43,7 @@ struct RegisterPressure {
   /// class. This is only useful to account for spilling or rematerialization.
   void decrease(const TargetRegisterClass *RC, const TargetRegisterInfo *TRI);
 
-  void dump(const TargetRegisterInfo *TRI);
+  void dump(const TargetRegisterInfo *TRI) const;
 };
 
 /// RegisterPressure computed within a region of instructions delimited by
@@ -197,6 +197,7 @@ public:
   /// This result is complete if either advance() or recede() has returned true,
   /// or if closeRegion() was explicitly invoked.
   RegisterPressure &getPressure() { return P; }
+  const RegisterPressure &getPressure() const { return P; }
 
   /// Get the register set pressure at the current position, which may be less
   /// than the pressure across the traversed region.
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 3986a8dd7da1..08d316992ec5 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -18,6 +18,7 @@
 #define LLVM_CODEGEN_REGISTER_SCAVENGING_H
 
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/ADT/BitVector.h"
 
 namespace llvm {
@@ -59,10 +60,6 @@ class RegScavenger {
   ///
   BitVector CalleeSavedRegs;
 
-  /// ReservedRegs - A bitvector of reserved registers.
-  ///
-  BitVector ReservedRegs;
-
   /// RegsAvailable - The current state of all the physical registers immediately
   /// before MBBI. One bit per physical register. If bit is set that means it's
   /// available, unset means the register is currently being used.
@@ -130,12 +127,12 @@ public:
   void setUsed(unsigned Reg);
 private:
   /// isReserved - Returns true if a register is reserved. It is never "unused".
-  bool isReserved(unsigned Reg) const { return ReservedRegs.test(Reg); }
+  bool isReserved(unsigned Reg) const { return MRI->isReserved(Reg); }
 
   /// isUsed / isUnused - Test if a register is currently being used.
   ///
   bool isUsed(unsigned Reg) const   {
-    return !RegsAvailable.test(Reg) || ReservedRegs.test(Reg);
+    return !RegsAvailable.test(Reg) || isReserved(Reg);
   }
 
   /// isAliasUsed - Is Reg or an alias currently in use?
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 85ab47beb6b4..7e0ca1478e5f 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -31,6 +31,7 @@ namespace llvm {
   class MachineFunction;
   class MachineRegisterInfo;
   class MachineInstr;
+  struct MCSchedClassDesc;
   class TargetRegisterInfo;
   class ScheduleDAG;
   class SDNode;
@@ -52,6 +53,13 @@ namespace llvm {
       Order        ///< Any other ordering dependency.
     };
 
+    enum OrderKind {
+      Barrier,      ///< An unknown scheduling barrier.
+      MayAliasMem,  ///< Nonvolatile load/Store instructions that may alias.
+      MustAliasMem, ///< Nonvolatile load/Store instructions that must alias.
+      Artificial    ///< Arbitrary weak DAG edge (no actual dependence).
+    };
+
   private:
     /// Dep - A pointer to the depending/depended-on SUnit, and an enum
     /// indicating the kind of the dependency.
@@ -65,26 +73,18 @@ namespace llvm {
       unsigned Reg;
 
       /// Order - Additional information about Order dependencies.
-      struct {
-        /// isNormalMemory - True if both sides of the dependence
-        /// access memory in non-volatile and fully modeled ways.
-        bool isNormalMemory : 1;
-
-        /// isMustAlias - True if both sides of the dependence are known to
-        /// access the same memory.
-        bool isMustAlias : 1;
-
-        /// isArtificial - True if this is an artificial dependency, meaning
-        /// it is not necessary for program correctness, and may be safely
-        /// deleted if necessary.
-        bool isArtificial : 1;
-      } Order;
+      unsigned OrdKind; // enum OrderKind
     } Contents;
 
     /// Latency - The time associated with this edge. Often this is just
     /// the value of the Latency field of the predecessor, however advanced
     /// models may provide additional information about specific edges.
     unsigned Latency;
+    /// Record MinLatency seperately from "expected" Latency.
+    ///
+    /// FIXME: this field is not packed on LP64. Convert to 16-bit DAG edge
+    /// latency after introducing saturating truncation.
+    unsigned MinLatency;
 
   public:
     /// SDep - Construct a null SDep. This is only for use by container
@@ -93,28 +93,28 @@ namespace llvm {
     SDep() : Dep(0, Data) {}
 
     /// SDep - Construct an SDep with the specified values.
-    SDep(SUnit *S, Kind kind, unsigned latency = 1, unsigned Reg = 0,
-         bool isNormalMemory = false, bool isMustAlias = false,
-         bool isArtificial = false)
-      : Dep(S, kind), Contents(), Latency(latency) {
+    SDep(SUnit *S, Kind kind, unsigned Reg)
+      : Dep(S, kind), Contents() {
       switch (kind) {
+      default:
+        llvm_unreachable("Reg given for non-register dependence!");
       case Anti:
       case Output:
         assert(Reg != 0 &&
                "SDep::Anti and SDep::Output must use a non-zero Reg!");
-        // fall through
-      case Data:
-        assert(!isMustAlias && "isMustAlias only applies with SDep::Order!");
-        assert(!isArtificial && "isArtificial only applies with SDep::Order!");
         Contents.Reg = Reg;
+        Latency = 0;
         break;
-      case Order:
-        assert(Reg == 0 && "Reg given for non-register dependence!");
-        Contents.Order.isNormalMemory = isNormalMemory;
-        Contents.Order.isMustAlias = isMustAlias;
-        Contents.Order.isArtificial = isArtificial;
+      case Data:
+        Contents.Reg = Reg;
+        Latency = 1;
         break;
       }
+      MinLatency = Latency;
+    }
+    SDep(SUnit *S, OrderKind kind)
+      : Dep(S, Order), Contents(), Latency(0), MinLatency(0) {
+      Contents.OrdKind = kind;
     }
 
     /// Return true if the specified SDep is equivalent except for latency.
@@ -126,16 +126,14 @@ namespace llvm {
       case Output:
         return Contents.Reg == Other.Contents.Reg;
       case Order:
-        return Contents.Order.isNormalMemory ==
-                 Other.Contents.Order.isNormalMemory &&
-               Contents.Order.isMustAlias == Other.Contents.Order.isMustAlias &&
-               Contents.Order.isArtificial == Other.Contents.Order.isArtificial;
+        return Contents.OrdKind == Other.Contents.OrdKind;
       }
       llvm_unreachable("Invalid dependency kind!");
     }
 
     bool operator==(const SDep &Other) const {
-      return overlaps(Other) && Latency == Other.Latency;
+      return overlaps(Other)
+        && Latency == Other.Latency && MinLatency == Other.MinLatency;
     }
 
     bool operator!=(const SDep &Other) const {
@@ -155,6 +153,18 @@ namespace llvm {
       Latency = Lat;
     }
 
+    /// getMinLatency - Return the minimum latency for this edge. Minimum
+    /// latency is used for scheduling groups, while normal (expected) latency
+    /// is for instruction cost and critical path.
+    unsigned getMinLatency() const {
+      return MinLatency;
+    }
+
+    /// setMinLatency - Set the minimum latency for this edge.
+    void setMinLatency(unsigned Lat) {
+      MinLatency = Lat;
+    }
+
     //// getSUnit - Return the SUnit to which this edge points.
     SUnit *getSUnit() const {
       return Dep.getPointer();
@@ -179,20 +189,21 @@ namespace llvm {
     /// memory accesses where both sides of the dependence access memory
     /// in non-volatile and fully modeled ways.
     bool isNormalMemory() const {
-      return getKind() == Order && Contents.Order.isNormalMemory;
+      return getKind() == Order && (Contents.OrdKind == MayAliasMem
+                                    || Contents.OrdKind == MustAliasMem);
     }
 
     /// isMustAlias - Test if this is an Order dependence that is marked
     /// as "must alias", meaning that the SUnits at either end of the edge
     /// have a memory dependence on a known memory location.
     bool isMustAlias() const {
-      return getKind() == Order && Contents.Order.isMustAlias;
+      return getKind() == Order && Contents.OrdKind == MustAliasMem;
     }
 
     /// isArtificial - Test if this is an Order dependence that is marked
     /// as "artificial", meaning it isn't necessary for correctness.
     bool isArtificial() const {
-      return getKind() == Order && Contents.Order.isArtificial;
+      return getKind() == Order && Contents.OrdKind == Artificial;
     }
 
     /// isAssignedRegDep - Test if this is a Data dependence that is
@@ -239,6 +250,8 @@ namespace llvm {
                                         // this node was cloned.
                                         // (SD scheduling only)
 
+    const MCSchedClassDesc *SchedClass; // NULL or resolved SchedClass.
+
     // Preds/Succs - The SUnits before/after us in the graph.
     SmallVector<SDep, 4> Preds;  // All sunit predecessors.
     SmallVector<SDep, 4> Succs;  // All sunit successors.
@@ -286,7 +299,7 @@ namespace llvm {
     /// SUnit - Construct an SUnit for pre-regalloc scheduling to represent
     /// an SDNode and any nodes flagged to it.
     SUnit(SDNode *node, unsigned nodenum)
-      : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum),
+      : Node(node), Instr(0), OrigNode(0), SchedClass(0), NodeNum(nodenum),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
         isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
@@ -300,7 +313,7 @@ namespace llvm {
     /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
     /// a MachineInstr.
     SUnit(MachineInstr *instr, unsigned nodenum)
-      : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum),
+      : Node(0), Instr(instr), OrigNode(0), SchedClass(0), NodeNum(nodenum),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
         isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
@@ -313,7 +326,7 @@ namespace llvm {
 
     /// SUnit - Construct a placeholder SUnit.
     SUnit()
-      : Node(0), Instr(0), OrigNode(0), NodeNum(~0u),
+      : Node(0), Instr(0), OrigNode(0), SchedClass(0), NodeNum(~0u),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
         isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
@@ -555,16 +568,6 @@ namespace llvm {
     unsigned VerifyScheduledDAG(bool isBottomUp);
 #endif
 
-  protected:
-    /// ComputeLatency - Compute node latency.
-    ///
-    virtual void computeLatency(SUnit *SU) = 0;
-
-    /// ForceUnitLatencies - Return true if all scheduling edges should be given
-    /// a latency value of one.  The default is to return false; schedulers may
-    /// override this as needed.
-    virtual bool forceUnitLatencies() const { return false; }
-
   private:
     // Return the MCInstrDesc of this SDNode or NULL.
     const MCInstrDesc *getNodeDesc(const SDNode *Node) const;
diff --git a/include/llvm/CodeGen/ScheduleDAGILP.h b/include/llvm/CodeGen/ScheduleDAGILP.h
new file mode 100644
index 000000000000..1aa405842173
--- /dev/null
+++ b/include/llvm/CodeGen/ScheduleDAGILP.h
@@ -0,0 +1,86 @@
+//===- ScheduleDAGILP.h - ILP metric for ScheduleDAGInstrs ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of an ILP metric for machine level instruction scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_SCHEDULEDAGILP_H
+#define LLVM_CODEGEN_SCHEDULEDAGILP_H
+
+#include "llvm/Support/DataTypes.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+class ScheduleDAGInstrs;
+class SUnit;
+
+/// \brief Represent the ILP of the subDAG rooted at a DAG node.
+struct ILPValue {
+  unsigned InstrCount;
+  unsigned Cycles;
+
+  ILPValue(): InstrCount(0), Cycles(0) {}
+
+  ILPValue(unsigned count, unsigned cycles):
+    InstrCount(count), Cycles(cycles) {}
+
+  bool isValid() const { return Cycles > 0; }
+
+  // Order by the ILP metric's value.
+  bool operator<(ILPValue RHS) const {
+    return (uint64_t)InstrCount * RHS.Cycles
+      < (uint64_t)Cycles * RHS.InstrCount;
+  }
+  bool operator>(ILPValue RHS) const {
+    return RHS < *this;
+  }
+  bool operator<=(ILPValue RHS) const {
+    return (uint64_t)InstrCount * RHS.Cycles
+      <= (uint64_t)Cycles * RHS.InstrCount;
+  }
+  bool operator>=(ILPValue RHS) const {
+    return RHS <= *this;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS) const;
+
+  void dump() const;
+#endif
+};
+
+/// \brief Compute the values of each DAG node for an ILP metric.
+///
+/// This metric assumes that the DAG is a forest of trees with roots at the
+/// bottom of the schedule.
+class ScheduleDAGILP {
+  bool IsBottomUp;
+  std::vector<ILPValue> ILPValues;
+
+public:
+  ScheduleDAGILP(bool IsBU): IsBottomUp(IsBU) {}
+
+  /// \brief Initialize the result data with the size of the DAG.
+  void resize(unsigned NumSUnits);
+
+  /// \brief Compute the ILP metric for the subDAG at this root.
+  void computeILP(const SUnit *Root);
+
+  /// \brief Get the ILP value for a DAG node.
+  ILPValue getILP(const SUnit *SU);
+};
+
+raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val);
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 1bde94215a57..4bcd35a834c3 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/SmallSet.h"
@@ -30,72 +31,6 @@ namespace llvm {
   class LiveIntervals;
   class RegPressureTracker;
 
-  /// LoopDependencies - This class analyzes loop-oriented register
-  /// dependencies, which are used to guide scheduling decisions.
-  /// For example, loop induction variable increments should be
-  /// scheduled as soon as possible after the variable's last use.
-  ///
-  class LoopDependencies {
-    const MachineDominatorTree &MDT;
-
-  public:
-    typedef std::map<unsigned, std::pair<const MachineOperand *, unsigned> >
-      LoopDeps;
-    LoopDeps Deps;
-
-    LoopDependencies(const MachineDominatorTree &mdt) : MDT(mdt) {}
-
-    /// VisitLoop - Clear out any previous state and analyze the given loop.
-    ///
-    void VisitLoop(const MachineLoop *Loop) {
-      assert(Deps.empty() && "stale loop dependencies");
-
-      MachineBasicBlock *Header = Loop->getHeader();
-      SmallSet<unsigned, 8> LoopLiveIns;
-      for (MachineBasicBlock::livein_iterator LI = Header->livein_begin(),
-           LE = Header->livein_end(); LI != LE; ++LI)
-        LoopLiveIns.insert(*LI);
-
-      const MachineDomTreeNode *Node = MDT.getNode(Header);
-      const MachineBasicBlock *MBB = Node->getBlock();
-      assert(Loop->contains(MBB) &&
-             "Loop does not contain header!");
-      VisitRegion(Node, MBB, Loop, LoopLiveIns);
-    }
-
-  private:
-    void VisitRegion(const MachineDomTreeNode *Node,
-                     const MachineBasicBlock *MBB,
-                     const MachineLoop *Loop,
-                     const SmallSet<unsigned, 8> &LoopLiveIns) {
-      unsigned Count = 0;
-      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
-           I != E; ++I) {
-        const MachineInstr *MI = I;
-        if (MI->isDebugValue())
-          continue;
-        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-          const MachineOperand &MO = MI->getOperand(i);
-          if (!MO.isReg() || !MO.isUse())
-            continue;
-          unsigned MOReg = MO.getReg();
-          if (LoopLiveIns.count(MOReg))
-            Deps.insert(std::make_pair(MOReg, std::make_pair(&MO, Count)));
-        }
-        ++Count; // Not every iteration due to dbg_value above.
-      }
-
-      const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
-      for (std::vector<MachineDomTreeNode*>::const_iterator I =
-           Children.begin(), E = Children.end(); I != E; ++I) {
-        const MachineDomTreeNode *ChildNode = *I;
-        MachineBasicBlock *ChildBlock = ChildNode->getBlock();
-        if (Loop->contains(ChildBlock))
-          VisitRegion(ChildNode, ChildBlock, Loop, LoopLiveIns);
-      }
-    }
-  };
-
   /// An individual mapping from virtual register number to SUnit.
   struct VReg2SUnit {
     unsigned VirtReg;
@@ -108,6 +43,15 @@ namespace llvm {
     }
   };
 
+  /// Record a physical register access.
+  /// For non data-dependent uses, OpIdx == -1.
+  struct PhysRegSUOper {
+    SUnit *SU;
+    int OpIdx;
+
+    PhysRegSUOper(SUnit *su, int op): SU(su), OpIdx(op) {}
+  };
+
   /// Combine a SparseSet with a 1x1 vector to track physical registers.
   /// The SparseSet allows iterating over the (few) live registers for quickly
   /// comparing against a regmask or clearing the set.
@@ -116,7 +60,7 @@ namespace llvm {
   /// cleared between scheduling regions without freeing unused entries.
   class Reg2SUnitsMap {
     SparseSet<unsigned> PhysRegSet;
-    std::vector<std::vector<SUnit*> > SUnits;
+    std::vector<std::vector<PhysRegSUOper> > SUnits;
   public:
     typedef SparseSet<unsigned>::const_iterator const_iterator;
 
@@ -140,7 +84,7 @@ namespace llvm {
 
     /// If this register is mapped, return its existing SUnits vector.
     /// Otherwise map the register and return an empty SUnits vector.
-    std::vector<SUnit *> &operator[](unsigned Reg) {
+    std::vector<PhysRegSUOper> &operator[](unsigned Reg) {
       bool New = PhysRegSet.insert(Reg).second;
       assert((!New || SUnits[Reg].empty()) && "stale SUnits vector");
       (void)New;
@@ -167,11 +111,13 @@ namespace llvm {
     const MachineLoopInfo &MLI;
     const MachineDominatorTree &MDT;
     const MachineFrameInfo *MFI;
-    const InstrItineraryData *InstrItins;
 
     /// Live Intervals provides reaching defs in preRA scheduling.
     LiveIntervals *LIS;
 
+    /// TargetSchedModel provides an interface to the machine model.
+    TargetSchedModel SchedModel;
+
     /// isPostRA flag indicates vregs cannot be present.
     bool IsPostRA;
 
@@ -223,10 +169,6 @@ namespace llvm {
     /// to minimize construction/destruction.
     std::vector<SUnit *> PendingLoads;
 
-    /// LoopRegs - Track which registers are used for loop-carried dependencies.
-    ///
-    LoopDependencies LoopRegs;
-
     /// DbgValues - Remember instruction that precedes DBG_VALUE.
     /// These are generated by buildSchedGraph but persist so they can be
     /// referenced when emitting the final schedule.
@@ -244,6 +186,16 @@ namespace llvm {
 
     virtual ~ScheduleDAGInstrs() {}
 
+    /// \brief Get the machine model for instruction scheduling.
+    const TargetSchedModel *getSchedModel() const { return &SchedModel; }
+
+    /// \brief Resolve and cache a resolved scheduling class for an SUnit.
+    const MCSchedClassDesc *getSchedClass(SUnit *SU) const {
+      if (!SU->SchedClass)
+        SU->SchedClass = SchedModel.resolveSchedClass(SU->getInstr());
+      return SU->SchedClass;
+    }
+
     /// begin - Return an iterator to the top of the current scheduling region.
     MachineBasicBlock::iterator begin() const { return RegionBegin; }
 
@@ -284,20 +236,6 @@ namespace llvm {
     /// used by instructions in the fallthrough block.
     void addSchedBarrierDeps();
 
-    /// computeLatency - Compute node latency.
-    ///
-    virtual void computeLatency(SUnit *SU);
-
-    /// computeOperandLatency - Return dependence edge latency using
-    /// operand use/def information
-    ///
-    /// FindMin may be set to get the minimum vs. expected latency. Minimum
-    /// latency is used for scheduling groups, while expected latency is for
-    /// instruction cost and critical path.
-    virtual unsigned computeOperandLatency(SUnit *Def, SUnit *Use,
-                                           const SDep& dep,
-                                           bool FindMin = false) const;
-
     /// schedule - Order nodes according to selected style, filling
     /// in the Sequence member.
     ///
@@ -319,7 +257,7 @@ namespace llvm {
 
   protected:
     void initSUnits();
-    void addPhysRegDataDeps(SUnit *SU, const MachineOperand &MO);
+    void addPhysRegDataDeps(SUnit *SU, unsigned OperIdx);
     void addPhysRegDeps(SUnit *SU, unsigned OperIdx);
     void addVRegDefDeps(SUnit *SU, unsigned OperIdx);
     void addVRegUseDeps(SUnit *SU, unsigned OperIdx);
diff --git a/include/llvm/CodeGen/SchedulerRegistry.h b/include/llvm/CodeGen/SchedulerRegistry.h
index a582b0c40c8b..836b73a15a2f 100644
--- a/include/llvm/CodeGen/SchedulerRegistry.h
+++ b/include/llvm/CodeGen/SchedulerRegistry.h
@@ -102,6 +102,11 @@ ScheduleDAGSDNodes *createVLIWDAGScheduler(SelectionDAGISel *IS,
 ScheduleDAGSDNodes *createDefaultScheduler(SelectionDAGISel *IS,
                                            CodeGenOpt::Level OptLevel);
 
+/// createDAGLinearizer - This creates a "no-scheduling" scheduler which
+/// linearize the DAG using topological order.
+ScheduleDAGSDNodes *createDAGLinearizer(SelectionDAGISel *IS,
+                                        CodeGenOpt::Level OptLevel);
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 1ccfe54d2126..619ee699430d 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -73,8 +73,8 @@ class SDDbgInfo {
   SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
   DenseMap<const SDNode*, SmallVector<SDDbgValue*, 2> > DbgValMap;
 
-  void operator=(const SDDbgInfo&);   // Do not implement.
-  SDDbgInfo(const SDDbgInfo&);   // Do not implement.
+  void operator=(const SDDbgInfo&) LLVM_DELETED_FUNCTION;
+  SDDbgInfo(const SDDbgInfo&) LLVM_DELETED_FUNCTION;
 public:
   SDDbgInfo() {}
 
@@ -222,8 +222,8 @@ private:
                               DenseSet<SDNode *> &visited,
                               int level, bool &printed);
 
-  void operator=(const SelectionDAG&); // Do not implement.
-  SelectionDAG(const SelectionDAG&);   // Do not implement.
+  void operator=(const SelectionDAG&) LLVM_DELETED_FUNCTION;
+  SelectionDAG(const SelectionDAG&) LLVM_DELETED_FUNCTION;
 
 public:
   explicit SelectionDAG(const TargetMachine &TM, llvm::CodeGenOpt::Level);
@@ -437,7 +437,13 @@ public:
   SDValue getRegisterMask(const uint32_t *RegMask);
   SDValue getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label);
   SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
-                          bool isTarget = false, unsigned char TargetFlags = 0);
+                          int64_t Offset = 0, bool isTarget = false,
+                          unsigned char TargetFlags = 0);
+  SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
+                                int64_t Offset = 0,
+                                unsigned char TargetFlags = 0) {
+    return getBlockAddress(BA, VT, Offset, true, TargetFlags);
+  }
 
   SDValue getCopyToReg(SDValue Chain, DebugLoc dl, unsigned Reg, SDValue N) {
     return getNode(ISD::CopyToReg, dl, MVT::Other, Chain,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index db361ee9b1bc..362e9afd225a 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -216,8 +216,8 @@ class SDUse {
   /// this operand.
   SDUse **Prev, *Next;
 
-  SDUse(const SDUse &U);          // Do not implement
-  void operator=(const SDUse &U); // Do not implement
+  SDUse(const SDUse &U) LLVM_DELETED_FUNCTION;
+  void operator=(const SDUse &U) LLVM_DELETED_FUNCTION;
 
 public:
   SDUse() : Val(), User(NULL), Prev(NULL), Next(NULL) {}
@@ -662,9 +662,6 @@ public:
   ///
   void dumprWithDepth(const SelectionDAG *G = 0, unsigned depth = 100) const;
 
-
-  static bool classof(const SDNode *) { return true; }
-
   /// Profile - Gather unique data for the node.
   ///
   void Profile(FoldingSetNodeID &ID) const;
@@ -956,7 +953,12 @@ public:
   const MachinePointerInfo &getPointerInfo() const {
     return MMO->getPointerInfo();
   }
-  
+
+  /// getAddressSpace - Return the address space for the associated pointer
+  unsigned getAddressSpace() const {
+    return getPointerInfo().getAddrSpace();
+  }
+
   /// refineAlignment - Update this MemSDNode's MachineMemOperand information
   /// to reflect the alignment of NewMMO, if it has a greater alignment.
   /// This must only be used when the new alignment applies to all users of
@@ -971,7 +973,6 @@ public:
   }
 
   // Methods to support isa and dyn_cast
-  static bool classof(const MemSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     // For some targets, we lower some target intrinsics to a MemIntrinsicNode
     // with either an intrinsic or a target opcode.
@@ -1011,11 +1012,6 @@ class AtomicSDNode : public MemSDNode {
     SubclassData |= SynchScope << 12;
     assert(getOrdering() == Ordering && "Ordering encoding error!");
     assert(getSynchScope() == SynchScope && "Synch-scope encoding error!");
-
-    assert((readMem() || getOrdering() <= Monotonic) &&
-           "Acquire/Release MachineMemOperand must be a load!");
-    assert((writeMem() || getOrdering() <= Monotonic) &&
-           "Acquire/Release MachineMemOperand must be a store!");
   }
 
 public:
@@ -1061,7 +1057,6 @@ public:
   }
 
   // Methods to support isa and dyn_cast
-  static bool classof(const AtomicSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ATOMIC_CMP_SWAP     ||
            N->getOpcode() == ISD::ATOMIC_SWAP         ||
@@ -1093,7 +1088,6 @@ public:
   }
 
   // Methods to support isa and dyn_cast
-  static bool classof(const MemIntrinsicSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     // We lower some target intrinsics to their target opcode
     // early a node with a target opcode can be of this class
@@ -1148,7 +1142,6 @@ public:
   }
   static bool isSplatMask(const int *Mask, EVT VT);
 
-  static bool classof(const ShuffleVectorSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::VECTOR_SHUFFLE;
   }
@@ -1172,7 +1165,6 @@ public:
   bool isNullValue() const { return Value->isNullValue(); }
   bool isAllOnesValue() const { return Value->isAllOnesValue(); }
 
-  static bool classof(const ConstantSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::Constant ||
            N->getOpcode() == ISD::TargetConstant;
@@ -1207,9 +1199,6 @@ public:
   /// have to duplicate its logic everywhere it's called.
   bool isExactlyValue(double V) const {
     bool ignored;
-    // convert is not supported on this type
-    if (&Value->getValueAPF().getSemantics() == &APFloat::PPCDoubleDouble)
-      return false;
     APFloat Tmp(V);
     Tmp.convert(Value->getValueAPF().getSemantics(),
                 APFloat::rmNearestTiesToEven, &ignored);
@@ -1219,7 +1208,6 @@ public:
 
   static bool isValueValidForType(EVT VT, const APFloat& Val);
 
-  static bool classof(const ConstantFPSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ConstantFP ||
            N->getOpcode() == ISD::TargetConstantFP;
@@ -1241,7 +1229,6 @@ public:
   // Return the address space this GlobalAddress belongs to.
   unsigned getAddressSpace() const;
 
-  static bool classof(const GlobalAddressSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::GlobalAddress ||
            N->getOpcode() == ISD::TargetGlobalAddress ||
@@ -1261,7 +1248,6 @@ public:
 
   int getIndex() const { return FI; }
 
-  static bool classof(const FrameIndexSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::FrameIndex ||
            N->getOpcode() == ISD::TargetFrameIndex;
@@ -1281,7 +1267,6 @@ public:
   int getIndex() const { return JTI; }
   unsigned char getTargetFlags() const { return TargetFlags; }
 
-  static bool classof(const JumpTableSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::JumpTable ||
            N->getOpcode() == ISD::TargetJumpTable;
@@ -1342,7 +1327,6 @@ public:
 
   Type *getType() const;
 
-  static bool classof(const ConstantPoolSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ConstantPool ||
            N->getOpcode() == ISD::TargetConstantPool;
@@ -1366,7 +1350,6 @@ public:
   int getIndex() const { return Index; }
   int64_t getOffset() const { return Offset; }
 
-  static bool classof(const TargetIndexSDNode*) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::TargetIndex;
   }
@@ -1385,7 +1368,6 @@ public:
 
   MachineBasicBlock *getBasicBlock() const { return MBB; }
 
-  static bool classof(const BasicBlockSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::BasicBlock;
   }
@@ -1395,7 +1377,7 @@ public:
 /// BUILD_VECTORs.
 class BuildVectorSDNode : public SDNode {
   // These are constructed as SDNodes and then cast to BuildVectorSDNodes.
-  explicit BuildVectorSDNode();        // Do not implement
+  explicit BuildVectorSDNode() LLVM_DELETED_FUNCTION;
 public:
   /// isConstantSplat - Check if this is a constant splat, and if so, find the
   /// smallest element size that splats the vector.  If MinSplatBits is
@@ -1410,7 +1392,6 @@ public:
                        unsigned &SplatBitSize, bool &HasAnyUndefs,
                        unsigned MinSplatBits = 0, bool isBigEndian = false);
 
-  static inline bool classof(const BuildVectorSDNode *) { return true; }
   static inline bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::BUILD_VECTOR;
   }
@@ -1431,7 +1412,6 @@ public:
   /// getValue - return the contained Value.
   const Value *getValue() const { return V; }
 
-  static bool classof(const SrcValueSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::SRCVALUE;
   }
@@ -1446,7 +1426,6 @@ public:
   
   const MDNode *getMD() const { return MD; }
   
-  static bool classof(const MDNodeSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::MDNODE_SDNODE;
   }
@@ -1463,7 +1442,6 @@ public:
 
   unsigned getReg() const { return Reg; }
 
-  static bool classof(const RegisterSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::Register;
   }
@@ -1480,7 +1458,6 @@ public:
 
   const uint32_t *getRegMask() const { return RegMask; }
 
-  static bool classof(const RegisterMaskSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::RegisterMask;
   }
@@ -1488,18 +1465,19 @@ public:
 
 class BlockAddressSDNode : public SDNode {
   const BlockAddress *BA;
+  int64_t Offset;
   unsigned char TargetFlags;
   friend class SelectionDAG;
   BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
-                     unsigned char Flags)
+                     int64_t o, unsigned char Flags)
     : SDNode(NodeTy, DebugLoc(), getSDVTList(VT)),
-             BA(ba), TargetFlags(Flags) {
+             BA(ba), Offset(o), TargetFlags(Flags) {
   }
 public:
   const BlockAddress *getBlockAddress() const { return BA; }
+  int64_t getOffset() const { return Offset; }
   unsigned char getTargetFlags() const { return TargetFlags; }
 
-  static bool classof(const BlockAddressSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::BlockAddress ||
            N->getOpcode() == ISD::TargetBlockAddress;
@@ -1517,7 +1495,6 @@ class EHLabelSDNode : public SDNode {
 public:
   MCSymbol *getLabel() const { return Label; }
 
-  static bool classof(const EHLabelSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::EH_LABEL;
   }
@@ -1537,7 +1514,6 @@ public:
   const char *getSymbol() const { return Symbol; }
   unsigned char getTargetFlags() const { return TargetFlags; }
 
-  static bool classof(const ExternalSymbolSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::ExternalSymbol ||
            N->getOpcode() == ISD::TargetExternalSymbol;
@@ -1555,7 +1531,6 @@ public:
 
   ISD::CondCode get() const { return Condition; }
 
-  static bool classof(const CondCodeSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::CONDCODE;
   }
@@ -1575,7 +1550,6 @@ class CvtRndSatSDNode : public SDNode {
 public:
   ISD::CvtCode getCvtCode() const { return CvtCode; }
 
-  static bool classof(const CvtRndSatSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::CONVERT_RNDSAT;
   }
@@ -1594,7 +1568,6 @@ public:
 
   EVT getVT() const { return ValueType; }
 
-  static bool classof(const VTSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::VALUETYPE;
   }
@@ -1638,7 +1611,6 @@ public:
   /// isUnindexed - Return true if this is NOT a pre/post inc/dec load/store.
   bool isUnindexed() const { return getAddressingMode() == ISD::UNINDEXED; }
 
-  static bool classof(const LSBaseSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::LOAD ||
            N->getOpcode() == ISD::STORE;
@@ -1670,7 +1642,6 @@ public:
   const SDValue &getBasePtr() const { return getOperand(1); }
   const SDValue &getOffset() const { return getOperand(2); }
 
-  static bool classof(const LoadSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::LOAD;
   }
@@ -1701,7 +1672,6 @@ public:
   const SDValue &getBasePtr() const { return getOperand(2); }
   const SDValue &getOffset() const { return getOperand(3); }
 
-  static bool classof(const StoreSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->getOpcode() == ISD::STORE;
   }
@@ -1742,7 +1712,6 @@ public:
     MemRefsEnd = NewMemRefsEnd;
   }
 
-  static bool classof(const MachineSDNode *) { return true; }
   static bool classof(const SDNode *N) {
     return N->isMachineOpcode();
   }
@@ -1750,10 +1719,10 @@ public:
 
 class SDNodeIterator : public std::iterator<std::forward_iterator_tag,
                                             SDNode, ptrdiff_t> {
-  SDNode *Node;
+  const SDNode *Node;
   unsigned Operand;
 
-  SDNodeIterator(SDNode *N, unsigned Op) : Node(N), Operand(Op) {}
+  SDNodeIterator(const SDNode *N, unsigned Op) : Node(N), Operand(Op) {}
 public:
   bool operator==(const SDNodeIterator& x) const {
     return Operand == x.Operand;
@@ -1784,8 +1753,8 @@ public:
     return Operand - Other.Operand;
   }
 
-  static SDNodeIterator begin(SDNode *N) { return SDNodeIterator(N, 0); }
-  static SDNodeIterator end  (SDNode *N) {
+  static SDNodeIterator begin(const SDNode *N) { return SDNodeIterator(N, 0); }
+  static SDNodeIterator end  (const SDNode *N) {
     return SDNodeIterator(N, N->getNumOperands());
   }
 
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
new file mode 100644
index 000000000000..88e6105a7de2
--- /dev/null
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -0,0 +1,167 @@
+//===-- llvm/CodeGen/TargetSchedule.h - Sched Machine Model -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a wrapper around MCSchedModel that allows the interface to
+// benefit from information currently only available in TargetInstrInfo.
+// Ideally, the scheduling interface would be fully defined in the MC layer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_TARGETSCHEDMODEL_H
+#define LLVM_TARGET_TARGETSCHEDMODEL_H
+
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+class TargetSubtargetInfo;
+class TargetInstrInfo;
+class MachineInstr;
+
+/// Provide an instruction scheduling machine model to CodeGen passes.
+class TargetSchedModel {
+  // For efficiency, hold a copy of the statically defined MCSchedModel for this
+  // processor.
+  MCSchedModel SchedModel;
+  InstrItineraryData InstrItins;
+  const TargetSubtargetInfo *STI;
+  const TargetInstrInfo *TII;
+
+  SmallVector<unsigned, 16> ResourceFactors;
+  unsigned MicroOpFactor; // Multiply to normalize microops to resource units.
+  unsigned ResourceLCM;   // Resource units per cycle. Latency normalization factor.
+public:
+  TargetSchedModel(): STI(0), TII(0) {}
+
+  /// \brief Initialize the machine model for instruction scheduling.
+  ///
+  /// The machine model API keeps a copy of the top-level MCSchedModel table
+  /// indices and may query TargetSubtargetInfo and TargetInstrInfo to resolve
+  /// dynamic properties.
+  void init(const MCSchedModel &sm, const TargetSubtargetInfo *sti,
+            const TargetInstrInfo *tii);
+
+  /// Return the MCSchedClassDesc for this instruction.
+  const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
+
+  /// \brief TargetInstrInfo getter.
+  const TargetInstrInfo *getInstrInfo() const { return TII; }
+
+  /// \brief Return true if this machine model includes an instruction-level
+  /// scheduling model.
+  ///
+  /// This is more detailed than the course grain IssueWidth and default
+  /// latency properties, but separate from the per-cycle itinerary data.
+  bool hasInstrSchedModel() const;
+
+  const MCSchedModel *getMCSchedModel() const { return &SchedModel; }
+
+  /// \brief Return true if this machine model includes cycle-to-cycle itinerary
+  /// data.
+  ///
+  /// This models scheduling at each stage in the processor pipeline.
+  bool hasInstrItineraries() const;
+
+  const InstrItineraryData *getInstrItineraries() const {
+    if (hasInstrItineraries())
+      return &InstrItins;
+    return 0;
+  }
+
+  /// \brief Identify the processor corresponding to the current subtarget.
+  unsigned getProcessorID() const { return SchedModel.getProcessorID(); }
+
+  /// \brief Maximum number of micro-ops that may be scheduled per cycle.
+  unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
+
+  /// \brief Return the number of issue slots required for this MI.
+  unsigned getNumMicroOps(const MachineInstr *MI,
+                          const MCSchedClassDesc *SC = 0) const;
+
+  /// \brief Get the number of kinds of resources for this target.
+  unsigned getNumProcResourceKinds() const {
+    return SchedModel.getNumProcResourceKinds();
+  }
+
+  /// \brief Get a processor resource by ID for convenience.
+  const MCProcResourceDesc *getProcResource(unsigned PIdx) const {
+    return SchedModel.getProcResource(PIdx);
+  }
+
+  typedef const MCWriteProcResEntry *ProcResIter;
+
+  // \brief Get an iterator into the processor resources consumed by this
+  // scheduling class.
+  ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const {
+    // The subtarget holds a single resource table for all processors.
+    return STI->getWriteProcResBegin(SC);
+  }
+  ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const {
+    return STI->getWriteProcResEnd(SC);
+  }
+
+  /// \brief Multiply the number of units consumed for a resource by this factor
+  /// to normalize it relative to other resources.
+  unsigned getResourceFactor(unsigned ResIdx) const {
+    return ResourceFactors[ResIdx];
+  }
+
+  /// \brief Multiply number of micro-ops by this factor to normalize it
+  /// relative to other resources.
+  unsigned getMicroOpFactor() const {
+    return MicroOpFactor;
+  }
+
+  /// \brief Multiply cycle count by this factor to normalize it relative to
+  /// other resources. This is the number of resource units per cycle.
+  unsigned getLatencyFactor() const {
+    return ResourceLCM;
+  }
+
+  /// \brief Compute operand latency based on the available machine model.
+  ///
+  /// Computes and return the latency of the given data dependent def and use
+  /// when the operand indices are already known. UseMI may be NULL for an
+  /// unknown user.
+  ///
+  /// FindMin may be set to get the minimum vs. expected latency. Minimum
+  /// latency is used for scheduling groups, while expected latency is for
+  /// instruction cost and critical path.
+  unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
+                                 const MachineInstr *UseMI, unsigned UseOperIdx,
+                                 bool FindMin) const;
+
+  /// \brief Compute the instruction latency based on the available machine
+  /// model.
+  ///
+  /// Compute and return the expected latency of this instruction independent of
+  /// a particular use. computeOperandLatency is the prefered API, but this is
+  /// occasionally useful to help estimate instruction cost.
+  unsigned computeInstrLatency(const MachineInstr *MI) const;
+
+  /// \brief Output dependency latency of a pair of defs of the same register.
+  ///
+  /// This is typically one cycle.
+  unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx,
+                                const MachineInstr *DepMI) const;
+
+private:
+  /// getDefLatency is a helper for computeOperandLatency. Return the
+  /// instruction's latency if operand lookup is not required.
+  /// Otherwise return -1.
+  int getDefLatency(const MachineInstr *DefMI, bool FindMin) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index eb38cd33d167..240199291ae9 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -56,50 +56,56 @@ namespace llvm {
       FIRST_FP_VALUETYPE = f16,
       LAST_FP_VALUETYPE  = ppcf128,
 
-      v2i8           =  13,   //  2 x i8
-      v4i8           =  14,   //  4 x i8
-      v8i8           =  15,   //  8 x i8
-      v16i8          =  16,   // 16 x i8
-      v32i8          =  17,   // 32 x i8
-      v2i16          =  18,   //  2 x i16
-      v4i16          =  19,   //  4 x i16
-      v8i16          =  20,   //  8 x i16
-      v16i16         =  21,   // 16 x i16
-      v2i32          =  22,   //  2 x i32
-      v4i32          =  23,   //  4 x i32
-      v8i32          =  24,   //  8 x i32
-      v16i32         =  25,   // 16 x i32
-      v1i64          =  26,   //  1 x i64
-      v2i64          =  27,   //  2 x i64
-      v4i64          =  28,   //  4 x i64
-      v8i64          =  29,   //  8 x i64
-      v16i64         =  30,   // 16 x i64
-
-      v2f16          =  31,   //  2 x f16
-      v2f32          =  32,   //  2 x f32
-      v4f32          =  33,   //  4 x f32
-      v8f32          =  34,   //  8 x f32
-      v2f64          =  35,   //  2 x f64
-      v4f64          =  36,   //  4 x f64
-
-      FIRST_VECTOR_VALUETYPE = v2i8,
+      v2i1           =  13,   //  2 x i1
+      v4i1           =  14,   //  4 x i1
+      v8i1           =  15,   //  8 x i1
+      v16i1          =  16,   // 16 x i1
+      v2i8           =  17,   //  2 x i8
+      v4i8           =  18,   //  4 x i8
+      v8i8           =  19,   //  8 x i8
+      v16i8          =  20,   // 16 x i8
+      v32i8          =  21,   // 32 x i8
+      v1i16          =  22,   //  1 x i16
+      v2i16          =  23,   //  2 x i16
+      v4i16          =  24,   //  4 x i16
+      v8i16          =  25,   //  8 x i16
+      v16i16         =  26,   // 16 x i16
+      v1i32          =  27,   //  1 x i32
+      v2i32          =  28,   //  2 x i32
+      v4i32          =  29,   //  4 x i32
+      v8i32          =  30,   //  8 x i32
+      v16i32         =  31,   // 16 x i32
+      v1i64          =  32,   //  1 x i64
+      v2i64          =  33,   //  2 x i64
+      v4i64          =  34,   //  4 x i64
+      v8i64          =  35,   //  8 x i64
+      v16i64         =  36,   // 16 x i64
+
+      v2f16          =  37,   //  2 x f16
+      v2f32          =  38,   //  2 x f32
+      v4f32          =  39,   //  4 x f32
+      v8f32          =  40,   //  8 x f32
+      v2f64          =  41,   //  2 x f64
+      v4f64          =  42,   //  4 x f64
+
+      FIRST_VECTOR_VALUETYPE = v2i1,
       LAST_VECTOR_VALUETYPE  = v4f64,
-      FIRST_INTEGER_VECTOR_VALUETYPE = v2i8,
+      FIRST_INTEGER_VECTOR_VALUETYPE = v2i1,
       LAST_INTEGER_VECTOR_VALUETYPE = v16i64,
       FIRST_FP_VECTOR_VALUETYPE = v2f16,
       LAST_FP_VECTOR_VALUETYPE = v4f64,
 
-      x86mmx         =  37,   // This is an X86 MMX value
+      x86mmx         =  43,   // This is an X86 MMX value
 
-      Glue           =  38,   // This glues nodes together during pre-RA sched
+      Glue           =  44,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  39,   // This has no value
+      isVoid         =  45,   // This has no value
 
-      Untyped        =  40,   // This value takes a register, but has
+      Untyped        =  46,   // This value takes a register, but has
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
-      LAST_VALUETYPE =  41,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  47,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -175,6 +181,18 @@ namespace llvm {
               SimpleTy <= MVT::LAST_VECTOR_VALUETYPE);
     }
 
+    /// is16BitVector - Return true if this is a 16-bit vector type.
+    bool is16BitVector() const {
+      return (SimpleTy == MVT::v2i8  || SimpleTy == MVT::v1i16 ||
+              SimpleTy == MVT::v16i1);
+    }
+
+    /// is32BitVector - Return true if this is a 32-bit vector type.
+    bool is32BitVector() const {
+      return (SimpleTy == MVT::v4i8  || SimpleTy == MVT::v2i16 ||
+              SimpleTy == MVT::v1i32);
+    }
+
     /// is64BitVector - Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
       return (SimpleTy == MVT::v8i8  || SimpleTy == MVT::v4i16 ||
@@ -233,15 +251,21 @@ namespace llvm {
       switch (SimpleTy) {
       default:
         llvm_unreachable("Not a vector MVT!");
+      case v2i1 :
+      case v4i1 :
+      case v8i1 :
+      case v16i1: return i1;
       case v2i8 :
       case v4i8 :
       case v8i8 :
       case v16i8:
       case v32i8: return i8;
+      case v1i16:
       case v2i16:
       case v4i16:
       case v8i16:
       case v16i16: return i16;
+      case v1i32:
       case v2i32:
       case v4i32:
       case v8i32:
@@ -265,21 +289,25 @@ namespace llvm {
       default:
         llvm_unreachable("Not a vector MVT!");
       case v32i8: return 32;
+      case v16i1:
       case v16i8:
       case v16i16:
       case v16i32:
       case v16i64:return 16;
+      case v8i1:
       case v8i8 :
       case v8i16:
       case v8i32:
       case v8i64:
       case v8f32: return 8;
+      case v4i1:
       case v4i8:
       case v4i16:
       case v4i32:
       case v4i64:
       case v4f32:
       case v4f64: return 4;
+      case v2i1:
       case v2i8:
       case v2i16:
       case v2i32:
@@ -287,6 +315,8 @@ namespace llvm {
       case v2f16:
       case v2f32:
       case v2f64: return 2;
+      case v1i16:
+      case v1i32:
       case v1i64: return 1;
       }
     }
@@ -302,15 +332,21 @@ namespace llvm {
       default:
         llvm_unreachable("getSizeInBits called on extended MVT.");
       case i1  :  return 1;
-      case i8  :  return 8;
+      case v2i1:  return 2;
+      case v4i1:  return 4;
+      case i8  :
+      case v8i1: return 8;
       case i16 :
       case f16:
-      case v2i8:  return 16;
+      case v16i1:
+      case v2i8:
+      case v1i16: return 16;
       case f32 :
       case i32 :
       case v4i8:
       case v2i16:
-      case v2f16: return 32;
+      case v2f16: 
+      case v1i32: return 32;
       case x86mmx:
       case f64 :
       case i64 :
@@ -393,6 +429,12 @@ namespace llvm {
       switch (VT.SimpleTy) {
       default:
         break;
+      case MVT::i1:
+        if (NumElements == 2)  return MVT::v2i1;
+        if (NumElements == 4)  return MVT::v4i1;
+        if (NumElements == 8)  return MVT::v8i1;
+        if (NumElements == 16) return MVT::v16i1;
+        break;
       case MVT::i8:
         if (NumElements == 2)  return MVT::v2i8;
         if (NumElements == 4)  return MVT::v4i8;
@@ -401,12 +443,14 @@ namespace llvm {
         if (NumElements == 32) return MVT::v32i8;
         break;
       case MVT::i16:
+        if (NumElements == 1)  return MVT::v1i16;
         if (NumElements == 2)  return MVT::v2i16;
         if (NumElements == 4)  return MVT::v4i16;
         if (NumElements == 8)  return MVT::v8i16;
         if (NumElements == 16) return MVT::v16i16;
         break;
       case MVT::i32:
+        if (NumElements == 1)  return MVT::v1i32;
         if (NumElements == 2)  return MVT::v2i32;
         if (NumElements == 4)  return MVT::v4i32;
         if (NumElements == 8)  return MVT::v8i32;
@@ -529,6 +573,16 @@ namespace llvm {
       return isSimple() ? V.isVector() : isExtendedVector();
     }
 
+    /// is16BitVector - Return true if this is a 16-bit vector type.
+    bool is16BitVector() const {
+      return isSimple() ? V.is16BitVector() : isExtended16BitVector();
+    }
+
+    /// is32BitVector - Return true if this is a 32-bit vector type.
+    bool is32BitVector() const {
+      return isSimple() ? V.is32BitVector() : isExtended32BitVector();
+    }
+
     /// is64BitVector - Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
       return isSimple() ? V.is64BitVector() : isExtended64BitVector();
@@ -740,6 +794,8 @@ namespace llvm {
     bool isExtendedFloatingPoint() const;
     bool isExtendedInteger() const;
     bool isExtendedVector() const;
+    bool isExtended16BitVector() const;
+    bool isExtended32BitVector() const;
     bool isExtended64BitVector() const;
     bool isExtended128BitVector() const;
     bool isExtended256BitVector() const;
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index f4b75bd1b17d..a707f887aaf4 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -33,36 +33,42 @@ def f80    : ValueType<80 , 10>;   // 80-bit floating point value
 def f128   : ValueType<128, 11>;   // 128-bit floating point value
 def ppcf128: ValueType<128, 12>;   // PPC 128-bit floating point value
 
-def v2i8   : ValueType<16 , 13>;   //  2 x i8  vector value
-def v4i8   : ValueType<32 , 14>;   //  4 x i8  vector value
-def v8i8   : ValueType<64 , 15>;   //  8 x i8  vector value
-def v16i8  : ValueType<128, 16>;   // 16 x i8  vector value
-def v32i8  : ValueType<256, 17>;   // 32 x i8 vector value
-def v2i16  : ValueType<32 , 18>;   //  2 x i16 vector value
-def v4i16  : ValueType<64 , 19>;   //  4 x i16 vector value
-def v8i16  : ValueType<128, 20>;   //  8 x i16 vector value
-def v16i16 : ValueType<256, 21>;   // 16 x i16 vector value
-def v2i32  : ValueType<64 , 22>;   //  2 x i32 vector value
-def v4i32  : ValueType<128, 23>;   //  4 x i32 vector value
-def v8i32  : ValueType<256, 24>;   //  8 x i32 vector value
-def v16i32 : ValueType<512, 25>;   // 16 x i32 vector value
-def v1i64  : ValueType<64 , 26>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 27>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 28>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 29>;   //  8 x i64 vector value
-def v16i64 : ValueType<1024,30>;  // 16 x i64 vector value
+def v2i1   : ValueType<2 ,  13>;   //  2 x i1  vector value
+def v4i1   : ValueType<4 ,  14>;   //  4 x i1  vector value
+def v8i1   : ValueType<8 ,  15>;   //  8 x i1  vector value
+def v16i1  : ValueType<16,  16>;   // 16 x i1  vector value
+def v2i8   : ValueType<16 , 17>;   //  2 x i8  vector value
+def v4i8   : ValueType<32 , 18>;   //  4 x i8  vector value
+def v8i8   : ValueType<64 , 19>;   //  8 x i8  vector value
+def v16i8  : ValueType<128, 20>;   // 16 x i8  vector value
+def v32i8  : ValueType<256, 21>;   // 32 x i8 vector value
+def v1i16  : ValueType<16 , 22>;   //  1 x i16 vector value
+def v2i16  : ValueType<32 , 23>;   //  2 x i16 vector value
+def v4i16  : ValueType<64 , 24>;   //  4 x i16 vector value
+def v8i16  : ValueType<128, 25>;   //  8 x i16 vector value
+def v16i16 : ValueType<256, 26>;   // 16 x i16 vector value
+def v1i32  : ValueType<32 , 27>;   //  1 x i32 vector value
+def v2i32  : ValueType<64 , 28>;   //  2 x i32 vector value
+def v4i32  : ValueType<128, 29>;   //  4 x i32 vector value
+def v8i32  : ValueType<256, 30>;   //  8 x i32 vector value
+def v16i32 : ValueType<512, 31>;   // 16 x i32 vector value
+def v1i64  : ValueType<64 , 32>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 33>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 34>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 35>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,36>;   // 16 x i64 vector value
 
-def v2f16  : ValueType<32 , 31>;   //  2 x f16 vector value
-def v2f32  : ValueType<64 , 32>;   //  2 x f32 vector value
-def v4f32  : ValueType<128, 33>;   //  4 x f32 vector value
-def v8f32  : ValueType<256, 34>;   //  8 x f32 vector value
-def v2f64  : ValueType<128, 35>;   //  2 x f64 vector value
-def v4f64  : ValueType<256, 36>;   //  4 x f64 vector value
+def v2f16  : ValueType<32 , 37>;   //  2 x f16 vector value
+def v2f32  : ValueType<64 , 38>;   //  2 x f32 vector value
+def v4f32  : ValueType<128, 39>;   //  4 x f32 vector value
+def v8f32  : ValueType<256, 40>;   //  8 x f32 vector value
+def v2f64  : ValueType<128, 41>;   //  2 x f64 vector value
+def v4f64  : ValueType<256, 42>;   //  4 x f64 vector value
 
-def x86mmx : ValueType<64 , 37>;   // X86 MMX value
-def FlagVT : ValueType<0  , 38>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 39>;   // Produces no value
-def untyped: ValueType<8  , 40>;   // Produces an untyped value
+def x86mmx : ValueType<64 , 43>;   // X86 MMX value
+def FlagVT : ValueType<0  , 44>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 45>;   // Produces no value
+def untyped: ValueType<8  , 46>;   // Produces an untyped value
 
 def MetadataVT: ValueType<0, 250>; // Metadata
 
diff --git a/include/llvm/Config/AsmParsers.def.in b/include/llvm/Config/AsmParsers.def.in
index 041af837541c..d63675351c80 100644
--- a/include/llvm/Config/AsmParsers.def.in
+++ b/include/llvm/Config/AsmParsers.def.in
@@ -1,24 +1,24 @@
-//===- llvm/Config/AsmParsers.def - LLVM Assembly Parsers -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates all of the assembly-language parsers
-// supported by this build of LLVM. Clients of this file should define
-// the LLVM_ASM_PARSER macro to be a function-like macro with a
-// single parameter (the name of the target whose assembly can be
-// generated); including this file will then enumerate all of the
-// targets with assembly parsers.
-//
-// The set of targets supported by LLVM is generated at configuration
-// time, at which point this header is generated. Do not modify this
-// header directly.
-//
-//===----------------------------------------------------------------------===//
+/*===- llvm/Config/AsmParsers.def - LLVM Assembly Parsers -------*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file enumerates all of the assembly-language parsers                  *|
+|* supported by this build of LLVM. Clients of this file should define        *|
+|* the LLVM_ASM_PARSER macro to be a function-like macro with a               *|
+|* single parameter (the name of the target whose assembly can be             *|
+|* generated); including this file will then enumerate all of the             *|
+|* targets with assembly parsers.                                             *|
+|*                                                                            *|
+|* The set of targets supported by LLVM is generated at configuration         *|
+|* time, at which point this header is generated. Do not modify this          *|
+|* header directly.                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_ASM_PARSER
 #  error Please define the macro LLVM_ASM_PARSER(TargetName)
diff --git a/include/llvm/Config/AsmPrinters.def.in b/include/llvm/Config/AsmPrinters.def.in
index 9729bd75eb40..f0152a4aa979 100644
--- a/include/llvm/Config/AsmPrinters.def.in
+++ b/include/llvm/Config/AsmPrinters.def.in
@@ -1,24 +1,24 @@
-//===- llvm/Config/AsmPrinters.def - LLVM Assembly Printers -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates all of the assembly-language printers
-// supported by this build of LLVM. Clients of this file should define
-// the LLVM_ASM_PRINTER macro to be a function-like macro with a
-// single parameter (the name of the target whose assembly can be
-// generated); including this file will then enumerate all of the
-// targets with assembly printers.
-//
-// The set of targets supported by LLVM is generated at configuration
-// time, at which point this header is generated. Do not modify this
-// header directly.
-//
-//===----------------------------------------------------------------------===//
+/*===- llvm/Config/AsmPrinters.def - LLVM Assembly Printers -----*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file enumerates all of the assembly-language printers                 *|
+|* supported by this build of LLVM. Clients of this file should define        *|
+|* the LLVM_ASM_PRINTER macro to be a function-like macro with a              *|
+|* single parameter (the name of the target whose assembly can be             *|
+|* generated); including this file will then enumerate all of the             *|
+|* targets with assembly printers.                                            *|
+|*                                                                            *|
+|* The set of targets supported by LLVM is generated at configuration         *|
+|* time, at which point this header is generated. Do not modify this          *|
+|* header directly.                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_ASM_PRINTER
 #  error Please define the macro LLVM_ASM_PRINTER(TargetName)
diff --git a/include/llvm/Config/Disassemblers.def.in b/include/llvm/Config/Disassemblers.def.in
index 1e6281de9989..d3a9bbdeaeac 100644
--- a/include/llvm/Config/Disassemblers.def.in
+++ b/include/llvm/Config/Disassemblers.def.in
@@ -1,24 +1,24 @@
-//===- llvm/Config/Disassemblers.def - LLVM Assembly Parsers ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file enumerates all of the assembly-language parsers
-// supported by this build of LLVM. Clients of this file should define
-// the LLVM_DISASSEMBLER macro to be a function-like macro with a
-// single parameter (the name of the target whose assembly can be
-// generated); including this file will then enumerate all of the
-// targets with assembly parsers.
-//
-// The set of targets supported by LLVM is generated at configuration
-// time, at which point this header is generated. Do not modify this
-// header directly.
-//
-//===----------------------------------------------------------------------===//
+/*===- llvm/Config/Disassemblers.def - LLVM Assembly Parsers ----*- C++ -*-===*\
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This file enumerates all of the assembly-language parsers                  *|
+|* supported by this build of LLVM. Clients of this file should define        *|
+|* the LLVM_DISASSEMBLER macro to be a function-like macro with a             *|
+|* single parameter (the name of the target whose assembly can be             *|
+|* generated); including this file will then enumerate all of the             *|
+|* targets with assembly parsers.                                             *|
+|*                                                                            *|
+|* The set of targets supported by LLVM is generated at configuration         *|
+|* time, at which point this header is generated. Do not modify this          *|
+|* header directly.                                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
 
 #ifndef LLVM_DISASSEMBLER
 #  error Please define the macro LLVM_DISASSEMBLER(TargetName)
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index b912251239da..ca6412472991 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -1,6 +1,4 @@
-/**************************************
-** Created by Kevin from config.h.in **
-***************************************/
+/* include/llvm/Config/config.h.cmake corresponding to config.h.in. */
 
 #ifndef CONFIG_H
 #define CONFIG_H
@@ -17,6 +15,9 @@
 /* Default <path> to all compiler invocations for --sysroot=<path>. */
 #undef DEFAULT_SYSROOT
 
+/* Define if you want backtraces on crash */
+#cmakedefine ENABLE_BACKTRACES
+
 /* Define if position independent code is enabled */
 #cmakedefine ENABLE_PIC
 
@@ -51,7 +52,7 @@
 #cmakedefine HAVE_ASSERT_H ${HAVE_ASSERT_H}
 
 /* Define to 1 if you have the `backtrace' function. */
-#undef HAVE_BACKTRACE
+#cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
 
 /* Define to 1 if you have the `bcopy' function. */
 #undef HAVE_BCOPY
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 5a60ba565f00..a4f8af4db028 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -18,6 +18,9 @@
 /* Default <path> to all compiler invocations for --sysroot=<path>. */
 #undef DEFAULT_SYSROOT
 
+/* Define if you want backtraces on crash */
+#undef ENABLE_BACKTRACES
+
 /* Define if position independent code is enabled */
 #undef ENABLE_PIC
 
diff --git a/include/llvm/Constant.h b/include/llvm/Constant.h
index e0e516d55c9a..0ddd1db6c010 100644
--- a/include/llvm/Constant.h
+++ b/include/llvm/Constant.h
@@ -39,8 +39,8 @@ namespace llvm {
 /// don't have to worry about the lifetime of the objects.
 /// @brief LLVM Constant Representation
 class Constant : public User {
-  void operator=(const Constant &);     // Do not implement
-  Constant(const Constant &);           // Do not implement
+  void operator=(const Constant &) LLVM_DELETED_FUNCTION;
+  Constant(const Constant &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
   
 protected:
@@ -65,6 +65,9 @@ public:
   /// true for things like constant expressions that could divide by zero.
   bool canTrap() const;
 
+  /// isThreadDependent - Return true if the value can vary between threads.
+  bool isThreadDependent() const;
+
   /// isConstantUsed - Return true if the constant has users other than constant
   /// exprs and other dangling things.
   bool isConstantUsed() const;
@@ -108,8 +111,6 @@ public:
   virtual void destroyConstant() { llvm_unreachable("Not reached!"); }
 
   //// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const Constant *) { return true; }
-  static inline bool classof(const GlobalValue *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() >= ConstantFirstVal &&
            V->getValueID() <= ConstantLastVal;
diff --git a/include/llvm/Constants.h b/include/llvm/Constants.h
index fdd53823aa0c..7f94ef464ea4 100644
--- a/include/llvm/Constants.h
+++ b/include/llvm/Constants.h
@@ -49,8 +49,8 @@ struct ConvertConstantType;
 /// @brief Class for constant integers.
 class ConstantInt : public Constant {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
-  ConstantInt(const ConstantInt &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantInt(const ConstantInt &) LLVM_DELETED_FUNCTION;
   ConstantInt(IntegerType *Ty, const APInt& V);
   APInt Val;
 protected:
@@ -221,7 +221,6 @@ public:
   }
 
   /// @brief Methods to support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const ConstantInt *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantIntVal;
   }
@@ -234,8 +233,8 @@ public:
 class ConstantFP : public Constant {
   APFloat Val;
   virtual void anchor();
-  void *operator new(size_t, unsigned);// DO NOT IMPLEMENT
-  ConstantFP(const ConstantFP &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantFP(const ConstantFP &) LLVM_DELETED_FUNCTION;
   friend class LLVMContextImpl;
 protected:
   ConstantFP(Type *Ty, const APFloat& V);
@@ -283,15 +282,11 @@ public:
 
   bool isExactlyValue(double V) const {
     bool ignored;
-    // convert is not supported on this type
-    if (&Val.getSemantics() == &APFloat::PPCDoubleDouble)
-      return false;
     APFloat FV(V);
     FV.convert(Val.getSemantics(), APFloat::rmNearestTiesToEven, &ignored);
     return isExactlyValue(FV);
   }
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ConstantFP *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantFPVal;
   }
@@ -301,8 +296,8 @@ public:
 /// ConstantAggregateZero - All zero aggregate value
 ///
 class ConstantAggregateZero : public Constant {
-  void *operator new(size_t, unsigned);                      // DO NOT IMPLEMENT
-  ConstantAggregateZero(const ConstantAggregateZero &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantAggregateZero(const ConstantAggregateZero &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantAggregateZero(Type *ty)
     : Constant(ty, ConstantAggregateZeroVal, 0, 0) {}
@@ -334,7 +329,6 @@ public:
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   ///
-  static bool classof(const ConstantAggregateZero *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantAggregateZeroVal;
   }
@@ -346,7 +340,7 @@ public:
 ///
 class ConstantArray : public Constant {
   friend struct ConstantArrayCreator<ConstantArray, ArrayType>;
-  ConstantArray(const ConstantArray &);      // DO NOT IMPLEMENT
+  ConstantArray(const ConstantArray &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantArray(ArrayType *T, ArrayRef<Constant *> Val);
 public:
@@ -367,7 +361,6 @@ public:
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ConstantArray *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantArrayVal;
   }
@@ -385,7 +378,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantArray, Constant)
 //
 class ConstantStruct : public Constant {
   friend struct ConstantArrayCreator<ConstantStruct, StructType>;
-  ConstantStruct(const ConstantStruct &);      // DO NOT IMPLEMENT
+  ConstantStruct(const ConstantStruct &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantStruct(StructType *T, ArrayRef<Constant *> Val);
 public:
@@ -426,7 +419,6 @@ public:
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ConstantStruct *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantStructVal;
   }
@@ -445,7 +437,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantStruct, Constant)
 ///
 class ConstantVector : public Constant {
   friend struct ConstantArrayCreator<ConstantVector, VectorType>;
-  ConstantVector(const ConstantVector &);      // DO NOT IMPLEMENT
+  ConstantVector(const ConstantVector &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantVector(VectorType *T, ArrayRef<Constant *> Val);
 public:
@@ -474,7 +466,6 @@ public:
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ConstantVector *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantVectorVal;
   }
@@ -491,8 +482,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantVector, Constant)
 /// ConstantPointerNull - a constant pointer value that points to null
 ///
 class ConstantPointerNull : public Constant {
-  void *operator new(size_t, unsigned);                  // DO NOT IMPLEMENT
-  ConstantPointerNull(const ConstantPointerNull &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantPointerNull(const ConstantPointerNull &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantPointerNull(PointerType *T)
     : Constant(reinterpret_cast<Type*>(T),
@@ -517,7 +508,6 @@ public:
   }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ConstantPointerNull *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantPointerNullVal;
   }
@@ -543,8 +533,8 @@ class ConstantDataSequential : public Constant {
   /// element array of i8, or a 1-element array of i32.  They'll both end up in
   /// the same StringMap bucket, linked up.
   ConstantDataSequential *Next;
-  void *operator new(size_t, unsigned);                      // DO NOT IMPLEMENT
-  ConstantDataSequential(const ConstantDataSequential &);    // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantDataSequential(const ConstantDataSequential &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data)
     : Constant(ty, VT, 0, 0), DataElements(Data), Next(0) {}
@@ -639,7 +629,6 @@ public:
   
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   ///
-  static bool classof(const ConstantDataSequential *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantDataArrayVal ||
            V->getValueID() == ConstantDataVectorVal;
@@ -655,8 +644,8 @@ private:
 /// operands because it stores all of the elements of the constant as densely
 /// packed data, instead of as Value*'s.
 class ConstantDataArray : public ConstantDataSequential {
-  void *operator new(size_t, unsigned);            // DO NOT IMPLEMENT
-  ConstantDataArray(const ConstantDataArray &);    // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantDataArray(const ConstantDataArray &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
   friend class ConstantDataSequential;
   explicit ConstantDataArray(Type *ty, const char *Data)
@@ -695,7 +684,6 @@ public:
   
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   ///
-  static bool classof(const ConstantDataArray *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantDataArrayVal;
   }
@@ -708,8 +696,8 @@ public:
 /// operands because it stores all of the elements of the constant as densely
 /// packed data, instead of as Value*'s.
 class ConstantDataVector : public ConstantDataSequential {
-  void *operator new(size_t, unsigned);              // DO NOT IMPLEMENT
-  ConstantDataVector(const ConstantDataVector &);    // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantDataVector(const ConstantDataVector &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
   friend class ConstantDataSequential;
   explicit ConstantDataVector(Type *ty, const char *Data)
@@ -749,7 +737,6 @@ public:
   
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
   ///
-  static bool classof(const ConstantDataVector *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == ConstantDataVectorVal;
   }
@@ -760,7 +747,7 @@ public:
 /// BlockAddress - The address of a basic block.
 ///
 class BlockAddress : public Constant {
-  void *operator new(size_t, unsigned);                  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void *operator new(size_t s) { return User::operator new(s, 2); }
   BlockAddress(Function *F, BasicBlock *BB);
 public:
@@ -781,7 +768,6 @@ public:
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
   
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const BlockAddress *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == BlockAddressVal;
   }
@@ -1094,7 +1080,6 @@ public:
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ConstantExpr *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == ConstantExprVal;
   }
@@ -1125,8 +1110,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant)
 /// LangRef.html#undefvalues for details.
 ///
 class UndefValue : public Constant {
-  void *operator new(size_t, unsigned); // DO NOT IMPLEMENT
-  UndefValue(const UndefValue &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  UndefValue(const UndefValue &) LLVM_DELETED_FUNCTION;
 protected:
   explicit UndefValue(Type *T) : Constant(T, UndefValueVal, 0, 0) {}
 protected:
@@ -1159,7 +1144,6 @@ public:
   virtual void destroyConstant();
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const UndefValue *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == UndefValueVal;
   }
diff --git a/include/llvm/DIBuilder.h b/include/llvm/DIBuilder.h
index 2ed48a944e96..2f0780068087 100644
--- a/include/llvm/DIBuilder.h
+++ b/include/llvm/DIBuilder.h
@@ -63,8 +63,8 @@ namespace llvm {
     SmallVector<Value *, 4> AllSubprograms;
     SmallVector<Value *, 4> AllGVs;
 
-    DIBuilder(const DIBuilder &);       // DO NOT IMPLEMENT
-    void operator=(const DIBuilder &);  // DO NOT IMPLEMENT
+    DIBuilder(const DIBuilder &) LLVM_DELETED_FUNCTION;
+    void operator=(const DIBuilder &) LLVM_DELETED_FUNCTION;
 
     public:
     explicit DIBuilder(Module &M);
@@ -179,8 +179,10 @@ namespace llvm {
     /// @param Ty           Parent type.
     /// @param PropertyName Name of the Objective C property associated with
     ///                     this ivar.
-    /// @param GetterName   Name of the Objective C property getter selector.
-    /// @param SetterName   Name of the Objective C property setter selector.
+    /// @param PropertyGetterName Name of the Objective C property getter
+    ///                           selector.
+    /// @param PropertySetterName Name of the Objective C property setter
+    ///                           selector.
     /// @param PropertyAttributes Objective C property attributes.
     DIType createObjCIVar(StringRef Name, DIFile File,
                           unsigned LineNo, uint64_t SizeInBits, 
@@ -201,7 +203,7 @@ namespace llvm {
     /// @param OffsetInBits Member offset.
     /// @param Flags        Flags to encode member attribute, e.g. private
     /// @param Ty           Parent type.
-    /// @param Property     Property associated with this ivar.
+    /// @param PropertyNode Property associated with this ivar.
     DIType createObjCIVar(StringRef Name, DIFile File,
                           unsigned LineNo, uint64_t SizeInBits, 
                           uint64_t AlignInBits, uint64_t OffsetInBits, 
@@ -228,7 +230,7 @@ namespace llvm {
     /// @param Scope        Scope in which this class is defined.
     /// @param Name         class name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param OffsetInBits Member offset.
@@ -250,7 +252,7 @@ namespace llvm {
     /// @param Scope        Scope in which this struct is defined.
     /// @param Name         Struct name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Flags        Flags to encode member attribute, e.g. private
@@ -265,7 +267,7 @@ namespace llvm {
     /// @param Scope        Scope in which this union is defined.
     /// @param Name         Union name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Flags        Flags to encode member attribute, e.g. private
@@ -325,33 +327,36 @@ namespace llvm {
     /// @param Scope        Scope in which this enumeration is defined.
     /// @param Name         Union name.
     /// @param File         File where this member is defined.
-    /// @param LineNo       Line number.
+    /// @param LineNumber   Line number.
     /// @param SizeInBits   Member size.
     /// @param AlignInBits  Member alignment.
     /// @param Elements     Enumeration elements.
-    /// @param Flags        Flags (e.g. forward decl)
     DIType createEnumerationType(DIDescriptor Scope, StringRef Name, 
                                  DIFile File, unsigned LineNumber, 
                                  uint64_t SizeInBits, uint64_t AlignInBits,
-                                 DIArray Elements, DIType ClassType,
-                                 unsigned Flags);
+                                 DIArray Elements, DIType ClassType);
 
     /// createSubroutineType - Create subroutine type.
-    /// @param File          File in which this subroutine is defined.
-    /// @param ParamterTypes An array of subroutine parameter types. This
-    ///                      includes return type at 0th index.
+    /// @param File           File in which this subroutine is defined.
+    /// @param ParameterTypes An array of subroutine parameter types. This
+    ///                       includes return type at 0th index.
     DIType createSubroutineType(DIFile File, DIArray ParameterTypes);
 
     /// createArtificialType - Create a new DIType with "artificial" flag set.
     DIType createArtificialType(DIType Ty);
 
+    /// createObjectPointerType - Create a new DIType with the "object pointer"
+    /// flag set.
+    DIType createObjectPointerType(DIType Ty);
+
     /// createTemporaryType - Create a temporary forward-declared type.
     DIType createTemporaryType();
     DIType createTemporaryType(DIFile F);
 
     /// createForwardDecl - Create a temporary forward-declared type.
     DIType createForwardDecl(unsigned Tag, StringRef Name, DIDescriptor Scope,
-                             DIFile F, unsigned Line, unsigned RuntimeLang = 0);
+                             DIFile F, unsigned Line, unsigned RuntimeLang = 0,
+                             uint64_t SizeInBits = 0, uint64_t AlignInBits = 0);
 
     /// retainType - Retain DIType in a module even if it is not referenced 
     /// through debug info anchors.
@@ -383,9 +388,9 @@ namespace llvm {
 
     /// createStaticVariable - Create a new descriptor for the specified 
     /// variable.
-    /// @param Conext      Variable scope. 
+    /// @param Context     Variable scope.
     /// @param Name        Name of the variable.
-    /// @param LinakgeName Mangled  name of the variable.
+    /// @param LinkageName Mangled  name of the variable.
     /// @param File        File where this variable is defined.
     /// @param LineNo      Line number.
     /// @param Ty          Variable Type.
@@ -426,7 +431,7 @@ namespace llvm {
     ///                    DW_TAG_arg_variable.
     /// @param Scope       Variable scope.
     /// @param Name        Variable name.
-    /// @param File        File where this variable is defined.
+    /// @param F           File where this variable is defined.
     /// @param LineNo      Line number.
     /// @param Ty          Variable Type
     /// @param Addr        An array of complex address operations.
diff --git a/include/llvm/Target/TargetData.h b/include/llvm/DataLayout.h
index 4f94ab751cb6..24ad05f17f39 100644
--- a/include/llvm/Target/TargetData.h
+++ b/include/llvm/DataLayout.h
@@ -1,4 +1,4 @@
-//===-- llvm/Target/TargetData.h - Data size & alignment info ---*- C++ -*-===//
+//===--------- llvm/DataLayout.h - Data size & alignment info ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines target properties related to datatype size/offset/alignment
+// This file defines layout properties related to datatype size/offset/alignment
 // information.  It uses lazy annotations to cache information about how
 // structure types are laid out and used.
 //
@@ -17,11 +17,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_TARGETDATA_H
-#define LLVM_TARGET_TARGETDATA_H
+#ifndef LLVM_DATALAYOUT_H
+#define LLVM_DATALAYOUT_H
 
 #include "llvm/Pass.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -36,7 +37,7 @@ class LLVMContext;
 template<typename T>
 class ArrayRef;
 
-/// Enum used to categorize the alignment types stored by TargetAlignElem
+/// Enum used to categorize the alignment types stored by LayoutAlignElem
 enum AlignTypeEnum {
   INTEGER_ALIGN = 'i',               ///< Integer type alignment
   VECTOR_ALIGN = 'v',                ///< Vector type alignment
@@ -45,38 +46,55 @@ enum AlignTypeEnum {
   STACK_ALIGN = 's'                  ///< Stack objects alignment
 };
 
-/// Target alignment element.
+/// Layout alignment element.
 ///
-/// Stores the alignment data associated with a given alignment type (pointer,
-/// integer, vector, float) and type bit width.
+/// Stores the alignment data associated with a given alignment type (integer,
+/// vector, float) and type bit width.
 ///
 /// @note The unusual order of elements in the structure attempts to reduce
 /// padding and make the structure slightly more cache friendly.
-struct TargetAlignElem {
-  AlignTypeEnum       AlignType : 8;  ///< Alignment type (AlignTypeEnum)
+struct LayoutAlignElem {
+  unsigned AlignType    : 8;  ///< Alignment type (AlignTypeEnum)
+  unsigned TypeBitWidth : 24; ///< Type bit width
+  unsigned ABIAlign     : 16; ///< ABI alignment for this type/bitw
+  unsigned PrefAlign    : 16; ///< Pref. alignment for this type/bitw
+
+  /// Initializer
+  static LayoutAlignElem get(AlignTypeEnum align_type, unsigned abi_align,
+                             unsigned pref_align, uint32_t bit_width);
+  /// Equality predicate
+  bool operator==(const LayoutAlignElem &rhs) const;
+};
+
+/// Layout pointer alignment element.
+///
+/// Stores the alignment data associated with a given pointer and address space.
+///
+/// @note The unusual order of elements in the structure attempts to reduce
+/// padding and make the structure slightly more cache friendly.
+struct PointerAlignElem {
   unsigned            ABIAlign;       ///< ABI alignment for this type/bitw
   unsigned            PrefAlign;      ///< Pref. alignment for this type/bitw
   uint32_t            TypeBitWidth;   ///< Type bit width
+  uint32_t            AddressSpace;   ///< Address space for the pointer type
 
   /// Initializer
-  static TargetAlignElem get(AlignTypeEnum align_type, unsigned abi_align,
+  static PointerAlignElem get(uint32_t addr_space, unsigned abi_align,
                              unsigned pref_align, uint32_t bit_width);
   /// Equality predicate
-  bool operator==(const TargetAlignElem &rhs) const;
+  bool operator==(const PointerAlignElem &rhs) const;
 };
 
-/// TargetData - This class holds a parsed version of the target data layout
+
+/// DataLayout - This class holds a parsed version of the target data layout
 /// string in a module and provides methods for querying it.  The target data
 /// layout string is specified *by the target* - a frontend generating LLVM IR
 /// is required to generate the right target data for the target being codegen'd
 /// to.  If some measure of portability is desired, an empty string may be
 /// specified in the module.
-class TargetData : public ImmutablePass {
+class DataLayout : public ImmutablePass {
 private:
   bool          LittleEndian;          ///< Defaults to false
-  unsigned      PointerMemSize;        ///< Pointer size in bytes
-  unsigned      PointerABIAlign;       ///< Pointer ABI alignment
-  unsigned      PointerPrefAlign;      ///< Pointer preferred alignment
   unsigned      StackNaturalAlign;     ///< Stack natural alignment
 
   SmallVector<unsigned char, 8> LegalIntWidths; ///< Legal Integers.
@@ -85,13 +103,18 @@ private:
   ///
   /// @sa init().
   /// @note Could support multiple size pointer alignments, e.g., 32-bit
-  /// pointers vs. 64-bit pointers by extending TargetAlignment, but for now,
+  /// pointers vs. 64-bit pointers by extending LayoutAlignment, but for now,
   /// we don't.
-  SmallVector<TargetAlignElem, 16> Alignments;
+  SmallVector<LayoutAlignElem, 16> Alignments;
+  DenseMap<unsigned, PointerAlignElem> Pointers;
 
   /// InvalidAlignmentElem - This member is a signal that a requested alignment
   /// type and bit width were not found in the SmallVector.
-  static const TargetAlignElem InvalidAlignmentElem;
+  static const LayoutAlignElem InvalidAlignmentElem;
+
+  /// InvalidPointerElem - This member is a signal that a requested pointer
+  /// type and bit width were not found in the DenseSet.
+  static const PointerAlignElem InvalidPointerElem;
 
   // The StructType -> StructLayout map.
   mutable void *LayoutMap;
@@ -101,18 +124,31 @@ private:
                     unsigned pref_align, uint32_t bit_width);
   unsigned getAlignmentInfo(AlignTypeEnum align_type, uint32_t bit_width,
                             bool ABIAlign, Type *Ty) const;
+
+  //! Set/initialize pointer alignments
+  void setPointerAlignment(uint32_t addr_space, unsigned abi_align,
+      unsigned pref_align, uint32_t bit_width);
+
   //! Internal helper method that returns requested alignment for type.
   unsigned getAlignment(Type *Ty, bool abi_or_pref) const;
 
   /// Valid alignment predicate.
   ///
-  /// Predicate that tests a TargetAlignElem reference returned by get() against
+  /// Predicate that tests a LayoutAlignElem reference returned by get() against
   /// InvalidAlignmentElem.
-  bool validAlignment(const TargetAlignElem &align) const {
+  bool validAlignment(const LayoutAlignElem &align) const {
     return &align != &InvalidAlignmentElem;
   }
 
-  /// Initialise a TargetData object with default values, ensure that the
+  /// Valid pointer predicate.
+  ///
+  /// Predicate that tests a PointerAlignElem reference returned by get() against
+  /// InvalidPointerElem.
+  bool validPointer(const PointerAlignElem &align) const {
+    return &align != &InvalidPointerElem;
+  }
+
+  /// Initialise a DataLayout object with default values, ensure that the
   /// target data pass is registered.
   void init();
 
@@ -121,43 +157,42 @@ public:
   ///
   /// @note This has to exist, because this is a pass, but it should never be
   /// used.
-  TargetData();
+  DataLayout();
 
-  /// Constructs a TargetData from a specification string. See init().
-  explicit TargetData(StringRef TargetDescription)
+  /// Constructs a DataLayout from a specification string. See init().
+  explicit DataLayout(StringRef LayoutDescription)
     : ImmutablePass(ID) {
-    std::string errMsg = parseSpecifier(TargetDescription, this);
+    std::string errMsg = parseSpecifier(LayoutDescription, this);
     assert(errMsg == "" && "Invalid target data layout string.");
     (void)errMsg;
   }
 
   /// Parses a target data specification string. Returns an error message
   /// if the string is malformed, or the empty string on success. Optionally
-  /// initialises a TargetData object if passed a non-null pointer.
-  static std::string parseSpecifier(StringRef TargetDescription, TargetData* td = 0);
+  /// initialises a DataLayout object if passed a non-null pointer.
+  static std::string parseSpecifier(StringRef LayoutDescription,
+                                    DataLayout* td = 0);
 
   /// Initialize target data from properties stored in the module.
-  explicit TargetData(const Module *M);
+  explicit DataLayout(const Module *M);
 
-  TargetData(const TargetData &TD) :
+  DataLayout(const DataLayout &TD) :
     ImmutablePass(ID),
     LittleEndian(TD.isLittleEndian()),
-    PointerMemSize(TD.PointerMemSize),
-    PointerABIAlign(TD.PointerABIAlign),
-    PointerPrefAlign(TD.PointerPrefAlign),
     LegalIntWidths(TD.LegalIntWidths),
     Alignments(TD.Alignments),
+    Pointers(TD.Pointers),
     LayoutMap(0)
   { }
 
-  ~TargetData();  // Not virtual, do not subclass this class
+  ~DataLayout();  // Not virtual, do not subclass this class
 
-  /// Target endianness...
+  /// Layout endianness...
   bool isLittleEndian() const { return LittleEndian; }
   bool isBigEndian() const { return !LittleEndian; }
 
   /// getStringRepresentation - Return the string representation of the
-  /// TargetData.  This representation is in the same format accepted by the
+  /// DataLayout.  This representation is in the same format accepted by the
   /// string constructor above.
   std::string getStringRepresentation() const;
 
@@ -195,15 +230,42 @@ public:
     return false;
   }
 
-  /// Target pointer alignment
-  unsigned getPointerABIAlignment() const { return PointerABIAlign; }
+  /// Layout pointer alignment
+  /// FIXME: The defaults need to be removed once all of
+  /// the backends/clients are updated.
+  unsigned getPointerABIAlignment(unsigned AS = 0)  const {
+    DenseMap<unsigned, PointerAlignElem>::const_iterator val = Pointers.find(AS);
+    if (val == Pointers.end()) {
+      val = Pointers.find(0);
+    }
+    return val->second.ABIAlign;
+  }
   /// Return target's alignment for stack-based pointers
-  unsigned getPointerPrefAlignment() const { return PointerPrefAlign; }
-  /// Target pointer size
-  unsigned getPointerSize()         const { return PointerMemSize; }
-  /// Target pointer size, in bits
-  unsigned getPointerSizeInBits()   const { return 8*PointerMemSize; }
-
+  /// FIXME: The defaults need to be removed once all of
+  /// the backends/clients are updated.
+  unsigned getPointerPrefAlignment(unsigned AS = 0) const {
+    DenseMap<unsigned, PointerAlignElem>::const_iterator val = Pointers.find(AS);
+    if (val == Pointers.end()) {
+      val = Pointers.find(0);
+    }
+    return val->second.PrefAlign;
+  }
+  /// Layout pointer size
+  /// FIXME: The defaults need to be removed once all of
+  /// the backends/clients are updated.
+  unsigned getPointerSize(unsigned AS = 0)          const {
+    DenseMap<unsigned, PointerAlignElem>::const_iterator val = Pointers.find(AS);
+    if (val == Pointers.end()) {
+      val = Pointers.find(0);
+    }
+    return val->second.TypeBitWidth;
+  }
+  /// Layout pointer size, in bits
+  /// FIXME: The defaults need to be removed once all of
+  /// the backends/clients are updated.
+  unsigned getPointerSizeInBits(unsigned AS = 0)    const {
+    return getPointerSize(AS) * 8;
+  }
   /// Size examples:
   ///
   /// Type        SizeInBits  StoreSizeInBits  AllocSizeInBits[*]
@@ -279,10 +341,14 @@ public:
   ///
   unsigned getPreferredTypeAlignmentShift(Type *Ty) const;
 
-  /// getIntPtrType - Return an unsigned integer type that is the same size or
-  /// greater to the host pointer size.
-  ///
-  IntegerType *getIntPtrType(LLVMContext &C) const;
+  /// getIntPtrType - Return an integer type with size at least as big as that
+  /// of a pointer in the given address space.
+  IntegerType *getIntPtrType(LLVMContext &C, unsigned AddressSpace = 0) const;
+
+  /// getIntPtrType - Return an integer (vector of integer) type with size at
+  /// least as big as that of a pointer of the given pointer (vector of pointer)
+  /// type.
+  Type *getIntPtrType(Type *) const;
 
   /// getIndexedOffset - return the offset from the beginning of the type for
   /// the specified indices.  This is used to implement getelementptr.
@@ -318,7 +384,7 @@ public:
 };
 
 /// StructLayout - used to lazily calculate structure layout information for a
-/// target machine, based on the TargetData structure.
+/// target machine, based on the DataLayout structure.
 ///
 class StructLayout {
   uint64_t StructSize;
@@ -354,8 +420,8 @@ public:
   }
 
 private:
-  friend class TargetData;   // Only TargetData can create this class
-  StructLayout(StructType *ST, const TargetData &TD);
+  friend class DataLayout;   // Only DataLayout can create this class
+  StructLayout(StructType *ST, const DataLayout &TD);
 };
 
 } // End llvm namespace
diff --git a/include/llvm/DebugInfo.h b/include/llvm/DebugInfo.h
index 618220fcb010..dae03ad10095 100644
--- a/include/llvm/DebugInfo.h
+++ b/include/llvm/DebugInfo.h
@@ -60,7 +60,8 @@ namespace llvm {
       FlagArtificial         = 1 << 6,
       FlagExplicit           = 1 << 7,
       FlagPrototyped         = 1 << 8,
-      FlagObjcClassComplete  = 1 << 9
+      FlagObjcClassComplete  = 1 << 9,
+      FlagObjectPointer      = 1 << 10
     };
   protected:
     const MDNode *DbgNode;
@@ -80,6 +81,7 @@ namespace llvm {
     GlobalVariable *getGlobalVariableField(unsigned Elt) const;
     Constant *getConstantField(unsigned Elt) const;
     Function *getFunctionField(unsigned Elt) const;
+    void replaceFunctionField(unsigned Elt, Function *F);
 
   public:
     explicit DIDescriptor() : DbgNode(0) {}
@@ -287,6 +289,9 @@ namespace llvm {
     bool isArtificial() const {
       return (getFlags() & FlagArtificial) != 0;
     }
+    bool isObjectPointer() const {
+      return (getFlags() & FlagObjectPointer) != 0;
+    }
     bool isObjcClassComplete() const {
       return (getFlags() & FlagObjcClassComplete) != 0;
     }
@@ -558,6 +563,7 @@ namespace llvm {
     bool describes(const Function *F);
 
     Function *getFunction() const { return getFunctionField(16); }
+    void replaceFunction(Function *F) { replaceFunctionField(16, F); }
     DIArray getTemplateParams() const { return getFieldAs<DIArray>(17); }
     DISubprogram getFunctionDeclaration() const {
       return getFieldAs<DISubprogram>(18);
@@ -644,6 +650,10 @@ namespace llvm {
       return (getUnsignedField(6) & FlagArtificial) != 0;
     }
 
+    bool isObjectPointer() const {
+      return (getUnsignedField(6) & FlagObjectPointer) != 0;
+    }
+
     /// getInlinedAt - If this variable is inlined then return inline location.
     MDNode *getInlinedAt() const;
 
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index cfdeb46889e5..26bd1f627526 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_DEBUGINFO_DICONTEXT_H
 #define LLVM_DEBUGINFO_DICONTEXT_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
@@ -54,6 +56,23 @@ public:
   }
 };
 
+/// DIInliningInfo - a format-neutral container for inlined code description.
+class DIInliningInfo {
+  SmallVector<DILineInfo, 4> Frames;
+ public:
+  DIInliningInfo() {}
+  DILineInfo getFrame(unsigned Index) const {
+    assert(Index < Frames.size());
+    return Frames[Index];
+  }
+  uint32_t getNumberOfFrames() const {
+    return Frames.size();
+  }
+  void addFrame(const DILineInfo &Frame) {
+    Frames.push_back(Frame);
+  }
+};
+
 /// DILineInfoSpecifier - controls which fields of DILineInfo container
 /// should be filled with data.
 class DILineInfoSpecifier {
@@ -71,6 +90,13 @@ public:
   }
 };
 
+// In place of applying the relocations to the data we've read from disk we use
+// a separate mapping table to the side and checking that at locations in the
+// dwarf where we expect relocated values. This adds a bit of complexity to the
+// dwarf parsing/extraction at the benefit of not allocating memory for the
+// entire size of the debug info sections.
+typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
+
 class DIContext {
 public:
   virtual ~DIContext();
@@ -81,12 +107,16 @@ public:
                                     StringRef abbrevSection,
                                     StringRef aRangeSection = StringRef(),
                                     StringRef lineSection = StringRef(),
-                                    StringRef stringSection = StringRef());
+                                    StringRef stringSection = StringRef(),
+                                    StringRef rangeSection = StringRef(),
+                                    const RelocAddrMap &Map = RelocAddrMap());
 
   virtual void dump(raw_ostream &OS) = 0;
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address,
-      DILineInfoSpecifier specifier = DILineInfoSpecifier()) = 0;
+  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
+  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier()) = 0;
 };
 
 }
diff --git a/include/llvm/DefaultPasses.h b/include/llvm/DefaultPasses.h
index 929569d543d9..9f1ade86aba6 100644
--- a/include/llvm/DefaultPasses.h
+++ b/include/llvm/DefaultPasses.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_DEFAULT_PASS_SUPPORT_H
 #define LLVM_DEFAULT_PASS_SUPPORT_H
 
-#include <llvm/PassSupport.h>
+#include "llvm/PassSupport.h"
 
 namespace llvm {
 
diff --git a/include/llvm/DerivedTypes.h b/include/llvm/DerivedTypes.h
index da5ad27b1f1c..c862c2c8bb20 100644
--- a/include/llvm/DerivedTypes.h
+++ b/include/llvm/DerivedTypes.h
@@ -20,6 +20,7 @@
 
 #include "llvm/Type.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -84,7 +85,6 @@ public:
   bool isPowerOf2ByteWidth() const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const IntegerType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == IntegerTyID;
   }
@@ -94,8 +94,8 @@ public:
 /// FunctionType - Class to represent function types
 ///
 class FunctionType : public Type {
-  FunctionType(const FunctionType &);                   // Do not implement
-  const FunctionType &operator=(const FunctionType &);  // Do not implement
+  FunctionType(const FunctionType &) LLVM_DELETED_FUNCTION;
+  const FunctionType &operator=(const FunctionType &) LLVM_DELETED_FUNCTION;
   FunctionType(Type *Result, ArrayRef<Type*> Params, bool IsVarArgs);
 
 public:
@@ -133,7 +133,6 @@ public:
   unsigned getNumParams() const { return NumContainedTys - 1; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const FunctionType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == FunctionTyID;
   }
@@ -156,7 +155,6 @@ public:
   bool indexValid(unsigned Idx) const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const CompositeType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == ArrayTyID ||
            T->getTypeID() == StructTyID ||
@@ -183,12 +181,12 @@ public:
 /// Independent of what kind of struct you have, the body of a struct type are
 /// laid out in memory consequtively with the elements directly one after the
 /// other (if the struct is packed) or (if not packed) with padding between the
-/// elements as defined by TargetData (which is required to match what the code
+/// elements as defined by DataLayout (which is required to match what the code
 /// generator for a target expects).
 ///
 class StructType : public CompositeType {
-  StructType(const StructType &);                   // Do not implement
-  const StructType &operator=(const StructType &);  // Do not implement
+  StructType(const StructType &) LLVM_DELETED_FUNCTION;
+  const StructType &operator=(const StructType &) LLVM_DELETED_FUNCTION;
   StructType(LLVMContext &C)
     : CompositeType(C, StructTyID), SymbolTableEntry(0) {}
   enum {
@@ -292,7 +290,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const StructType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == StructTyID;
   }
@@ -308,8 +305,8 @@ public:
 ///
 class SequentialType : public CompositeType {
   Type *ContainedType;               ///< Storage for the single contained type.
-  SequentialType(const SequentialType &);                  // Do not implement!
-  const SequentialType &operator=(const SequentialType &); // Do not implement!
+  SequentialType(const SequentialType &) LLVM_DELETED_FUNCTION;
+  const SequentialType &operator=(const SequentialType &) LLVM_DELETED_FUNCTION;
 
 protected:
   SequentialType(TypeID TID, Type *ElType)
@@ -322,7 +319,6 @@ public:
   Type *getElementType() const { return ContainedTys[0]; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const SequentialType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == ArrayTyID ||
            T->getTypeID() == PointerTyID ||
@@ -336,8 +332,8 @@ public:
 class ArrayType : public SequentialType {
   uint64_t NumElements;
 
-  ArrayType(const ArrayType &);                   // Do not implement
-  const ArrayType &operator=(const ArrayType &);  // Do not implement
+  ArrayType(const ArrayType &) LLVM_DELETED_FUNCTION;
+  const ArrayType &operator=(const ArrayType &) LLVM_DELETED_FUNCTION;
   ArrayType(Type *ElType, uint64_t NumEl);
 public:
   /// ArrayType::get - This static method is the primary way to construct an
@@ -352,7 +348,6 @@ public:
   uint64_t getNumElements() const { return NumElements; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const ArrayType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == ArrayTyID;
   }
@@ -363,8 +358,8 @@ public:
 class VectorType : public SequentialType {
   unsigned NumElements;
 
-  VectorType(const VectorType &);                   // Do not implement
-  const VectorType &operator=(const VectorType &);  // Do not implement
+  VectorType(const VectorType &) LLVM_DELETED_FUNCTION;
+  const VectorType &operator=(const VectorType &) LLVM_DELETED_FUNCTION;
   VectorType(Type *ElType, unsigned NumEl);
 public:
   /// VectorType::get - This static method is the primary way to construct an
@@ -419,7 +414,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const VectorType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == VectorTyID;
   }
@@ -429,8 +423,8 @@ public:
 /// PointerType - Class to represent pointers.
 ///
 class PointerType : public SequentialType {
-  PointerType(const PointerType &);                   // Do not implement
-  const PointerType &operator=(const PointerType &);  // Do not implement
+  PointerType(const PointerType &) LLVM_DELETED_FUNCTION;
+  const PointerType &operator=(const PointerType &) LLVM_DELETED_FUNCTION;
   explicit PointerType(Type *ElType, unsigned AddrSpace);
 public:
   /// PointerType::get - This constructs a pointer to an object of the specified
@@ -451,7 +445,6 @@ public:
   inline unsigned getAddressSpace() const { return getSubclassData(); }
 
   // Implement support type inquiry through isa, cast, and dyn_cast.
-  static inline bool classof(const PointerType *) { return true; }
   static inline bool classof(const Type *T) {
     return T->getTypeID() == PointerTyID;
   }
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index ae8b68d0241e..8073d8f92c51 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -42,7 +42,7 @@ class JITMemoryManager;
 class MachineCodeInfo;
 class Module;
 class MutexGuard;
-class TargetData;
+class DataLayout;
 class Triple;
 class Type;
 
@@ -88,7 +88,7 @@ public:
 
   /// \brief Erase an entry from the mapping table.
   ///
-  /// \returns The address that \arg ToUnmap was happed to.
+  /// \returns The address that \p ToUnmap was happed to.
   void *RemoveMapping(const MutexGuard &, const GlobalValue *ToUnmap);
 };
 
@@ -104,7 +104,7 @@ class ExecutionEngine {
   ExecutionEngineState EEState;
 
   /// The target data for the platform for which execution is being performed.
-  const TargetData *TD;
+  const DataLayout *TD;
 
   /// Whether lazy JIT compilation is enabled.
   bool CompilingLazily;
@@ -123,7 +123,7 @@ protected:
   /// optimize for the case where there is only one module.
   SmallVector<Module*, 1> Modules;
 
-  void setTargetData(const TargetData *td) { TD = td; }
+  void setDataLayout(const DataLayout *td) { TD = td; }
 
   /// getMemoryforGV - Allocate memory for a global variable.
   virtual char *getMemoryForGV(const GlobalVariable *GV);
@@ -213,7 +213,7 @@ public:
 
   //===--------------------------------------------------------------------===//
 
-  const TargetData *getTargetData() const { return TD; }
+  const DataLayout *getDataLayout() const { return TD; }
 
   /// removeModule - Remove a Module from the list of modules.  Returns true if
   /// M is found.
@@ -244,11 +244,18 @@ public:
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
   /// This is the address which will be used for relocation resolution.
-  virtual void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress) {
+  virtual void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress) {
     llvm_unreachable("Re-mapping of section addresses not supported with this "
                      "EE!");
   }
 
+  // finalizeObject - This method should be called after sections within an
+  // object have been relocated using mapSectionAddress.  When this method is
+  // called the MCJIT execution engine will reapply relocations for a loaded
+  // object.  This method has no effect for the legacy JIT engine or the
+  // interpeter.
+  virtual void finalizeObject() {}
+
   /// runStaticConstructorsDestructors - This method is used to execute all of
   /// the static constructors or destructors for a program.
   ///
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index eea603fcee2c..e6586e778c19 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -26,6 +26,7 @@ class Function;
 class MachineFunction;
 class OProfileWrapper;
 class IntelJITEventsWrapper;
+class ObjectImage;
 
 /// JITEvent_EmittedFunctionDetails - Helper struct for containing information
 /// about a generated machine code function.
@@ -76,6 +77,20 @@ public:
   /// matching NotifyFreeingMachineCode call.
   virtual void NotifyFreeingMachineCode(void *) {}
 
+  /// NotifyObjectEmitted - Called after an object has been successfully
+  /// emitted to memory.  NotifyFunctionEmitted will not be called for
+  /// individual functions in the object.
+  ///
+  /// ELF-specific information
+  /// The ObjectImage contains the generated object image
+  /// with section headers updated to reflect the address at which sections
+  /// were loaded and with relocations performed in-place on debug sections.
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj) {}
+
+  /// NotifyFreeingObject - Called just before the memory associated with
+  /// a previously emitted object is released.
+  virtual void NotifyFreeingObject(const ObjectImage &Obj) {}
+
 #if LLVM_USE_INTEL_JITEVENTS
   // Construct an IntelJITEventListener
   static JITEventListener *createIntelJITEventListener();
diff --git a/include/llvm/ExecutionEngine/JITMemoryManager.h b/include/llvm/ExecutionEngine/JITMemoryManager.h
index 4c75b6ab970e..90896465018c 100644
--- a/include/llvm/ExecutionEngine/JITMemoryManager.h
+++ b/include/llvm/ExecutionEngine/JITMemoryManager.h
@@ -10,7 +10,9 @@
 #ifndef LLVM_EXECUTION_ENGINE_JIT_MEMMANAGER_H
 #define LLVM_EXECUTION_ENGINE_JIT_MEMMANAGER_H
 
+#include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/Support/DataTypes.h"
+
 #include <string>
 
 namespace llvm {
@@ -22,7 +24,7 @@ namespace llvm {
 /// memory for the code generated by the JIT.  This can be reimplemented by
 /// clients that have a strong desire to control how the layout of JIT'd memory
 /// works.
-class JITMemoryManager {
+class JITMemoryManager : public RTDyldMemoryManager {
 protected:
   bool HasGOT;
 
@@ -47,17 +49,6 @@ public:
   /// debugging, and may be turned on by default in debug mode.
   virtual void setPoisonMemory(bool poison) = 0;
 
-  /// getPointerToNamedFunction - This method returns the address of the
-  /// specified function. As such it is only useful for resolving library
-  /// symbols, not code generated symbols.
-  ///
-  /// If AbortOnFailure is false and no function with the given name is
-  /// found, this function silently returns a null pointer. Otherwise,
-  /// it prints a message to stderr and aborts.
-  ///
-  virtual void *getPointerToNamedFunction(const std::string &Name,
-                                          bool AbortOnFailure = true) = 0;
-
   //===--------------------------------------------------------------------===//
   // Global Offset Table Management
   //===--------------------------------------------------------------------===//
@@ -112,22 +103,6 @@ public:
   virtual void endFunctionBody(const Function *F, uint8_t *FunctionStart,
                                uint8_t *FunctionEnd) = 0;
 
-  /// allocateCodeSection - Allocate a memory block of (at least) the given
-  /// size suitable for executable code. The SectionID is a unique identifier
-  /// assigned by the JIT and passed through to the memory manager for
-  /// the instance class to use if it needs to communicate to the JIT about
-  /// a given section after the fact.
-  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID) = 0;
-
-  /// allocateDataSection - Allocate a memory block of (at least) the given
-  /// size suitable for data. The SectionID is a unique identifier
-  /// assigned by the JIT and passed through to the memory manager for
-  /// the instance class to use if it needs to communicate to the JIT about
-  /// a given section after the fact.
-  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                                       unsigned SectionID) = 0;
-
   /// allocateSpace - Allocate a memory block of the given size.  This method
   /// cannot be called between calls to startFunctionBody and endFunctionBody.
   virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) = 0;
diff --git a/include/llvm/ExecutionEngine/ObjectBuffer.h b/include/llvm/ExecutionEngine/ObjectBuffer.h
new file mode 100644
index 000000000000..a0a77b8ba888
--- /dev/null
+++ b/include/llvm/ExecutionEngine/ObjectBuffer.h
@@ -0,0 +1,80 @@
+//===---- ObjectBuffer.h - Utility class to wrap object image memory -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a wrapper class to hold the memory into which an
+// object will be generated.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
+#define LLVM_EXECUTIONENGINE_OBJECTBUFFER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+/// ObjectBuffer - This class acts as a container for the memory buffer used during
+/// generation and loading of executable objects using MCJIT and RuntimeDyld.  The
+/// underlying memory for the object will be owned by the ObjectBuffer instance
+/// throughout its lifetime.  The getMemBuffer() method provides a way to create a
+/// MemoryBuffer wrapper object instance to be owned by other classes (such as
+/// ObjectFile) as needed, but the MemoryBuffer instance returned does not own the
+/// actual memory it points to.
+class ObjectBuffer {
+public:
+  ObjectBuffer() {}
+  ObjectBuffer(MemoryBuffer* Buf) : Buffer(Buf) {}
+  virtual ~ObjectBuffer() {}
+
+  /// getMemBuffer - Like MemoryBuffer::getMemBuffer() this function
+  /// returns a pointer to an object that is owned by the caller. However,
+  /// the caller does not take ownership of the underlying memory.
+  MemoryBuffer *getMemBuffer() const {
+    return MemoryBuffer::getMemBuffer(Buffer->getBuffer(), "", false);
+  }
+
+  const char *getBufferStart() const { return Buffer->getBufferStart(); }
+  size_t getBufferSize() const { return Buffer->getBufferSize(); }
+
+protected:
+  // The memory contained in an ObjectBuffer
+  OwningPtr<MemoryBuffer> Buffer;
+};
+
+/// ObjectBufferStream - This class encapsulates the SmallVector and
+/// raw_svector_ostream needed to generate an object using MC code emission
+/// while providing a common ObjectBuffer interface for access to the
+/// memory once the object has been generated.
+class ObjectBufferStream : public ObjectBuffer {
+public:
+  ObjectBufferStream() : OS(SV) {}
+  virtual ~ObjectBufferStream() {}
+
+  raw_ostream &getOStream() { return OS; }
+  void flush()
+  {
+    OS.flush();
+
+    // Make the data accessible via the ObjectBuffer::Buffer
+    Buffer.reset(MemoryBuffer::getMemBuffer(StringRef(SV.data(), SV.size()),
+                                            "",
+                                            false));
+  }
+
+protected:
+  SmallVector<char, 4096> SV; // Working buffer into which we JIT.
+  raw_svector_ostream     OS; // streaming wrapper
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/ExecutionEngine/ObjectImage.h b/include/llvm/ExecutionEngine/ObjectImage.h
new file mode 100644
index 000000000000..82549add62e8
--- /dev/null
+++ b/include/llvm/ExecutionEngine/ObjectImage.h
@@ -0,0 +1,61 @@
+//===---- ObjectImage.h - Format independent executuable object image -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a file format independent ObjectImage class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_OBJECTIMAGE_H
+#define LLVM_EXECUTIONENGINE_OBJECTIMAGE_H
+
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+
+namespace llvm {
+
+
+/// ObjectImage - A container class that represents an ObjectFile that has been
+/// or is in the process of being loaded into memory for execution.
+class ObjectImage {
+  ObjectImage() LLVM_DELETED_FUNCTION;
+  ObjectImage(const ObjectImage &other) LLVM_DELETED_FUNCTION;
+
+protected:
+  OwningPtr<ObjectBuffer> Buffer;
+
+public:
+  ObjectImage(ObjectBuffer *Input) : Buffer(Input) {}
+  virtual ~ObjectImage() {}
+
+  virtual object::symbol_iterator begin_symbols() const = 0;
+  virtual object::symbol_iterator end_symbols() const = 0;
+
+  virtual object::section_iterator begin_sections() const = 0;
+  virtual object::section_iterator end_sections() const  = 0;
+
+  virtual /* Triple::ArchType */ unsigned getArch() const = 0;
+
+  // Subclasses can override these methods to update the image with loaded
+  // addresses for sections and common symbols
+  virtual void updateSectionAddress(const object::SectionRef &Sec,
+                                    uint64_t Addr) = 0;
+  virtual void updateSymbolAddress(const object::SymbolRef &Sym,
+                                   uint64_t Addr) = 0;
+
+  virtual StringRef getData() const = 0;
+
+  // Subclasses can override these methods to provide JIT debugging support
+  virtual void registerWithDebugger() = 0;
+  virtual void deregisterWithDebugger() = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index a5c9272d3ca6..891f534862f4 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -15,43 +15,55 @@
 #define LLVM_RUNTIME_DYLD_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/Support/Memory.h"
 
 namespace llvm {
 
 class RuntimeDyldImpl;
-class MemoryBuffer;
+class ObjectImage;
 
 // RuntimeDyld clients often want to handle the memory management of
-// what gets placed where. For JIT clients, this is an abstraction layer
-// over the JITMemoryManager, which references objects by their source
-// representations in LLVM IR.
+// what gets placed where. For JIT clients, this is the subset of
+// JITMemoryManager required for dynamic loading of binaries.
+//
 // FIXME: As the RuntimeDyld fills out, additional routines will be needed
 //        for the varying types of objects to be allocated.
 class RTDyldMemoryManager {
-  RTDyldMemoryManager(const RTDyldMemoryManager&);  // DO NOT IMPLEMENT
-  void operator=(const RTDyldMemoryManager&);       // DO NOT IMPLEMENT
+  RTDyldMemoryManager(const RTDyldMemoryManager&) LLVM_DELETED_FUNCTION;
+  void operator=(const RTDyldMemoryManager&) LLVM_DELETED_FUNCTION;
 public:
   RTDyldMemoryManager() {}
   virtual ~RTDyldMemoryManager();
 
   /// allocateCodeSection - Allocate a memory block of (at least) the given
-  /// size suitable for executable code.
+  /// size suitable for executable code. The SectionID is a unique identifier
+  /// assigned by the JIT engine, and optionally recorded by the memory manager
+  /// to access a loaded section.
   virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                        unsigned SectionID) = 0;
 
   /// allocateDataSection - Allocate a memory block of (at least) the given
-  /// size suitable for data.
+  /// size suitable for data. The SectionID is a unique identifier
+  /// assigned by the JIT engine, and optionally recorded by the memory manager
+  /// to access a loaded section.
   virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
                                        unsigned SectionID) = 0;
 
+  /// getPointerToNamedFunction - This method returns the address of the
+  /// specified function. As such it is only useful for resolving library
+  /// symbols, not code generated symbols.
+  ///
+  /// If AbortOnFailure is false and no function with the given name is
+  /// found, this function returns a null pointer. Otherwise, it prints a
+  /// message to stderr and aborts.
   virtual void *getPointerToNamedFunction(const std::string &Name,
                                           bool AbortOnFailure = true) = 0;
 };
 
 class RuntimeDyld {
-  RuntimeDyld(const RuntimeDyld &);     // DO NOT IMPLEMENT
-  void operator=(const RuntimeDyld &);  // DO NOT IMPLEMENT
+  RuntimeDyld(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
+  void operator=(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
 
   // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public
   // interface.
@@ -62,17 +74,24 @@ protected:
   // Any relocations already associated with the symbol will be re-resolved.
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 public:
-  RuntimeDyld(RTDyldMemoryManager*);
+  RuntimeDyld(RTDyldMemoryManager *);
   ~RuntimeDyld();
 
-  /// Load an in-memory object file into the dynamic linker.
-  bool loadObject(MemoryBuffer *InputBuffer);
+  /// loadObject - prepare the object contained in the input buffer for
+  /// execution.  Ownership of the input buffer is transferred to the
+  /// ObjectImage instance returned from this function if successful.
+  /// In the case of load failure, the input buffer will be deleted.
+  ObjectImage *loadObject(ObjectBuffer *InputBuffer);
 
   /// Get the address of our local copy of the symbol. This may or may not
   /// be the address used for relocation (clients can copy the data around
   /// and resolve relocatons based on where they put it).
   void *getSymbolAddress(StringRef Name);
 
+  /// Get the address of the target copy of the symbol. This is the address
+  /// used for relocation.
+  uint64_t getSymbolLoadAddress(StringRef Name);
+
   /// Resolve the relocations for all symbols we currently know about.
   void resolveRelocations();
 
@@ -80,7 +99,7 @@ public:
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
   /// This is the address which will be used for relocation resolution.
-  void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress);
+  void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress);
 
   StringRef getErrorString();
 };
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index fdd90d1d8faa..e211e9ab52a8 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -109,9 +109,9 @@ private:
       BuildLazyArguments();
   }
   void BuildLazyArguments() const;
-  
-  Function(const Function&); // DO NOT IMPLEMENT
-  void operator=(const Function&); // DO NOT IMPLEMENT
+
+  Function(const Function&) LLVM_DELETED_FUNCTION;
+  void operator=(const Function&) LLVM_DELETED_FUNCTION;
 
   /// Function ctor - If the (optional) Module argument is specified, the
   /// function is automatically inserted into the end of the function list for
@@ -168,17 +168,17 @@ public:
   ///
   void setAttributes(const AttrListPtr &attrs) { AttributeList = attrs; }
 
-  /// hasFnAttr - Return true if this function has the given attribute.
-  bool hasFnAttr(Attributes N) const {
-    // Function Attributes are stored at ~0 index 
-    return AttributeList.paramHasAttr(~0U, N);
+  /// getFnAttributes - Return the function attributes for querying.
+  ///
+  Attributes getFnAttributes() const {
+    return AttributeList.getFnAttributes();
   }
 
   /// addFnAttr - Add function attributes to this function.
   ///
-  void addFnAttr(Attributes N) { 
+  void addFnAttr(Attributes::AttrVal N) { 
     // Function Attributes are stored at ~0 index 
-    addAttribute(~0U, N);
+    addAttribute(AttrListPtr::FunctionIndex, Attributes::get(getContext(), N));
   }
 
   /// removeFnAttr - Remove function attributes from this function.
@@ -195,9 +195,15 @@ public:
   void setGC(const char *Str);
   void clearGC();
 
-  /// @brief Determine whether the function has the given attribute.
-  bool paramHasAttr(unsigned i, Attributes attr) const {
-    return AttributeList.paramHasAttr(i, attr);
+
+  /// getRetAttributes - Return the return attributes for querying.
+  Attributes getRetAttributes() const {
+    return AttributeList.getRetAttributes();
+  }
+
+  /// getParamAttributes - Return the parameter attributes for querying.
+  Attributes getParamAttributes(unsigned Idx) const {
+    return AttributeList.getParamAttributes(Idx);
   }
 
   /// addAttribute - adds the attribute to the list of attributes.
@@ -213,50 +219,44 @@ public:
 
   /// @brief Determine if the function does not access memory.
   bool doesNotAccessMemory() const {
-    return hasFnAttr(Attribute::ReadNone);
+    return getFnAttributes().hasAttribute(Attributes::ReadNone);
   }
-  void setDoesNotAccessMemory(bool DoesNotAccessMemory = true) {
-    if (DoesNotAccessMemory) addFnAttr(Attribute::ReadNone);
-    else removeFnAttr(Attribute::ReadNone);
+  void setDoesNotAccessMemory() {
+    addFnAttr(Attributes::ReadNone);
   }
 
   /// @brief Determine if the function does not access or only reads memory.
   bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
+    return doesNotAccessMemory() ||
+      getFnAttributes().hasAttribute(Attributes::ReadOnly);
   }
-  void setOnlyReadsMemory(bool OnlyReadsMemory = true) {
-    if (OnlyReadsMemory) addFnAttr(Attribute::ReadOnly);
-    else removeFnAttr(Attribute::ReadOnly | Attribute::ReadNone);
+  void setOnlyReadsMemory() {
+    addFnAttr(Attributes::ReadOnly);
   }
 
   /// @brief Determine if the function cannot return.
   bool doesNotReturn() const {
-    return hasFnAttr(Attribute::NoReturn);
+    return getFnAttributes().hasAttribute(Attributes::NoReturn);
   }
-  void setDoesNotReturn(bool DoesNotReturn = true) {
-    if (DoesNotReturn) addFnAttr(Attribute::NoReturn);
-    else removeFnAttr(Attribute::NoReturn);
+  void setDoesNotReturn() {
+    addFnAttr(Attributes::NoReturn);
   }
 
   /// @brief Determine if the function cannot unwind.
   bool doesNotThrow() const {
-    return hasFnAttr(Attribute::NoUnwind);
+    return getFnAttributes().hasAttribute(Attributes::NoUnwind);
   }
-  void setDoesNotThrow(bool DoesNotThrow = true) {
-    if (DoesNotThrow) addFnAttr(Attribute::NoUnwind);
-    else removeFnAttr(Attribute::NoUnwind);
+  void setDoesNotThrow() {
+    addFnAttr(Attributes::NoUnwind);
   }
 
   /// @brief True if the ABI mandates (or the user requested) that this
   /// function be in a unwind table.
   bool hasUWTable() const {
-    return hasFnAttr(Attribute::UWTable);
+    return getFnAttributes().hasAttribute(Attributes::UWTable);
   }
-  void setHasUWTable(bool HasUWTable = true) {
-    if (HasUWTable)
-      addFnAttr(Attribute::UWTable);
-    else
-      removeFnAttr(Attribute::UWTable);
+  void setHasUWTable() {
+    addFnAttr(Attributes::UWTable);
   }
 
   /// @brief True if this function needs an unwind table.
@@ -267,27 +267,25 @@ public:
   /// @brief Determine if the function returns a structure through first 
   /// pointer argument.
   bool hasStructRetAttr() const {
-    return paramHasAttr(1, Attribute::StructRet);
+    return getParamAttributes(1).hasAttribute(Attributes::StructRet);
   }
 
   /// @brief Determine if the parameter does not alias other parameters.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotAlias(unsigned n) const {
-    return paramHasAttr(n, Attribute::NoAlias);
+    return getParamAttributes(n).hasAttribute(Attributes::NoAlias);
   }
-  void setDoesNotAlias(unsigned n, bool DoesNotAlias = true) {
-    if (DoesNotAlias) addAttribute(n, Attribute::NoAlias);
-    else removeAttribute(n, Attribute::NoAlias);
+  void setDoesNotAlias(unsigned n) {
+    addAttribute(n, Attributes::get(getContext(), Attributes::NoAlias));
   }
 
   /// @brief Determine if the parameter can be captured.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotCapture(unsigned n) const {
-    return paramHasAttr(n, Attribute::NoCapture);
+    return getParamAttributes(n).hasAttribute(Attributes::NoCapture);
   }
-  void setDoesNotCapture(unsigned n, bool DoesNotCapture = true) {
-    if (DoesNotCapture) addAttribute(n, Attribute::NoCapture);
-    else removeAttribute(n, Attribute::NoCapture);
+  void setDoesNotCapture(unsigned n) {
+    addAttribute(n, Attributes::get(getContext(), Attributes::NoCapture));
   }
 
   /// copyAttributesFrom - copy all additional attributes (those not needed to
@@ -400,7 +398,6 @@ public:
   void viewCFGOnly() const;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const Function *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::FunctionVal;
   }
diff --git a/include/llvm/GlobalAlias.h b/include/llvm/GlobalAlias.h
index 164d976588d6..d0f014733fce 100644
--- a/include/llvm/GlobalAlias.h
+++ b/include/llvm/GlobalAlias.h
@@ -28,8 +28,8 @@ template<typename ValueSubClass, typename ItemParentClass>
 
 class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   friend class SymbolTableListTraits<GlobalAlias, Module>;
-  void operator=(const GlobalAlias &);     // Do not implement
-  GlobalAlias(const GlobalAlias &);     // Do not implement
+  void operator=(const GlobalAlias &) LLVM_DELETED_FUNCTION;
+  GlobalAlias(const GlobalAlias &) LLVM_DELETED_FUNCTION;
 
   void setParent(Module *parent);
 
@@ -76,7 +76,6 @@ public:
   const GlobalValue *resolveAliasedGlobal(bool stopOnWeak = true) const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const GlobalAlias *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::GlobalAliasVal;
   }
diff --git a/include/llvm/GlobalValue.h b/include/llvm/GlobalValue.h
index 8b969f3354c3..7f7f74b1e2da 100644
--- a/include/llvm/GlobalValue.h
+++ b/include/llvm/GlobalValue.h
@@ -26,7 +26,7 @@ class PointerType;
 class Module;
 
 class GlobalValue : public Constant {
-  GlobalValue(const GlobalValue &);             // do not implement
+  GlobalValue(const GlobalValue &) LLVM_DELETED_FUNCTION;
 public:
   /// @brief An enumeration for the kinds of linkage for global values.
   enum LinkageTypes {
@@ -34,6 +34,7 @@ public:
     AvailableExternallyLinkage, ///< Available for inspection, not emission.
     LinkOnceAnyLinkage, ///< Keep one copy of function when linking (inline)
     LinkOnceODRLinkage, ///< Same, but only replaced by something equivalent.
+    LinkOnceODRAutoHideLinkage, ///< Like LinkOnceODRLinkage but addr not taken.
     WeakAnyLinkage,     ///< Keep one copy of named function when linking (weak)
     WeakODRLinkage,     ///< Same, but only replaced by something equivalent.
     AppendingLinkage,   ///< Special purpose, only applies to global arrays
@@ -41,8 +42,6 @@ public:
     PrivateLinkage,     ///< Like Internal, but omit from symbol table.
     LinkerPrivateLinkage, ///< Like Private, but linker removes.
     LinkerPrivateWeakLinkage, ///< Like LinkerPrivate, but weak.
-    LinkerPrivateWeakDefAutoLinkage, ///< Like LinkerPrivateWeak, but possibly
-                                     ///  hidden.
     DLLImportLinkage,   ///< Function to be imported from DLL
     DLLExportLinkage,   ///< Function to be accessible from DLL.
     ExternalWeakLinkage,///< ExternalWeak linkage description.
@@ -123,7 +122,12 @@ public:
     return Linkage == AvailableExternallyLinkage;
   }
   static bool isLinkOnceLinkage(LinkageTypes Linkage) {
-    return Linkage == LinkOnceAnyLinkage || Linkage == LinkOnceODRLinkage;
+    return Linkage == LinkOnceAnyLinkage ||
+           Linkage == LinkOnceODRLinkage ||
+           Linkage == LinkOnceODRAutoHideLinkage;
+  }
+  static bool isLinkOnceODRAutoHideLinkage(LinkageTypes Linkage) {
+    return Linkage == LinkOnceODRAutoHideLinkage;
   }
   static bool isWeakLinkage(LinkageTypes Linkage) {
     return Linkage == WeakAnyLinkage || Linkage == WeakODRLinkage;
@@ -143,13 +147,9 @@ public:
   static bool isLinkerPrivateWeakLinkage(LinkageTypes Linkage) {
     return Linkage == LinkerPrivateWeakLinkage;
   }
-  static bool isLinkerPrivateWeakDefAutoLinkage(LinkageTypes Linkage) {
-    return Linkage == LinkerPrivateWeakDefAutoLinkage;
-  }
   static bool isLocalLinkage(LinkageTypes Linkage) {
     return isInternalLinkage(Linkage) || isPrivateLinkage(Linkage) ||
-      isLinkerPrivateLinkage(Linkage) || isLinkerPrivateWeakLinkage(Linkage) ||
-      isLinkerPrivateWeakDefAutoLinkage(Linkage);
+      isLinkerPrivateLinkage(Linkage) || isLinkerPrivateWeakLinkage(Linkage);
   }
   static bool isDLLImportLinkage(LinkageTypes Linkage) {
     return Linkage == DLLImportLinkage;
@@ -178,8 +178,7 @@ public:
            Linkage == LinkOnceAnyLinkage ||
            Linkage == CommonLinkage ||
            Linkage == ExternalWeakLinkage ||
-           Linkage == LinkerPrivateWeakLinkage ||
-           Linkage == LinkerPrivateWeakDefAutoLinkage;
+           Linkage == LinkerPrivateWeakLinkage;
   }
 
   /// isWeakForLinker - Whether the definition of this global may be replaced at
@@ -192,10 +191,10 @@ public:
            Linkage == WeakODRLinkage ||
            Linkage == LinkOnceAnyLinkage ||
            Linkage == LinkOnceODRLinkage ||
+           Linkage == LinkOnceODRAutoHideLinkage ||
            Linkage == CommonLinkage ||
            Linkage == ExternalWeakLinkage ||
-           Linkage == LinkerPrivateWeakLinkage ||
-           Linkage == LinkerPrivateWeakDefAutoLinkage;
+           Linkage == LinkerPrivateWeakLinkage;
   }
 
   bool hasExternalLinkage() const { return isExternalLinkage(Linkage); }
@@ -205,6 +204,9 @@ public:
   bool hasLinkOnceLinkage() const {
     return isLinkOnceLinkage(Linkage);
   }
+  bool hasLinkOnceODRAutoHideLinkage() const {
+    return isLinkOnceODRAutoHideLinkage(Linkage);
+  }
   bool hasWeakLinkage() const {
     return isWeakLinkage(Linkage);
   }
@@ -215,9 +217,6 @@ public:
   bool hasLinkerPrivateWeakLinkage() const {
     return isLinkerPrivateWeakLinkage(Linkage);
   }
-  bool hasLinkerPrivateWeakDefAutoLinkage() const {
-    return isLinkerPrivateWeakDefAutoLinkage(Linkage);
-  }
   bool hasLocalLinkage() const { return isLocalLinkage(Linkage); }
   bool hasDLLImportLinkage() const { return isDLLImportLinkage(Linkage); }
   bool hasDLLExportLinkage() const { return isDLLExportLinkage(Linkage); }
@@ -288,7 +287,6 @@ public:
   inline const Module *getParent() const { return Parent; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const GlobalValue *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::FunctionVal ||
            V->getValueID() == Value::GlobalVariableVal ||
diff --git a/include/llvm/GlobalVariable.h b/include/llvm/GlobalVariable.h
index 99b7a73b35ac..b9d3f68642f4 100644
--- a/include/llvm/GlobalVariable.h
+++ b/include/llvm/GlobalVariable.h
@@ -34,9 +34,9 @@ template<typename ValueSubClass, typename ItemParentClass>
 
 class GlobalVariable : public GlobalValue, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable, Module>;
-  void *operator new(size_t, unsigned);       // Do not implement
-  void operator=(const GlobalVariable &);     // Do not implement
-  GlobalVariable(const GlobalVariable &);     // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void operator=(const GlobalVariable &) LLVM_DELETED_FUNCTION;
+  GlobalVariable(const GlobalVariable &) LLVM_DELETED_FUNCTION;
 
   void setParent(Module *parent);
 
@@ -174,7 +174,6 @@ public:
   virtual void replaceUsesOfWithOnConstant(Value *From, Value *To, Use *U);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const GlobalVariable *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::GlobalVariableVal;
   }
diff --git a/include/llvm/IRBuilder.h b/include/llvm/IRBuilder.h
index d5b6f47f8a25..f63a16051e30 100644
--- a/include/llvm/IRBuilder.h
+++ b/include/llvm/IRBuilder.h
@@ -17,6 +17,7 @@
 
 #include "llvm/Instructions.h"
 #include "llvm/BasicBlock.h"
+#include "llvm/DataLayout.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -266,6 +267,10 @@ public:
     return Type::getInt8PtrTy(Context, AddrSpace);
   }
 
+  IntegerType* getIntPtrTy(DataLayout *DL, unsigned AddrSpace = 0) {
+    return DL->getIntPtrType(Context, AddrSpace);
+  }
+
   //===--------------------------------------------------------------------===//
   // Intrinsic creation methods
   //===--------------------------------------------------------------------===//
@@ -285,12 +290,15 @@ public:
   /// If the pointers aren't i8*, they will be converted.  If a TBAA tag is
   /// specified, it will be added to the instruction.
   CallInst *CreateMemCpy(Value *Dst, Value *Src, uint64_t Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = 0) {
-    return CreateMemCpy(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag);
+                         bool isVolatile = false, MDNode *TBAATag = 0,
+                         MDNode *TBAAStructTag = 0) {
+    return CreateMemCpy(Dst, Src, getInt64(Size), Align, isVolatile, TBAATag,
+                        TBAAStructTag);
   }
 
   CallInst *CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
-                         bool isVolatile = false, MDNode *TBAATag = 0);
+                         bool isVolatile = false, MDNode *TBAATag = 0,
+                         MDNode *TBAAStructTag = 0);
 
   /// CreateMemMove - Create and insert a memmove between the specified
   /// pointers.  If the pointers aren't i8*, they will be converted.  If a TBAA
@@ -810,6 +818,31 @@ public:
   StoreInst *CreateStore(Value *Val, Value *Ptr, bool isVolatile = false) {
     return Insert(new StoreInst(Val, Ptr, isVolatile));
   }
+  // Provided to resolve 'CreateAlignedLoad(Ptr, Align, "...")' correctly,
+  // instead of converting the string to 'bool' for the isVolatile parameter.
+  LoadInst *CreateAlignedLoad(Value *Ptr, unsigned Align, const char *Name) {
+    LoadInst *LI = CreateLoad(Ptr, Name);
+    LI->setAlignment(Align);
+    return LI;
+  }
+  LoadInst *CreateAlignedLoad(Value *Ptr, unsigned Align,
+                              const Twine &Name = "") {
+    LoadInst *LI = CreateLoad(Ptr, Name);
+    LI->setAlignment(Align);
+    return LI;
+  }
+  LoadInst *CreateAlignedLoad(Value *Ptr, unsigned Align, bool isVolatile,
+                              const Twine &Name = "") {
+    LoadInst *LI = CreateLoad(Ptr, isVolatile, Name);
+    LI->setAlignment(Align);
+    return LI;
+  }
+  StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align,
+                                bool isVolatile = false) {
+    StoreInst *SI = CreateStore(Val, Ptr, isVolatile);
+    SI->setAlignment(Align);
+    return SI;
+  }
   FenceInst *CreateFence(AtomicOrdering Ordering,
                          SynchronizationScope SynchScope = CrossThread) {
     return Insert(new FenceInst(Context, Ordering, SynchScope));
@@ -970,6 +1003,30 @@ public:
   Value *CreateSExt(Value *V, Type *DestTy, const Twine &Name = "") {
     return CreateCast(Instruction::SExt, V, DestTy, Name);
   }
+  /// CreateZExtOrTrunc - Create a ZExt or Trunc from the integer value V to
+  /// DestTy. Return the value untouched if the type of V is already DestTy.
+  Value *CreateZExtOrTrunc(Value *V, IntegerType *DestTy,
+                           const Twine &Name = "") {
+    assert(isa<IntegerType>(V->getType()) && "Can only zero extend integers!");
+    IntegerType *IntTy = cast<IntegerType>(V->getType());
+    if (IntTy->getBitWidth() < DestTy->getBitWidth())
+      return CreateZExt(V, DestTy, Name);
+    if (IntTy->getBitWidth() > DestTy->getBitWidth())
+      return CreateTrunc(V, DestTy, Name);
+    return V;
+  }
+  /// CreateSExtOrTrunc - Create a SExt or Trunc from the integer value V to
+  /// DestTy. Return the value untouched if the type of V is already DestTy.
+  Value *CreateSExtOrTrunc(Value *V, IntegerType *DestTy,
+                           const Twine &Name = "") {
+    assert(isa<IntegerType>(V->getType()) && "Can only sign extend integers!");
+    IntegerType *IntTy = cast<IntegerType>(V->getType());
+    if (IntTy->getBitWidth() < DestTy->getBitWidth())
+      return CreateSExt(V, DestTy, Name);
+    if (IntTy->getBitWidth() > DestTy->getBitWidth())
+      return CreateTrunc(V, DestTy, Name);
+    return V;
+  }
   Value *CreateFPToUI(Value *V, Type *DestTy, const Twine &Name = ""){
     return CreateCast(Instruction::FPToUI, V, DestTy, Name);
   }
@@ -1052,7 +1109,7 @@ public:
 private:
   // Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a compile time
   // error, instead of converting the string to bool for the isSigned parameter.
-  Value *CreateIntCast(Value *, Type *, const char *); // DO NOT IMPLEMENT
+  Value *CreateIntCast(Value *, Type *, const char *) LLVM_DELETED_FUNCTION;
 public:
   Value *CreateFPCast(Value *V, Type *DestTy, const Twine &Name = "") {
     if (V->getType() == DestTy)
@@ -1261,13 +1318,13 @@ public:
   // Utility creation methods
   //===--------------------------------------------------------------------===//
 
-  /// CreateIsNull - Return an i1 value testing if \arg Arg is null.
+  /// CreateIsNull - Return an i1 value testing if \p Arg is null.
   Value *CreateIsNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpEQ(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
   }
 
-  /// CreateIsNotNull - Return an i1 value testing if \arg Arg is not null.
+  /// CreateIsNotNull - Return an i1 value testing if \p Arg is not null.
   Value *CreateIsNotNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpNE(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index de97957a84c6..8c164eb91984 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -66,6 +66,7 @@ void initializeAliasDebuggerPass(PassRegistry&);
 void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlwaysInlinerPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
+void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAliasAnalysisPass(PassRegistry&);
 void initializeBasicCallGraphPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
@@ -87,6 +88,7 @@ void initializeCodePlacementOptPass(PassRegistry&);
 void initializeConstantMergePass(PassRegistry&);
 void initializeConstantPropagationPass(PassRegistry&);
 void initializeMachineCopyPropagationPass(PassRegistry&);
+void initializeCostModelAnalysisPass(PassRegistry&);
 void initializeCorrelatedValuePropagationPass(PassRegistry&);
 void initializeDAEPass(PassRegistry&);
 void initializeDAHPass(PassRegistry&);
@@ -94,6 +96,7 @@ void initializeDCEPass(PassRegistry&);
 void initializeDSEPass(PassRegistry&);
 void initializeDeadInstEliminationPass(PassRegistry&);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
+void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDomOnlyPrinterPass(PassRegistry&);
 void initializeDomOnlyViewerPass(PassRegistry&);
 void initializeDomPrinterPass(PassRegistry&);
@@ -141,10 +144,10 @@ void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
+void initializeProfileMetadataLoaderPassPass(PassRegistry&);
 void initializePathProfileLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopDeletionPass(PassRegistry&);
-void initializeLoopDependenceAnalysisPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopInfoPass(PassRegistry&);
 void initializeLoopInstSimplifyPass(PassRegistry&);
@@ -166,6 +169,7 @@ void initializeMachineBlockPlacementStatsPass(PassRegistry&);
 void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
 void initializeMachineCSEPass(PassRegistry&);
 void initializeMachineDominatorTreePass(PassRegistry&);
+void initializeMachinePostDominatorTreePass(PassRegistry&);
 void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
 void initializeMachineLoopRangesPass(PassRegistry&);
@@ -177,6 +181,7 @@ void initializeMachineVerifierPassPass(PassRegistry&);
 void initializeMemCpyOptPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
 void initializeMemoryDependenceAnalysisPass(PassRegistry&);
+void initializeMetaRenamerPass(PassRegistry&);
 void initializeMergeFunctionsPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
 void initializeNoAAPass(PassRegistry&);
@@ -219,6 +224,7 @@ void initializeRegionOnlyViewerPass(PassRegistry&);
 void initializeRegionPrinterPass(PassRegistry&);
 void initializeRegionViewerPass(PassRegistry&);
 void initializeSCCPPass(PassRegistry&);
+void initializeSROAPass(PassRegistry&);
 void initializeSROA_DTPass(PassRegistry&);
 void initializeSROA_SSAUpPass(PassRegistry&);
 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
@@ -231,6 +237,7 @@ void initializeSinkingPass(PassRegistry&);
 void initializeSlotIndexesPass(PassRegistry&);
 void initializeSpillPlacementPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
+void initializeStackColoringPass(PassRegistry&);
 void initializeStackSlotColoringPass(PassRegistry&);
 void initializeStripDeadDebugInfoPass(PassRegistry&);
 void initializeStripDeadPrototypesPassPass(PassRegistry&);
@@ -241,7 +248,8 @@ void initializeStrongPHIEliminationPass(PassRegistry&);
 void initializeTailCallElimPass(PassRegistry&);
 void initializeTailDuplicatePassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
-void initializeTargetDataPass(PassRegistry&);
+void initializeDataLayoutPass(PassRegistry&);
+void initializeTargetTransformInfoPass(PassRegistry&);
 void initializeTargetLibraryInfoPass(PassRegistry&);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAliasAnalysisPass(PassRegistry&);
@@ -254,6 +262,7 @@ void initializeVirtRegRewriterPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
+void initializeLoopVectorizePass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 }
diff --git a/include/llvm/InlineAsm.h b/include/llvm/InlineAsm.h
index 37aa18bfff73..b5e0fd4effd6 100644
--- a/include/llvm/InlineAsm.h
+++ b/include/llvm/InlineAsm.h
@@ -33,20 +33,28 @@ template<class ConstantClass, class TypeClass, class ValType>
 struct ConstantCreator;
 
 class InlineAsm : public Value {
+public:
+  enum AsmDialect {
+    AD_ATT,
+    AD_Intel
+  };
+
+private:
   friend struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType>;
   friend class ConstantUniqueMap<InlineAsmKeyType, const InlineAsmKeyType&,
                                  PointerType, InlineAsm, false>;
 
-  InlineAsm(const InlineAsm &);             // do not implement
-  void operator=(const InlineAsm&);         // do not implement
+  InlineAsm(const InlineAsm &) LLVM_DELETED_FUNCTION;
+  void operator=(const InlineAsm&) LLVM_DELETED_FUNCTION;
 
   std::string AsmString, Constraints;
   bool HasSideEffects;
   bool IsAlignStack;
-  
+  AsmDialect Dialect;
+
   InlineAsm(PointerType *Ty, const std::string &AsmString,
             const std::string &Constraints, bool hasSideEffects,
-            bool isAlignStack);
+            bool isAlignStack, AsmDialect asmDialect);
   virtual ~InlineAsm();
 
   /// When the ConstantUniqueMap merges two types and makes two InlineAsms
@@ -58,11 +66,13 @@ public:
   ///
   static InlineAsm *get(FunctionType *Ty, StringRef AsmString,
                         StringRef Constraints, bool hasSideEffects,
-                        bool isAlignStack = false);
+                        bool isAlignStack = false,
+                        AsmDialect asmDialect = AD_ATT);
   
   bool hasSideEffects() const { return HasSideEffects; }
   bool isAlignStack() const { return IsAlignStack; }
-  
+  AsmDialect getDialect() const { return Dialect; }
+
   /// getType - InlineAsm's are always pointers.
   ///
   PointerType *getType() const {
@@ -179,7 +189,6 @@ public:
   }
   
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const InlineAsm *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == Value::InlineAsmVal;
   }
@@ -193,17 +202,20 @@ public:
     Op_InputChain = 0,
     Op_AsmString = 1,
     Op_MDNode = 2,
-    Op_ExtraInfo = 3,    // HasSideEffects, IsAlignStack
+    Op_ExtraInfo = 3,    // HasSideEffects, IsAlignStack, AsmDialect.
     Op_FirstOperand = 4,
 
     // Fixed operands on an INLINEASM MachineInstr.
     MIOp_AsmString = 0,
-    MIOp_ExtraInfo = 1,    // HasSideEffects, IsAlignStack
+    MIOp_ExtraInfo = 1,    // HasSideEffects, IsAlignStack, AsmDialect.
     MIOp_FirstOperand = 2,
 
     // Interpretation of the MIOp_ExtraInfo bit field.
     Extra_HasSideEffects = 1,
     Extra_IsAlignStack = 2,
+    Extra_AsmDialect = 4,
+    Extra_MayLoad = 8,
+    Extra_MayStore = 16,
 
     // Inline asm operands map to multiple SDNode / MachineInstr operands.
     // The first operand is an immediate describing the asm operand, the low
diff --git a/include/llvm/InstrTypes.h b/include/llvm/InstrTypes.h
index 2529f24fe991..da17f3b80d7b 100644
--- a/include/llvm/InstrTypes.h
+++ b/include/llvm/InstrTypes.h
@@ -73,7 +73,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const TerminatorInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->isTerminator();
   }
@@ -88,7 +87,7 @@ public:
 //===----------------------------------------------------------------------===//
 
 class UnaryInstruction : public Instruction {
-  void *operator new(size_t, unsigned);      // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 
 protected:
   UnaryInstruction(Type *Ty, unsigned iType, Value *V,
@@ -113,7 +112,6 @@ public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const UnaryInstruction *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Alloca ||
            I->getOpcode() == Instruction::Load ||
@@ -138,14 +136,14 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(UnaryInstruction, Value)
 //===----------------------------------------------------------------------===//
 
 class BinaryOperator : public Instruction {
-  void *operator new(size_t, unsigned); // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 protected:
   void init(BinaryOps iType);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
                  const Twine &Name, Instruction *InsertBefore);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
                  const Twine &Name, BasicBlock *InsertAtEnd);
-  virtual BinaryOperator *clone_impl() const;
+  virtual BinaryOperator *clone_impl() const LLVM_OVERRIDE;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -361,7 +359,6 @@ public:
   bool isExact() const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const BinaryOperator *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->isBinaryOp();
   }
@@ -388,7 +385,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BinaryOperator, Value)
 /// if (isa<CastInst>(Instr)) { ... }
 /// @brief Base class of casting instructions.
 class CastInst : public UnaryInstruction {
-  virtual void anchor();
+  virtual void anchor() LLVM_OVERRIDE;
 protected:
   /// @brief Constructor with insert-before-instruction semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
@@ -563,7 +560,7 @@ public:
   /// IntPtrTy argument is used to make accurate determinations for casts
   /// involving Integer and Pointer types. They are no-op casts if the integer
   /// is the same size as the pointer. However, pointer size varies with
-  /// platform. Generally, the result of TargetData::getIntPtrType() should be
+  /// platform. Generally, the result of DataLayout::getIntPtrType() should be
   /// passed in. If that's not available, use Type::Int64Ty, which will make
   /// the isNoopCast call conservative.
   /// @brief Determine if the described cast is a no-op cast.
@@ -581,8 +578,8 @@ public:
 
   /// Determine how a pair of casts can be eliminated, if they can be at all.
   /// This is a helper function for both CastInst and ConstantExpr.
-  /// @returns 0 if the CastInst pair can't be eliminated
-  /// @returns Instruction::CastOps value for a cast that can replace
+  /// @returns 0 if the CastInst pair can't be eliminated, otherwise
+  /// returns Instruction::CastOps value for a cast that can replace
   /// the pair, casting SrcTy to DstTy.
   /// @brief Determine if a cast pair is eliminable
   static unsigned isEliminableCastPair(
@@ -591,7 +588,9 @@ public:
     Type *SrcTy, ///< SrcTy of 1st cast
     Type *MidTy, ///< DstTy of 1st cast & SrcTy of 2nd cast
     Type *DstTy, ///< DstTy of 2nd cast
-    Type *IntPtrTy ///< Integer type corresponding to Ptr types, or null
+    Type *SrcIntPtrTy, ///< Integer type corresponding to Ptr SrcTy, or null
+    Type *MidIntPtrTy, ///< Integer type corresponding to Ptr MidTy, or null
+    Type *DstIntPtrTy  ///< Integer type corresponding to Ptr DstTy, or null
   );
 
   /// @brief Return the opcode of this CastInst
@@ -611,7 +610,6 @@ public:
   static bool castIsValid(Instruction::CastOps op, Value *S, Type *DstTy);
 
   /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const CastInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->isCast();
   }
@@ -627,8 +625,8 @@ public:
 /// This class is the base class for the comparison instructions.
 /// @brief Abstract base class of comparison instructions.
 class CmpInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
-  CmpInst(); // do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  CmpInst() LLVM_DELETED_FUNCTION;
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, unsigned short pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
@@ -638,7 +636,7 @@ protected:
           Value *LHS, Value *RHS, const Twine &Name,
           BasicBlock *InsertAtEnd);
 
-  virtual void Anchor() const; // Out of line virtual method.
+  virtual void anchor() LLVM_OVERRIDE; // Out of line virtual method.
 public:
   /// This enumeration lists the possible predicates for CmpInst subclasses.
   /// Values in the range 0-31 are reserved for FCmpInst, while values in the
@@ -816,7 +814,6 @@ public:
   static bool isFalseWhenEqual(unsigned short predicate);
 
   /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const CmpInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ICmp ||
            I->getOpcode() == Instruction::FCmp;
diff --git a/include/llvm/Instruction.h b/include/llvm/Instruction.h
index 5512dcc9e6b6..8aa8a56bf825 100644
--- a/include/llvm/Instruction.h
+++ b/include/llvm/Instruction.h
@@ -28,8 +28,8 @@ template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
 
 class Instruction : public User, public ilist_node<Instruction> {
-  void operator=(const Instruction &);     // Do not implement
-  Instruction(const Instruction &);        // Do not implement
+  void operator=(const Instruction &) LLVM_DELETED_FUNCTION;
+  Instruction(const Instruction &) LLVM_DELETED_FUNCTION;
 
   BasicBlock *Parent;
   DebugLoc DbgLoc;                         // 'dbg' Metadata cache.
@@ -310,7 +310,6 @@ public:
   
   
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const Instruction *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() >= Value::InstructionVal;
   }
diff --git a/include/llvm/Instructions.h b/include/llvm/Instructions.h
index f5187e683269..69593b48c1f1 100644
--- a/include/llvm/Instructions.h
+++ b/include/llvm/Instructions.h
@@ -112,7 +112,6 @@ public:
   bool isStaticAlloca() const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const AllocaInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::Alloca);
   }
@@ -226,13 +225,13 @@ public:
   const Value *getPointerOperand() const { return getOperand(0); }
   static unsigned getPointerOperandIndex() { return 0U; }
 
+  /// \brief Returns the address space of the pointer operand.
   unsigned getPointerAddressSpace() const {
-    return cast<PointerType>(getPointerOperand()->getType())->getAddressSpace();
+    return getPointerOperand()->getType()->getPointerAddressSpace();
   }
 
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const LoadInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Load;
   }
@@ -255,7 +254,7 @@ private:
 /// StoreInst - an instruction for storing to memory
 ///
 class StoreInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void AssertOK();
 protected:
   virtual StoreInst *clone_impl() const;
@@ -349,12 +348,12 @@ public:
   const Value *getPointerOperand() const { return getOperand(1); }
   static unsigned getPointerOperandIndex() { return 1U; }
 
+  /// \brief Returns the address space of the pointer operand.
   unsigned getPointerAddressSpace() const {
-    return cast<PointerType>(getPointerOperand()->getType())->getAddressSpace();
+    return getPointerOperand()->getType()->getPointerAddressSpace();
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const StoreInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Store;
   }
@@ -382,7 +381,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value)
 /// FenceInst - an instruction for ordering other memory operations
 ///
 class FenceInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void Init(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 protected:
   virtual FenceInst *clone_impl() const;
@@ -426,7 +425,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const FenceInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Fence;
   }
@@ -450,7 +448,7 @@ private:
 /// there.  Returns the value that was loaded.
 ///
 class AtomicCmpXchgInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void Init(Value *Ptr, Value *Cmp, Value *NewVal,
             AtomicOrdering Ordering, SynchronizationScope SynchScope);
 protected:
@@ -521,12 +519,12 @@ public:
   Value *getNewValOperand() { return getOperand(2); }
   const Value *getNewValOperand() const { return getOperand(2); }
   
+  /// \brief Returns the address space of the pointer operand.
   unsigned getPointerAddressSpace() const {
-    return cast<PointerType>(getPointerOperand()->getType())->getAddressSpace();
+    return getPointerOperand()->getType()->getPointerAddressSpace();
   }
   
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const AtomicCmpXchgInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::AtomicCmpXchg;
   }
@@ -557,7 +555,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(AtomicCmpXchgInst, Value)
 /// the old value.
 ///
 class AtomicRMWInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 protected:
   virtual AtomicRMWInst *clone_impl() const;
 public:
@@ -665,12 +663,12 @@ public:
   Value *getValOperand() { return getOperand(1); }
   const Value *getValOperand() const { return getOperand(1); }
 
+  /// \brief Returns the address space of the pointer operand.
   unsigned getPointerAddressSpace() const {
-    return cast<PointerType>(getPointerOperand()->getType())->getAddressSpace();
+    return getPointerOperand()->getType()->getPointerAddressSpace();
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const AtomicRMWInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::AtomicRMW;
   }
@@ -768,6 +766,13 @@ public:
     return reinterpret_cast<PointerType*>(Instruction::getType());
   }
 
+  /// \brief Returns the address space of this instruction's pointer type.
+  unsigned getAddressSpace() const {
+    // Note that this is always the same as the pointer operand's address space
+    // and that is cheaper to compute, so cheat here.
+    return getPointerAddressSpace();
+  }
+
   /// getIndexedType - Returns the type of the element that would be loaded with
   /// a load instruction with the specified parameters.
   ///
@@ -778,10 +783,6 @@ public:
   static Type *getIndexedType(Type *Ptr, ArrayRef<Constant *> IdxList);
   static Type *getIndexedType(Type *Ptr, ArrayRef<uint64_t> IdxList);
 
-  /// getIndexedType - Returns the address space used by the GEP pointer.
-  ///
-  static unsigned getAddressSpace(Value *Ptr);
-
   inline op_iterator       idx_begin()       { return op_begin()+1; }
   inline const_op_iterator idx_begin() const { return op_begin()+1; }
   inline op_iterator       idx_end()         { return op_end(); }
@@ -797,22 +798,23 @@ public:
     return 0U;    // get index for modifying correct operand.
   }
 
-  unsigned getPointerAddressSpace() const {
-    return cast<PointerType>(getType())->getAddressSpace();
-  }
-
   /// getPointerOperandType - Method to return the pointer operand as a
   /// PointerType.
   Type *getPointerOperandType() const {
     return getPointerOperand()->getType();
   }
 
+  /// \brief Returns the address space of the pointer operand.
+  unsigned getPointerAddressSpace() const {
+    return getPointerOperandType()->getPointerAddressSpace();
+  }
+
   /// GetGEPReturnType - Returns the pointer type returned by the GEP
   /// instruction, which may be a vector of pointers.
   static Type *getGEPReturnType(Value *Ptr, ArrayRef<Value *> IdxList) {
     Type *PtrTy = PointerType::get(checkGEPType(
                                    getIndexedType(Ptr->getType(), IdxList)),
-                                   getAddressSpace(Ptr));
+                                   Ptr->getType()->getPointerAddressSpace());
     // Vector GEP
     if (Ptr->getType()->isVectorTy()) {
       unsigned NumElem = cast<VectorType>(Ptr->getType())->getNumElements();
@@ -849,7 +851,6 @@ public:
   bool isInBounds() const;
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const GetElementPtrInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::GetElementPtr);
   }
@@ -897,13 +898,13 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(GetElementPtrInst, Value)
 /// This instruction compares its operands according to the predicate given
 /// to the constructor. It only operates on integers or pointers. The operands
 /// must be identical types.
-/// @brief Represent an integer comparison operator.
+/// \brief Represent an integer comparison operator.
 class ICmpInst: public CmpInst {
 protected:
-  /// @brief Clone an identical ICmpInst
+  /// \brief Clone an identical ICmpInst
   virtual ICmpInst *clone_impl() const;
 public:
-  /// @brief Constructor with insert-before-instruction semantics.
+  /// \brief Constructor with insert-before-instruction semantics.
   ICmpInst(
     Instruction *InsertBefore,  ///< Where to insert
     Predicate pred,  ///< The predicate to use for the comparison
@@ -924,7 +925,7 @@ public:
            "Invalid operand types for ICmp instruction");
   }
 
-  /// @brief Constructor with insert-at-end semantics.
+  /// \brief Constructor with insert-at-end semantics.
   ICmpInst(
     BasicBlock &InsertAtEnd, ///< Block to insert into.
     Predicate pred,  ///< The predicate to use for the comparison
@@ -945,7 +946,7 @@ public:
            "Invalid operand types for ICmp instruction");
   }
 
-  /// @brief Constructor with no-insertion semantics
+  /// \brief Constructor with no-insertion semantics
   ICmpInst(
     Predicate pred, ///< The predicate to use for the comparison
     Value *LHS,     ///< The left-hand-side of the expression
@@ -967,25 +968,25 @@ public:
   /// For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
   /// @returns the predicate that would be the result if the operand were
   /// regarded as signed.
-  /// @brief Return the signed version of the predicate
+  /// \brief Return the signed version of the predicate
   Predicate getSignedPredicate() const {
     return getSignedPredicate(getPredicate());
   }
 
   /// This is a static version that you can use without an instruction.
-  /// @brief Return the signed version of the predicate.
+  /// \brief Return the signed version of the predicate.
   static Predicate getSignedPredicate(Predicate pred);
 
   /// For example, EQ->EQ, SLE->ULE, UGT->UGT, etc.
   /// @returns the predicate that would be the result if the operand were
   /// regarded as unsigned.
-  /// @brief Return the unsigned version of the predicate
+  /// \brief Return the unsigned version of the predicate
   Predicate getUnsignedPredicate() const {
     return getUnsignedPredicate(getPredicate());
   }
 
   /// This is a static version that you can use without an instruction.
-  /// @brief Return the unsigned version of the predicate.
+  /// \brief Return the unsigned version of the predicate.
   static Predicate getUnsignedPredicate(Predicate pred);
 
   /// isEquality - Return true if this predicate is either EQ or NE.  This also
@@ -1001,7 +1002,7 @@ public:
   }
 
   /// @returns true if the predicate of this ICmpInst is commutative
-  /// @brief Determine if this relation is commutative.
+  /// \brief Determine if this relation is commutative.
   bool isCommutative() const { return isEquality(); }
 
   /// isRelational - Return true if the predicate is relational (not EQ or NE).
@@ -1017,21 +1018,20 @@ public:
   }
 
   /// Initialize a set of values that all satisfy the predicate with C.
-  /// @brief Make a ConstantRange for a relation with a constant value.
+  /// \brief Make a ConstantRange for a relation with a constant value.
   static ConstantRange makeConstantRange(Predicate pred, const APInt &C);
 
   /// Exchange the two operands to this instruction in such a way that it does
   /// not modify the semantics of the instruction. The predicate value may be
   /// changed to retain the same result if the predicate is order dependent
   /// (e.g. ult).
-  /// @brief Swap operands and adjust predicate.
+  /// \brief Swap operands and adjust predicate.
   void swapOperands() {
     setPredicate(getSwappedPredicate());
     Op<0>().swap(Op<1>());
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ICmpInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ICmp;
   }
@@ -1048,13 +1048,13 @@ public:
 /// This instruction compares its operands according to the predicate given
 /// to the constructor. It only operates on floating point values or packed
 /// vectors of floating point values. The operands must be identical types.
-/// @brief Represents a floating point comparison operator.
+/// \brief Represents a floating point comparison operator.
 class FCmpInst: public CmpInst {
 protected:
-  /// @brief Clone an identical FCmpInst
+  /// \brief Clone an identical FCmpInst
   virtual FCmpInst *clone_impl() const;
 public:
-  /// @brief Constructor with insert-before-instruction semantics.
+  /// \brief Constructor with insert-before-instruction semantics.
   FCmpInst(
     Instruction *InsertBefore, ///< Where to insert
     Predicate pred,  ///< The predicate to use for the comparison
@@ -1073,7 +1073,7 @@ public:
            "Invalid operand types for FCmp instruction");
   }
 
-  /// @brief Constructor with insert-at-end semantics.
+  /// \brief Constructor with insert-at-end semantics.
   FCmpInst(
     BasicBlock &InsertAtEnd, ///< Block to insert into.
     Predicate pred,  ///< The predicate to use for the comparison
@@ -1092,7 +1092,7 @@ public:
            "Invalid operand types for FCmp instruction");
   }
 
-  /// @brief Constructor with no-insertion semantics
+  /// \brief Constructor with no-insertion semantics
   FCmpInst(
     Predicate pred, ///< The predicate to use for the comparison
     Value *LHS,     ///< The left-hand-side of the expression
@@ -1110,14 +1110,14 @@ public:
   }
 
   /// @returns true if the predicate of this instruction is EQ or NE.
-  /// @brief Determine if this is an equality predicate.
+  /// \brief Determine if this is an equality predicate.
   bool isEquality() const {
     return getPredicate() == FCMP_OEQ || getPredicate() == FCMP_ONE ||
            getPredicate() == FCMP_UEQ || getPredicate() == FCMP_UNE;
   }
 
   /// @returns true if the predicate of this instruction is commutative.
-  /// @brief Determine if this is a commutative predicate.
+  /// \brief Determine if this is a commutative predicate.
   bool isCommutative() const {
     return isEquality() ||
            getPredicate() == FCMP_FALSE ||
@@ -1127,21 +1127,20 @@ public:
   }
 
   /// @returns true if the predicate is relational (not EQ or NE).
-  /// @brief Determine if this a relational predicate.
+  /// \brief Determine if this a relational predicate.
   bool isRelational() const { return !isEquality(); }
 
   /// Exchange the two operands to this instruction in such a way that it does
   /// not modify the semantics of the instruction. The predicate value may be
   /// changed to retain the same result if the predicate is order dependent
   /// (e.g. ult).
-  /// @brief Swap operands and adjust predicate.
+  /// \brief Swap operands and adjust predicate.
   void swapOperands() {
     setPredicate(getSwappedPredicate());
     Op<0>().swap(Op<1>());
   }
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const FCmpInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::FCmp;
   }
@@ -1163,12 +1162,12 @@ class CallInst : public Instruction {
   void init(Value *Func, const Twine &NameStr);
 
   /// Construct a CallInst given a range of arguments.
-  /// @brief Construct a CallInst from a range of arguments
+  /// \brief Construct a CallInst from a range of arguments
   inline CallInst(Value *Func, ArrayRef<Value *> Args,
                   const Twine &NameStr, Instruction *InsertBefore);
 
   /// Construct a CallInst given a range of arguments.
-  /// @brief Construct a CallInst from a range of arguments
+  /// \brief Construct a CallInst from a range of arguments
   inline CallInst(Value *Func, ArrayRef<Value *> Args,
                   const Twine &NameStr, BasicBlock *InsertAtEnd);
 
@@ -1267,77 +1266,78 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attributes attr);
 
-  /// \brief Return true if this call has the given attribute.
-  bool hasFnAttr(Attributes N) const {
-    return paramHasAttr(~0, N);
-  }
+  /// \brief Determine whether this call has the given attribute.
+  bool hasFnAttr(Attributes::AttrVal A) const;
 
-  /// @brief Determine whether the call or the callee has the given attribute.
-  bool paramHasAttr(unsigned i, Attributes attr) const;
+  /// \brief Determine whether the call or the callee has the given attributes.
+  bool paramHasAttr(unsigned i, Attributes::AttrVal A) const;
 
-  /// @brief Extract the alignment for a call or parameter (0=unknown).
+  /// \brief Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned i) const {
     return AttributeList.getParamAlignment(i);
   }
 
-  /// @brief Return true if the call should not be inlined.
-  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
-  void setIsNoInline(bool Value = true) {
-    if (Value) addAttribute(~0, Attribute::NoInline);
-    else removeAttribute(~0, Attribute::NoInline);
+  /// \brief Return true if the call should not be inlined.
+  bool isNoInline() const { return hasFnAttr(Attributes::NoInline); }
+  void setIsNoInline() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::NoInline));
   }
 
-  /// @brief Return true if the call can return twice
+  /// \brief Return true if the call can return twice
   bool canReturnTwice() const {
-    return hasFnAttr(Attribute::ReturnsTwice);
+    return hasFnAttr(Attributes::ReturnsTwice);
   }
-  void setCanReturnTwice(bool Value = true) {
-    if (Value) addAttribute(~0, Attribute::ReturnsTwice);
-    else removeAttribute(~0, Attribute::ReturnsTwice);
+  void setCanReturnTwice() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::ReturnsTwice));
   }
 
-  /// @brief Determine if the call does not access memory.
+  /// \brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
-    return hasFnAttr(Attribute::ReadNone);
+    return hasFnAttr(Attributes::ReadNone);
   }
-  void setDoesNotAccessMemory(bool NotAccessMemory = true) {
-    if (NotAccessMemory) addAttribute(~0, Attribute::ReadNone);
-    else removeAttribute(~0, Attribute::ReadNone);
+  void setDoesNotAccessMemory() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::ReadNone));
   }
 
-  /// @brief Determine if the call does not access or only reads memory.
+  /// \brief Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
+    return doesNotAccessMemory() || hasFnAttr(Attributes::ReadOnly);
   }
-  void setOnlyReadsMemory(bool OnlyReadsMemory = true) {
-    if (OnlyReadsMemory) addAttribute(~0, Attribute::ReadOnly);
-    else removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone);
+  void setOnlyReadsMemory() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::ReadOnly));
   }
 
-  /// @brief Determine if the call cannot return.
-  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
-  void setDoesNotReturn(bool DoesNotReturn = true) {
-    if (DoesNotReturn) addAttribute(~0, Attribute::NoReturn);
-    else removeAttribute(~0, Attribute::NoReturn);
+  /// \brief Determine if the call cannot return.
+  bool doesNotReturn() const { return hasFnAttr(Attributes::NoReturn); }
+  void setDoesNotReturn() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::NoReturn));
   }
 
-  /// @brief Determine if the call cannot unwind.
-  bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
-  void setDoesNotThrow(bool DoesNotThrow = true) {
-    if (DoesNotThrow) addAttribute(~0, Attribute::NoUnwind);
-    else removeAttribute(~0, Attribute::NoUnwind);
+  /// \brief Determine if the call cannot unwind.
+  bool doesNotThrow() const { return hasFnAttr(Attributes::NoUnwind); }
+  void setDoesNotThrow() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::NoUnwind));
   }
 
-  /// @brief Determine if the call returns a structure through first
+  /// \brief Determine if the call returns a structure through first
   /// pointer argument.
   bool hasStructRetAttr() const {
     // Be friendly and also check the callee.
-    return paramHasAttr(1, Attribute::StructRet);
+    return paramHasAttr(1, Attributes::StructRet);
   }
 
-  /// @brief Determine if any call argument is an aggregate passed by value.
+  /// \brief Determine if any call argument is an aggregate passed by value.
   bool hasByValArgument() const {
-    return AttributeList.hasAttrSomewhere(Attribute::ByVal);
+    for (unsigned I = 0, E = AttributeList.getNumAttrs(); I != E; ++I)
+      if (AttributeList.getAttributesAtIndex(I).hasAttribute(Attributes::ByVal))
+        return true;
+    return false;
   }
 
   /// getCalledFunction - Return the function called, or null if this is an
@@ -1363,7 +1363,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const CallInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Call;
   }
@@ -1469,7 +1468,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const SelectInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Select;
   }
@@ -1512,7 +1510,6 @@ public:
   static unsigned getPointerOperandIndex() { return 0U; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const VAArgInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == VAArg;
   }
@@ -1566,7 +1563,6 @@ public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ExtractElementInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ExtractElement;
   }
@@ -1625,7 +1621,6 @@ public:
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const InsertElementInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::InsertElement;
   }
@@ -1706,7 +1701,6 @@ public:
 
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ShuffleVectorInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ShuffleVector;
   }
@@ -1802,7 +1796,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ExtractValueInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::ExtractValue;
   }
@@ -1839,7 +1832,7 @@ ExtractValueInst::ExtractValueInst(Value *Agg,
 class InsertValueInst : public Instruction {
   SmallVector<unsigned, 4> Indices;
 
-  void *operator new(size_t, unsigned); // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   InsertValueInst(const InsertValueInst &IVI);
   void init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
             const Twine &NameStr);
@@ -1924,7 +1917,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const InsertValueInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::InsertValue;
   }
@@ -1970,7 +1962,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value)
 // scientist's overactive imagination.
 //
 class PHINode : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   /// ReservedSpace - The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -2141,7 +2133,6 @@ public:
   Value *hasConstantValue() const;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const PHINode *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::PHI;
   }
@@ -2178,7 +2169,7 @@ class LandingPadInst : public Instruction {
 public:
   enum ClauseType { Catch, Filter };
 private:
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   // Allocate space for exactly zero operands.
   void *operator new(size_t s) {
     return User::operator new(s, 0);
@@ -2249,7 +2240,6 @@ public:
   void reserveClauses(unsigned Size) { growOperands(Size); }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const LandingPadInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::LandingPad;
   }
@@ -2318,7 +2308,6 @@ public:
   unsigned getNumSuccessors() const { return 0; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ReturnInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::Ret);
   }
@@ -2418,7 +2407,6 @@ public:
   void swapSuccessors();
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const BranchInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::Br);
   }
@@ -2445,7 +2433,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 /// SwitchInst - Multiway switch
 ///
 class SwitchInst : public TerminatorInst {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   unsigned ReservedSpace;
   // Operands format:
   // Operand[0]    = Value to switch on
@@ -2613,7 +2601,7 @@ public:
   }
 
   /// addCase - Add an entry to the switch instruction...
-  /// @Deprecated
+  /// @deprecated
   /// Note:
   /// This action invalidates case_end(). Old case_end() iterator will
   /// point to the added case.
@@ -2699,7 +2687,7 @@ public:
     }
     
     /// Resolves case value for current case.
-    /// @Deprecated
+    /// @deprecated
     ConstantIntTy *getCaseValue() {
       assert(Index < SI->getNumCases() && "Index out the number of cases.");
       IntegersSubsetRef CaseRanges = *SubsetIt;
@@ -2803,7 +2791,7 @@ public:
     CaseIt(const ParentTy& Src) : ParentTy(Src) {}
 
     /// Sets the new value for current case.    
-    /// @Deprecated.
+    /// @deprecated.
     void setValue(ConstantInt *V) {
       assert(Index < SI->getNumCases() && "Index out the number of cases.");
       IntegersSubsetToBB Mapping;
@@ -2829,7 +2817,6 @@ public:
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
 
-  static inline bool classof(const SwitchInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Switch;
   }
@@ -2857,7 +2844,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 /// IndirectBrInst - Indirect Branch Instruction.
 ///
 class IndirectBrInst : public TerminatorInst {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   unsigned ReservedSpace;
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
@@ -2928,7 +2915,6 @@ public:
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IndirectBrInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::IndirectBr;
   }
@@ -2963,14 +2949,14 @@ class InvokeInst : public TerminatorInst {
 
   /// Construct an InvokeInst given a range of arguments.
   ///
-  /// @brief Construct an InvokeInst from a range of arguments
+  /// \brief Construct an InvokeInst from a range of arguments
   inline InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
                     ArrayRef<Value *> Args, unsigned Values,
                     const Twine &NameStr, Instruction *InsertBefore);
 
   /// Construct an InvokeInst given a range of arguments.
   ///
-  /// @brief Construct an InvokeInst from a range of arguments
+  /// \brief Construct an InvokeInst from a range of arguments
   inline InvokeInst(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException,
                     ArrayRef<Value *> Args, unsigned Values,
                     const Twine &NameStr, BasicBlock *InsertAtEnd);
@@ -3029,68 +3015,69 @@ public:
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attributes attr);
 
-  /// \brief Return true if this call has the given attribute.
-  bool hasFnAttr(Attributes N) const {
-    return paramHasAttr(~0, N);
-  }
+  /// \brief Determine whether this call has the NoAlias attribute.
+  bool hasFnAttr(Attributes::AttrVal A) const;
 
-  /// @brief Determine whether the call or the callee has the given attribute.
-  bool paramHasAttr(unsigned i, Attributes attr) const;
+  /// \brief Determine whether the call or the callee has the given attributes.
+  bool paramHasAttr(unsigned i, Attributes::AttrVal A) const;
 
-  /// @brief Extract the alignment for a call or parameter (0=unknown).
+  /// \brief Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned i) const {
     return AttributeList.getParamAlignment(i);
   }
 
-  /// @brief Return true if the call should not be inlined.
-  bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
-  void setIsNoInline(bool Value = true) {
-    if (Value) addAttribute(~0, Attribute::NoInline);
-    else removeAttribute(~0, Attribute::NoInline);
+  /// \brief Return true if the call should not be inlined.
+  bool isNoInline() const { return hasFnAttr(Attributes::NoInline); }
+  void setIsNoInline() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::NoInline));
   }
 
-  /// @brief Determine if the call does not access memory.
+  /// \brief Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
-    return hasFnAttr(Attribute::ReadNone);
+    return hasFnAttr(Attributes::ReadNone);
   }
-  void setDoesNotAccessMemory(bool NotAccessMemory = true) {
-    if (NotAccessMemory) addAttribute(~0, Attribute::ReadNone);
-    else removeAttribute(~0, Attribute::ReadNone);
+  void setDoesNotAccessMemory() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::ReadNone));
   }
 
-  /// @brief Determine if the call does not access or only reads memory.
+  /// \brief Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
-    return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
+    return doesNotAccessMemory() || hasFnAttr(Attributes::ReadOnly);
   }
-  void setOnlyReadsMemory(bool OnlyReadsMemory = true) {
-    if (OnlyReadsMemory) addAttribute(~0, Attribute::ReadOnly);
-    else removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone);
+  void setOnlyReadsMemory() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::ReadOnly));
   }
 
-  /// @brief Determine if the call cannot return.
-  bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
-  void setDoesNotReturn(bool DoesNotReturn = true) {
-    if (DoesNotReturn) addAttribute(~0, Attribute::NoReturn);
-    else removeAttribute(~0, Attribute::NoReturn);
+  /// \brief Determine if the call cannot return.
+  bool doesNotReturn() const { return hasFnAttr(Attributes::NoReturn); }
+  void setDoesNotReturn() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::NoReturn));
   }
 
-  /// @brief Determine if the call cannot unwind.
-  bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
-  void setDoesNotThrow(bool DoesNotThrow = true) {
-    if (DoesNotThrow) addAttribute(~0, Attribute::NoUnwind);
-    else removeAttribute(~0, Attribute::NoUnwind);
+  /// \brief Determine if the call cannot unwind.
+  bool doesNotThrow() const { return hasFnAttr(Attributes::NoUnwind); }
+  void setDoesNotThrow() {
+    addAttribute(AttrListPtr::FunctionIndex,
+                 Attributes::get(getContext(), Attributes::NoUnwind));
   }
 
-  /// @brief Determine if the call returns a structure through first
+  /// \brief Determine if the call returns a structure through first
   /// pointer argument.
   bool hasStructRetAttr() const {
     // Be friendly and also check the callee.
-    return paramHasAttr(1, Attribute::StructRet);
+    return paramHasAttr(1, Attributes::StructRet);
   }
 
-  /// @brief Determine if any call argument is an aggregate passed by value.
+  /// \brief Determine if any call argument is an aggregate passed by value.
   bool hasByValArgument() const {
-    return AttributeList.hasAttrSomewhere(Attribute::ByVal);
+    for (unsigned I = 0, E = AttributeList.getNumAttrs(); I != E; ++I)
+      if (AttributeList.getAttributesAtIndex(I).hasAttribute(Attributes::ByVal))
+        return true;
+    return false;
   }
 
   /// getCalledFunction - Return the function called, or null if this is an
@@ -3141,7 +3128,6 @@ public:
   unsigned getNumSuccessors() const { return 2; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const InvokeInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::Invoke);
   }
@@ -3221,7 +3207,6 @@ public:
   unsigned getNumSuccessors() const { return 0; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ResumeInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Resume;
   }
@@ -3251,7 +3236,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
 /// end of the block cannot be reached.
 ///
 class UnreachableInst : public TerminatorInst {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 protected:
   virtual UnreachableInst *clone_impl() const;
 
@@ -3266,7 +3251,6 @@ public:
   unsigned getNumSuccessors() const { return 0; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const UnreachableInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Unreachable;
   }
@@ -3283,14 +3267,14 @@ private:
 //                                 TruncInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a truncation of integer types.
+/// \brief This class represents a truncation of integer types.
 class TruncInst : public CastInst {
 protected:
-  /// @brief Clone an identical TruncInst
+  /// \brief Clone an identical TruncInst
   virtual TruncInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   TruncInst(
     Value *S,                     ///< The value to be truncated
     Type *Ty,               ///< The (smaller) type to truncate to
@@ -3298,7 +3282,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   TruncInst(
     Value *S,                     ///< The value to be truncated
     Type *Ty,               ///< The (smaller) type to truncate to
@@ -3306,8 +3290,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const TruncInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Trunc;
   }
@@ -3320,14 +3303,14 @@ public:
 //                                 ZExtInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents zero extension of integer types.
+/// \brief This class represents zero extension of integer types.
 class ZExtInst : public CastInst {
 protected:
-  /// @brief Clone an identical ZExtInst
+  /// \brief Clone an identical ZExtInst
   virtual ZExtInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   ZExtInst(
     Value *S,                     ///< The value to be zero extended
     Type *Ty,               ///< The type to zero extend to
@@ -3335,7 +3318,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end semantics.
+  /// \brief Constructor with insert-at-end semantics.
   ZExtInst(
     Value *S,                     ///< The value to be zero extended
     Type *Ty,               ///< The type to zero extend to
@@ -3343,8 +3326,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const ZExtInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == ZExt;
   }
@@ -3357,14 +3339,14 @@ public:
 //                                 SExtInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a sign extension of integer types.
+/// \brief This class represents a sign extension of integer types.
 class SExtInst : public CastInst {
 protected:
-  /// @brief Clone an identical SExtInst
+  /// \brief Clone an identical SExtInst
   virtual SExtInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   SExtInst(
     Value *S,                     ///< The value to be sign extended
     Type *Ty,               ///< The type to sign extend to
@@ -3372,7 +3354,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   SExtInst(
     Value *S,                     ///< The value to be sign extended
     Type *Ty,               ///< The type to sign extend to
@@ -3380,8 +3362,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const SExtInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == SExt;
   }
@@ -3394,14 +3375,14 @@ public:
 //                                 FPTruncInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a truncation of floating point types.
+/// \brief This class represents a truncation of floating point types.
 class FPTruncInst : public CastInst {
 protected:
-  /// @brief Clone an identical FPTruncInst
+  /// \brief Clone an identical FPTruncInst
   virtual FPTruncInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   FPTruncInst(
     Value *S,                     ///< The value to be truncated
     Type *Ty,               ///< The type to truncate to
@@ -3409,7 +3390,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   FPTruncInst(
     Value *S,                     ///< The value to be truncated
     Type *Ty,               ///< The type to truncate to
@@ -3417,8 +3398,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const FPTruncInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == FPTrunc;
   }
@@ -3431,14 +3411,14 @@ public:
 //                                 FPExtInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents an extension of floating point types.
+/// \brief This class represents an extension of floating point types.
 class FPExtInst : public CastInst {
 protected:
-  /// @brief Clone an identical FPExtInst
+  /// \brief Clone an identical FPExtInst
   virtual FPExtInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   FPExtInst(
     Value *S,                     ///< The value to be extended
     Type *Ty,               ///< The type to extend to
@@ -3446,7 +3426,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   FPExtInst(
     Value *S,                     ///< The value to be extended
     Type *Ty,               ///< The type to extend to
@@ -3454,8 +3434,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const FPExtInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == FPExt;
   }
@@ -3468,14 +3447,14 @@ public:
 //                                 UIToFPInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a cast unsigned integer to floating point.
+/// \brief This class represents a cast unsigned integer to floating point.
 class UIToFPInst : public CastInst {
 protected:
-  /// @brief Clone an identical UIToFPInst
+  /// \brief Clone an identical UIToFPInst
   virtual UIToFPInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   UIToFPInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3483,7 +3462,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   UIToFPInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3491,8 +3470,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const UIToFPInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == UIToFP;
   }
@@ -3505,14 +3483,14 @@ public:
 //                                 SIToFPInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a cast from signed integer to floating point.
+/// \brief This class represents a cast from signed integer to floating point.
 class SIToFPInst : public CastInst {
 protected:
-  /// @brief Clone an identical SIToFPInst
+  /// \brief Clone an identical SIToFPInst
   virtual SIToFPInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   SIToFPInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3520,7 +3498,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   SIToFPInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3528,8 +3506,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const SIToFPInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == SIToFP;
   }
@@ -3542,14 +3519,14 @@ public:
 //                                 FPToUIInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a cast from floating point to unsigned integer
+/// \brief This class represents a cast from floating point to unsigned integer
 class FPToUIInst  : public CastInst {
 protected:
-  /// @brief Clone an identical FPToUIInst
+  /// \brief Clone an identical FPToUIInst
   virtual FPToUIInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   FPToUIInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3557,7 +3534,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   FPToUIInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3565,8 +3542,7 @@ public:
     BasicBlock *InsertAtEnd       ///< Where to insert the new instruction
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const FPToUIInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == FPToUI;
   }
@@ -3579,14 +3555,14 @@ public:
 //                                 FPToSIInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a cast from floating point to signed integer.
+/// \brief This class represents a cast from floating point to signed integer.
 class FPToSIInst  : public CastInst {
 protected:
-  /// @brief Clone an identical FPToSIInst
+  /// \brief Clone an identical FPToSIInst
   virtual FPToSIInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   FPToSIInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3594,7 +3570,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   FPToSIInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3602,8 +3578,7 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const FPToSIInst *) { return true; }
+  /// \brief Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == FPToSI;
   }
@@ -3616,10 +3591,10 @@ public:
 //                                 IntToPtrInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a cast from an integer to a pointer.
+/// \brief This class represents a cast from an integer to a pointer.
 class IntToPtrInst : public CastInst {
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   IntToPtrInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3627,7 +3602,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   IntToPtrInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3635,11 +3610,15 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
-  /// @brief Clone an identical IntToPtrInst
+  /// \brief Clone an identical IntToPtrInst
   virtual IntToPtrInst *clone_impl() const;
 
+  /// \brief Returns the address space of this instruction's pointer type.
+  unsigned getAddressSpace() const {
+    return getType()->getPointerAddressSpace();
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const IntToPtrInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == IntToPtr;
   }
@@ -3652,14 +3631,14 @@ public:
 //                                 PtrToIntInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a cast from a pointer to an integer
+/// \brief This class represents a cast from a pointer to an integer
 class PtrToIntInst : public CastInst {
 protected:
-  /// @brief Clone an identical PtrToIntInst
+  /// \brief Clone an identical PtrToIntInst
   virtual PtrToIntInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   PtrToIntInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3667,7 +3646,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   PtrToIntInst(
     Value *S,                     ///< The value to be converted
     Type *Ty,               ///< The type to convert to
@@ -3675,8 +3654,19 @@ public:
     BasicBlock *InsertAtEnd       ///< The block to insert the instruction into
   );
 
+  /// \brief Gets the pointer operand.
+  Value *getPointerOperand() { return getOperand(0); }
+  /// \brief Gets the pointer operand.
+  const Value *getPointerOperand() const { return getOperand(0); }
+  /// \brief Gets the operand index of the pointer operand.
+  static unsigned getPointerOperandIndex() { return 0U; }
+
+  /// \brief Returns the address space of the pointer operand.
+  unsigned getPointerAddressSpace() const {
+    return getPointerOperand()->getType()->getPointerAddressSpace();
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const PtrToIntInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == PtrToInt;
   }
@@ -3689,14 +3679,14 @@ public:
 //                             BitCastInst Class
 //===----------------------------------------------------------------------===//
 
-/// @brief This class represents a no-op cast from one type to another.
+/// \brief This class represents a no-op cast from one type to another.
 class BitCastInst : public CastInst {
 protected:
-  /// @brief Clone an identical BitCastInst
+  /// \brief Clone an identical BitCastInst
   virtual BitCastInst *clone_impl() const;
 
 public:
-  /// @brief Constructor with insert-before-instruction semantics
+  /// \brief Constructor with insert-before-instruction semantics
   BitCastInst(
     Value *S,                     ///< The value to be casted
     Type *Ty,               ///< The type to casted to
@@ -3704,7 +3694,7 @@ public:
     Instruction *InsertBefore = 0 ///< Where to insert the new instruction
   );
 
-  /// @brief Constructor with insert-at-end-of-block semantics
+  /// \brief Constructor with insert-at-end-of-block semantics
   BitCastInst(
     Value *S,                     ///< The value to be casted
     Type *Ty,               ///< The type to casted to
@@ -3713,7 +3703,6 @@ public:
   );
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const BitCastInst *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == BitCast;
   }
diff --git a/include/llvm/IntrinsicInst.h b/include/llvm/IntrinsicInst.h
index 1cebdd2ee642..9b2afd56e05f 100644
--- a/include/llvm/IntrinsicInst.h
+++ b/include/llvm/IntrinsicInst.h
@@ -34,9 +34,9 @@ namespace llvm {
   /// functions.  This allows the standard isa/dyncast/cast functionality to
   /// work with calls to intrinsic functions.
   class IntrinsicInst : public CallInst {
-    IntrinsicInst();                      // DO NOT IMPLEMENT
-    IntrinsicInst(const IntrinsicInst&);  // DO NOT IMPLEMENT
-    void operator=(const IntrinsicInst&); // DO NOT IMPLEMENT
+    IntrinsicInst() LLVM_DELETED_FUNCTION;
+    IntrinsicInst(const IntrinsicInst&) LLVM_DELETED_FUNCTION;
+    void operator=(const IntrinsicInst&) LLVM_DELETED_FUNCTION;
   public:
     /// getIntrinsicID - Return the intrinsic ID of this intrinsic.
     ///
@@ -45,7 +45,6 @@ namespace llvm {
     }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const IntrinsicInst *) { return true; }
     static inline bool classof(const CallInst *I) {
       if (const Function *CF = I->getCalledFunction())
         return CF->getIntrinsicID() != 0;
@@ -62,7 +61,6 @@ namespace llvm {
   public:
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const DbgInfoIntrinsic *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       switch (I->getIntrinsicID()) {
       case Intrinsic::dbg_declare:
@@ -86,7 +84,6 @@ namespace llvm {
     MDNode *getVariable() const { return cast<MDNode>(getArgOperand(1)); }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const DbgDeclareInst *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::dbg_declare;
     }
@@ -108,7 +105,6 @@ namespace llvm {
     MDNode *getVariable() const { return cast<MDNode>(getArgOperand(2)); }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const DbgValueInst *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::dbg_value;
     }
@@ -175,7 +171,6 @@ namespace llvm {
     }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const MemIntrinsic *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       switch (I->getIntrinsicID()) {
       case Intrinsic::memcpy:
@@ -205,7 +200,6 @@ namespace llvm {
     }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const MemSetInst *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memset;
     }
@@ -238,7 +232,6 @@ namespace llvm {
     }
 
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const MemTransferInst *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memcpy ||
              I->getIntrinsicID() == Intrinsic::memmove;
@@ -254,7 +247,6 @@ namespace llvm {
   class MemCpyInst : public MemTransferInst {
   public:
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const MemCpyInst *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memcpy;
     }
@@ -268,7 +260,6 @@ namespace llvm {
   class MemMoveInst : public MemTransferInst {
   public:
     // Methods for support type inquiry through isa, cast, and dyn_cast:
-    static inline bool classof(const MemMoveInst *) { return true; }
     static inline bool classof(const IntrinsicInst *I) {
       return I->getIntrinsicID() == Intrinsic::memmove;
     }
@@ -277,6 +268,49 @@ namespace llvm {
     }
   };
 
+  /// VAStartInst - This represents the llvm.va_start intrinsic.
+  ///
+  class VAStartInst : public IntrinsicInst {
+  public:
+    static inline bool classof(const IntrinsicInst *I) {
+      return I->getIntrinsicID() == Intrinsic::vastart;
+    }
+    static inline bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+
+    Value *getArgList() const { return const_cast<Value*>(getArgOperand(0)); }
+  };
+
+  /// VAEndInst - This represents the llvm.va_end intrinsic.
+  ///
+  class VAEndInst : public IntrinsicInst {
+  public:
+    static inline bool classof(const IntrinsicInst *I) {
+      return I->getIntrinsicID() == Intrinsic::vaend;
+    }
+    static inline bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+
+    Value *getArgList() const { return const_cast<Value*>(getArgOperand(0)); }
+  };
+
+  /// VACopyInst - This represents the llvm.va_copy intrinsic.
+  ///
+  class VACopyInst : public IntrinsicInst {
+  public:
+    static inline bool classof(const IntrinsicInst *I) {
+      return I->getIntrinsicID() == Intrinsic::vacopy;
+    }
+    static inline bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+
+    Value *getDest() const { return const_cast<Value*>(getArgOperand(0)); }
+    Value *getSrc() const { return const_cast<Value*>(getArgOperand(1)); }
+  };
+
 }
 
 #endif
diff --git a/include/llvm/Intrinsics.h b/include/llvm/Intrinsics.h
index c3503889e702..3108a8e5251c 100644
--- a/include/llvm/Intrinsics.h
+++ b/include/llvm/Intrinsics.h
@@ -50,7 +50,7 @@ namespace Intrinsic {
   /// Intrinsic::getType(ID) - Return the function type for an intrinsic.
   ///
   FunctionType *getType(LLVMContext &Context, ID id,
-                              ArrayRef<Type*> Tys = ArrayRef<Type*>());
+                        ArrayRef<Type*> Tys = ArrayRef<Type*>());
 
   /// Intrinsic::isOverloaded(ID) - Returns true if the intrinsic can be
   /// overloaded.
@@ -58,7 +58,7 @@ namespace Intrinsic {
 
   /// Intrinsic::getAttributes(ID) - Return the attributes for an intrinsic.
   ///
-  AttrListPtr getAttributes(ID id);
+  AttrListPtr getAttributes(LLVMContext &C, ID id);
 
   /// Intrinsic::getDeclaration(M, ID) - Create or insert an LLVM Function
   /// declaration for an intrinsic, and return it.
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index d1a0feef1d5a..2e1597fe6f6b 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -121,15 +121,21 @@ def llvm_metadata_ty   : LLVMType<MetadataVT>;                    // !{...}
 def llvm_x86mmx_ty     : LLVMType<x86mmx>;
 def llvm_ptrx86mmx_ty  : LLVMPointerType<llvm_x86mmx_ty>;         // <1 x i64>*
 
+def llvm_v2i1_ty       : LLVMType<v2i1>;     //  2 x i1
+def llvm_v4i1_ty       : LLVMType<v4i1>;     //  4 x i1
+def llvm_v8i1_ty       : LLVMType<v8i1>;     //  8 x i1
+def llvm_v16i1_ty      : LLVMType<v16i1>;    // 16 x i1
 def llvm_v2i8_ty       : LLVMType<v2i8>;     //  2 x i8
 def llvm_v4i8_ty       : LLVMType<v4i8>;     //  4 x i8
 def llvm_v8i8_ty       : LLVMType<v8i8>;     //  8 x i8
 def llvm_v16i8_ty      : LLVMType<v16i8>;    // 16 x i8
 def llvm_v32i8_ty      : LLVMType<v32i8>;    // 32 x i8
+def llvm_v1i16_ty      : LLVMType<v1i16>;    //  1 x i16
 def llvm_v2i16_ty      : LLVMType<v2i16>;    //  2 x i16
 def llvm_v4i16_ty      : LLVMType<v4i16>;    //  4 x i16
 def llvm_v8i16_ty      : LLVMType<v8i16>;    //  8 x i16
 def llvm_v16i16_ty     : LLVMType<v16i16>;   // 16 x i16
+def llvm_v1i32_ty      : LLVMType<v1i32>;    //  1 x i32
 def llvm_v2i32_ty      : LLVMType<v2i32>;    //  2 x i32
 def llvm_v4i32_ty      : LLVMType<v4i32>;    //  4 x i32
 def llvm_v8i32_ty      : LLVMType<v8i32>;    //  8 x i32
@@ -279,9 +285,9 @@ let Properties = [IntrNoMem] in {
 
 // NOTE: these are internal interfaces.
 def int_setjmp     : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty]>;
-def int_longjmp    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_longjmp    : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 def int_sigsetjmp  : Intrinsic<[llvm_i32_ty] , [llvm_ptr_ty, llvm_i32_ty]>;
-def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty]>;
+def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 
 // Internal interface for object size checking
 def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_ptr_ty, llvm_i1_ty],
@@ -339,7 +345,7 @@ let Properties = [IntrNoMem] in {
 }
 def int_eh_sjlj_functioncontext : Intrinsic<[], [llvm_ptr_ty]>;
 def int_eh_sjlj_setjmp          : Intrinsic<[llvm_i32_ty], [llvm_ptr_ty]>;
-def int_eh_sjlj_longjmp         : Intrinsic<[], [llvm_ptr_ty]>;
+def int_eh_sjlj_longjmp         : Intrinsic<[], [llvm_ptr_ty], [IntrNoReturn]>;
 
 //===---------------- Generic Variable Attribute Intrinsics----------------===//
 //
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index fa8034e0c2ce..93b1ae1dc887 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -16,147 +16,136 @@
 // TLS
 
 let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
-              Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
-}
+
+def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
+            Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Saturating Arithmentic
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, Commutative]>;
-  def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-}
+def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem, Commutative]>;
+def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Load and Store exclusive doubleword
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
-                                  llvm_ptr_ty], [IntrReadWriteArgMem]>;
-  def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty],
-                                 [IntrReadArgMem]>;
-}
+def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
+    llvm_ptr_ty], [IntrReadWriteArgMem]>;
+def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty],
+    [IntrReadArgMem]>;
 
 //===----------------------------------------------------------------------===//
 // VFP
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">, 
-                         Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
-  def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">, 
-                         Intrinsic<[], [llvm_i32_ty], []>;
-  def int_arm_vcvtr     : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
-                                    [IntrNoMem]>;
-  def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
-                                    [IntrNoMem]>;
-}
+def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">,
+                       Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">,
+                       Intrinsic<[], [llvm_i32_ty], []>;
+def int_arm_vcvtr     : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
+                                  [IntrNoMem]>;
+def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
+                                  [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Coprocessor
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  // Move to coprocessor
-  def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Move from coprocessor
-  def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
-     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                               llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
-     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                               llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Coprocessor data processing
-  def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Move from two registers to coprocessor
-  def int_arm_mcrr : GCCBuiltin<"__builtin_arm_mcrr">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty], []>;
-}
+// Move to coprocessor
+def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+// Move from coprocessor
+def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
+   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                             llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
+   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                             llvm_i32_ty, llvm_i32_ty], []>;
+
+// Coprocessor data processing
+def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+// Move from two registers to coprocessor
+def int_arm_mcrr : GCCBuiltin<"__builtin_arm_mcrr">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-
-  // The following classes do not correspond directly to GCC builtins.
-  class Neon_1Arg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class Neon_1Arg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
-  class Neon_2Arg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class Neon_2Arg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedElementVectorType<0>,
-                 LLVMExtendedElementVectorType<0>],
-                [IntrNoMem]>;
-  class Neon_2Arg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedElementVectorType<0>,
-                 LLVMTruncatedElementVectorType<0>],
-                [IntrNoMem]>;
-  class Neon_3Arg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class Neon_3Arg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>,
-                 LLVMTruncatedElementVectorType<0>,
-                 LLVMTruncatedElementVectorType<0>],
-                [IntrNoMem]>;
-  class Neon_CvtFxToFP_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-  class Neon_CvtFPToFx_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // The table operands for VTBL and VTBX consist of 1 to 4 v8i8 vectors.
-  // Besides the table, VTBL has one other v8i8 argument and VTBX has two.
-  // Overall, the classes range from 2 to 6 v8i8 arguments.
-  class Neon_Tbl2Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
-  class Neon_Tbl3Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
-  class Neon_Tbl4Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty],
-                [IntrNoMem]>;
-  class Neon_Tbl5Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
-                 llvm_v8i8_ty], [IntrNoMem]>;
-  class Neon_Tbl6Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
-                 llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
-}
+// The following classes do not correspond directly to GCC builtins.
+class Neon_1Arg_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+class Neon_1Arg_Narrow_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
+class Neon_2Arg_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+              [IntrNoMem]>;
+class Neon_2Arg_Narrow_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMExtendedElementVectorType<0>,
+               LLVMExtendedElementVectorType<0>],
+              [IntrNoMem]>;
+class Neon_2Arg_Long_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMTruncatedElementVectorType<0>,
+               LLVMTruncatedElementVectorType<0>],
+              [IntrNoMem]>;
+class Neon_3Arg_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+              [IntrNoMem]>;
+class Neon_3Arg_Long_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>,
+               LLVMTruncatedElementVectorType<0>,
+               LLVMTruncatedElementVectorType<0>],
+              [IntrNoMem]>;
+class Neon_CvtFxToFP_Intrinsic
+  : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+class Neon_CvtFPToFx_Intrinsic
+  : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+
+// The table operands for VTBL and VTBX consist of 1 to 4 v8i8 vectors.
+// Besides the table, VTBL has one other v8i8 argument and VTBX has two.
+// Overall, the classes range from 2 to 6 v8i8 arguments.
+class Neon_Tbl2Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
+class Neon_Tbl3Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
+class Neon_Tbl4Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty],
+              [IntrNoMem]>;
+class Neon_Tbl5Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
+               llvm_v8i8_ty], [IntrNoMem]>;
+class Neon_Tbl6Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
+               llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
 
 // Arithmetic ops
 
@@ -209,20 +198,18 @@ def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
 def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
 
 // Vector Absolute Compare.
-let TargetPrefix = "arm" in {
-  def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
-                                      [llvm_v2f32_ty, llvm_v2f32_ty],
-                                      [IntrNoMem]>;
-  def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
-                                      [llvm_v4f32_ty, llvm_v4f32_ty],
-                                      [IntrNoMem]>;
-  def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
-                                      [llvm_v2f32_ty, llvm_v2f32_ty],
-                                      [IntrNoMem]>;
-  def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
-                                      [llvm_v4f32_ty, llvm_v4f32_ty],
-                                      [IntrNoMem]>;
-}
+def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
+                                    [llvm_v2f32_ty, llvm_v2f32_ty],
+                                    [IntrNoMem]>;
+def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
+                                    [llvm_v4f32_ty, llvm_v4f32_ty],
+                                    [IntrNoMem]>;
+def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
+                                    [llvm_v2f32_ty, llvm_v2f32_ty],
+                                    [IntrNoMem]>;
+def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
+                                    [llvm_v4f32_ty, llvm_v4f32_ty],
+                                    [IntrNoMem]>;
 
 // Vector Absolute Differences.
 def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
@@ -235,24 +222,20 @@ def int_arm_neon_vpadd : Neon_2Arg_Intrinsic;
 // Note: This is different than the other "long" NEON intrinsics because
 // the result vector has half as many elements as the source vector.
 // The source and destination vector types must be specified separately.
-let TargetPrefix = "arm" in {
-  def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-  def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-}
+def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
+                                     [IntrNoMem]>;
+def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
+                                     [IntrNoMem]>;
 
 // Vector Pairwise Add and Accumulate Long.
 // Note: This is similar to vpaddl but the destination vector also appears
 // as the first argument.
-let TargetPrefix = "arm" in {
-  def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty],
-                                       [LLVMMatchType<0>, llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-  def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty],
-                                       [LLVMMatchType<0>, llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-}
+def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty],
+                                     [LLVMMatchType<0>, llvm_anyvector_ty],
+                                     [IntrNoMem]>;
+def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty],
+                                     [LLVMMatchType<0>, llvm_anyvector_ty],
+                                     [IntrNoMem]>;
 
 // Vector Pairwise Maximum and Minimum.
 def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
@@ -364,79 +347,83 @@ def int_arm_neon_vtbx2 : Neon_Tbl4Arg_Intrinsic;
 def int_arm_neon_vtbx3 : Neon_Tbl5Arg_Intrinsic;
 def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic;
 
-let TargetPrefix = "arm" in {
-
-  // De-interleaving vector loads from N-element structures.
-  // Source operands are the address and alignment.
-  def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-  def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-  def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                     LLVMMatchType<0>],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-  def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                     LLVMMatchType<0>, LLVMMatchType<0>],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-
-  // Vector load N-element structure to one lane.
-  // Source operands are: the address, the N input vectors (since only one
-  // lane is assigned), the lane number, and the alignment.
-  def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadArgMem]>;
-  def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>],
-                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-  def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadArgMem]>;
-
-  // Interleaving vector stores from N-element structures.
-  // Source operands are: the address, the N vectors, and the alignment.
-  def int_arm_neon_vst1 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     llvm_i32_ty], [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst2 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, llvm_i32_ty],
-                                    [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst3 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, LLVMMatchType<0>,
-                                     llvm_i32_ty], [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst4 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, LLVMMatchType<0>,
-                                     LLVMMatchType<0>, llvm_i32_ty],
-                                    [IntrReadWriteArgMem]>;
-
-  // Vector store N-element structure from one lane.
-  // Source operands are: the address, the N vectors, the lane number, and
-  // the alignment.
-  def int_arm_neon_vst2lane : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst3lane : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst4lane : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
-}
+// De-interleaving vector loads from N-element structures.
+// Source operands are the address and alignment.
+def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                   LLVMMatchType<0>],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                   LLVMMatchType<0>, LLVMMatchType<0>],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+
+// Vector load N-element structure to one lane.
+// Source operands are: the address, the N input vectors (since only one
+// lane is assigned), the lane number, and the alignment.
+def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                      [llvm_ptr_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadArgMem]>;
+def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>],
+                                      [llvm_ptr_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       llvm_i32_ty, llvm_i32_ty],
+                                      [IntrReadArgMem]>;
+def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, LLVMMatchType<0>],
+                                      [llvm_ptr_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadArgMem]>;
+
+// Interleaving vector stores from N-element structures.
+// Source operands are: the address, the N vectors, and the alignment.
+def int_arm_neon_vst1 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_arm_neon_vst2 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   LLVMMatchType<0>, llvm_i32_ty],
+                                  [IntrReadWriteArgMem]>;
+def int_arm_neon_vst3 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   LLVMMatchType<0>, LLVMMatchType<0>,
+                                   llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_arm_neon_vst4 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   LLVMMatchType<0>, LLVMMatchType<0>,
+                                   LLVMMatchType<0>, llvm_i32_ty],
+                                  [IntrReadWriteArgMem]>;
+
+// Vector store N-element structure from one lane.
+// Source operands are: the address, the N vectors, the lane number, and
+// the alignment.
+def int_arm_neon_vst2lane : Intrinsic<[],
+                                      [llvm_ptr_ty, llvm_anyvector_ty,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_arm_neon_vst3lane : Intrinsic<[],
+                                      [llvm_ptr_ty, llvm_anyvector_ty,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       llvm_i32_ty, llvm_i32_ty],
+                                      [IntrReadWriteArgMem]>;
+def int_arm_neon_vst4lane : Intrinsic<[],
+                                      [llvm_ptr_ty, llvm_anyvector_ty,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadWriteArgMem]>;
+
+// Vector bitwise select.
+def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty],
+                        [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                        [IntrNoMem]>;
+
+} // end TargetPrefix
diff --git a/include/llvm/IntrinsicsMips.td b/include/llvm/IntrinsicsMips.td
index 4375ac2a7a1b..e40e162a158d 100644
--- a/include/llvm/IntrinsicsMips.td
+++ b/include/llvm/IntrinsicsMips.td
@@ -14,11 +14,15 @@
 //===----------------------------------------------------------------------===//
 // MIPS DSP data types
 def mips_v2q15_ty: LLVMType<v2i16>;
+def mips_v4q7_ty: LLVMType<v4i8>;
 def mips_q31_ty: LLVMType<i32>;
 
 let TargetPrefix = "mips" in {  // All intrinsics start with "llvm.mips.".
 
 //===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+
+//===----------------------------------------------------------------------===//
 // Addition/subtraction
 
 def int_mips_addu_qb : GCCBuiltin<"__builtin_mips_addu_qb">,
@@ -261,4 +265,125 @@ def int_mips_lhx: GCCBuiltin<"__builtin_mips_lhx">,
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
 def int_mips_lwx: GCCBuiltin<"__builtin_mips_lwx">,
   Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadArgMem]>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+
+def int_mips_absq_s_qb: GCCBuiltin<"__builtin_mips_absq_s_qb">,
+  Intrinsic<[mips_v4q7_ty], [mips_v4q7_ty], []>;
+
+def int_mips_addqh_ph: GCCBuiltin<"__builtin_mips_addqh_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_addqh_r_ph: GCCBuiltin<"__builtin_mips_addqh_r_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_addqh_w: GCCBuiltin<"__builtin_mips_addqh_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_addqh_r_w: GCCBuiltin<"__builtin_mips_addqh_r_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty],
+            [IntrNoMem, Commutative]>;
+
+def int_mips_addu_ph: GCCBuiltin<"__builtin_mips_addu_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+def int_mips_addu_s_ph: GCCBuiltin<"__builtin_mips_addu_s_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+
+def int_mips_adduh_qb: GCCBuiltin<"__builtin_mips_adduh_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem, Commutative]>;
+def int_mips_adduh_r_qb: GCCBuiltin<"__builtin_mips_adduh_r_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty],
+            [IntrNoMem, Commutative]>;
+
+def int_mips_append: GCCBuiltin<"__builtin_mips_append">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+def int_mips_balign: GCCBuiltin<"__builtin_mips_balign">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_cmpgdu_eq_qb: GCCBuiltin<"__builtin_mips_cmpgdu_eq_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgdu_lt_qb: GCCBuiltin<"__builtin_mips_cmpgdu_lt_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+def int_mips_cmpgdu_le_qb: GCCBuiltin<"__builtin_mips_cmpgdu_le_qb">,
+  Intrinsic<[llvm_i32_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [Commutative]>;
+
+def int_mips_dpa_w_ph: GCCBuiltin<"__builtin_mips_dpa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+def int_mips_dps_w_ph: GCCBuiltin<"__builtin_mips_dps_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+
+def int_mips_dpaqx_s_w_ph: GCCBuiltin<"__builtin_mips_dpaqx_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpaqx_sa_w_ph: GCCBuiltin<"__builtin_mips_dpaqx_sa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpax_w_ph: GCCBuiltin<"__builtin_mips_dpax_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+def int_mips_dpsx_w_ph: GCCBuiltin<"__builtin_mips_dpsx_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+def int_mips_dpsqx_s_w_ph: GCCBuiltin<"__builtin_mips_dpsqx_s_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+def int_mips_dpsqx_sa_w_ph: GCCBuiltin<"__builtin_mips_dpsqx_sa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, mips_v2q15_ty, mips_v2q15_ty], []>;
+
+def int_mips_mul_ph: GCCBuiltin<"__builtin_mips_mul_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+def int_mips_mul_s_ph: GCCBuiltin<"__builtin_mips_mul_s_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], [Commutative]>;
+
+def int_mips_mulq_rs_w: GCCBuiltin<"__builtin_mips_mulq_rs_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
+def int_mips_mulq_s_ph: GCCBuiltin<"__builtin_mips_mulq_s_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [Commutative]>;
+def int_mips_mulq_s_w: GCCBuiltin<"__builtin_mips_mulq_s_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [Commutative]>;
+def int_mips_mulsa_w_ph: GCCBuiltin<"__builtin_mips_mulsa_w_ph">,
+  Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_v2i16_ty, llvm_v2i16_ty],
+            [IntrNoMem]>;
+
+def int_mips_precr_qb_ph: GCCBuiltin<"__builtin_mips_precr_qb_ph">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
+def int_mips_precr_sra_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_ph_w">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+def int_mips_precr_sra_r_ph_w: GCCBuiltin<"__builtin_mips_precr_sra_r_ph_w">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+            [IntrNoMem]>;
+
+def int_mips_prepend: GCCBuiltin<"__builtin_mips_prepend">,
+  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+  [IntrNoMem]>;
+
+def int_mips_shra_qb: GCCBuiltin<"__builtin_mips_shra_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shra_r_qb: GCCBuiltin<"__builtin_mips_shra_r_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_mips_shrl_ph: GCCBuiltin<"__builtin_mips_shrl_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_i32_ty], [IntrNoMem]>;
+
+def int_mips_subqh_ph: GCCBuiltin<"__builtin_mips_subqh_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_subqh_r_ph: GCCBuiltin<"__builtin_mips_subqh_r_ph">,
+  Intrinsic<[mips_v2q15_ty], [mips_v2q15_ty, mips_v2q15_ty], [IntrNoMem]>;
+def int_mips_subqh_w: GCCBuiltin<"__builtin_mips_subqh_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
+def int_mips_subqh_r_w: GCCBuiltin<"__builtin_mips_subqh_r_w">,
+  Intrinsic<[mips_q31_ty], [mips_q31_ty, mips_q31_ty], [IntrNoMem]>;
+
+def int_mips_subu_ph: GCCBuiltin<"__builtin_mips_subu_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
+def int_mips_subu_s_ph: GCCBuiltin<"__builtin_mips_subu_s_ph">,
+  Intrinsic<[llvm_v2i16_ty], [llvm_v2i16_ty, llvm_v2i16_ty], []>;
+
+def int_mips_subuh_qb: GCCBuiltin<"__builtin_mips_subuh_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
+def int_mips_subuh_r_qb: GCCBuiltin<"__builtin_mips_subuh_r_qb">,
+  Intrinsic<[llvm_v4i8_ty], [llvm_v4i8_ty, llvm_v4i8_ty], [IntrNoMem]>;
 }
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index e8039f23583a..d2463c0efa14 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -219,7 +219,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse_storeu_ps : GCCBuiltin<"__builtin_ia32_storeups">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v4f32_ty], []>;
+                         llvm_v4f32_ty], [IntrReadWriteArgMem]>;
 }
 
 // Cacheability support ops
@@ -502,13 +502,13 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_sse2_storeu_pd : GCCBuiltin<"__builtin_ia32_storeupd">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v2f64_ty], []>;
+                         llvm_v2f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_sse2_storeu_dq : GCCBuiltin<"__builtin_ia32_storedqu">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v16i8_ty], []>;
+                         llvm_v16i8_ty], [IntrReadWriteArgMem]>;
   def int_x86_sse2_storel_dq : GCCBuiltin<"__builtin_ia32_storelv4si">,
               Intrinsic<[], [llvm_ptr_ty,
-                         llvm_v4i32_ty], []>;
+                         llvm_v4i32_ty], [IntrReadWriteArgMem]>;
 }
 
 // Misc.
@@ -1270,19 +1270,19 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_vbroadcast_ss :
         GCCBuiltin<"__builtin_ia32_vbroadcastss">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcast_sd_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastsd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcast_ss_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastss256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcastf128_pd_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx_vbroadcastf128_ps_256 :
         GCCBuiltin<"__builtin_ia32_vbroadcastf128_ps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
 }
 
 // SIMD load ops
@@ -1294,41 +1294,45 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // SIMD store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_storeu_pd_256 : GCCBuiltin<"__builtin_ia32_storeupd256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_storeu_ps_256 : GCCBuiltin<"__builtin_ia32_storeups256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_storeu_dq_256 : GCCBuiltin<"__builtin_ia32_storedqu256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v32i8_ty], [IntrReadWriteArgMem]>;
 }
 
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2f64_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx_maskload_ps : GCCBuiltin<"__builtin_ia32_maskloadps">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4f32_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx_maskload_pd_256 : GCCBuiltin<"__builtin_ia32_maskloadpd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4f64_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8f32_ty],
+                  [IntrReadArgMem]>;
 }
 
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v2f64_ty, llvm_v2f64_ty], []>;
+                  llvm_v2f64_ty, llvm_v2f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_maskstore_ps : GCCBuiltin<"__builtin_ia32_maskstoreps">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v4f32_ty, llvm_v4f32_ty], []>;
+                  llvm_v4f32_ty, llvm_v4f32_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_maskstore_pd_256 :
         GCCBuiltin<"__builtin_ia32_maskstorepd256">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v4f64_ty, llvm_v4f64_ty], []>;
+                  llvm_v4f64_ty, llvm_v4f64_ty], [IntrReadWriteArgMem]>;
   def int_x86_avx_maskstore_ps_256 :
         GCCBuiltin<"__builtin_ia32_maskstoreps256">,
         Intrinsic<[], [llvm_ptr_ty,
-                  llvm_v8f32_ty, llvm_v8f32_ty], []>;
+                  llvm_v8f32_ty, llvm_v8f32_ty], [IntrReadWriteArgMem]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1632,7 +1636,7 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
               Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_x86_avx2_vbroadcasti128 :
               GCCBuiltin<"__builtin_ia32_vbroadcastsi256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+              Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
   def int_x86_avx2_pbroadcastb_128 :
               GCCBuiltin<"__builtin_ia32_pbroadcastb128">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>;
@@ -1685,27 +1689,35 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 // Conditional load ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">,
-        Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx2_maskload_q : GCCBuiltin<"__builtin_ia32_maskloadq">,
-        Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx2_maskload_d_256 : GCCBuiltin<"__builtin_ia32_maskloadd256">,
-        Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty],
+                  [IntrReadArgMem]>;
   def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">,
-        Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty], [IntrReadMem]>;
+        Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty],
+                  [IntrReadArgMem]>;
 }
 
 // Conditional store ops
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_maskstore_d : GCCBuiltin<"__builtin_ia32_maskstored">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx2_maskstore_q : GCCBuiltin<"__builtin_ia32_maskstoreq">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx2_maskstore_d_256 :
         GCCBuiltin<"__builtin_ia32_maskstored256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                  [IntrReadWriteArgMem]>;
   def int_x86_avx2_maskstore_q_256 :
         GCCBuiltin<"__builtin_ia32_maskstoreq256">,
-        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty], []>;
+        Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty],
+                  [IntrReadWriteArgMem]>;
 }
 
 // Variable bit shift ops
@@ -2547,3 +2559,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_rdrand_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>;
   def int_x86_rdrand_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// RTM intrinsics. Transactional Memory support.
+
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_xbegin : GCCBuiltin<"__builtin_ia32_xbegin">,
+              Intrinsic<[llvm_i32_ty], [], []>;
+  def int_x86_xend : GCCBuiltin<"__builtin_ia32_xend">,
+              Intrinsic<[], [], []>;
+  def int_x86_xabort : GCCBuiltin<"__builtin_ia32_xabort">,
+              Intrinsic<[], [llvm_i8_ty], [IntrNoReturn]>;
+}
diff --git a/include/llvm/LLVMContext.h b/include/llvm/LLVMContext.h
index a8306a9e7617..5903e2e55e1f 100644
--- a/include/llvm/LLVMContext.h
+++ b/include/llvm/LLVMContext.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_LLVMCONTEXT_H
 #define LLVM_LLVMCONTEXT_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class LLVMContextImpl;
@@ -43,7 +45,8 @@ public:
     MD_tbaa = 1, // "tbaa"
     MD_prof = 2,  // "prof"
     MD_fpmath = 3,  // "fpmath"
-    MD_range = 4 // "range"
+    MD_range = 4, // "range"
+    MD_tbaa_struct = 5 // "tbaa.struct"
   };
   
   /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
@@ -87,9 +90,8 @@ public:
   void emitError(const Twine &ErrorStr);
 
 private:
-  // DO NOT IMPLEMENT
-  LLVMContext(LLVMContext&);
-  void operator=(LLVMContext&);
+  LLVMContext(LLVMContext&) LLVM_DELETED_FUNCTION;
+  void operator=(LLVMContext&) LLVM_DELETED_FUNCTION;
 
   /// addModule - Register a module as being instantiated in this context.  If
   /// the context is deleted, the module will be deleted as well.
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 697c94c094b0..806e4b37b73d 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -60,10 +60,12 @@ namespace {
       (void) llvm::createCFGSimplificationPass();
       (void) llvm::createConstantMergePass();
       (void) llvm::createConstantPropagationPass();
+      (void) llvm::createCostModelAnalysisPass();
       (void) llvm::createDeadArgEliminationPass();
       (void) llvm::createDeadCodeEliminationPass();
       (void) llvm::createDeadInstEliminationPass();
       (void) llvm::createDeadStoreEliminationPass();
+      (void) llvm::createDependenceAnalysisPass();
       (void) llvm::createDomOnlyPrinterPass();
       (void) llvm::createDomPrinterPass();
       (void) llvm::createDomOnlyViewerPass();
@@ -81,11 +83,10 @@ namespace {
       (void) llvm::createIPSCCPPass();
       (void) llvm::createIndVarSimplifyPass();
       (void) llvm::createInstructionCombiningPass();
-      (void) llvm::createInternalizePass(false);
+      (void) llvm::createInternalizePass();
       (void) llvm::createLCSSAPass();
       (void) llvm::createLICMPass();
       (void) llvm::createLazyValueInfoPass();
-      (void) llvm::createLoopDependenceAnalysisPass();
       (void) llvm::createLoopExtractorPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopStrengthReducePass();
@@ -107,6 +108,7 @@ namespace {
       (void) llvm::createProfileVerifierPass();
       (void) llvm::createPathProfileVerifierPass();
       (void) llvm::createProfileLoaderPass();
+      (void) llvm::createProfileMetadataLoaderPass();
       (void) llvm::createPathProfileLoaderPass();
       (void) llvm::createPromoteMemoryToRegisterPass();
       (void) llvm::createDemoteRegisterToMemoryPass();
@@ -140,6 +142,7 @@ namespace {
       (void) llvm::createLoopDeletionPass();
       (void) llvm::createPostDomTree();
       (void) llvm::createInstructionNamerPass();
+      (void) llvm::createMetaRenamerPass();
       (void) llvm::createFunctionAttrsPass();
       (void) llvm::createMergeFunctionsPass();
       (void) llvm::createPrintModulePass(0);
@@ -153,6 +156,7 @@ namespace {
       (void) llvm::createCorrelatedValuePropagationPass();
       (void) llvm::createMemDepPrinter();
       (void) llvm::createInstructionSimplifierPass();
+      (void) llvm::createLoopVectorizePass();
       (void) llvm::createBBVectorizePass();
 
       (void)new llvm::IntervalPartition();
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 05e6286b7cc5..72ed1a317c55 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -30,12 +30,13 @@ class raw_ostream;
 
 /// MCAsmBackend - Generic interface to target specific assembler backends.
 class MCAsmBackend {
-  MCAsmBackend(const MCAsmBackend &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmBackend &);  // DO NOT IMPLEMENT
+  MCAsmBackend(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmBackend &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCAsmBackend();
 
   unsigned HasReliableSymbolDifference : 1;
+  unsigned HasDataInCodeSupport : 1;
 
 public:
   virtual ~MCAsmBackend();
@@ -65,6 +66,12 @@ public:
     return HasReliableSymbolDifference;
   }
 
+  /// hasDataInCodeSupport - Check whether this target implements data-in-code
+  /// markers. If not, data region directives will be ignored.
+  bool hasDataInCodeSupport() const {
+    return HasDataInCodeSupport;
+  }
+
   /// doesSectionRequireSymbols - Check whether the given section requires that
   /// all symbols (even temporaries) have symbol table entries.
   virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
@@ -99,7 +106,7 @@ public:
 
   /// @}
 
-  /// applyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// applyFixup - Apply the \p Value for given \p Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
@@ -126,13 +133,20 @@ public:
   /// RelaxInstruction - Relax the instruction in the given fragment to the next
   /// wider instruction.
   ///
-  /// \param Inst - The instruction to relax, which may be the same as the
+  /// \param Inst The instruction to relax, which may be the same as the
   /// output.
-  /// \parm Res [output] - On return, the relaxed instruction.
+  /// \param [out] Res On return, the relaxed instruction.
   virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const = 0;
 
   /// @}
 
+  /// getMinimumNopSize - Returns the minimum size of a nop in bytes on this
+  /// target. The assembler will use this to emit excess padding in situations
+  /// where the padding required for simple alignment would be less than the
+  /// minimum nop size.
+  ///
+  virtual unsigned getMinimumNopSize() const { return 1; }
+
   /// writeNopData - Write an (optimal) nop sequence of Count bytes to the given
   /// output. If the target cannot generate such a sequence, it should return an
   /// error.
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 9f5230b9c8fa..97aad71fd955 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -33,7 +33,7 @@ namespace llvm {
   }
 
   namespace LCOMM {
-    enum LCOMMType { None, NoAlignment, ByteAlignment };
+    enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
   }
 
   /// MCAsmInfo - This class is intended to be used as a base class for asm
@@ -247,14 +247,14 @@ namespace llvm {
     /// .long a - b
     bool HasAggressiveSymbolFolding;           // Defaults to true.
 
-    /// LCOMMDirectiveType - Describes if the target supports the .lcomm
-    /// directive and whether it has an alignment parameter.
-    LCOMM::LCOMMType LCOMMDirectiveType;     // Defaults to LCOMM::None.
-
-    /// COMMDirectiveAlignmentIsInBytes - True is COMMDirective's optional
+    /// COMMDirectiveAlignmentIsInBytes - True is .comm's and .lcomms optional
     /// alignment is to be specified in bytes instead of log2(n).
     bool COMMDirectiveAlignmentIsInBytes;    // Defaults to true;
 
+    /// LCOMMDirectiveAlignment - Describes if the .lcomm directive for the
+    /// target supports an alignment argument and how it is interpreted.
+    LCOMM::LCOMMType LCOMMDirectiveAlignmentType; // Defaults to NoAlignment.
+
     /// HasDotTypeDotSizeDirective - True if the target has .type and .size
     /// directives, this is true for most ELF targets.
     bool HasDotTypeDotSizeDirective;         // Defaults to true.
@@ -496,13 +496,13 @@ namespace llvm {
     bool hasAggressiveSymbolFolding() const {
       return HasAggressiveSymbolFolding;
     }
-    LCOMM::LCOMMType getLCOMMDirectiveType() const {
-      return LCOMMDirectiveType;
-    }
-    bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
     bool getCOMMDirectiveAlignmentIsInBytes() const {
       return COMMDirectiveAlignmentIsInBytes;
     }
+    LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
+      return LCOMMDirectiveAlignmentType;
+    }
+    bool hasDotTypeDotSizeDirective() const {return HasDotTypeDotSizeDirective;}
     bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
     bool hasNoDeadStrip() const { return HasNoDeadStrip; }
     bool hasSymbolResolver() const { return HasSymbolResolver; }
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index b7b2d663f4cc..5771415c81cc 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -40,8 +40,8 @@ class MCAsmBackend;
 class MCFragment : public ilist_node<MCFragment> {
   friend class MCAsmLayout;
 
-  MCFragment(const MCFragment&);     // DO NOT IMPLEMENT
-  void operator=(const MCFragment&); // DO NOT IMPLEMENT
+  MCFragment(const MCFragment&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCFragment&) LLVM_DELETED_FUNCTION;
 
 public:
   enum FragmentType {
@@ -99,8 +99,6 @@ public:
   unsigned getLayoutOrder() const { return LayoutOrder; }
   void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
 
-  static bool classof(const MCFragment *O) { return true; }
-
   void dump();
 };
 
@@ -151,7 +149,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Data;
   }
-  static bool classof(const MCDataFragment *) { return true; }
 };
 
 // FIXME: This current incarnation of MCInstFragment doesn't make much sense, as
@@ -176,7 +173,7 @@ public:
   typedef SmallVectorImpl<MCFixup>::iterator fixup_iterator;
 
 public:
-  MCInstFragment(MCInst _Inst, MCSectionData *SD = 0)
+  MCInstFragment(const MCInst &_Inst, MCSectionData *SD = 0)
     : MCFragment(FT_Inst, SD), Inst(_Inst) {
   }
 
@@ -191,7 +188,7 @@ public:
   MCInst &getInst() { return Inst; }
   const MCInst &getInst() const { return Inst; }
 
-  void setInst(MCInst Value) { Inst = Value; }
+  void setInst(const MCInst& Value) { Inst = Value; }
 
   /// @}
   /// @name Fixup Access
@@ -213,7 +210,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Inst;
   }
-  static bool classof(const MCInstFragment *) { return true; }
 };
 
 class MCAlignFragment : public MCFragment {
@@ -225,7 +221,7 @@ class MCAlignFragment : public MCFragment {
   /// Value - Value to use for filling padding bytes.
   int64_t Value;
 
-  /// ValueSize - The size of the integer (in bytes) of \arg Value.
+  /// ValueSize - The size of the integer (in bytes) of \p Value.
   unsigned ValueSize;
 
   /// MaxBytesToEmit - The maximum number of bytes to emit; if the alignment
@@ -263,7 +259,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Align;
   }
-  static bool classof(const MCAlignFragment *) { return true; }
 };
 
 class MCFillFragment : public MCFragment {
@@ -272,7 +267,7 @@ class MCFillFragment : public MCFragment {
   /// Value - Value to use for filling bytes.
   int64_t Value;
 
-  /// ValueSize - The size (in bytes) of \arg Value to use when filling, or 0 if
+  /// ValueSize - The size (in bytes) of \p Value to use when filling, or 0 if
   /// this is a virtual fill fragment.
   unsigned ValueSize;
 
@@ -302,7 +297,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Fill;
   }
-  static bool classof(const MCFillFragment *) { return true; }
 };
 
 class MCOrgFragment : public MCFragment {
@@ -331,7 +325,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Org;
   }
-  static bool classof(const MCOrgFragment *) { return true; }
 };
 
 class MCLEBFragment : public MCFragment {
@@ -364,7 +357,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_LEB;
   }
-  static bool classof(const MCLEBFragment *) { return true; }
 };
 
 class MCDwarfLineAddrFragment : public MCFragment {
@@ -401,7 +393,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_Dwarf;
   }
-  static bool classof(const MCDwarfLineAddrFragment *) { return true; }
 };
 
 class MCDwarfCallFrameFragment : public MCFragment {
@@ -431,7 +422,6 @@ public:
   static bool classof(const MCFragment *F) {
     return F->getKind() == MCFragment::FT_DwarfFrame;
   }
-  static bool classof(const MCDwarfCallFrameFragment *) { return true; }
 };
 
 // FIXME: Should this be a separate class, or just merged into MCSection? Since
@@ -440,8 +430,8 @@ public:
 class MCSectionData : public ilist_node<MCSectionData> {
   friend class MCAsmLayout;
 
-  MCSectionData(const MCSectionData&);  // DO NOT IMPLEMENT
-  void operator=(const MCSectionData&); // DO NOT IMPLEMENT
+  MCSectionData(const MCSectionData&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCSectionData&) LLVM_DELETED_FUNCTION;
 
 public:
   typedef iplist<MCFragment> FragmentListType;
@@ -683,8 +673,8 @@ public:
   typedef std::vector<DataRegionData>::iterator data_region_iterator;
 
 private:
-  MCAssembler(const MCAssembler&);    // DO NOT IMPLEMENT
-  void operator=(const MCAssembler&); // DO NOT IMPLEMENT
+  MCAssembler(const MCAssembler&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAssembler&) LLVM_DELETED_FUNCTION;
 
   MCContext &Context;
 
@@ -738,7 +728,7 @@ private:
   /// \param Value [out] On return, the value of the fixup as currently laid
   /// out.
   /// \return Whether the fixup value was fully resolved. This is true if the
-  /// \arg Value result is fixed, otherwise the value may change due to
+  /// \p Value result is fixed, otherwise the value may change due to
   /// relocation.
   bool evaluateFixup(const MCAsmLayout &Layout,
                      const MCFixup &Fixup, const MCFragment *DF,
@@ -775,7 +765,7 @@ private:
 
 public:
   /// Compute the effective fragment size assuming it is laid out at the given
-  /// \arg SectionAddress and \arg FragmentOffset.
+  /// \p SectionAddress and \p FragmentOffset.
   uint64_t computeFragmentSize(const MCAsmLayout &Layout,
                                const MCFragment &F) const;
 
@@ -804,7 +794,7 @@ public:
 public:
   /// Construct a new assembler instance.
   ///
-  /// \arg OS - The stream to output to.
+  /// \param OS The stream to output to.
   //
   // FIXME: How are we going to parameterize this? Two obvious options are stay
   // concrete and require clients to pass in a target like object. The other
@@ -824,7 +814,7 @@ public:
   MCObjectWriter &getWriter() const { return Writer; }
 
   /// Finish - Do final processing and write the object to the output stream.
-  /// \arg Writer is used for custom object writer (as the MCJIT does),
+  /// \p Writer is used for custom object writer (as the MCJIT does),
   /// if not specified it is automatically created from backend.
   void Finish();
 
diff --git a/include/llvm/MC/MCCodeEmitter.h b/include/llvm/MC/MCCodeEmitter.h
index 934ef69ce3fe..057489090293 100644
--- a/include/llvm/MC/MCCodeEmitter.h
+++ b/include/llvm/MC/MCCodeEmitter.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_MC_MCCODEEMITTER_H
 #define LLVM_MC_MCCODEEMITTER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 class MCFixup;
 class MCInst;
@@ -19,16 +21,16 @@ template<typename T> class SmallVectorImpl;
 /// MCCodeEmitter - Generic instruction encoding interface.
 class MCCodeEmitter {
 private:
-  MCCodeEmitter(const MCCodeEmitter &);   // DO NOT IMPLEMENT
-  void operator=(const MCCodeEmitter &);  // DO NOT IMPLEMENT
+  MCCodeEmitter(const MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCCodeEmitter &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCCodeEmitter();
 
 public:
   virtual ~MCCodeEmitter();
 
-  /// EncodeInstruction - Encode the given \arg Inst to bytes on the output
-  /// stream \arg OS.
+  /// EncodeInstruction - Encode the given \p Inst to bytes on the output
+  /// stream \p OS.
   virtual void EncodeInstruction(const MCInst &Inst, raw_ostream &OS,
                                  SmallVectorImpl<MCFixup> &Fixups) const = 0;
 };
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 59545d31a655..5a8830cb66ce 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -40,8 +40,8 @@ namespace llvm {
   /// of the sections that it creates.
   ///
   class MCContext {
-    MCContext(const MCContext&); // DO NOT IMPLEMENT
-    MCContext &operator=(const MCContext&); // DO NOT IMPLEMENT
+    MCContext(const MCContext&) LLVM_DELETED_FUNCTION;
+    MCContext &operator=(const MCContext&) LLVM_DELETED_FUNCTION;
   public:
     typedef StringMap<MCSymbol*, BumpPtrAllocator&> SymbolTable;
   private:
@@ -183,6 +183,7 @@ namespace llvm {
 
     /// LookupSymbol - Get the symbol for \p Name, or null.
     MCSymbol *LookupSymbol(StringRef Name) const;
+    MCSymbol *LookupSymbol(const Twine &Name) const;
 
     /// getSymbols - Get a reference for the symbol table for clients that
     /// want to, for example, iterate over all symbols. 'const' because we
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index fdb7ab23c09f..8fc437f3e691 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -19,6 +19,7 @@
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Compiler.h"
 #include <vector>
 
 namespace llvm {
@@ -48,8 +49,8 @@ namespace llvm {
     MCDwarfFile(StringRef name, unsigned dirIndex)
       : Name(name), DirIndex(dirIndex) {}
 
-    MCDwarfFile(const MCDwarfFile&);       // DO NOT IMPLEMENT
-    void operator=(const MCDwarfFile&); // DO NOT IMPLEMENT
+    MCDwarfFile(const MCDwarfFile&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCDwarfFile&) LLVM_DELETED_FUNCTION;
   public:
     /// getName - Get the base name of this MCDwarfFile.
     StringRef getName() const { return Name; }
@@ -58,7 +59,7 @@ namespace llvm {
     unsigned getDirIndex() const { return DirIndex; }
 
 
-    /// print - Print the value to the stream \arg OS.
+    /// print - Print the value to the stream \p OS.
     void print(raw_ostream &OS) const;
 
     /// dump - Print the value to stderr.
@@ -177,8 +178,8 @@ namespace llvm {
   class MCLineSection {
 
   private:
-    MCLineSection(const MCLineSection&);  // DO NOT IMPLEMENT
-    void operator=(const MCLineSection&); // DO NOT IMPLEMENT
+    MCLineSection(const MCLineSection&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCLineSection&) LLVM_DELETED_FUNCTION;
 
   public:
     // Constructor to create an MCLineSection with an empty MCLineEntries
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index abbe188fe88d..38cdc7293ba0 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -85,6 +85,9 @@ public:
                                          const MCFragment &F,
                                          const MCFixup &Fixup,
                                          bool IsPCRel) const;
+  virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                                  const MCFixup &Fixup,
+                                                  bool IsPCRel) const;
   virtual void adjustFixupOffset(const MCFixup &Fixup,
                                  uint64_t &RelocOffset);
 
@@ -93,9 +96,9 @@ public:
 
   /// @name Accessors
   /// @{
-  uint8_t getOSABI() { return OSABI; }
-  uint16_t getEMachine() { return EMachine; }
-  bool hasRelocationAddend() { return HasRelocationAddend; }
+  uint8_t getOSABI() const { return OSABI; }
+  uint16_t getEMachine() const { return EMachine; }
+  bool hasRelocationAddend() const { return HasRelocationAddend; }
   bool is64Bit() const { return Is64Bit; }
   bool isN64() const { return IsN64; }
   /// @}
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index aa62eb2b16c0..00eef270d6c4 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -41,8 +41,8 @@ public:
 private:
   ExprKind Kind;
 
-  MCExpr(const MCExpr&); // DO NOT IMPLEMENT
-  void operator=(const MCExpr&); // DO NOT IMPLEMENT
+  MCExpr(const MCExpr&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCExpr&) LLVM_DELETED_FUNCTION;
 
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                           const MCAsmLayout *Layout,
@@ -78,11 +78,11 @@ public:
   /// values. If not given, then only non-symbolic expressions will be
   /// evaluated.
   /// @result - True on success.
+  bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout,
+                          const SectionAddrMap &Addrs) const;
   bool EvaluateAsAbsolute(int64_t &Res) const;
   bool EvaluateAsAbsolute(int64_t &Res, const MCAssembler &Asm) const;
   bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout) const;
-  bool EvaluateAsAbsolute(int64_t &Res, const MCAsmLayout &Layout,
-                          const SectionAddrMap &Addrs) const;
 
   /// EvaluateAsRelocatable - Try to evaluate the expression to a relocatable
   /// value, i.e. an expression of the fixed form (a - b + constant).
@@ -99,8 +99,6 @@ public:
   const MCSection *FindAssociatedSection() const;
 
   /// @}
-
-  static bool classof(const MCExpr *) { return true; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const MCExpr &E) {
@@ -132,7 +130,6 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Constant;
   }
-  static bool classof(const MCConstantExpr *) { return true; }
 };
 
 /// MCSymbolRefExpr - Represent a reference to a symbol from inside an
@@ -170,8 +167,10 @@ public:
     VK_ARM_TPOFF,
     VK_ARM_GOTTPOFF,
     VK_ARM_TARGET1,
+    VK_ARM_TARGET2,
 
-    VK_PPC_TOC,
+    VK_PPC_TOC,          // TOC base
+    VK_PPC_TOC_ENTRY,    // TOC entry
     VK_PPC_DARWIN_HA16,  // ha16(symbol)
     VK_PPC_DARWIN_LO16,  // lo16(symbol)
     VK_PPC_GAS_HA16,     // symbol@ha
@@ -247,7 +246,6 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::SymbolRef;
   }
-  static bool classof(const MCSymbolRefExpr *) { return true; }
 };
 
 /// MCUnaryExpr - Unary assembler expressions.
@@ -301,7 +299,6 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Unary;
   }
-  static bool classof(const MCUnaryExpr *) { return true; }
 };
 
 /// MCBinaryExpr - Binary assembler expressions.
@@ -436,7 +433,6 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Binary;
   }
-  static bool classof(const MCBinaryExpr *) { return true; }
 };
 
 /// MCTargetExpr - This is an extension point for target-specific MCExpr
@@ -445,7 +441,7 @@ public:
 /// NOTE: All subclasses are required to have trivial destructors because
 /// MCExprs are bump pointer allocated and not destructed.
 class MCTargetExpr : public MCExpr {
-  virtual void Anchor();
+  virtual void anchor();
 protected:
   MCTargetExpr() : MCExpr(Target) {}
   virtual ~MCTargetExpr() {}
@@ -460,7 +456,6 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }
-  static bool classof(const MCTargetExpr *) { return true; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 397a37d3ce48..e91c6a2e8ee7 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -182,7 +182,7 @@ public:
   void dump() const;
 
   /// \brief Dump the MCInst as prettily as possible using the additional MC
-  /// structures, if given. Operators are separated by the \arg Separator
+  /// structures, if given. Operators are separated by the \p Separator
   /// string.
   void dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI = 0,
                    const MCInstPrinter *Printer = 0,
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 3c4f28be7ca6..3b9420a40389 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -33,12 +33,16 @@ protected:
   /// The current set of available features.
   unsigned AvailableFeatures;
 
+  /// True if we are printing marked up assembly.
+  bool UseMarkup;
+
   /// Utility function for printing annotations.
   void printAnnotation(raw_ostream &OS, StringRef Annot);
 public:
   MCInstPrinter(const MCAsmInfo &mai, const MCInstrInfo &mii,
                 const MCRegisterInfo &mri)
-    : CommentStream(0), MAI(mai), MII(mii), MRI(mri), AvailableFeatures(0) {}
+    : CommentStream(0), MAI(mai), MII(mii), MRI(mri), AvailableFeatures(0),
+      UseMarkup(0) {}
 
   virtual ~MCInstPrinter();
 
@@ -59,6 +63,13 @@ public:
 
   unsigned getAvailableFeatures() const { return AvailableFeatures; }
   void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
+
+  bool getUseMarkup() const { return UseMarkup; }
+  void setUseMarkup(bool Value) { UseMarkup = Value; }
+
+  /// Utility functions to make adding mark ups simpler.
+  StringRef markup(StringRef s) const;
+  StringRef markup(StringRef a, StringRef b) const;
 };
 
 } // namespace llvm
diff --git a/include/llvm/MC/MCInstrDesc.h b/include/llvm/MC/MCInstrDesc.h
index dbf16d870050..02383f8bc658 100644
--- a/include/llvm/MC/MCInstrDesc.h
+++ b/include/llvm/MC/MCInstrDesc.h
@@ -1,4 +1,4 @@
-//===-- llvm/Mc/McInstrDesc.h - Instruction Descriptors -*- C++ -*-===//
+//===-- llvm/MC/MCInstrDesc.h - Instruction Descriptors -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/include/llvm/MC/MCLabel.h b/include/llvm/MC/MCLabel.h
index 727520d4af9d..f531de8b40d9 100644
--- a/include/llvm/MC/MCLabel.h
+++ b/include/llvm/MC/MCLabel.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_MC_MCLABEL_H
 #define LLVM_MC_MCLABEL_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
   class MCContext;
   class raw_ostream;
@@ -30,8 +32,8 @@ namespace llvm {
     MCLabel(unsigned instance)
       : Instance(instance) {}
 
-    MCLabel(const MCLabel&);       // DO NOT IMPLEMENT
-    void operator=(const MCLabel&); // DO NOT IMPLEMENT
+    MCLabel(const MCLabel&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCLabel&) LLVM_DELETED_FUNCTION;
   public:
     /// getInstance - Get the current instance of this Directional Local Label.
     unsigned getInstance() const { return Instance; }
@@ -40,7 +42,7 @@ namespace llvm {
     /// Label.
     unsigned incInstance() { return ++Instance; }
 
-    /// print - Print the value to the stream \arg OS.
+    /// print - Print the value to the stream \p OS.
     void print(raw_ostream &OS) const;
 
     /// dump - Print the value to stderr.
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 949d90700e08..efaabfb9e88b 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -153,8 +153,8 @@ public:
 
   /// WriteSegmentLoadCommand - Write a segment load command.
   ///
-  /// \arg NumSections - The number of sections in this segment.
-  /// \arg SectionDataSize - The total size of the sections.
+  /// \param NumSections The number of sections in this segment.
+  /// \param SectionDataSize The total size of the sections.
   void WriteSegmentLoadCommand(unsigned NumSections,
                                uint64_t VMSize,
                                uint64_t SectionDataStartOffset,
@@ -233,6 +233,8 @@ public:
   void computeSectionAddresses(const MCAssembler &Asm,
                                const MCAsmLayout &Layout);
 
+  void markAbsoluteVariableSymbols(MCAssembler &Asm,
+                                   const MCAsmLayout &Layout);
   void ExecutePostLayoutBinding(MCAssembler &Asm, const MCAsmLayout &Layout);
 
   virtual bool IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 74e2263c731c..23e5513ae35e 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -84,7 +84,8 @@ protected:
   /// this is the section to emit them into.
   const MCSection *CompactUnwindSection;
 
-  /// DwarfAccelNamesSection, DwarfAccelObjCSection
+  /// DwarfAccelNamesSection, DwarfAccelObjCSection,
+  /// DwarfAccelNamespaceSection, DwarfAccelTypesSection -
   /// If we use the DWARF accelerated hash tables then we want toe emit these
   /// sections.
   const MCSection *DwarfAccelNamesSection;
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index a69075ddd002..08b00f1c478e 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -72,6 +72,13 @@ public:
   virtual void ChangeSection(const MCSection *Section);
   virtual void EmitInstruction(const MCInst &Inst);
   virtual void EmitInstToFragment(const MCInst &Inst);
+  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
+  virtual void EmitValueToAlignment(unsigned ByteAlignment,
+                                    int64_t Value = 0,
+                                    unsigned ValueSize = 1,
+                                    unsigned MaxBytesToEmit = 0);
+  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+                                 unsigned MaxBytesToEmit = 0);
   virtual bool EmitValueToOffset(const MCExpr *Offset, unsigned char Value);
   virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                         const MCSymbol *LastLabel,
@@ -80,6 +87,9 @@ public:
   virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                          const MCSymbol *Label);
   virtual void EmitGPRel32Value(const MCExpr *Value);
+  virtual void EmitGPRel64Value(const MCExpr *Value);
+  virtual void EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                        unsigned AddrSpace);
   virtual void FinishImpl();
 
   /// @}
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 9591a0094614..14fe75fd4c31 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -11,6 +11,7 @@
 #define LLVM_MC_MCOBJECTWRITER_H
 
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include <cassert>
 
@@ -35,8 +36,8 @@ class MCValue;
 /// The object writer also contains a number of helper methods for writing
 /// binary data to the output stream.
 class MCObjectWriter {
-  MCObjectWriter(const MCObjectWriter &); // DO NOT IMPLEMENT
-  void operator=(const MCObjectWriter &); // DO NOT IMPLEMENT
+  MCObjectWriter(const MCObjectWriter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCObjectWriter &) LLVM_DELETED_FUNCTION;
 
 protected:
   raw_ostream &OS;
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 9a8735f3e726..e102dfb82c4a 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -31,8 +31,8 @@ class AsmLexer : public MCAsmLexer {
   const MemoryBuffer *CurBuf;
   bool isAtStartOfLine;
 
-  void operator=(const AsmLexer&); // DO NOT IMPLEMENT
-  AsmLexer(const AsmLexer&);       // DO NOT IMPLEMENT
+  void operator=(const AsmLexer&) LLVM_DELETED_FUNCTION;
+  AsmLexer(const AsmLexer&) LLVM_DELETED_FUNCTION;
 
 protected:
   /// LexToken - Read the next token and return its code.
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 5e29ad49dd3f..0a961d6d0971 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -11,6 +11,7 @@
 #define LLVM_MC_MCASMLEXER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
 
@@ -39,6 +40,7 @@ public:
     // No-value.
     EndOfStatement,
     Colon,
+    Space,
     Plus, Minus, Tilde,
     Slash,    // '/'
     BackSlash, // '\'
@@ -121,10 +123,11 @@ class MCAsmLexer {
   SMLoc ErrLoc;
   std::string Err;
 
-  MCAsmLexer(const MCAsmLexer &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmLexer &);  // DO NOT IMPLEMENT
+  MCAsmLexer(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   const char *TokStart;
+  bool SkipSpace;
 
   MCAsmLexer();
 
@@ -169,11 +172,14 @@ public:
   /// getKind - Get the kind of current token.
   AsmToken::TokenKind getKind() const { return CurTok.getKind(); }
 
-  /// is - Check if the current token has kind \arg K.
+  /// is - Check if the current token has kind \p K.
   bool is(AsmToken::TokenKind K) const { return CurTok.is(K); }
 
-  /// isNot - Check if the current token has kind \arg K.
+  /// isNot - Check if the current token has kind \p K.
   bool isNot(AsmToken::TokenKind K) const { return CurTok.isNot(K); }
+
+  /// setSkipSpace - Set whether spaces should be ignored by the lexer
+  void setSkipSpace(bool val) { SkipSpace = val; }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 793c7097ba14..a71d3c321741 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -20,6 +20,9 @@ class MCAsmLexer;
 class MCAsmParserExtension;
 class MCContext;
 class MCExpr;
+class MCInstPrinter;
+class MCInstrInfo;
+class MCParsedAsmOperand;
 class MCStreamer;
 class MCTargetAsmParser;
 class SMLoc;
@@ -28,6 +31,16 @@ class SourceMgr;
 class StringRef;
 class Twine;
 
+/// MCAsmParserSemaCallback - Generic Sema callback for assembly parser.
+class MCAsmParserSemaCallback {
+public:
+  virtual ~MCAsmParserSemaCallback(); 
+  virtual void *LookupInlineAsmIdentifier(StringRef Name, void *Loc,
+                                          unsigned &Size) = 0;
+  virtual bool LookupInlineAsmField(StringRef Base, StringRef Member,
+                                    unsigned &Offset) = 0;
+};
+
 /// MCAsmParser - Generic assembler parser interface, for use by target specific
 /// assembly parsers.
 class MCAsmParser {
@@ -35,8 +48,8 @@ public:
   typedef bool (*DirectiveHandler)(MCAsmParserExtension*, StringRef, SMLoc);
 
 private:
-  MCAsmParser(const MCAsmParser &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmParser &);  // DO NOT IMPLEMENT
+  MCAsmParser(const MCAsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmParser &) LLVM_DELETED_FUNCTION;
 
   MCTargetAsmParser *TargetParser;
 
@@ -73,15 +86,26 @@ public:
   /// Run - Run the parser on the input source buffer.
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
 
-  /// Warning - Emit a warning at the location \arg L, with the message \arg
-  /// Msg.
+  virtual void setParsingInlineAsm(bool V) = 0;
+  virtual bool isParsingInlineAsm() = 0;
+
+  /// ParseMSInlineAsm - Parse ms-style inline assembly.
+  virtual bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                                unsigned &NumOutputs, unsigned &NumInputs,
+                                SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+                                SmallVectorImpl<std::string> &Constraints,
+                                SmallVectorImpl<std::string> &Clobbers,
+                                const MCInstrInfo *MII,
+                                const MCInstPrinter *IP,
+                                MCAsmParserSemaCallback &SI) = 0;
+
+  /// Warning - Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
   virtual bool Warning(SMLoc L, const Twine &Msg,
                        ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) = 0;
 
-  /// Error - Emit an error at the location \arg L, with the message \arg
-  /// Msg.
+  /// Error - Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
@@ -100,7 +124,7 @@ public:
                 ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
 
   /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
-  /// and set \arg Res to the identifier contents.
+  /// and set \p Res to the identifier contents.
   virtual bool ParseIdentifier(StringRef &Res) = 0;
 
   /// \brief Parse up to the end of statement and return the contents from the
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index 4e2aee992877..0918c93bdf3d 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -21,8 +21,8 @@ class Twine;
 /// which is implemented by target and object file assembly parser
 /// implementations.
 class MCAsmParserExtension {
-  MCAsmParserExtension(const MCAsmParserExtension &);   // DO NOT IMPLEMENT
-  void operator=(const MCAsmParserExtension &);  // DO NOT IMPLEMENT
+  MCAsmParserExtension(const MCAsmParserExtension &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAsmParserExtension &) LLVM_DELETED_FUNCTION;
 
   MCAsmParser *Parser;
 
@@ -43,8 +43,8 @@ protected:
 public:
   virtual ~MCAsmParserExtension();
 
-  /// \brief Initialize the extension for parsing using the given \arg
-  /// Parser. The extension should use the AsmParser interfaces to register its
+  /// \brief Initialize the extension for parsing using the given \p Parser.
+  /// The extension should use the AsmParser interfaces to register its
   /// parsing routines.
   virtual void Initialize(MCAsmParser &Parser);
 
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index 2556e5f27a30..60e7887a5396 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -19,15 +19,69 @@ class raw_ostream;
 /// base class is used by target-independent clients and is the interface
 /// between parsing an asm instruction and recognizing it.
 class MCParsedAsmOperand {
+  /// MCOperandNum - The corresponding MCInst operand number.  Only valid when
+  /// parsing MS-style inline assembly.
+  unsigned MCOperandNum;
+
+  /// Constraint - The constraint on this operand.  Only valid when parsing
+  /// MS-style inline assembly.
+  std::string Constraint;
+
 public:
   MCParsedAsmOperand() {}
   virtual ~MCParsedAsmOperand() {}
 
+  void setConstraint(StringRef C) { Constraint = C.str(); }
+  StringRef getConstraint() { return Constraint; }
+
+  void setMCOperandNum (unsigned OpNum) { MCOperandNum = OpNum; }
+  unsigned getMCOperandNum() { return MCOperandNum; }
+
+  unsigned getNameLen() {
+    assert (getStartLoc().isValid() && "Invalid StartLoc!");
+    assert (getEndLoc().isValid() && "Invalid EndLoc!");
+    return getEndLoc().getPointer() - getStartLoc().getPointer();
+  }
+
+  StringRef getName() {
+    return StringRef(getStartLoc().getPointer(), getNameLen());
+  }
+
+  /// isToken - Is this a token operand?
+  virtual bool isToken() const = 0;
+  /// isImm - Is this an immediate operand?
+  virtual bool isImm() const = 0;
+  /// isReg - Is this a register operand?
+  virtual bool isReg() const = 0;
+  virtual unsigned getReg() const = 0;
+
+  /// isMem - Is this a memory operand?
+  virtual bool isMem() const = 0;
+  virtual unsigned getMemSize() const { return 0; }
+
   /// getStartLoc - Get the location of the first token of this operand.
   virtual SMLoc getStartLoc() const = 0;
   /// getEndLoc - Get the location of the last token of this operand.
   virtual SMLoc getEndLoc() const = 0;
 
+  /// needAsmRewrite - AsmRewrites happen in both the target-independent and
+  /// target-dependent parsers.  The target-independent parser calls this
+  /// function to determine if the target-dependent parser has already taken
+  /// care of the rewrites.  Only valid when parsing MS-style inline assembly.
+  virtual bool needAsmRewrite() const { return true; }
+
+  /// isOffsetOf - Do we need to emit code to get the offset of the variable,
+  /// rather then the value of the variable?   Only valid when parsing MS-style
+  /// inline assembly.
+  virtual bool isOffsetOf() const { return false; }
+
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  virtual SMLoc getOffsetOfLoc() const { return SMLoc(); }
+
+  /// needSizeDirective - Do we need to emit a sizing directive for this
+  /// operand?  Only valid when parsing MS-style inline assembly.
+  virtual bool needSizeDirective() const { return false; }
+
   /// print - Print a debug representation of the operand to the given stream.
   virtual void print(raw_ostream &OS) const = 0;
   /// dump - Print to the debug stream.
diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index 46a9d71fff24..f05baeaaf689 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h
@@ -333,6 +333,13 @@ public:
     return NumRegs;
   }
 
+  /// getNumSubRegIndices - Return the number of sub-register indices
+  /// understood by the target. Index 0 is reserved for the no-op sub-register,
+  /// while 1 to getNumSubRegIndices() - 1 represent real sub-registers.
+  unsigned getNumSubRegIndices() const {
+    return NumSubRegIndices;
+  }
+
   /// getNumRegUnits - Return the number of (native) register units in the
   /// target. Register units are numbered from 0 to getNumRegUnits() - 1. They
   /// can be accessed through MCRegUnitIterator defined below.
@@ -363,7 +370,7 @@ public:
 
   /// getRegClass - Returns the register class associated with the enumeration
   /// value.  See class MCOperandInfo.
-  const MCRegisterClass getRegClass(unsigned i) const {
+  const MCRegisterClass& getRegClass(unsigned i) const {
     assert(i < getNumRegClasses() && "Register Class ID out of range");
     return Classes[i];
   }
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 3b1cdf1cd2fa..0c71ee513500 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -16,17 +16,111 @@
 #define LLVM_MC_MCSCHEDMODEL_H
 
 #include "llvm/Support/DataTypes.h"
+#include <cassert>
 
 namespace llvm {
 
 struct InstrItinerary;
 
+/// Define a kind of processor resource that will be modeled by the scheduler.
+struct MCProcResourceDesc {
+#ifndef NDEBUG
+  const char *Name;
+#endif
+  unsigned NumUnits; // Number of resource of this kind
+  unsigned SuperIdx; // Index of the resources kind that contains this kind.
+
+  // Buffered resources may be consumed at some indeterminate cycle after
+  // dispatch (e.g. for instructions that may issue out-of-order). Unbuffered
+  // resources always consume their resource some fixed number of cycles after
+  // dispatch (e.g. for instruction interlocking that may stall the pipeline).
+  bool IsBuffered;
+
+  bool operator==(const MCProcResourceDesc &Other) const {
+    return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
+      && IsBuffered == Other.IsBuffered;
+  }
+};
+
+/// Identify one of the processor resource kinds consumed by a particular
+/// scheduling class for the specified number of cycles.
+struct MCWriteProcResEntry {
+  unsigned ProcResourceIdx;
+  unsigned Cycles;
+
+  bool operator==(const MCWriteProcResEntry &Other) const {
+    return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
+  }
+};
+
+/// Specify the latency in cpu cycles for a particular scheduling class and def
+/// index. -1 indicates an invalid latency. Heuristics would typically consider
+/// an instruction with invalid latency to have infinite latency.  Also identify
+/// the WriteResources of this def. When the operand expands to a sequence of
+/// writes, this ID is the last write in the sequence.
+struct MCWriteLatencyEntry {
+  int Cycles;
+  unsigned WriteResourceID;
+
+  bool operator==(const MCWriteLatencyEntry &Other) const {
+    return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
+  }
+};
+
+/// Specify the number of cycles allowed after instruction issue before a
+/// particular use operand reads its registers. This effectively reduces the
+/// write's latency. Here we allow negative cycles for corner cases where
+/// latency increases. This rule only applies when the entry's WriteResource
+/// matches the write's WriteResource.
+///
+/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
+/// WriteResourceIdx.
+struct MCReadAdvanceEntry {
+  unsigned UseIdx;
+  unsigned WriteResourceID;
+  int Cycles;
+
+  bool operator==(const MCReadAdvanceEntry &Other) const {
+    return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
+      && Cycles == Other.Cycles;
+  }
+};
+
+/// Summarize the scheduling resources required for an instruction of a
+/// particular scheduling class.
+///
+/// Defined as an aggregate struct for creating tables with initializer lists.
+struct MCSchedClassDesc {
+  static const unsigned short InvalidNumMicroOps = UINT16_MAX;
+  static const unsigned short VariantNumMicroOps = UINT16_MAX - 1;
+
+#ifndef NDEBUG
+  const char* Name;
+#endif
+  unsigned short NumMicroOps;
+  bool     BeginGroup;
+  bool     EndGroup;
+  unsigned WriteProcResIdx; // First index into WriteProcResTable.
+  unsigned NumWriteProcResEntries;
+  unsigned WriteLatencyIdx; // First index into WriteLatencyTable.
+  unsigned NumWriteLatencyEntries;
+  unsigned ReadAdvanceIdx; // First index into ReadAdvanceTable.
+  unsigned NumReadAdvanceEntries;
+
+  bool isValid() const {
+    return NumMicroOps != InvalidNumMicroOps;
+  }
+  bool isVariant() const {
+    return NumMicroOps == VariantNumMicroOps;
+  }
+};
+
 /// Machine model for scheduling, bundling, and heuristics.
 ///
 /// The machine model directly provides basic information about the
 /// microarchitecture to the scheduler in the form of properties. It also
-/// optionally refers to scheduler resources tables and itinerary
-/// tables. Scheduler resources tables model the latency and cost for each
+/// optionally refers to scheduler resource tables and itinerary
+/// tables. Scheduler resource tables model the latency and cost for each
 /// instruction type. Itinerary tables are an independant mechanism that
 /// provides a detailed reservation table describing each cycle of instruction
 /// execution. Subtargets may define any or all of the above categories of data
@@ -84,8 +178,11 @@ public:
   static const unsigned DefaultMispredictPenalty = 10;
 
 private:
-  // TODO: Add a reference to proc resource types and sched resource tables.
-
+  unsigned ProcID;
+  const MCProcResourceDesc *ProcResourceTable;
+  const MCSchedClassDesc *SchedClassTable;
+  unsigned NumProcResourceKinds;
+  unsigned NumSchedClasses;
   // Instruction itinerary tables used by InstrItineraryData.
   friend class InstrItineraryData;
   const InstrItinerary *InstrItineraries;
@@ -100,13 +197,45 @@ public:
                   LoadLatency(DefaultLoadLatency),
                   HighLatency(DefaultHighLatency),
                   MispredictPenalty(DefaultMispredictPenalty),
-                  InstrItineraries(0) {}
+                  ProcID(0), ProcResourceTable(0), SchedClassTable(0),
+                  NumProcResourceKinds(0), NumSchedClasses(0),
+                  InstrItineraries(0) {
+    (void)NumProcResourceKinds;
+    (void)NumSchedClasses;
+  }
 
   // Table-gen driven ctor.
   MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp,
+               unsigned pi, const MCProcResourceDesc *pr,
+               const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                const InstrItinerary *ii):
     IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), InstrItineraries(ii){}
+    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
+    SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
+    InstrItineraries(ii) {}
+
+  unsigned getProcessorID() const { return ProcID; }
+
+  /// Does this machine model include instruction-level scheduling.
+  bool hasInstrSchedModel() const { return SchedClassTable; }
+
+  unsigned getNumProcResourceKinds() const {
+    return NumProcResourceKinds;
+  }
+
+  const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
+    assert(hasInstrSchedModel() && "No scheduling machine model");
+
+    assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
+    return &ProcResourceTable[ProcResourceIdx];
+  }
+
+  const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
+    assert(hasInstrSchedModel() && "No scheduling machine model");
+
+    assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
+    return &SchedClassTable[SchedClassIdx];
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index 7da6534b6e88..21fdb6bd39b8 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -15,7 +15,7 @@
 #define LLVM_MC_MCSECTION_H
 
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
   class MCAsmInfo;
@@ -33,8 +33,8 @@ namespace llvm {
     };
 
   private:
-    MCSection(const MCSection&);      // DO NOT IMPLEMENT
-    void operator=(const MCSection&); // DO NOT IMPLEMENT
+    MCSection(const MCSection&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCSection&) LLVM_DELETED_FUNCTION;
   protected:
     MCSection(SectionVariant V, SectionKind K) : Variant(V), Kind(K) {}
     SectionVariant Variant;
@@ -64,8 +64,6 @@ namespace llvm {
     /// isVirtualSection - Check whether this section is "virtual", that is
     /// has no actual object file contents.
     virtual bool isVirtualSection() const = 0;
-
-    static bool classof(const MCSection *) { return true; }
   };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index 7eacde57f48f..b050c0f442b6 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -61,7 +61,6 @@ namespace llvm {
     static bool classof(const MCSection *S) {
       return S->getVariant() == SV_COFF;
     }
-    static bool classof(const MCSectionCOFF *) { return true; }
   };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCSectionELF.h b/include/llvm/MC/MCSectionELF.h
index 7321ca83e897..4d54465760d4 100644
--- a/include/llvm/MC/MCSectionELF.h
+++ b/include/llvm/MC/MCSectionELF.h
@@ -76,7 +76,6 @@ public:
   static bool classof(const MCSection *S) {
     return S->getVariant() == SV_ELF;
   }
-  static bool classof(const MCSectionELF *) { return true; }
 
   // Return the entry size for sections with fixed-width data.
   static unsigned DetermineEntrySize(SectionKind Kind);
diff --git a/include/llvm/MC/MCSectionMachO.h b/include/llvm/MC/MCSectionMachO.h
index 15eb4f4a7685..71ea8f3e901d 100644
--- a/include/llvm/MC/MCSectionMachO.h
+++ b/include/llvm/MC/MCSectionMachO.h
@@ -174,7 +174,6 @@ public:
   static bool classof(const MCSection *S) {
     return S->getVariant() == SV_MachO;
   }
-  static bool classof(const MCSectionMachO *) { return true; }
 };
 
 } // end namespace llvm
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index e8c3e59fac8a..230d27ef2ef0 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -47,8 +47,8 @@ namespace llvm {
   class MCStreamer {
     MCContext &Context;
 
-    MCStreamer(const MCStreamer&); // DO NOT IMPLEMENT
-    MCStreamer &operator=(const MCStreamer&); // DO NOT IMPLEMENT
+    MCStreamer(const MCStreamer&) LLVM_DELETED_FUNCTION;
+    MCStreamer &operator=(const MCStreamer&) LLVM_DELETED_FUNCTION;
 
     bool EmitEHFrame;
     bool EmitDebugFrame;
@@ -342,7 +342,7 @@ namespace llvm {
     /// @name Generating Data
     /// @{
 
-    /// EmitBytes - Emit the bytes in \arg Data into the output.
+    /// EmitBytes - Emit the bytes in \p Data into the output.
     ///
     /// This is used to implement assembler directives such as .byte, .ascii,
     /// etc.
@@ -554,6 +554,11 @@ namespace llvm {
     virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
                              bool isVector);
 
+    /// PPC-related methods.
+    /// FIXME: Eventually replace it with some "target MC streamer" and move
+    /// these methods there.
+    virtual void EmitTCEntry(const MCSymbol &S);
+
     /// FinishImpl - Streamer specific finalization.
     virtual void FinishImpl() = 0;
     /// Finish - Finish emission of machine code.
@@ -573,17 +578,14 @@ namespace llvm {
   /// InstPrint.
   ///
   /// \param CE - If given, a code emitter to use to show the instruction
-  /// encoding inline with the assembly. This method takes ownership of \arg CE.
+  /// encoding inline with the assembly. This method takes ownership of \p CE.
   ///
   /// \param TAB - If given, a target asm backend to use to show the fixup
   /// information in conjunction with encoding information. This method takes
-  /// ownership of \arg TAB.
+  /// ownership of \p TAB.
   ///
   /// \param ShowInst - Whether to show the MCInst representation inline with
   /// the assembly.
-  ///
-  /// \param DecodeLSDA - If true, emit comments that translates the LSDA into a
-  /// human readable format. Only usable with CFI.
   MCStreamer *createAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
                                 bool isVerboseAsm,
                                 bool useLoc,
@@ -597,7 +599,7 @@ namespace llvm {
   /// createMachOStreamer - Create a machine code streamer which will generate
   /// Mach-O format object files.
   ///
-  /// Takes ownership of \arg TAB and \arg CE.
+  /// Takes ownership of \p TAB and \p CE.
   MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                                   raw_ostream &OS, MCCodeEmitter *CE,
                                   bool RelaxAll = false);
@@ -605,7 +607,7 @@ namespace llvm {
   /// createWinCOFFStreamer - Create a machine code streamer which will
   /// generate Microsoft COFF format object files.
   ///
-  /// Takes ownership of \arg TAB and \arg CE.
+  /// Takes ownership of \p TAB and \p CE.
   MCStreamer *createWinCOFFStreamer(MCContext &Ctx,
                                     MCAsmBackend &TAB,
                                     MCCodeEmitter &CE, raw_ostream &OS,
@@ -620,7 +622,7 @@ namespace llvm {
   /// createPureStreamer - Create a machine code streamer which will generate
   /// "pure" MC object files, for use with MC-JIT and testing tools.
   ///
-  /// Takes ownership of \arg TAB and \arg CE.
+  /// Takes ownership of \p TAB and \p CE.
   MCStreamer *createPureStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                                  raw_ostream &OS, MCCodeEmitter *CE);
 
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 31d632de60be..69213cd77d92 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -30,7 +30,14 @@ class MCSubtargetInfo {
   std::string TargetTriple;            // Target triple
   const SubtargetFeatureKV *ProcFeatures;  // Processor feature list
   const SubtargetFeatureKV *ProcDesc;  // Processor descriptions
-  const SubtargetInfoKV *ProcSchedModel; // Scheduler machine model
+
+  // Scheduler machine model
+  const SubtargetInfoKV *ProcSchedModels;
+  const MCWriteProcResEntry *WriteProcResTable;
+  const MCWriteLatencyEntry *WriteLatencyTable;
+  const MCReadAdvanceEntry *ReadAdvanceTable;
+  const MCSchedModel *CPUSchedModel;
+
   const InstrStage *Stages;            // Instruction itinerary stages
   const unsigned *OperandCycles;       // Itinerary operand cycles
   const unsigned *ForwardingPaths;     // Forwarding paths
@@ -43,6 +50,9 @@ public:
                            const SubtargetFeatureKV *PF,
                            const SubtargetFeatureKV *PD,
                            const SubtargetInfoKV *ProcSched,
+                           const MCWriteProcResEntry *WPR,
+                           const MCWriteLatencyEntry *WL,
+                           const MCReadAdvanceEntry *RA,
                            const InstrStage *IS,
                            const unsigned *OC, const unsigned *FP,
                            unsigned NF, unsigned NP);
@@ -58,9 +68,9 @@ public:
     return FeatureBits;
   }
 
-  /// ReInitMCSubtargetInfo - Change CPU (and optionally supplemented with
-  /// feature string), recompute and return feature bits.
-  uint64_t ReInitMCSubtargetInfo(StringRef CPU, StringRef FS);
+  /// InitMCProcessorInfo - Set or change the CPU (optionally supplemented with
+  /// feature string). Recompute feature bits and scheduling model.
+  void InitMCProcessorInfo(StringRef CPU, StringRef FS);
 
   /// ToggleFeature - Toggle a feature and returns the re-computed feature
   /// bits. This version does not change the implied bits.
@@ -72,11 +82,56 @@ public:
 
   /// getSchedModelForCPU - Get the machine model of a CPU.
   ///
-  MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
+  const MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
+
+  /// getSchedModel - Get the machine model for this subtarget's CPU.
+  ///
+  const MCSchedModel *getSchedModel() const { return CPUSchedModel; }
+
+  /// Return an iterator at the first process resource consumed by the given
+  /// scheduling class.
+  const MCWriteProcResEntry *getWriteProcResBegin(
+    const MCSchedClassDesc *SC) const {
+    return &WriteProcResTable[SC->WriteProcResIdx];
+  }
+  const MCWriteProcResEntry *getWriteProcResEnd(
+    const MCSchedClassDesc *SC) const {
+    return getWriteProcResBegin(SC) + SC->NumWriteProcResEntries;
+  }
+
+  const MCWriteLatencyEntry *getWriteLatencyEntry(const MCSchedClassDesc *SC,
+                                                  unsigned DefIdx) const {
+    assert(DefIdx < SC->NumWriteLatencyEntries &&
+           "MachineModel does not specify a WriteResource for DefIdx");
+
+    return &WriteLatencyTable[SC->WriteLatencyIdx + DefIdx];
+  }
+
+  int getReadAdvanceCycles(const MCSchedClassDesc *SC, unsigned UseIdx,
+                           unsigned WriteResID) const {
+    // TODO: The number of read advance entries in a class can be significant
+    // (~50). Consider compressing the WriteID into a dense ID of those that are
+    // used by ReadAdvance and representing them as a bitset.
+    for (const MCReadAdvanceEntry *I = &ReadAdvanceTable[SC->ReadAdvanceIdx],
+           *E = I + SC->NumReadAdvanceEntries; I != E; ++I) {
+      if (I->UseIdx < UseIdx)
+        continue;
+      if (I->UseIdx > UseIdx)
+        break;
+      // Find the first WriteResIdx match, which has the highest cycle count.
+      if (!I->WriteResourceID || I->WriteResourceID == WriteResID) {
+        return I->Cycles;
+      }
+    }
+    return 0;
+  }
 
   /// getInstrItineraryForCPU - Get scheduling itinerary of a CPU.
   ///
   InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
+
+  /// Initialize an InstrItineraryData instance.
+  void initInstrItins(InstrItineraryData &InstrItins) const;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 0583ce56820b..fe927555c49b 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -15,6 +15,7 @@
 #define LLVM_MC_MCSYMBOL_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
   class MCExpr;
@@ -62,8 +63,8 @@ namespace llvm {
       : Name(name), Section(0), Value(0),
         IsTemporary(isTemporary), IsUsed(false) {}
 
-    MCSymbol(const MCSymbol&);       // DO NOT IMPLEMENT
-    void operator=(const MCSymbol&); // DO NOT IMPLEMENT
+    MCSymbol(const MCSymbol&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCSymbol&) LLVM_DELETED_FUNCTION;
   public:
     /// getName - Get the symbol name.
     StringRef getName() const { return Name; }
@@ -112,7 +113,7 @@ namespace llvm {
       return *Section;
     }
 
-    /// setSection - Mark the symbol as defined in the section \arg S.
+    /// setSection - Mark the symbol as defined in the section \p S.
     void setSection(const MCSection &S) { Section = &S; }
 
     /// setUndefined - Mark the symbol as undefined.
@@ -132,7 +133,7 @@ namespace llvm {
       return Value != 0;
     }
 
-    /// getValue() - Get the value for variable symbols.
+    /// getVariableValue() - Get the value for variable symbols.
     const MCExpr *getVariableValue() const {
       assert(isVariable() && "Invalid accessor!");
       IsUsed = true;
@@ -148,7 +149,7 @@ namespace llvm {
 
     /// @}
 
-    /// print - Print the value to the stream \arg OS.
+    /// print - Print the value to the stream \p OS.
     void print(raw_ostream &OS) const;
 
     /// dump - Print the value to stderr.
diff --git a/include/llvm/MC/MCTargetAsmLexer.h b/include/llvm/MC/MCTargetAsmLexer.h
index f5c8c09df0ea..b1cc546e1efa 100644
--- a/include/llvm/MC/MCTargetAsmLexer.h
+++ b/include/llvm/MC/MCTargetAsmLexer.h
@@ -24,8 +24,8 @@ class MCTargetAsmLexer {
   SMLoc ErrLoc;
   std::string Err;
 
-  MCTargetAsmLexer(const MCTargetAsmLexer &);   // DO NOT IMPLEMENT
-  void operator=(const MCTargetAsmLexer &);  // DO NOT IMPLEMENT
+  MCTargetAsmLexer(const MCTargetAsmLexer &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCTargetAsmLexer &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCTargetAsmLexer(const Target &);
 
@@ -45,7 +45,7 @@ public:
 
   const Target &getTarget() const { return TheTarget; }
 
-  /// InstallLexer - Set the lexer to get tokens from lower-level lexer \arg L.
+  /// InstallLexer - Set the lexer to get tokens from lower-level lexer \p L.
   void InstallLexer(MCAsmLexer &L) {
     Lexer = &L;
   }
@@ -77,10 +77,10 @@ public:
   /// getKind - Get the kind of current token.
   AsmToken::TokenKind getKind() const { return CurTok.getKind(); }
 
-  /// is - Check if the current token has kind \arg K.
+  /// is - Check if the current token has kind \p K.
   bool is(AsmToken::TokenKind K) const { return CurTok.is(K); }
 
-  /// isNot - Check if the current token has kind \arg K.
+  /// isNot - Check if the current token has kind \p K.
   bool isNot(AsmToken::TokenKind K) const { return CurTok.isNot(K); }
 };
 
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 929a2042cac6..483a80b3b595 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -21,11 +21,43 @@ class MCParsedAsmOperand;
 class MCInst;
 template <typename T> class SmallVectorImpl;
 
+enum AsmRewriteKind {
+  AOK_DotOperator,    // Rewrite a dot operator expression as an immediate.
+                      // E.g., [eax].foo.bar -> [eax].8
+  AOK_Emit,           // Rewrite _emit as .byte.
+  AOK_Imm,            // Rewrite as $$N.
+  AOK_ImmPrefix,      // Add $$ before a parsed Imm.
+  AOK_Input,          // Rewrite in terms of $N.
+  AOK_Output,         // Rewrite in terms of $N.
+  AOK_SizeDirective,  // Add a sizing directive (e.g., dword ptr).
+  AOK_Skip            // Skip emission (e.g., offset/type operators).
+};
+
+struct AsmRewrite {
+  AsmRewriteKind Kind;
+  SMLoc Loc;
+  unsigned Len;
+  unsigned Val;
+public:
+  AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len = 0, unsigned val = 0)
+    : Kind(kind), Loc(loc), Len(len), Val(val) {}
+};
+
+struct ParseInstructionInfo {
+
+  SmallVectorImpl<AsmRewrite> *AsmRewrites;
+
+  ParseInstructionInfo() : AsmRewrites(0) {}
+  ParseInstructionInfo(SmallVectorImpl<AsmRewrite> *rewrites)
+    : AsmRewrites(rewrites) {}
+
+  ~ParseInstructionInfo() {}
+};
+
 /// MCTargetAsmParser - Generic interface to target specific assembly parsers.
 class MCTargetAsmParser : public MCAsmParserExtension {
 public:
   enum MatchResultTy {
-    Match_ConversionFail,
     Match_InvalidOperand,
     Match_MissingFeature,
     Match_MnemonicFail,
@@ -34,20 +66,34 @@ public:
   };
 
 private:
-  MCTargetAsmParser(const MCTargetAsmParser &);   // DO NOT IMPLEMENT
-  void operator=(const MCTargetAsmParser &);  // DO NOT IMPLEMENT
+  MCTargetAsmParser(const MCTargetAsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const MCTargetAsmParser &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   MCTargetAsmParser();
 
   /// AvailableFeatures - The current set of available features.
   unsigned AvailableFeatures;
 
+  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  bool ParsingInlineAsm;
+
+  /// SemaCallback - The Sema callback implementation.  Must be set when parsing
+  /// ms-style inline assembly.
+  MCAsmParserSemaCallback *SemaCallback;
+
 public:
   virtual ~MCTargetAsmParser();
 
   unsigned getAvailableFeatures() const { return AvailableFeatures; }
   void setAvailableFeatures(unsigned Value) { AvailableFeatures = Value; }
 
+  bool isParsingInlineAsm () { return ParsingInlineAsm; }
+  void setParsingInlineAsm (bool Value) { ParsingInlineAsm = Value; }
+
+  void setSemaCallback(MCAsmParserSemaCallback *Callback) {
+    SemaCallback = Callback;
+  }
+
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                              SMLoc &EndLoc) = 0;
 
@@ -64,7 +110,8 @@ public:
   /// \param Operands [out] - The list of parsed operands, this returns
   ///        ownership of them to the caller.
   /// \return True on failure.
-  virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc,
                             SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
 
   /// ParseDirective - Parse a target specific assembler directive
@@ -79,18 +126,9 @@ public:
   /// \param DirectiveID - the identifier token of the directive.
   virtual bool ParseDirective(AsmToken DirectiveID) = 0;
 
-  /// MatchInstruction - Recognize a series of operands of a parsed instruction
-  /// as an actual MCInst.  This returns false on success and returns true on
-  /// failure to match.
-  ///
-  /// On failure, the target parser is responsible for emitting a diagnostic
-  /// explaining the match failure.
-  virtual bool
-  MatchInstruction(SMLoc IDLoc,
-                   SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                   SmallVectorImpl<MCInst> &MCInsts) {
-    return true;
-  }
+  /// mnemonicIsValid - This returns true if this is a valid mnemonic and false
+  /// otherwise.
+  virtual bool mnemonicIsValid(StringRef Mnemonic) = 0;
 
   /// MatchAndEmitInstruction - Recognize a series of operands of a parsed
   /// instruction as an actual MCInst and emit it to the specified MCStreamer.
@@ -99,9 +137,10 @@ public:
   /// On failure, the target parser is responsible for emitting a diagnostic
   /// explaining the match failure.
   virtual bool
-  MatchAndEmitInstruction(SMLoc IDLoc,
+  MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                           SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                          MCStreamer &Out) = 0;
+                          MCStreamer &Out, unsigned &ErrorInfo,
+                          bool MatchingInlineAsm) = 0;
 
   /// checkTargetMatchPredicate - Validate the instruction match against
   /// any complex target predicates not expressible via match classes.
@@ -109,6 +148,8 @@ public:
     return Match_Success;
   }
 
+  virtual void convertToMapAndConstraints(unsigned Kind,
+                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) = 0;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCValue.h b/include/llvm/MC/MCValue.h
index 8352ed183f09..f9af8bcfbf61 100644
--- a/include/llvm/MC/MCValue.h
+++ b/include/llvm/MC/MCValue.h
@@ -46,7 +46,7 @@ public:
   /// isAbsolute - Is this an absolute (as opposed to relocatable) value.
   bool isAbsolute() const { return !SymA && !SymB; }
 
-  /// print - Print the value to the stream \arg OS.
+  /// print - Print the value to the stream \p OS.
   void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   /// dump - Print the value to stderr.
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index 507d8827750c..57f0518cbf3a 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -50,7 +50,7 @@ struct SubtargetFeatureKV {
 //
 struct SubtargetInfoKV {
   const char *Key;                      // K-V key string
-  void *Value;                          // K-V pointer value
+  const void *Value;                    // K-V pointer value
 
   // Compare routine for std binary search
   bool operator<(const SubtargetInfoKV &S) const {
@@ -95,10 +95,6 @@ public:
                           const SubtargetFeatureKV *FeatureTable,
                           size_t FeatureTableSize);
 
-  /// Get scheduling itinerary of a CPU.
-  void *getItinerary(const StringRef CPU,
-                     const SubtargetInfoKV *Table, size_t TableSize);
-
   /// Print feature string.
   void print(raw_ostream &OS) const;
 
diff --git a/include/llvm/MDBuilder.h b/include/llvm/MDBuilder.h
index 2aa48b0b4724..1867a639236e 100644
--- a/include/llvm/MDBuilder.h
+++ b/include/llvm/MDBuilder.h
@@ -134,6 +134,27 @@ namespace llvm {
       }
     }
 
+    struct TBAAStructField {
+      uint64_t Offset;
+      uint64_t Size;
+      MDNode *TBAA;
+      TBAAStructField(uint64_t Offset, uint64_t Size, MDNode *TBAA) :
+        Offset(Offset), Size(Size), TBAA(TBAA) {}
+    };
+
+    /// \brief Return metadata for a tbaa.struct node with the given
+    /// struct field descriptions.
+    MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
+      SmallVector<Value *, 4> Vals(Fields.size() * 3);
+      Type *Int64 = IntegerType::get(Context, 64);
+      for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
+        Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
+        Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
+        Vals[i * 3 + 2] = Fields[i].TBAA;
+      }
+      return MDNode::get(Context, Vals);
+    }
+
   };
 
 } // end namespace llvm
diff --git a/include/llvm/Metadata.h b/include/llvm/Metadata.h
index b40549bed6bf..0fbbb959888b 100644
--- a/include/llvm/Metadata.h
+++ b/include/llvm/Metadata.h
@@ -37,7 +37,7 @@ template<typename ValueSubClass, typename ItemParentClass>
 /// MDString is always unnamed.
 class MDString : public Value {
   virtual void anchor();
-  MDString(const MDString &);            // DO NOT IMPLEMENT
+  MDString(const MDString &) LLVM_DELETED_FUNCTION;
 
   explicit MDString(LLVMContext &C);
 public:
@@ -59,7 +59,6 @@ public:
   iterator end() const { return getName().end(); }
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const MDString *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == MDStringVal;
   }
@@ -71,8 +70,8 @@ class MDNodeOperand;
 //===----------------------------------------------------------------------===//
 /// MDNode - a tuple of other values.
 class MDNode : public Value, public FoldingSetNode {
-  MDNode(const MDNode &);                // DO NOT IMPLEMENT
-  void operator=(const MDNode &);        // DO NOT IMPLEMENT
+  MDNode(const MDNode &) LLVM_DELETED_FUNCTION;
+  void operator=(const MDNode &) LLVM_DELETED_FUNCTION;
   friend class MDNodeOperand;
   friend class LLVMContextImpl;
   friend struct FoldingSetTrait<MDNode>;
@@ -161,7 +160,6 @@ public:
   void Profile(FoldingSetNodeID &ID) const;
 
   /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const MDNode *) { return true; }
   static bool classof(const Value *V) {
     return V->getValueID() == MDNodeVal;
   }
@@ -195,7 +193,7 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
   friend struct ilist_traits<NamedMDNode>;
   friend class LLVMContextImpl;
   friend class Module;
-  NamedMDNode(const NamedMDNode &);      // DO NOT IMPLEMENT
+  NamedMDNode(const NamedMDNode &) LLVM_DELETED_FUNCTION;
 
   std::string Name;
   Module *Parent;
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 358b27a416cd..f3d824960c2f 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -129,7 +129,6 @@ public:
   symbol_iterator end_symbols() const;
 
   // Cast methods.
-  static inline bool classof(Archive const *v) { return true; }
   static inline bool classof(Binary const *v) {
     return v->isArchive();
   }
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index befe812a3692..d555de3accc2 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -26,8 +26,8 @@ namespace object {
 
 class Binary {
 private:
-  Binary(); // = delete
-  Binary(const Binary &other); // = delete
+  Binary() LLVM_DELETED_FUNCTION;
+  Binary(const Binary &other) LLVM_DELETED_FUNCTION;
 
   unsigned int TypeID;
 
@@ -64,7 +64,6 @@ public:
 
   // Cast methods.
   unsigned int getType() const { return TypeID; }
-  static inline bool classof(const Binary *v) { return true; }
 
   // Convenience methods
   bool isObject() const {
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index 967420ec9f12..6f42d76ee996 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -116,6 +116,7 @@ protected:
   virtual error_code getSymbolType(DataRefImpl Symb, SymbolRef::Type &Res) const;
   virtual error_code getSymbolSection(DataRefImpl Symb,
                                       section_iterator &Res) const;
+  virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const;
 
   virtual error_code getSectionNext(DataRefImpl Sec, SectionRef &Res) const;
   virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const;
@@ -128,6 +129,7 @@ protected:
   virtual error_code isSectionBSS(DataRefImpl Sec, bool &Res) const;
   virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const;
   virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const;
   virtual error_code isSectionRequiredForExecution(DataRefImpl Sec,
                                                    bool &Res) const;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
@@ -197,7 +199,6 @@ public:
   static inline bool classof(const Binary *v) {
     return v->isCOFF();
   }
-  static inline bool classof(const COFFObjectFile *v) { return true; }
 };
 
 }
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 7698441fd1cb..466de93a78b2 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -387,11 +387,65 @@ struct Elf_Rel_Impl<target_endianness, false, isRela>
   }
 };
 
+template<support::endianness target_endianness, bool is64Bits>
+struct Elf_Ehdr_Impl {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
+  unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
+  Elf_Half e_type;     // Type of file (see ET_*)
+  Elf_Half e_machine;  // Required architecture for this file (see EM_*)
+  Elf_Word e_version;  // Must be equal to 1
+  Elf_Addr e_entry;    // Address to jump to in order to start program
+  Elf_Off  e_phoff;    // Program header table's file offset, in bytes
+  Elf_Off  e_shoff;    // Section header table's file offset, in bytes
+  Elf_Word e_flags;    // Processor-specific flags
+  Elf_Half e_ehsize;   // Size of ELF header, in bytes
+  Elf_Half e_phentsize;// Size of an entry in the program header table
+  Elf_Half e_phnum;    // Number of entries in the program header table
+  Elf_Half e_shentsize;// Size of an entry in the section header table
+  Elf_Half e_shnum;    // Number of entries in the section header table
+  Elf_Half e_shstrndx; // Section header table index of section name
+                                 // string table
+  bool checkMagic() const {
+    return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
+  }
+   unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; }
+   unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
+};
+
+template<support::endianness target_endianness, bool is64Bits>
+struct Elf_Phdr;
+
+template<support::endianness target_endianness>
+struct Elf_Phdr<target_endianness, false> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, false)
+  Elf_Word p_type;   // Type of segment
+  Elf_Off  p_offset; // FileOffset where segment is located, in bytes
+  Elf_Addr p_vaddr;  // Virtual Address of beginning of segment 
+  Elf_Addr p_paddr;  // Physical address of beginning of segment (OS-specific)
+  Elf_Word p_filesz; // Num. of bytes in file image of segment (may be zero)
+  Elf_Word p_memsz;  // Num. of bytes in mem image of segment (may be zero)
+  Elf_Word p_flags;  // Segment flags
+  Elf_Word p_align;  // Segment alignment constraint
+};
+
+template<support::endianness target_endianness>
+struct Elf_Phdr<target_endianness, true> {
+  LLVM_ELF_IMPORT_TYPES(target_endianness, true)
+  Elf_Word p_type;   // Type of segment
+  Elf_Word p_flags;  // Segment flags
+  Elf_Off  p_offset; // FileOffset where segment is located, in bytes
+  Elf_Addr p_vaddr;  // Virtual Address of beginning of segment 
+  Elf_Addr p_paddr;  // Physical address of beginning of segment (OS-specific)
+  Elf_Word p_filesz; // Num. of bytes in file image of segment (may be zero)
+  Elf_Word p_memsz;  // Num. of bytes in mem image of segment (may be zero)
+  Elf_Word p_align;  // Segment alignment constraint
+};
 
 template<support::endianness target_endianness, bool is64Bits>
 class ELFObjectFile : public ObjectFile {
   LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
 
+  typedef Elf_Ehdr_Impl<target_endianness, is64Bits> Elf_Ehdr;
   typedef Elf_Shdr_Impl<target_endianness, is64Bits> Elf_Shdr;
   typedef Elf_Sym_Impl<target_endianness, is64Bits> Elf_Sym;
   typedef Elf_Dyn_Impl<target_endianness, is64Bits> Elf_Dyn;
@@ -406,28 +460,6 @@ class ELFObjectFile : public ObjectFile {
   typedef content_iterator<DynRef> dyn_iterator;
 
 protected:
-  struct Elf_Ehdr {
-    unsigned char e_ident[ELF::EI_NIDENT]; // ELF Identification bytes
-    Elf_Half e_type;     // Type of file (see ET_*)
-    Elf_Half e_machine;  // Required architecture for this file (see EM_*)
-    Elf_Word e_version;  // Must be equal to 1
-    Elf_Addr e_entry;    // Address to jump to in order to start program
-    Elf_Off  e_phoff;    // Program header table's file offset, in bytes
-    Elf_Off  e_shoff;    // Section header table's file offset, in bytes
-    Elf_Word e_flags;    // Processor-specific flags
-    Elf_Half e_ehsize;   // Size of ELF header, in bytes
-    Elf_Half e_phentsize;// Size of an entry in the program header table
-    Elf_Half e_phnum;    // Number of entries in the program header table
-    Elf_Half e_shentsize;// Size of an entry in the section header table
-    Elf_Half e_shnum;    // Number of entries in the section header table
-    Elf_Half e_shstrndx; // Section header table index of section name
-                                  // string table
-    bool checkMagic() const {
-      return (memcmp(e_ident, ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
-    }
-    unsigned char getFileClass() const { return e_ident[ELF::EI_CLASS]; }
-    unsigned char getDataEncoding() const { return e_ident[ELF::EI_DATA]; }
-  };
   // This flag is used for classof, to distinguish ELFObjectFile from
   // its subclass. If more subclasses will be created, this flag will
   // have to become an enum.
@@ -459,6 +491,59 @@ private:
   // This is set the first time getLoadName is called.
   mutable const char *dt_soname;
 
+public:
+  /// \brief Iterate over relocations in a .rel or .rela section.
+  template<class RelocT>
+  class ELFRelocationIterator {
+  public:
+    typedef void difference_type;
+    typedef const RelocT value_type;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef value_type &reference;
+    typedef value_type *pointer;
+
+    /// \brief Default construct iterator.
+    ELFRelocationIterator() : Section(0), Current(0) {}
+    ELFRelocationIterator(const Elf_Shdr *Sec, const char *Start)
+      : Section(Sec)
+      , Current(Start) {}
+
+    reference operator *() {
+      assert(Current && "Attempted to dereference an invalid iterator!");
+      return *reinterpret_cast<const RelocT*>(Current);
+    }
+
+    pointer operator ->() {
+      assert(Current && "Attempted to dereference an invalid iterator!");
+      return reinterpret_cast<const RelocT*>(Current);
+    }
+
+    bool operator ==(const ELFRelocationIterator &Other) {
+      return Section == Other.Section && Current == Other.Current;
+    }
+
+    bool operator !=(const ELFRelocationIterator &Other) {
+      return !(*this == Other);
+    }
+
+    ELFRelocationIterator &operator ++(int) {
+      assert(Current && "Attempted to increment an invalid iterator!");
+      Current += Section->sh_entsize;
+      return *this;
+    }
+
+    ELFRelocationIterator operator ++() {
+      ELFRelocationIterator Tmp = *this;
+      ++*this;
+      return Tmp;
+    }
+
+  private:
+    const Elf_Shdr *Section;
+    const char *Current;
+  };
+
+private:
   // Records for each version index the corresponding Verdef or Vernaux entry.
   // This is filled the first time LoadVersionMap() is called.
   class VersionMapEntry : public PointerIntPair<const void*, 1> {
@@ -535,6 +620,7 @@ protected:
   virtual error_code getSymbolType(DataRefImpl Symb, SymbolRef::Type &Res) const;
   virtual error_code getSymbolSection(DataRefImpl Symb,
                                       section_iterator &Res) const;
+  virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const;
 
   friend class DynRefImpl<target_endianness, is64Bits>;
   virtual error_code getDynNext(DataRefImpl DynData, DynRef &Result) const;
@@ -555,6 +641,7 @@ protected:
                                                    bool &Res) const;
   virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const;
   virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
                                            bool &Result) const;
   virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const;
@@ -594,6 +681,27 @@ public:
   virtual dyn_iterator begin_dynamic_table() const;
   virtual dyn_iterator end_dynamic_table() const;
 
+  typedef ELFRelocationIterator<Elf_Rela> Elf_Rela_Iter;
+  typedef ELFRelocationIterator<Elf_Rel> Elf_Rel_Iter;
+
+  virtual Elf_Rela_Iter beginELFRela(const Elf_Shdr *sec) const {
+    return Elf_Rela_Iter(sec, (const char *)(base() + sec->sh_offset));
+  }
+
+  virtual Elf_Rela_Iter endELFRela(const Elf_Shdr *sec) const {
+    return Elf_Rela_Iter(sec, (const char *)
+                         (base() + sec->sh_offset + sec->sh_size));
+  }
+
+  virtual Elf_Rel_Iter beginELFRel(const Elf_Shdr *sec) const {
+    return Elf_Rel_Iter(sec, (const char *)(base() + sec->sh_offset));
+  }
+
+  virtual Elf_Rel_Iter endELFRel(const Elf_Shdr *sec) const {
+    return Elf_Rel_Iter(sec, (const char *)
+                        (base() + sec->sh_offset + sec->sh_size));
+  }
+
   virtual uint8_t getBytesInAddress() const;
   virtual StringRef getFileFormatName() const;
   virtual StringRef getObjectType() const { return "ELF"; }
@@ -608,6 +716,7 @@ public:
   const Elf_Shdr *getSection(const Elf_Sym *symb) const;
   const Elf_Shdr *getElfSection(section_iterator &It) const;
   const Elf_Sym *getElfSymbol(symbol_iterator &It) const;
+  const Elf_Sym *getElfSymbol(uint32_t index) const;
 
   // Methods for type inquiry through isa, cast, and dyn_cast
   bool isDyldType() const { return isDyldELFObject; }
@@ -615,7 +724,6 @@ public:
     return v->getType() == getELFType(target_endianness == support::little,
                                       is64Bits);
   }
-  static inline bool classof(const ELFObjectFile *v) { return true; }
 };
 
 // Iterate through the version definitions, and place each Elf_Verdef
@@ -804,6 +912,16 @@ ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Sym *
+ELFObjectFile<target_endianness, is64Bits>
+                             ::getElfSymbol(uint32_t index) const {
+  DataRefImpl SymbolData;
+  SymbolData.d.a = index;
+  SymbolData.d.b = 1;
+  return getSymbol(SymbolData);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSymbolFileOffset(DataRefImpl Symb,
                                           uint64_t &Result) const {
@@ -863,7 +981,18 @@ error_code ELFObjectFile<target_endianness, is64Bits>
   case ELF::STT_FUNC:
   case ELF::STT_OBJECT:
   case ELF::STT_NOTYPE:
-    Result = symb->st_value + (Section ? Section->sh_addr : 0);
+    bool IsRelocatable;
+    switch(Header->e_type) {
+    case ELF::ET_EXEC:
+    case ELF::ET_DYN:
+      IsRelocatable = false;
+      break;
+    default:
+      IsRelocatable = true;
+    }
+    Result = symb->st_value;
+    if (IsRelocatable && Section != 0)
+      Result += Section->sh_addr;
     return object_error::success;
   default:
     Result = UnknownAddressOrSize;
@@ -1034,6 +1163,16 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::getSymbolValue(DataRefImpl Symb,
+                                         uint64_t &Val) const {
+  validateSymbol(Symb);
+  const Elf_Sym *symb = getSymbol(Symb);
+  Val = symb->st_value;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSectionNext(DataRefImpl Sec, SectionRef &Result) const {
   const uint8_t *sec = reinterpret_cast<const uint8_t *>(Sec.p);
   sec += Header->e_shentsize;
@@ -1160,7 +1299,8 @@ error_code ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
-error_code ELFObjectFile<target_endianness, is64Bits>::isSectionZeroInit(DataRefImpl Sec,
+error_code ELFObjectFile<target_endianness, is64Bits>
+                        ::isSectionZeroInit(DataRefImpl Sec,
                                             bool &Result) const {
   const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
   // For ELF, all zero-init sections are virtual (that is, they occupy no space
@@ -1174,6 +1314,18 @@ error_code ELFObjectFile<target_endianness, is64Bits>::isSectionZeroInit(DataRef
 
 template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
+                       ::isSectionReadOnlyData(DataRefImpl Sec,
+                                               bool &Result) const {
+  const Elf_Shdr *sec = reinterpret_cast<const Elf_Shdr *>(Sec.p);
+  if (sec->sh_flags & ELF::SHF_WRITE || sec->sh_flags & ELF::SHF_EXECINSTR)
+    Result = false;
+  else
+    Result = true;
+  return object_error::success;
+}
+
+template<support::endianness target_endianness, bool is64Bits>
+error_code ELFObjectFile<target_endianness, is64Bits>
                           ::sectionContainsSymbol(DataRefImpl Sec,
                                                   DataRefImpl Symb,
                                                   bool &Result) const {
@@ -1444,6 +1596,143 @@ error_code ELFObjectFile<target_endianness, is64Bits>
       res = "Unknown";
     }
     break;
+  case ELF::EM_ARM:
+    switch (type) {
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_NONE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PC24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ABS5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BREL_ADJ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_SWI8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_XPC25);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_XPC22);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPMOD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DTPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_TPOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_COPY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GLOB_DAT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP_SLOT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_RELATIVE);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP24);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_BASE_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_7_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_15_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PCREL_23_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SBREL_11_0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_19_12_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SBREL_27_20_CK);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_SBREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_V4BX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TARGET2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PREL31);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_ABS_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_PREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP19);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_ALU_PREL_11_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_PC12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ABS32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_REL32_NOI);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_PC_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ALU_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDR_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDRS_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_LDC_SB_G2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL_NC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVT_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_MOVW_BREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GOTDESC);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_DESCSEQ);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_CALL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PLT32_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_ABS);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_PREL);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOT_BREL12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTOFF12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GOTRELAX);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTENTRY);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_GNU_VTINHERIT);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_JUMP8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_GD32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDM32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE32);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LDO12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_LE12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_TLS_IE12GP);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_0);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_1);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_2);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_3);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_4);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_5);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_6);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_7);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_8);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_9);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_10);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_11);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_12);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_13);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_14);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_PRIVATE_15);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_ME_TOO);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ16);
+      LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_ARM_THM_TLS_DESCSEQ32);
+    default:
+      res = "Unknown";
+    }
+    break;
   case ELF::EM_HEXAGON:
     switch (type) {
       LLVM_ELF_SWITCH_RELOC_TYPE_NAME(R_HEX_NONE);
@@ -1574,15 +1863,15 @@ error_code ELFObjectFile<target_endianness, is64Bits>
   int64_t addend = 0;
   uint16_t symbol_index = 0;
   switch (sec->sh_type) {
-    default :
+    default:
       return object_error::parse_failed;
-    case ELF::SHT_REL : {
+    case ELF::SHT_REL: {
       type = getRel(Rel)->getType();
       symbol_index = getRel(Rel)->getSymbol();
       // TODO: Read implicit addend from section data.
       break;
     }
-    case ELF::SHT_RELA : {
+    case ELF::SHT_RELA: {
       type = getRela(Rel)->getType();
       symbol_index = getRela(Rel)->getSymbol();
       addend = getRela(Rel)->r_addend;
@@ -1596,9 +1885,8 @@ error_code ELFObjectFile<target_endianness, is64Bits>
   switch (Header->e_machine) {
   case ELF::EM_X86_64:
     switch (type) {
-    case ELF::R_X86_64_32S:
-      res = symname;
-      break;
+    case ELF::R_X86_64_PC8:
+    case ELF::R_X86_64_PC16:
     case ELF::R_X86_64_PC32: {
         std::string fmtbuf;
         raw_string_ostream fmt(fmtbuf);
@@ -1607,10 +1895,23 @@ error_code ELFObjectFile<target_endianness, is64Bits>
         Result.append(fmtbuf.begin(), fmtbuf.end());
       }
       break;
+    case ELF::R_X86_64_8:
+    case ELF::R_X86_64_16:
+    case ELF::R_X86_64_32:
+    case ELF::R_X86_64_32S:
+    case ELF::R_X86_64_64: {
+        std::string fmtbuf;
+        raw_string_ostream fmt(fmtbuf);
+        fmt << symname << (addend < 0 ? "" : "+") << addend;
+        fmt.flush();
+        Result.append(fmtbuf.begin(), fmtbuf.end());
+      }
+      break;
     default:
       res = "Unknown";
     }
     break;
+  case ELF::EM_ARM:
   case ELF::EM_HEXAGON:
     res = symname;
     break;
@@ -2024,6 +2325,8 @@ StringRef ELFObjectFile<target_endianness, is64Bits>
       return "ELF64-i386";
     case ELF::EM_X86_64:
       return "ELF64-x86-64";
+    case ELF::EM_PPC64:
+      return "ELF64-ppc64";
     default:
       return "ELF64-unknown";
     }
@@ -2044,6 +2347,11 @@ unsigned ELFObjectFile<target_endianness, is64Bits>::getArch() const {
     return Triple::arm;
   case ELF::EM_HEXAGON:
     return Triple::hexagon;
+  case ELF::EM_MIPS:
+    return (target_endianness == support::little) ?
+           Triple::mipsel : Triple::mips;
+  case ELF::EM_PPC64:
+    return Triple::ppc64;
   default:
     return Triple::UnknownArch;
   }
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index 0b73f9483164..4e03daab16a3 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -49,7 +49,6 @@ public:
   static inline bool classof(const Binary *v) {
     return v->isMachO();
   }
-  static inline bool classof(const MachOObjectFile *v) { return true; }
 
 protected:
   virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const;
@@ -62,6 +61,7 @@ protected:
   virtual error_code getSymbolType(DataRefImpl Symb, SymbolRef::Type &Res) const;
   virtual error_code getSymbolSection(DataRefImpl Symb,
                                       section_iterator &Res) const;
+  virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const;
 
   virtual error_code getSectionNext(DataRefImpl Sec, SectionRef &Res) const;
   virtual error_code getSectionName(DataRefImpl Sec, StringRef &Res) const;
@@ -76,6 +76,7 @@ protected:
                                                    bool &Res) const;
   virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const;
   virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const;
+  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const;
   virtual error_code sectionContainsSymbol(DataRefImpl DRI, DataRefImpl S,
                                            bool &Result) const;
   virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const;
diff --git a/include/llvm/Object/MachOFormat.h b/include/llvm/Object/MachOFormat.h
index f30d431b69da..c0f700d3c870 100644
--- a/include/llvm/Object/MachOFormat.h
+++ b/include/llvm/Object/MachOFormat.h
@@ -61,7 +61,10 @@ namespace mach {
     CSARM_V6     = 6,
     CSARM_V5TEJ  = 7,
     CSARM_XSCALE = 8,
-    CSARM_V7     = 9
+    CSARM_V7     = 9,
+    CSARM_V7F    = 10,
+    CSARM_V7S    = 11,
+    CSARM_V7K    = 12
   };
 
   /// \brief PowerPC Machine Subtypes.
@@ -273,6 +276,10 @@ namespace macho {
     uint16_t Flags;
     uint32_t Value;
   };
+  // Despite containing a uint64_t, this structure is only 4-byte aligned within
+  // a MachO file.
+#pragma pack(push)
+#pragma pack(4)
   struct Symbol64TableEntry {
     uint32_t StringIndex;
     uint8_t Type;
@@ -280,6 +287,7 @@ namespace macho {
     uint16_t Flags;
     uint64_t Value;
   };
+#pragma pack(pop)
 
   /// @}
   /// @name Data-in-code Table Entry
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 2ec656b0124e..1a3120ab8ba3 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -76,13 +76,13 @@ public:
   }
 };
 
-inline bool operator ==(const DataRefImpl &a, const DataRefImpl &b) {
+inline bool operator==(const DataRefImpl &a, const DataRefImpl &b) {
   // Check bitwise identical. This is the only legal way to compare a union w/o
   // knowing which member is in use.
   return std::memcmp(&a, &b, sizeof(DataRefImpl)) == 0;
 }
 
-inline bool operator <(const DataRefImpl &a, const DataRefImpl &b) {
+inline bool operator<(const DataRefImpl &a, const DataRefImpl &b) {
   // Check bitwise identical. This is the only legal way to compare a union w/o
   // knowing which member is in use.
   return std::memcmp(&a, &b, sizeof(DataRefImpl)) < 0;
@@ -144,7 +144,7 @@ public:
   SectionRef(DataRefImpl SectionP, const ObjectFile *Owner);
 
   bool operator==(const SectionRef &Other) const;
-  bool operator <(const SectionRef &Other) const;
+  bool operator<(const SectionRef &Other) const;
 
   error_code getNext(SectionRef &Result) const;
 
@@ -163,6 +163,7 @@ public:
   error_code isRequiredForExecution(bool &Result) const;
   error_code isVirtual(bool &Result) const;
   error_code isZeroInit(bool &Result) const;
+  error_code isReadOnlyData(bool &Result) const;
 
   error_code containsSymbol(SymbolRef S, bool &Result) const;
 
@@ -207,11 +208,13 @@ public:
   SymbolRef(DataRefImpl SymbolP, const ObjectFile *Owner);
 
   bool operator==(const SymbolRef &Other) const;
-  bool operator <(const SymbolRef &Other) const;
+  bool operator<(const SymbolRef &Other) const;
 
   error_code getNext(SymbolRef &Result) const;
 
   error_code getName(StringRef &Result) const;
+  /// Returns the symbol virtual address (i.e. address at which it will be
+  /// mapped).
   error_code getAddress(uint64_t &Result) const;
   error_code getFileOffset(uint64_t &Result) const;
   error_code getSize(uint64_t &Result) const;
@@ -231,6 +234,9 @@ public:
   /// end_sections() if it is undefined or is an absolute symbol.
   error_code getSection(section_iterator &Result) const;
 
+  /// @brief Get value of the symbol in the symbol table.
+  error_code getValue(uint64_t &Val) const;
+
   DataRefImpl getRawDataRefImpl() const;
 };
 typedef content_iterator<SymbolRef> symbol_iterator;
@@ -248,7 +254,7 @@ public:
   LibraryRef(DataRefImpl LibraryP, const ObjectFile *Owner);
 
   bool operator==(const LibraryRef &Other) const;
-  bool operator <(const LibraryRef &Other) const;
+  bool operator<(const LibraryRef &Other) const;
 
   error_code getNext(LibraryRef &Result) const;
 
@@ -263,11 +269,11 @@ const uint64_t UnknownAddressOrSize = ~0ULL;
 
 /// ObjectFile - This class is the base class for all object file types.
 /// Concrete instances of this object are created by createObjectFile, which
-/// figure out which type to create.
+/// figures out which type to create.
 class ObjectFile : public Binary {
   virtual void anchor();
-  ObjectFile(); // = delete
-  ObjectFile(const ObjectFile &other); // = delete
+  ObjectFile() LLVM_DELETED_FUNCTION;
+  ObjectFile(const ObjectFile &other) LLVM_DELETED_FUNCTION;
 
 protected:
   ObjectFile(unsigned int Type, MemoryBuffer *source, error_code &ec);
@@ -287,8 +293,8 @@ protected:
   friend class SymbolRef;
   virtual error_code getSymbolNext(DataRefImpl Symb, SymbolRef &Res) const = 0;
   virtual error_code getSymbolName(DataRefImpl Symb, StringRef &Res) const = 0;
-  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const =0;
-  virtual error_code getSymbolFileOffset(DataRefImpl Symb, uint64_t &Res) const =0;
+  virtual error_code getSymbolAddress(DataRefImpl Symb, uint64_t &Res) const = 0;
+  virtual error_code getSymbolFileOffset(DataRefImpl Symb, uint64_t &Res)const=0;
   virtual error_code getSymbolSize(DataRefImpl Symb, uint64_t &Res) const = 0;
   virtual error_code getSymbolType(DataRefImpl Symb,
                                    SymbolRef::Type &Res) const = 0;
@@ -297,6 +303,7 @@ protected:
                                     uint32_t &Res) const = 0;
   virtual error_code getSymbolSection(DataRefImpl Symb,
                                       section_iterator &Res) const = 0;
+  virtual error_code getSymbolValue(DataRefImpl Symb, uint64_t &Val) const = 0;
 
   // Same as above for SectionRef.
   friend class SectionRef;
@@ -314,6 +321,7 @@ protected:
   // A section is 'virtual' if its contents aren't present in the object image.
   virtual error_code isSectionVirtual(DataRefImpl Sec, bool &Res) const = 0;
   virtual error_code isSectionZeroInit(DataRefImpl Sec, bool &Res) const = 0;
+  virtual error_code isSectionReadOnlyData(DataRefImpl Sec, bool &Res) const =0;
   virtual error_code sectionContainsSymbol(DataRefImpl Sec, DataRefImpl Symb,
                                            bool &Result) const = 0;
   virtual relocation_iterator getSectionRelBegin(DataRefImpl Sec) const = 0;
@@ -384,7 +392,6 @@ public:
   static inline bool classof(const Binary *v) {
     return v->isObject();
   }
-  static inline bool classof(const ObjectFile *v) { return true; }
 
 public:
   static ObjectFile *createCOFFObjectFile(MemoryBuffer *Object);
@@ -401,7 +408,7 @@ inline bool SymbolRef::operator==(const SymbolRef &Other) const {
   return SymbolPimpl == Other.SymbolPimpl;
 }
 
-inline bool SymbolRef::operator <(const SymbolRef &Other) const {
+inline bool SymbolRef::operator<(const SymbolRef &Other) const {
   return SymbolPimpl < Other.SymbolPimpl;
 }
 
@@ -441,6 +448,10 @@ inline error_code SymbolRef::getType(SymbolRef::Type &Result) const {
   return OwningObject->getSymbolType(SymbolPimpl, Result);
 }
 
+inline error_code SymbolRef::getValue(uint64_t &Val) const {
+  return OwningObject->getSymbolValue(SymbolPimpl, Val);
+}
+
 inline DataRefImpl SymbolRef::getRawDataRefImpl() const {
   return SymbolPimpl;
 }
@@ -456,7 +467,7 @@ inline bool SectionRef::operator==(const SectionRef &Other) const {
   return SectionPimpl == Other.SectionPimpl;
 }
 
-inline bool SectionRef::operator <(const SectionRef &Other) const {
+inline bool SectionRef::operator<(const SectionRef &Other) const {
   return SectionPimpl < Other.SectionPimpl;
 }
 
@@ -508,6 +519,10 @@ inline error_code SectionRef::isZeroInit(bool &Result) const {
   return OwningObject->isSectionZeroInit(SectionPimpl, Result);
 }
 
+inline error_code SectionRef::isReadOnlyData(bool &Result) const {
+  return OwningObject->isSectionReadOnlyData(SectionPimpl, Result);
+}
+
 inline error_code SectionRef::containsSymbol(SymbolRef S, bool &Result) const {
   return OwningObject->sectionContainsSymbol(SectionPimpl, S.SymbolPimpl,
                                              Result);
@@ -586,7 +601,7 @@ inline bool LibraryRef::operator==(const LibraryRef &Other) const {
   return LibraryPimpl == Other.LibraryPimpl;
 }
 
-inline bool LibraryRef::operator <(const LibraryRef &Other) const {
+inline bool LibraryRef::operator<(const LibraryRef &Other) const {
   return LibraryPimpl < Other.LibraryPimpl;
 }
 
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
new file mode 100644
index 000000000000..7668bdedb7bb
--- /dev/null
+++ b/include/llvm/Object/RelocVisitor.h
@@ -0,0 +1,131 @@
+//===-- RelocVisitor.h - Visitor for object file relocations -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a wrapper around all the different types of relocations
+// in different file formats, such that a client can handle them in a unified
+// manner by only implementing a minimal number of functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LLVM_OBJECT_RELOCVISITOR
+#define _LLVM_OBJECT_RELOCVISITOR
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+namespace object {
+
+struct RelocToApply {
+  // The computed value after applying the relevant relocations.
+  int64_t Value;
+
+  // The width of the value; how many bytes to touch when applying the
+  // relocation.
+  char Width;
+  RelocToApply(const RelocToApply &In) : Value(In.Value), Width(In.Width) {}
+  RelocToApply(int64_t Value, char Width) : Value(Value), Width(Width) {}
+  RelocToApply() : Value(0), Width(0) {}
+};
+
+/// @brief Base class for object file relocation visitors.
+class RelocVisitor {
+public:
+  explicit RelocVisitor(llvm::StringRef FileFormat)
+    : FileFormat(FileFormat), HasError(false) {}
+
+  // TODO: Should handle multiple applied relocations via either passing in the
+  // previously computed value or just count paired relocations as a single
+  // visit.
+  RelocToApply visit(uint32_t RelocType, RelocationRef R, uint64_t SecAddr = 0,
+                     uint64_t Value = 0) {
+    if (FileFormat == "ELF64-x86-64") {
+      switch (RelocType) {
+        case llvm::ELF::R_X86_64_NONE:
+          return visitELF_X86_64_NONE(R);
+        case llvm::ELF::R_X86_64_64:
+          return visitELF_X86_64_64(R, Value);
+        case llvm::ELF::R_X86_64_PC32:
+          return visitELF_X86_64_PC32(R, Value, SecAddr);
+        case llvm::ELF::R_X86_64_32:
+          return visitELF_X86_64_32(R, Value);
+        case llvm::ELF::R_X86_64_32S:
+          return visitELF_X86_64_32S(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+      }
+    }
+    return RelocToApply();
+  }
+
+  bool error() { return HasError; }
+
+private:
+  llvm::StringRef FileFormat;
+  bool HasError;
+
+  /// Operations
+
+  // Width is the width in bytes of the extend.
+  RelocToApply zeroExtend(RelocToApply r, char Width) {
+    if (Width == r.Width)
+      return r;
+    r.Value &= (1 << ((Width * 8))) - 1;
+    return r;
+  }
+  RelocToApply signExtend(RelocToApply r, char Width) {
+    if (Width == r.Width)
+      return r;
+    bool SignBit = r.Value & (1 << ((Width * 8) - 1));
+    if (SignBit) {
+      r.Value |= ~((1 << (Width * 8)) - 1);
+    } else {
+      r.Value &= (1 << (Width * 8)) - 1;
+    }
+    return r;
+  }
+
+  /// X86-64 ELF
+  RelocToApply visitELF_X86_64_NONE(RelocationRef R) {
+    return RelocToApply(0, 0);
+  }
+  RelocToApply visitELF_X86_64_64(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    return RelocToApply(Value + Addend, 8);
+  }
+  RelocToApply visitELF_X86_64_PC32(RelocationRef R, uint64_t Value,
+                                    uint64_t SecAddr) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    uint64_t Address;
+    R.getAddress(Address);
+    return RelocToApply(Value + Addend - Address, 4);
+  }
+  RelocToApply visitELF_X86_64_32(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+  RelocToApply visitELF_X86_64_32S(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    int32_t Res = (Value + Addend) & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+};
+
+}
+}
+#endif
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index 1e86980cf303..b326c1135206 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -16,6 +16,7 @@
 #define LLVM_OPERATOR_H
 
 #include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/Instruction.h"
 #include "llvm/Type.h"
 
@@ -32,9 +33,14 @@ class Operator : public User {
 private:
   // Do not implement any of these. The Operator class is intended to be used
   // as a utility, and is never itself instantiated.
-  void *operator new(size_t, unsigned);
-  void *operator new(size_t s);
-  Operator();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t s) LLVM_DELETED_FUNCTION;
+  Operator() LLVM_DELETED_FUNCTION;
+
+protected:
+  // NOTE: Cannot use LLVM_DELETED_FUNCTION because it's not legal to delete
+  // an overridden method that's not deleted in the base class. Cannot leave
+  // this unimplemented because that leads to an ODR-violation.
   ~Operator();
 
 public:
@@ -57,7 +63,6 @@ public:
     return Instruction::UserOp1;
   }
 
-  static inline bool classof(const Operator *) { return true; }
   static inline bool classof(const Instruction *) { return true; }
   static inline bool classof(const ConstantExpr *) { return true; }
   static inline bool classof(const Value *V) {
@@ -77,8 +82,6 @@ public:
   };
 
 private:
-  ~OverflowingBinaryOperator(); // do not implement
-
   friend class BinaryOperator;
   friend class ConstantExpr;
   void setHasNoUnsignedWrap(bool B) {
@@ -103,7 +106,6 @@ public:
     return (SubclassOptionalData & NoSignedWrap) != 0;
   }
 
-  static inline bool classof(const OverflowingBinaryOperator *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::Add ||
            I->getOpcode() == Instruction::Sub ||
@@ -131,8 +133,6 @@ public:
   };
   
 private:
-  ~PossiblyExactOperator(); // do not implement
-
   friend class BinaryOperator;
   friend class ConstantExpr;
   void setIsExact(bool B) {
@@ -167,9 +167,6 @@ public:
 /// FPMathOperator - Utility class for floating point operations which can have
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
-private:
-  ~FPMathOperator(); // do not implement
-
 public:
 
   /// \brief Get the maximum error permitted by this operation in ULPs.  An
@@ -177,7 +174,6 @@ public:
   /// default precision.
   float getFPAccuracy() const;
 
-  static inline bool classof(const FPMathOperator *) { return true; }
   static inline bool classof(const Instruction *I) {
     return I->getType()->isFPOrFPVectorTy();
   }
@@ -191,11 +187,7 @@ public:
 /// opcodes.
 template<typename SuperClass, unsigned Opc>
 class ConcreteOperator : public SuperClass {
-  ~ConcreteOperator(); // DO NOT IMPLEMENT
 public:
-  static inline bool classof(const ConcreteOperator<SuperClass, Opc> *) {
-    return true;
-  }
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Opc;
   }
@@ -210,45 +202,35 @@ public:
 
 class AddOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {
-  ~AddOperator(); // DO NOT IMPLEMENT
 };
 class SubOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {
-  ~SubOperator(); // DO NOT IMPLEMENT
 };
 class MulOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {
-  ~MulOperator(); // DO NOT IMPLEMENT
 };
 class ShlOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
-  ~ShlOperator(); // DO NOT IMPLEMENT
 };
 
-  
+
 class SDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
-  ~SDivOperator(); // DO NOT IMPLEMENT
 };
 class UDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {
-  ~UDivOperator(); // DO NOT IMPLEMENT
 };
 class AShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {
-  ~AShrOperator(); // DO NOT IMPLEMENT
 };
 class LShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
-  ~LShrOperator(); // DO NOT IMPLEMENT
 };
-  
-  
-  
+
+
+
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
-  ~GEPOperator(); // DO NOT IMPLEMENT
-
   enum {
     IsInBounds = (1 << 0)
   };
@@ -288,6 +270,12 @@ public:
     return getPointerOperand()->getType();
   }
 
+  /// getPointerAddressSpace - Method to return the address space of the
+  /// pointer operand.
+  unsigned getPointerAddressSpace() const {
+    return cast<PointerType>(getPointerOperandType())->getAddressSpace();
+  }
+
   unsigned getNumIndices() const {  // Note: always non-negative
     return getNumOperands() - 1;
   }
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index 888537daa425..cd651db1f1c2 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -29,6 +29,7 @@
 #ifndef LLVM_PASS_H
 #define LLVM_PASS_H
 
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -82,8 +83,8 @@ class Pass {
   AnalysisResolver *Resolver;  // Used to resolve analysis
   const void *PassID;
   PassKind Kind;
-  void operator=(const Pass&);  // DO NOT IMPLEMENT
-  Pass(const Pass &);           // DO NOT IMPLEMENT
+  void operator=(const Pass&) LLVM_DELETED_FUNCTION;
+  Pass(const Pass &) LLVM_DELETED_FUNCTION;
 
 public:
   explicit Pass(PassKind K, char &pid) : Resolver(0), PassID(&pid), Kind(K) { }
diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index 5c6a2d7a92f9..d14d73b1b14f 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -120,7 +120,7 @@ public:
 class PMDataManager;
 class AnalysisResolver {
 private:
-  AnalysisResolver();  // DO NOT IMPLEMENT
+  AnalysisResolver() LLVM_DELETED_FUNCTION;
 
 public:
   explicit AnalysisResolver(PMDataManager &P) : PM(P) { }
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index c50c2cc184e3..c6ad44f5f4ec 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -126,8 +126,8 @@ public:
   }
 
 private:
-  void operator=(const PassInfo &); // do not implement
-  PassInfo(const PassInfo &);       // do not implement
+  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
+  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
 };
 
 #define CALL_ONCE_INITIALIZATION(function) \
diff --git a/include/llvm/Support/AlignOf.h b/include/llvm/Support/AlignOf.h
index cf7125173ee1..d6b0ab8b3750 100644
--- a/include/llvm/Support/AlignOf.h
+++ b/include/llvm/Support/AlignOf.h
@@ -68,24 +68,20 @@ inline unsigned alignOf() { return AlignOf<T>::Alignment; }
 /// integer literal can be used to specify an alignment constraint. Once built
 /// up here, we can then begin to indirect between these using normal C++
 /// template parameters.
-template <size_t Alignment> struct AlignedCharArrayImpl {};
-template <> struct AlignedCharArrayImpl<0> {
-  typedef char type;
-};
+template <size_t Alignment> struct AlignedCharArrayImpl;
+
+// MSVC requires special handling here.
+#ifndef _MSC_VER
+
 #if __has_feature(cxx_alignas)
 #define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
   template <> struct AlignedCharArrayImpl<x> { \
-    typedef char alignas(x) type; \
+    char alignas(x) aligned; \
   }
-#elif defined(__clang__) || defined(__GNUC__)
+#elif defined(__GNUC__) || defined(__IBM_ATTRIBUTES)
 #define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
   template <> struct AlignedCharArrayImpl<x> { \
-    typedef char type __attribute__((aligned(x))); \
-  }
-#elif defined(_MSC_VER)
-#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
-  template <> struct AlignedCharArrayImpl<x> { \
-    typedef __declspec(align(x)) char type; \
+    char aligned __attribute__((aligned(x))); \
   }
 #else
 # error No supported align as directive.
@@ -104,9 +100,38 @@ LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(1024);
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2048);
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4096);
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8192);
+
+#undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
+
+#else // _MSC_VER
+
+// We provide special variations of this template for the most common
+// alignments because __declspec(align(...)) doesn't actually work when it is
+// a member of a by-value function argument in MSVC, even if the alignment
+// request is something reasonably like 8-byte or 16-byte.
+template <> struct AlignedCharArrayImpl<1> { char aligned; };
+template <> struct AlignedCharArrayImpl<2> { short aligned; };
+template <> struct AlignedCharArrayImpl<4> { int aligned; };
+template <> struct AlignedCharArrayImpl<8> { double aligned; };
+
+#define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
+  template <> struct AlignedCharArrayImpl<x> { \
+    __declspec(align(x)) char aligned; \
+  }
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(64);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(128);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(512);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(1024);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(2048);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(4096);
+LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8192);
 // Any larger and MSVC complains.
 #undef LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT
 
+#endif // _MSC_VER
+
 /// \brief This union template exposes a suitably aligned and sized character
 /// array member which can hold elements of any of up to four types.
 ///
@@ -134,17 +159,11 @@ public:
   /// constrain the layout of this character array.
   char buffer[sizeof(SizerImpl)];
 
-  // Sadly, Clang and GCC both fail to align a character array properly even
-  // with an explicit alignment attribute. To work around this, we union
-  // the character array that will actually be used with a struct that contains
-  // a single aligned character member. Tests seem to indicate that both Clang
-  // and GCC will properly register the alignment of a struct containing an
-  // aligned member, and this alignment should carry over to the character
-  // array in the union.
-  struct {
-    typename llvm::AlignedCharArrayImpl<AlignOf<AlignerImpl>::Alignment>::type
-      nonce_inner_member;
-  } nonce_member;
+private:
+  // Tests seem to indicate that both Clang and GCC will properly register the
+  // alignment of a struct containing an aligned member, and this alignment
+  // should carry over to the character array in the union.
+  llvm::AlignedCharArrayImpl<AlignOf<AlignerImpl>::Alignment> nonce_member;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index a2ad24ffead9..a644b133660f 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -79,8 +79,8 @@ class MallocSlabAllocator : public SlabAllocator {
 public:
   MallocSlabAllocator() : Allocator() { }
   virtual ~MallocSlabAllocator();
-  virtual MemSlab *Allocate(size_t Size);
-  virtual void Deallocate(MemSlab *Slab);
+  virtual MemSlab *Allocate(size_t Size) LLVM_OVERRIDE;
+  virtual void Deallocate(MemSlab *Slab) LLVM_OVERRIDE;
 };
 
 /// BumpPtrAllocator - This allocator is useful for containers that need
@@ -88,8 +88,8 @@ public:
 /// allocating memory, and never deletes it until the entire block is dead. This
 /// makes allocation speedy, but must only be used when the trade-off is ok.
 class BumpPtrAllocator {
-  BumpPtrAllocator(const BumpPtrAllocator &); // do not implement
-  void operator=(const BumpPtrAllocator &);   // do not implement
+  BumpPtrAllocator(const BumpPtrAllocator &) LLVM_DELETED_FUNCTION;
+  void operator=(const BumpPtrAllocator &) LLVM_DELETED_FUNCTION;
 
   /// SlabSize - Allocate data into slabs of this size unless we get an
   /// allocation above SizeThreshold.
diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h
index c23bb6a97d2e..ad8d6d41fc4a 100644
--- a/include/llvm/Support/CallSite.h
+++ b/include/llvm/Support/CallSite.h
@@ -81,7 +81,7 @@ public:
   InstrTy *operator->() const { return I.getPointer(); }
   operator bool() const { return I.getPointer(); }
 
-  /// getCalledValue - Return the pointer to function that is being called...
+  /// getCalledValue - Return the pointer to function that is being called.
   ///
   ValTy *getCalledValue() const {
     assert(getInstruction() && "Not a call or invoke instruction!");
@@ -95,7 +95,7 @@ public:
     return dyn_cast<FunTy>(getCalledValue());
   }
 
-  /// setCalledFunction - Set the callee to the specified value...
+  /// setCalledFunction - Set the callee to the specified value.
   ///
   void setCalledFunction(Value *V) {
     assert(getInstruction() && "Not a call or invoke instruction!");
@@ -130,7 +130,7 @@ public:
   }
 
   /// arg_iterator - The type of iterator to use when looping over actual
-  /// arguments at this call site...
+  /// arguments at this call site.
   typedef IterTy arg_iterator;
 
   /// arg_begin/arg_end - Return iterators corresponding to the actual argument
@@ -185,13 +185,13 @@ public:
   }
 
   /// \brief Return true if this function has the given attribute.
-  bool hasFnAttr(Attributes N) const {
-    CALLSITE_DELEGATE_GETTER(hasFnAttr(N));
+  bool hasFnAttr(Attributes::AttrVal A) const {
+    CALLSITE_DELEGATE_GETTER(hasFnAttr(A));
   }
 
-  /// paramHasAttr - whether the call or the callee has the given attribute.
-  bool paramHasAttr(uint16_t i, Attributes attr) const {
-    CALLSITE_DELEGATE_GETTER(paramHasAttr(i, attr));
+  /// \brief Return true if the call or the callee has the given attribute.
+  bool paramHasAttr(unsigned i, Attributes::AttrVal A) const {
+    CALLSITE_DELEGATE_GETTER(paramHasAttr(i, A));
   }
 
   /// @brief Extract the alignment for a call or parameter (0=unknown).
@@ -211,32 +211,32 @@ public:
   bool doesNotAccessMemory() const {
     CALLSITE_DELEGATE_GETTER(doesNotAccessMemory());
   }
-  void setDoesNotAccessMemory(bool doesNotAccessMemory = true) {
-    CALLSITE_DELEGATE_SETTER(setDoesNotAccessMemory(doesNotAccessMemory));
+  void setDoesNotAccessMemory() {
+    CALLSITE_DELEGATE_SETTER(setDoesNotAccessMemory());
   }
 
   /// @brief Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
     CALLSITE_DELEGATE_GETTER(onlyReadsMemory());
   }
-  void setOnlyReadsMemory(bool onlyReadsMemory = true) {
-    CALLSITE_DELEGATE_SETTER(setOnlyReadsMemory(onlyReadsMemory));
+  void setOnlyReadsMemory() {
+    CALLSITE_DELEGATE_SETTER(setOnlyReadsMemory());
   }
 
   /// @brief Determine if the call cannot return.
   bool doesNotReturn() const {
     CALLSITE_DELEGATE_GETTER(doesNotReturn());
   }
-  void setDoesNotReturn(bool doesNotReturn = true) {
-    CALLSITE_DELEGATE_SETTER(setDoesNotReturn(doesNotReturn));
+  void setDoesNotReturn() {
+    CALLSITE_DELEGATE_SETTER(setDoesNotReturn());
   }
 
   /// @brief Determine if the call cannot unwind.
   bool doesNotThrow() const {
     CALLSITE_DELEGATE_GETTER(doesNotThrow());
   }
-  void setDoesNotThrow(bool doesNotThrow = true) {
-    CALLSITE_DELEGATE_SETTER(setDoesNotThrow(doesNotThrow));
+  void setDoesNotThrow() {
+    CALLSITE_DELEGATE_SETTER(setDoesNotThrow());
   }
 
 #undef CALLSITE_DELEGATE_GETTER
@@ -244,12 +244,12 @@ public:
 
   /// @brief Determine whether this argument is not captured.
   bool doesNotCapture(unsigned ArgNo) const {
-    return paramHasAttr(ArgNo + 1, Attribute::NoCapture);
+    return paramHasAttr(ArgNo + 1, Attributes::NoCapture);
   }
 
   /// @brief Determine whether this argument is passed by value.
   bool isByValArgument(unsigned ArgNo) const {
-    return paramHasAttr(ArgNo + 1, Attribute::ByVal);
+    return paramHasAttr(ArgNo + 1, Attributes::ByVal);
   }
 
   /// hasArgument - Returns true if this CallSite passes the given Value* as an
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index 3aab4367f5bb..0c71882a77b1 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_CASTING_H
 #define LLVM_SUPPORT_CASTING_H
 
+#include "llvm/Support/type_traits.h"
 #include <cassert>
 
 namespace llvm {
@@ -44,13 +45,23 @@ template<typename From> struct simplify_type<const From> {
 // The core of the implementation of isa<X> is here; To and From should be
 // the names of classes.  This template can be specialized to customize the
 // implementation of isa<> without rewriting it from scratch.
-template <typename To, typename From>
+template <typename To, typename From, typename Enabler = void>
 struct isa_impl {
   static inline bool doit(const From &Val) {
     return To::classof(&Val);
   }
 };
 
+/// \brief Always allow upcasts, and perform no dynamic check for them.
+template <typename To, typename From>
+struct isa_impl<To, From,
+                typename llvm::enable_if_c<
+                  llvm::is_base_of<To, From>::value
+                >::type
+               > {
+  static inline bool doit(const From &) { return true; }
+};
+
 template <typename To, typename From> struct isa_impl_cl {
   static inline bool doit(const From &Val) {
     return isa_impl<To, From>::doit(Val);
@@ -65,18 +76,21 @@ template <typename To, typename From> struct isa_impl_cl<To, const From> {
 
 template <typename To, typename From> struct isa_impl_cl<To, From*> {
   static inline bool doit(const From *Val) {
+    assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
 template <typename To, typename From> struct isa_impl_cl<To, const From*> {
   static inline bool doit(const From *Val) {
+    assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
 template <typename To, typename From> struct isa_impl_cl<To, const From*const> {
   static inline bool doit(const From *Val) {
+    assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index ae1570da9c42..872c57998c4e 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -41,16 +41,14 @@ namespace cl {
 // ParseCommandLineOptions - Command line option processing entry point.
 //
 void ParseCommandLineOptions(int argc, const char * const *argv,
-                             const char *Overview = 0,
-                             bool ReadResponseFiles = false);
+                             const char *Overview = 0);
 
 //===----------------------------------------------------------------------===//
 // ParseEnvironmentOptions - Environment variable option processing alternate
 //                           entry point.
 //
 void ParseEnvironmentOptions(const char *progName, const char *envvar,
-                             const char *Overview = 0,
-                             bool ReadResponseFiles = false);
+                             const char *Overview = 0);
 
 ///===---------------------------------------------------------------------===//
 /// SetVersionPrinter - Override the default (LLVM specific) version printer
@@ -1509,7 +1507,7 @@ class bits : public Option, public bits_storage<DataType, Storage> {
       typename ParserClass::parser_data_type();
     if (Parser.parse(*this, ArgName, Arg, Val))
       return true;  // Parse Error!
-    addValue(Val);
+    this->addValue(Val);
     setPosition(pos);
     Positions.push_back(pos);
     return false;
@@ -1608,15 +1606,16 @@ public:
 class alias : public Option {
   Option *AliasFor;
   virtual bool handleOccurrence(unsigned pos, StringRef /*ArgName*/,
-                                StringRef Arg) {
+                                StringRef Arg) LLVM_OVERRIDE {
     return AliasFor->handleOccurrence(pos, AliasFor->ArgStr, Arg);
   }
   // Handle printing stuff...
-  virtual size_t getOptionWidth() const;
-  virtual void printOptionInfo(size_t GlobalWidth) const;
+  virtual size_t getOptionWidth() const LLVM_OVERRIDE;
+  virtual void printOptionInfo(size_t GlobalWidth) const LLVM_OVERRIDE;
 
   // Aliases do not need to print their values.
-  virtual void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const {}
+  virtual void printOptionValue(size_t /*GlobalWidth*/,
+                                bool /*Force*/) const LLVM_OVERRIDE {}
 
   void done() {
     if (!hasArgStr())
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 4469ae31de09..7ceeb3212119 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -24,7 +24,7 @@
 /// does not imply the existence of any other C++ library features.
 #if (__has_feature(cxx_rvalue_references)   \
      || defined(__GXX_EXPERIMENTAL_CXX0X__) \
-     || _MSC_VER >= 1600)
+     || (defined(_MSC_VER) && _MSC_VER >= 1600))
 #define LLVM_USE_RVALUE_REFERENCES 1
 #else
 #define LLVM_USE_RVALUE_REFERENCES 0
@@ -40,7 +40,7 @@
 
 /// LLVM_DELETED_FUNCTION - Expands to = delete if the compiler supports it.
 /// Use to mark functions as uncallable. Member functions with this should
-/// be declared private so that some behaivor is kept in C++03 mode.
+/// be declared private so that some behavior is kept in C++03 mode.
 ///
 /// class DontCopy {
 /// private:
@@ -57,6 +57,22 @@
 #define LLVM_DELETED_FUNCTION
 #endif
 
+/// LLVM_FINAL - Expands to 'final' if the compiler supports it.
+/// Use to mark classes or virtual methods as final.
+#if (__has_feature(cxx_override_control))
+#define LLVM_FINAL final
+#else
+#define LLVM_FINAL
+#endif
+
+/// LLVM_OVERRIDE - Expands to 'override' if the compiler supports it.
+/// Use to mark virtual methods as overriding a base class method.
+#if (__has_feature(cxx_override_control))
+#define LLVM_OVERRIDE override
+#else
+#define LLVM_OVERRIDE
+#endif
+
 /// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked
 /// into a shared library, then the class should be private to the library and
 /// not accessible from outside it.  Can also be used to mark variables and
@@ -106,9 +122,11 @@
 #endif
 
 #if (__GNUC__ >= 4)
-#define BUILTIN_EXPECT(EXPR, VALUE) __builtin_expect((EXPR), (VALUE))
+#define LLVM_LIKELY(EXPR) __builtin_expect((bool)(EXPR), true)
+#define LLVM_UNLIKELY(EXPR) __builtin_expect((bool)(EXPR), false)
 #else
-#define BUILTIN_EXPECT(EXPR, VALUE) (EXPR)
+#define LLVM_LIKELY(EXPR) (EXPR)
+#define LLVM_UNLIKELY(EXPR) (EXPR)
 #endif
 
 
@@ -187,4 +205,13 @@
 # define LLVM_BUILTIN_UNREACHABLE __builtin_unreachable()
 #endif
 
+// LLVM_BUILTIN_TRAP - On compilers which support it, expands to an expression
+// which causes the program to exit abnormally.
+#if defined(__clang__) || (__GNUC__ > 4) \
+ || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)
+# define LLVM_BUILTIN_TRAP __builtin_trap()
+#else
+# define LLVM_BUILTIN_TRAP *(volatile int*)0x11 = 0
+#endif
+
 #endif
diff --git a/include/llvm/Support/DataExtractor.h b/include/llvm/Support/DataExtractor.h
index 506ec96930d9..a3ae78204074 100644
--- a/include/llvm/Support/DataExtractor.h
+++ b/include/llvm/Support/DataExtractor.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_SUPPORT_DATAEXTRACTOR_H
 #define LLVM_SUPPORT_DATAEXTRACTOR_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 
@@ -99,8 +100,8 @@ public:
   ///     enough bytes to extract this value, the offset will be left
   ///     unmodified.
   ///
-  /// @param[in] byte_size
-  ///     The size in byte of the integer to extract.
+  /// @param[in] size
+  ///     The size in bytes of the integer to extract.
   ///
   /// @return
   ///     The sign extended signed integer value that was extracted,
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index f7ae60fef74b..2cd267116cab 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -441,6 +441,7 @@ enum {
   R_MICROBLAZE_COPY           = 21
 };
 
+// ELF Relocation types for PPC32
 enum {
   R_PPC_NONE                  = 0,      /* No relocation. */
   R_PPC_ADDR32                = 1,
@@ -456,7 +457,23 @@ enum {
   R_PPC_REL14                 = 11,
   R_PPC_REL14_BRTAKEN         = 12,
   R_PPC_REL14_BRNTAKEN        = 13,
-  R_PPC_REL32                 = 26
+  R_PPC_REL32                 = 26,
+  R_PPC_TPREL16_LO            = 70,
+  R_PPC_TPREL16_HA            = 72
+};
+
+// ELF Relocation types for PPC64
+enum {
+  R_PPC64_ADDR16_LO           = 4,
+  R_PPC64_ADDR16_HI           = 5,
+  R_PPC64_ADDR14              = 7,
+  R_PPC64_REL24               = 10,
+  R_PPC64_ADDR64              = 38,
+  R_PPC64_ADDR16_HIGHER       = 39,
+  R_PPC64_ADDR16_HIGHEST      = 41,
+  R_PPC64_TOC16               = 47,
+  R_PPC64_TOC                 = 51,
+  R_PPC64_TOC16_DS            = 63
 };
 
 // ARM Specific e_flags
@@ -674,8 +691,36 @@ enum {
   R_MIPS_NUM               = 218
 };
 
+// Hexagon Specific e_flags
+// Release 5 ABI
+enum {
+  // Object processor version flags, bits[3:0]
+  EF_HEXAGON_MACH_V2      = 0x00000001,   // Hexagon V2
+  EF_HEXAGON_MACH_V3      = 0x00000002,   // Hexagon V3
+  EF_HEXAGON_MACH_V4      = 0x00000003,   // Hexagon V4
+  EF_HEXAGON_MACH_V5      = 0x00000004,   // Hexagon V5
+
+  // Highest ISA version flags
+  EF_HEXAGON_ISA_MACH     = 0x00000000,   // Same as specified in bits[3:0]
+                                          // of e_flags
+  EF_HEXAGON_ISA_V2       = 0x00000010,   // Hexagon V2 ISA
+  EF_HEXAGON_ISA_V3       = 0x00000020,   // Hexagon V3 ISA
+  EF_HEXAGON_ISA_V4       = 0x00000030,   // Hexagon V4 ISA
+  EF_HEXAGON_ISA_V5       = 0x00000040    // Hexagon V5 ISA
+};
+
+// Hexagon specific Section indexes for common small data
+// Release 5 ABI 
+enum {
+  SHN_HEXAGON_SCOMMON     = 0xff00,       // Other access sizes
+  SHN_HEXAGON_SCOMMON_1   = 0xff01,       // Byte-sized access
+  SHN_HEXAGON_SCOMMON_2   = 0xff02,       // Half-word-sized access
+  SHN_HEXAGON_SCOMMON_4   = 0xff03,       // Word-sized access
+  SHN_HEXAGON_SCOMMON_8   = 0xff04        // Double-word-size access
+};   
+
 // ELF Relocation types for Hexagon
-// Release 5 ABI - Document: 80-V9418-3 Rev. J
+// Release 5 ABI
 enum {
   R_HEX_NONE              =  0,
   R_HEX_B22_PCREL         =  1,
@@ -1103,6 +1148,9 @@ enum {
   PT_PHDR    = 6, // The program header table itself.
   PT_TLS     = 7, // The thread-local storage template.
   PT_LOOS    = 0x60000000, // Lowest operating system-specific pt entry type.
+  PT_HIOS    = 0x6fffffff, // Highest operating system-specific pt entry type.
+  PT_LOPROC  = 0x70000000, // Lowest processor-specific program hdr entry type.
+  PT_HIPROC  = 0x7fffffff, // Highest processor-specific program hdr entry type.
 
   // x86-64 program header types.
   // These all contain stack unwind tables.
@@ -1113,9 +1161,11 @@ enum {
   PT_GNU_STACK  = 0x6474e551, // Indicates stack executability.
   PT_GNU_RELRO  = 0x6474e552, // Read-only after relocation.
 
-  PT_HIOS    = 0x6fffffff, // Highest operating system-specific pt entry type.
-  PT_LOPROC  = 0x70000000, // Lowest processor-specific program hdr entry type.
-  PT_HIPROC  = 0x7fffffff  // Highest processor-specific program hdr entry type.
+  // ARM program header types.
+  PT_ARM_ARCHEXT = 0x70000000, // Platform architecture compatibility information
+  // These all contain stack unwind tables.
+  PT_ARM_EXIDX   = 0x70000001,
+  PT_ARM_UNWIND  = 0x70000001
 };
 
 // Segment flag bits.
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index 0f07164eb8ed..bcd35e3c1e1b 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -78,10 +78,11 @@ public:
   ~FileOutputBuffer();
 
   
+private:
+  FileOutputBuffer(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
+  FileOutputBuffer &operator=(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
 protected:
-  FileOutputBuffer(const FileOutputBuffer &); // DO NOT IMPLEMENT
-  FileOutputBuffer &operator=(const FileOutputBuffer &); // DO NOT IMPLEMENT
-  FileOutputBuffer(uint8_t *Start, uint8_t *End, 
+  FileOutputBuffer(uint8_t *Start, uint8_t *End,
                     StringRef Path, StringRef TempPath);
     
   uint8_t            *BufferStart;
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index f4a9aa0e8998..b455b28b819a 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -40,7 +40,7 @@
 #include <string>
 #include <vector>
 
-#if HAVE_SYS_STAT_H
+#ifdef HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
 
@@ -280,7 +280,7 @@ error_code create_symlink(const Twine &to, const Twine &from);
 /// @brief Get the current path.
 ///
 /// @param result Holds the current path on return.
-/// @results errc::success if the current path has been stored in result,
+/// @returns errc::success if the current path has been stored in result,
 ///          otherwise a platform specific error_code.
 error_code current_path(SmallVectorImpl<char> &result);
 
@@ -289,7 +289,7 @@ error_code current_path(SmallVectorImpl<char> &result);
 /// @param path Input path.
 /// @param existed Set to true if \a path existed, false if it did not.
 ///                undefined otherwise.
-/// @results errc::success if path has been removed and existed has been
+/// @returns errc::success if path has been removed and existed has been
 ///          successfully set, otherwise a platform specific error_code.
 error_code remove(const Twine &path, bool &existed);
 
@@ -298,7 +298,7 @@ error_code remove(const Twine &path, bool &existed);
 ///
 /// @param path Input path.
 /// @param num_removed Number of files removed.
-/// @results errc::success if path has been removed and num_removed has been
+/// @returns errc::success if path has been removed and num_removed has been
 ///          successfully set, otherwise a platform specific error_code.
 error_code remove_all(const Twine &path, uint32_t &num_removed);
 
@@ -323,7 +323,7 @@ error_code resize_file(const Twine &path, uint64_t size);
 /// @brief Does file exist?
 ///
 /// @param status A file_status previously returned from stat.
-/// @results True if the file represented by status exists, false if it does
+/// @returns True if the file represented by status exists, false if it does
 ///          not.
 bool exists(file_status status);
 
@@ -332,7 +332,7 @@ bool exists(file_status status);
 /// @param path Input path.
 /// @param result Set to true if the file represented by status exists, false if
 ///               it does not. Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code exists(const Twine &path, bool &result);
 
@@ -350,7 +350,7 @@ inline bool exists(const Twine &path) {
 ///
 /// assert(status_known(A) || status_known(B));
 ///
-/// @results True if A and B both represent the same file system entity, false
+/// @returns True if A and B both represent the same file system entity, false
 ///          otherwise.
 bool equivalent(file_status A, file_status B);
 
@@ -362,7 +362,7 @@ bool equivalent(file_status A, file_status B);
 /// @param B Input path B.
 /// @param result Set to true if stat(A) and stat(B) have the same device and
 ///               inode (or equivalent).
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code equivalent(const Twine &A, const Twine &B, bool &result);
 
@@ -384,7 +384,7 @@ error_code file_size(const Twine &path, uint64_t &result);
 /// @brief Does status represent a directory?
 ///
 /// @param status A file_status previously returned from status.
-/// @results status.type() == file_type::directory_file.
+/// @returns status.type() == file_type::directory_file.
 bool is_directory(file_status status);
 
 /// @brief Is path a directory?
@@ -392,14 +392,14 @@ bool is_directory(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path is a directory, false if it is not.
 ///               Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_directory(const Twine &path, bool &result);
 
 /// @brief Does status represent a regular file?
 ///
 /// @param status A file_status previously returned from status.
-/// @results status_known(status) && status.type() == file_type::regular_file.
+/// @returns status_known(status) && status.type() == file_type::regular_file.
 bool is_regular_file(file_status status);
 
 /// @brief Is path a regular file?
@@ -407,7 +407,7 @@ bool is_regular_file(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path is a regular file, false if it is not.
 ///               Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_regular_file(const Twine &path, bool &result);
 
@@ -415,7 +415,7 @@ error_code is_regular_file(const Twine &path, bool &result);
 ///        directory, regular file, or symlink?
 ///
 /// @param status A file_status previously returned from status.
-/// @results exists(s) && !is_regular_file(s) && !is_directory(s) &&
+/// @returns exists(s) && !is_regular_file(s) && !is_directory(s) &&
 ///          !is_symlink(s)
 bool is_other(file_status status);
 
@@ -425,14 +425,14 @@ bool is_other(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path exists, but is not a directory, regular
 ///               file, or a symlink, false if it does not. Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_other(const Twine &path, bool &result);
 
 /// @brief Does status represent a symlink?
 ///
 /// @param status A file_status previously returned from stat.
-/// @param result status.type() == symlink_file.
+/// @returns status.type() == symlink_file.
 bool is_symlink(file_status status);
 
 /// @brief Is path a symlink?
@@ -440,7 +440,7 @@ bool is_symlink(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path is a symlink, false if it is not.
 ///               Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_symlink(const Twine &path, bool &result);
 
@@ -448,28 +448,28 @@ error_code is_symlink(const Twine &path, bool &result);
 ///
 /// @param path Input path.
 /// @param result Set to the file status.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code status(const Twine &path, file_status &result);
 
 /// @brief Modifies permission bits on a file
 ///
 /// @param path Input path.
-/// @results errc::success if permissions have been changed, otherwise a
+/// @returns errc::success if permissions have been changed, otherwise a
 ///          platform specific error_code.
 error_code permissions(const Twine &path, perms prms);
 
 /// @brief Is status available?
 ///
-/// @param path Input path.
-/// @results True if status() != status_error.
+/// @param s Input file status.
+/// @returns True if status() != status_error.
 bool status_known(file_status s);
 
 /// @brief Is status available?
 ///
 /// @param path Input path.
 /// @param result Set to true if status() != status_error.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code status_known(const Twine &path, bool &result);
 
@@ -486,11 +486,11 @@ error_code status_known(const Twine &path, bool &result);
 /// clang-%%-%%-%%-%%-%%.s => /tmp/clang-a0-b1-c2-d3-e4.s
 ///
 /// @param model Name to base unique path off of.
-/// @param result_fs Set to the opened file's file descriptor.
+/// @param result_fd Set to the opened file's file descriptor.
 /// @param result_path Set to the opened file's absolute path.
-/// @param makeAbsolute If true and @model is not an absolute path, a temp
+/// @param makeAbsolute If true and \a model is not an absolute path, a temp
 ///        directory will be prepended.
-/// @results errc::success if result_{fd,path} have been successfully set,
+/// @returns errc::success if result_{fd,path} have been successfully set,
 ///          otherwise a platform specific error_code.
 error_code unique_file(const Twine &model, int &result_fd,
                        SmallVectorImpl<char> &result_path,
@@ -503,7 +503,7 @@ error_code unique_file(const Twine &model, int &result_fd,
 ///
 /// @param path Input path.
 /// @param result Set to the canonicalized version of \a path.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code canonicalize(const Twine &path, SmallVectorImpl<char> &result);
 
@@ -511,7 +511,7 @@ error_code canonicalize(const Twine &path, SmallVectorImpl<char> &result);
 ///
 /// @param path Input path.
 /// @param magic Byte sequence to compare \a path's first len(magic) bytes to.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code has_magic(const Twine &path, const Twine &magic, bool &result);
 
@@ -522,7 +522,7 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result);
 /// @param result Set to the first \a len bytes in the file pointed to by
 ///               \a path. Or the entire file if file_size(path) < len, in which
 ///               case result.size() returns the size of the file.
-/// @results errc::success if result has been successfully set,
+/// @returns errc::success if result has been successfully set,
 ///          errc::value_too_large if len is larger then the file pointed to by
 ///          \a path, otherwise a platform specific error_code.
 error_code get_magic(const Twine &path, uint32_t len,
@@ -535,14 +535,14 @@ file_magic identify_magic(StringRef magic);
 ///
 /// @param path Input path.
 /// @param result Set to the type of file, or LLVMFileType::Unknown_FileType.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code identify_magic(const Twine &path, file_magic &result);
 
 /// @brief Get library paths the system linker uses.
 ///
 /// @param result Set to the list of system library paths.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code GetSystemLibraryPaths(SmallVectorImpl<std::string> &result);
 
@@ -550,7 +550,7 @@ error_code GetSystemLibraryPaths(SmallVectorImpl<std::string> &result);
 ///        + LLVM_LIB_SEARCH_PATH + LLVM_LIBDIR.
 ///
 /// @param result Set to the list of bitcode library paths.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code GetBitcodeLibraryPaths(SmallVectorImpl<std::string> &result);
 
@@ -563,7 +563,7 @@ error_code GetBitcodeLibraryPaths(SmallVectorImpl<std::string> &result);
 ///
 /// @param short_name Library name one would give to the system linker.
 /// @param result Set to the absolute path \a short_name represents.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code FindLibrary(const Twine &short_name, SmallVectorImpl<char> &result);
 
@@ -572,7 +572,7 @@ error_code FindLibrary(const Twine &short_name, SmallVectorImpl<char> &result);
 /// @param argv0 The program name as it was spelled on the command line.
 /// @param MainAddr Address of some symbol in the executable (not in a library).
 /// @param result Set to the absolute path of the current executable.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code GetMainExecutable(const char *argv0, void *MainAddr,
                              SmallVectorImpl<char> &result);
@@ -586,9 +586,9 @@ class mapped_file_region {
 
 public:
   enum mapmode {
-    readonly, //< May only access map via const_data as read only.
-    readwrite, //< May access map via data and modify it. Written to path.
-    priv //< May modify via data, but changes are lost on destruction.
+    readonly, ///< May only access map via const_data as read only.
+    readwrite, ///< May access map via data and modify it. Written to path.
+    priv ///< May modify via data, but changes are lost on destruction.
   };
 
 private:
@@ -596,7 +596,7 @@ private:
   mapmode Mode;
   uint64_t Size;
   void *Mapping;
-#if LLVM_ON_WIN32
+#ifdef LLVM_ON_WIN32
   int FileDescriptor;
   void *FileHandle;
   void *FileMappingHandle;
@@ -658,13 +658,13 @@ public:
 ///
 /// @param path Path to file to map.
 /// @param file_offset Byte offset in file where mapping should begin.
-/// @param size_t Byte length of range of the file to map.
+/// @param size Byte length of range of the file to map.
 /// @param map_writable If true, the file will be mapped in r/w such
 ///        that changes to the mapped buffer will be flushed back
 ///        to the file.  If false, the file will be mapped read-only
 ///        and the buffer will be read-only.
 /// @param result Set to the start address of the mapped buffer.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
                           bool map_writable, void *&result);
@@ -674,7 +674,7 @@ error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
 ///
 /// @param base Pointer to the start of the buffer.
 /// @param size Byte length of the range to unmmap.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code unmap_file_pages(void *base, size_t size);
 
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index 59812d98f589..aaa54e1090a6 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -170,31 +170,47 @@ public:
   }
 };
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T>
 inline format_object1<T> format(const char *Fmt, const T &Val) {
   return format_object1<T>(Fmt, Val);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2>
 inline format_object2<T1, T2> format(const char *Fmt, const T1 &Val1,
                                      const T2 &Val2) {
   return format_object2<T1, T2>(Fmt, Val1, Val2);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2, typename T3>
   inline format_object3<T1, T2, T3> format(const char *Fmt, const T1 &Val1,
                                            const T2 &Val2, const T3 &Val3) {
   return format_object3<T1, T2, T3>(Fmt, Val1, Val2, Val3);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2, typename T3, typename T4>
 inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
@@ -202,8 +218,12 @@ inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
   return format_object4<T1, T2, T3, T4>(Fmt, Val1, Val2, Val3, Val4);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
diff --git a/include/llvm/Support/FormattedStream.h b/include/llvm/Support/FormattedStream.h
index 58a18851687c..21635dcfb688 100644
--- a/include/llvm/Support/FormattedStream.h
+++ b/include/llvm/Support/FormattedStream.h
@@ -55,14 +55,15 @@ namespace llvm
     ///
     const char *Scanned;
 
-    virtual void write_impl(const char *Ptr, size_t Size);
+    virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
     /// current_pos - Return the current position within the stream,
     /// not counting the bytes currently in the buffer.
-    virtual uint64_t current_pos() const { 
-      // This has the same effect as calling TheStream.current_pos(),
-      // but that interface is private.
-      return TheStream->tell() - TheStream->GetNumBytesInBuffer();
+    virtual uint64_t current_pos() const LLVM_OVERRIDE {
+      // Our current position in the stream is all the contents which have been
+      // written to the underlying stream (*not* the current position of the
+      // underlying stream).
+      return TheStream->tell();
     }
 
     /// ComputeColumn - Examine the given output buffer and figure out which
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index 19e1ce89cbd5..e552315f4558 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -27,13 +27,15 @@ class GCOVBlock;
 class GCOVLines;
 class FileInfo;
 
-enum GCOVFormat {
-  InvalidGCOV,
-  GCNO_402,
-  GCNO_404,
-  GCDA_402,
-  GCDA_404
-};
+namespace GCOV {
+  enum GCOVFormat {
+    InvalidGCOV,
+    GCNO_402,
+    GCNO_404,
+    GCDA_402,
+    GCDA_404
+  };
+} // end GCOV namespace
 
 /// GCOVBuffer - A wrapper around MemoryBuffer to provide GCOV specific
 /// read operations.
@@ -42,20 +44,20 @@ public:
   GCOVBuffer(MemoryBuffer *B) : Buffer(B), Cursor(0) {}
   
   /// readGCOVFormat - Read GCOV signature at the beginning of buffer.
-  enum GCOVFormat readGCOVFormat() {
+  GCOV::GCOVFormat readGCOVFormat() {
     StringRef Magic = Buffer->getBuffer().slice(0, 12);
     Cursor = 12;
     if (Magic == "oncg*404MVLL")
-      return GCNO_404;
+      return GCOV::GCNO_404;
     else if (Magic == "oncg*204MVLL")
-      return GCNO_402;
+      return GCOV::GCNO_402;
     else if (Magic == "adcg*404MVLL")
-      return GCDA_404;
+      return GCOV::GCDA_404;
     else if (Magic == "adcg*204MVLL")
-      return GCDA_402;
+      return GCOV::GCDA_402;
     
     Cursor = 0;
-    return InvalidGCOV;
+    return GCOV::InvalidGCOV;
   }
 
   /// readFunctionTag - If cursor points to a function tag then increment the
@@ -128,7 +130,7 @@ public:
     StringRef Str = Buffer->getBuffer().slice(Cursor, Cursor+4);
     assert (Str.empty() == false && "Unexpected memory buffer end!");
     Cursor += 4;
-    Result = *(uint32_t *)(Str.data());
+    Result = *(const uint32_t *)(Str.data());
     return Result;
   }
 
@@ -170,7 +172,7 @@ class GCOVFunction {
 public:
   GCOVFunction() : Ident(0), LineNumber(0) {}
   ~GCOVFunction();
-  bool read(GCOVBuffer &Buffer, GCOVFormat Format);
+  bool read(GCOVBuffer &Buffer, GCOV::GCOVFormat Format);
   void dump();
   void collectLineCounts(FileInfo &FI);
 private:
diff --git a/include/llvm/Support/InstVisitor.h b/include/llvm/Support/InstVisitor.h
index 109b3cff85b6..6dfb4dec0e23 100644
--- a/include/llvm/Support/InstVisitor.h
+++ b/include/llvm/Support/InstVisitor.h
@@ -209,6 +209,9 @@ public:
   RetTy visitMemMoveInst(MemMoveInst &I)          { DELEGATE(MemTransferInst); }
   RetTy visitMemTransferInst(MemTransferInst &I)  { DELEGATE(MemIntrinsic); }
   RetTy visitMemIntrinsic(MemIntrinsic &I)        { DELEGATE(IntrinsicInst); }
+  RetTy visitVAStartInst(VAStartInst &I)          { DELEGATE(IntrinsicInst); }
+  RetTy visitVAEndInst(VAEndInst &I)              { DELEGATE(IntrinsicInst); }
+  RetTy visitVACopyInst(VACopyInst &I)            { DELEGATE(IntrinsicInst); }
   RetTy visitIntrinsicInst(IntrinsicInst &I)      { DELEGATE(CallInst); }
 
   // Call and Invoke are slightly different as they delegate first through
@@ -262,6 +265,9 @@ private:
       case Intrinsic::memcpy:      DELEGATE(MemCpyInst);
       case Intrinsic::memmove:     DELEGATE(MemMoveInst);
       case Intrinsic::memset:      DELEGATE(MemSetInst);
+      case Intrinsic::vastart:     DELEGATE(VAStartInst);
+      case Intrinsic::vaend:       DELEGATE(VAEndInst);
+      case Intrinsic::vacopy:      DELEGATE(VACopyInst);
       case Intrinsic::not_intrinsic: break;
       }
     }
diff --git a/include/llvm/Support/IntegersSubset.h b/include/llvm/Support/IntegersSubset.h
index bb9e76925ed5..03039fd6459f 100644
--- a/include/llvm/Support/IntegersSubset.h
+++ b/include/llvm/Support/IntegersSubset.h
@@ -411,8 +411,8 @@ public:
   unsigned getSize() const {
     APInt sz(((const APInt&)getItem(0).getLow()).getBitWidth(), 0);
     for (unsigned i = 0, e = getNumItems(); i != e; ++i) {
-      const APInt &Low = getItem(i).getLow();
-      const APInt &High = getItem(i).getHigh();
+      const APInt Low = getItem(i).getLow();
+      const APInt High = getItem(i).getHigh();
       APInt S = High - Low + 1;
       sz += S;
     }
@@ -426,8 +426,8 @@ public:
   APInt getSingleValue(unsigned idx) const {
     APInt sz(((const APInt&)getItem(0).getLow()).getBitWidth(), 0);
     for (unsigned i = 0, e = getNumItems(); i != e; ++i) {
-      const APInt &Low = getItem(i).getLow();
-      const APInt &High = getItem(i).getHigh();
+      const APInt Low = getItem(i).getLow();
+      const APInt High = getItem(i).getHigh();
       APInt S = High - Low + 1;
       APInt oldSz = sz;
       sz += S;
diff --git a/include/llvm/Support/IntegersSubsetMapping.h b/include/llvm/Support/IntegersSubsetMapping.h
index cab18dce159b..7635d5e91221 100644
--- a/include/llvm/Support/IntegersSubsetMapping.h
+++ b/include/llvm/Support/IntegersSubsetMapping.h
@@ -42,6 +42,7 @@ public:
   struct RangeEx : public RangeTy {
     RangeEx() : Weight(1) {}
     RangeEx(const RangeTy &R) : RangeTy(R), Weight(1) {}
+    RangeEx(const RangeTy &R, unsigned W) : RangeTy(R), Weight(W) {}
     RangeEx(const IntTy &C) : RangeTy(C), Weight(1) {}
     RangeEx(const IntTy &L, const IntTy &H) : RangeTy(L, H), Weight(1) {}
     RangeEx(const IntTy &L, const IntTy &H, unsigned W) :
@@ -316,13 +317,13 @@ public:
     Items.clear();
     const IntTy *Low = &OldItems.begin()->first.getLow();
     const IntTy *High = &OldItems.begin()->first.getHigh();
-    unsigned Weight = 1;
+    unsigned Weight = OldItems.begin()->first.Weight;
     SuccessorClass *Successor = OldItems.begin()->second;
     for (CaseItemIt j = OldItems.begin(), i = j++, e = OldItems.end();
          j != e; i = j++) {
       if (isJoinable(i, j)) {
         const IntTy *CurHigh = &j->first.getHigh();
-        ++Weight;
+        Weight += j->first.Weight;
         if (*CurHigh > *High)
           High = CurHigh;
       } else {
@@ -330,7 +331,7 @@ public:
         add(R, Successor);
         Low = &j->first.getLow();
         High = &j->first.getHigh(); 
-        Weight = 1;
+        Weight = j->first.Weight;
         Successor = j->second;
       }
     }
@@ -362,10 +363,17 @@ public:
   
   /// Adds all ranges and values from given ranges set to the current
   /// mapping.
-  void add(const IntegersSubsetTy &CRS, SuccessorClass *S = 0) {
+  void add(const IntegersSubsetTy &CRS, SuccessorClass *S = 0,
+           unsigned Weight = 0) {
+    unsigned ItemWeight = 1;
+    if (Weight)
+      // Weight is associated with CRS, for now we perform a division to
+      // get the weight for each item.
+      ItemWeight = Weight / CRS.getNumItems();
     for (unsigned i = 0, e = CRS.getNumItems(); i < e; ++i) {
       RangeTy R = CRS.getItem(i);
-      add(R, S);
+      RangeEx REx(R, ItemWeight);
+      add(REx, S);
     }
   }
   
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index 410edd4dc740..b52e5bc9ad33 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_SYSTEM_LEB128_H
 #define LLVM_SYSTEM_LEB128_H
 
-#include <llvm/Support/raw_ostream.h>
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index e2fa8ebc56e4..8c4a760291b8 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -47,8 +47,8 @@ private:
   Optional<std::pair<std::string, int> > Owner;
   Optional<error_code> Error;
 
-  LockFileManager(const LockFileManager &);
-  LockFileManager &operator=(const LockFileManager &);
+  LockFileManager(const LockFileManager &) LLVM_DELETED_FUNCTION;
+  LockFileManager &operator=(const LockFileManager &) LLVM_DELETED_FUNCTION;
 
   static Optional<std::pair<std::string, int> >
   readLockFile(StringRef LockFileName);
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 4005161320d6..11f9e63c9bbc 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -431,21 +431,22 @@ inline uint64_t NextPowerOf2(uint64_t A) {
   return A + 1;
 }
 
-/// RoundUpToAlignment - Returns the next integer (mod 2**64) that is
-/// greater than or equal to \arg Value and is a multiple of \arg
-/// Align. Align must be non-zero.
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
 ///
 /// Examples:
-/// RoundUpToAlignment(5, 8) = 8
-/// RoundUpToAlignment(17, 8) = 24
-/// RoundUpToAlignment(~0LL, 8) = 0
+/// \code
+///   RoundUpToAlignment(5, 8) = 8
+///   RoundUpToAlignment(17, 8) = 24
+///   RoundUpToAlignment(~0LL, 8) = 0
+/// \endcode
 inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) {
   return ((Value + Align - 1) / Align) * Align;
 }
 
-/// OffsetToAlignment - Return the offset to the next integer (mod 2**64) that
-/// is greater than or equal to \arg Value and is a multiple of \arg
-/// Align. Align must be non-zero.
+/// Returns the offset to the next integer (mod 2**64) that is greater than
+/// or equal to \p Value and is a multiple of \p Align. \p Align must be
+/// non-zero.
 inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
   return RoundUpToAlignment(Value, Align) - Value;
 }
@@ -463,12 +464,24 @@ template <unsigned B> inline int32_t SignExtend32(uint32_t x) {
   return int32_t(x << (32 - B)) >> (32 - B);
 }
 
+/// \brief Sign extend number in the bottom B bits of X to a 32-bit int.
+/// Requires 0 < B <= 32.
+inline int32_t SignExtend32(uint32_t X, unsigned B) {
+  return int32_t(X << (32 - B)) >> (32 - B);
+}
+
 /// SignExtend64 - Sign extend B-bit number x to 64-bit int.
 /// Usage int64_t r = SignExtend64<5>(x);
 template <unsigned B> inline int64_t SignExtend64(uint64_t x) {
   return int64_t(x << (64 - B)) >> (64 - B);
 }
 
+/// \brief Sign extend number in the bottom B bits of X to a 64-bit int.
+/// Requires 0 < B <= 64.
+inline int64_t SignExtend64(uint64_t X, unsigned B) {
+  return int64_t(X << (64 - B)) >> (64 - B);
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Support/Memory.h b/include/llvm/Support/Memory.h
index 37890e7e4af1..025eee7f9f3e 100644
--- a/include/llvm/Support/Memory.h
+++ b/include/llvm/Support/Memory.h
@@ -15,6 +15,7 @@
 #define LLVM_SYSTEM_MEMORY_H
 
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/system_error.h"
 #include <string>
 
 namespace llvm {
@@ -43,6 +44,70 @@ namespace sys {
   /// @brief An abstraction for memory operations.
   class Memory {
   public:
+    enum ProtectionFlags {
+      MF_READ  = 0x1000000,
+      MF_WRITE = 0x2000000,
+      MF_EXEC  = 0x4000000
+    };
+
+    /// This method allocates a block of memory that is suitable for loading
+    /// dynamically generated code (e.g. JIT). An attempt to allocate
+    /// \p NumBytes bytes of virtual memory is made.
+    /// \p NearBlock may point to an existing allocation in which case
+    /// an attempt is made to allocate more memory near the existing block.
+    /// The actual allocated address is not guaranteed to be near the requested
+    /// address.
+    /// \p Flags is used to set the initial protection flags for the block
+    /// of the memory.
+    /// \p EC [out] returns an object describing any error that occurs.
+    ///
+    /// This method may allocate more than the number of bytes requested.  The
+    /// actual number of bytes allocated is indicated in the returned
+    /// MemoryBlock.
+    ///
+    /// The start of the allocated block must be aligned with the
+    /// system allocation granularity (64K on Windows, page size on Linux).
+    /// If the address following \p NearBlock is not so aligned, it will be
+    /// rounded up to the next allocation granularity boundary.
+    ///
+    /// \r a non-null MemoryBlock if the function was successful, 
+    /// otherwise a null MemoryBlock is with \p EC describing the error.
+    ///
+    /// @brief Allocate mapped memory.
+    static MemoryBlock allocateMappedMemory(size_t NumBytes,
+                                            const MemoryBlock *const NearBlock,
+                                            unsigned Flags,
+                                            error_code &EC);
+
+    /// This method releases a block of memory that was allocated with the
+    /// allocateMappedMemory method. It should not be used to release any
+    /// memory block allocated any other way.
+    /// \p Block describes the memory to be released.
+    ///
+    /// \r error_success if the function was successful, or an error_code
+    /// describing the failure if an error occurred.
+    /// 
+    /// @brief Release mapped memory.
+    static error_code releaseMappedMemory(MemoryBlock &Block);
+
+    /// This method sets the protection flags for a block of memory to the
+    /// state specified by /p Flags.  The behavior is not specified if the
+    /// memory was not allocated using the allocateMappedMemory method.
+    /// \p Block describes the memory block to be protected.
+    /// \p Flags specifies the new protection state to be assigned to the block.
+    /// \p ErrMsg [out] returns a string describing any error that occured.
+    ///
+    /// If \p Flags is MF_WRITE, the actual behavior varies
+    /// with the operating system (i.e. MF_READ | MF_WRITE on Windows) and the
+    /// target architecture (i.e. MF_WRITE -> MF_READ | MF_WRITE on i386).
+    ///
+    /// \r error_success if the function was successful, or an error_code
+    /// describing the failure if an error occurred.
+    ///
+    /// @brief Set memory protection state.
+    static error_code protectMappedMemory(const MemoryBlock &Block,
+                                          unsigned Flags);
+
     /// This method allocates a block of Read/Write/Execute memory that is
     /// suitable for executing dynamically generated code (e.g. JIT). An
     /// attempt to allocate \p NumBytes bytes of virtual memory is made.
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 06816de9716a..1f02907d9f9a 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_MEMORYBUFFER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -36,8 +37,8 @@ class MemoryBuffer {
   const char *BufferStart; // Start of the buffer.
   const char *BufferEnd;   // End of the buffer.
 
-  MemoryBuffer(const MemoryBuffer &); // DO NOT IMPLEMENT
-  MemoryBuffer &operator=(const MemoryBuffer &); // DO NOT IMPLEMENT
+  MemoryBuffer(const MemoryBuffer &) LLVM_DELETED_FUNCTION;
+  MemoryBuffer &operator=(const MemoryBuffer &) LLVM_DELETED_FUNCTION;
 protected:
   MemoryBuffer() {}
   void init(const char *BufStart, const char *BufEnd,
diff --git a/include/llvm/Support/Mutex.h b/include/llvm/Support/Mutex.h
index 42ea63060f66..6abc533d28d6 100644
--- a/include/llvm/Support/Mutex.h
+++ b/include/llvm/Support/Mutex.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SYSTEM_MUTEX_H
 #define LLVM_SYSTEM_MUTEX_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
 
@@ -75,8 +76,8 @@ namespace llvm
     /// @name Do Not Implement
     /// @{
     private:
-      MutexImpl(const MutexImpl & original);
-      void operator=(const MutexImpl &);
+      MutexImpl(const MutexImpl &) LLVM_DELETED_FUNCTION;
+      void operator=(const MutexImpl &) LLVM_DELETED_FUNCTION;
     /// @}
     };
 
diff --git a/include/llvm/Support/MutexGuard.h b/include/llvm/Support/MutexGuard.h
index cd13bfe6eeb0..6bb162277e2b 100644
--- a/include/llvm/Support/MutexGuard.h
+++ b/include/llvm/Support/MutexGuard.h
@@ -26,8 +26,8 @@ namespace llvm {
   /// @brief Guard a section of code with a Mutex.
   class MutexGuard {
     sys::Mutex &M;
-    MutexGuard(const MutexGuard &);    // DO NOT IMPLEMENT
-    void operator=(const MutexGuard &); // DO NOT IMPLEMENT
+    MutexGuard(const MutexGuard &) LLVM_DELETED_FUNCTION;
+    void operator=(const MutexGuard &) LLVM_DELETED_FUNCTION;
   public:
     MutexGuard(sys::Mutex &m) : M(m) { M.acquire(); }
     ~MutexGuard() { M.release(); }
diff --git a/include/llvm/Support/PathV1.h b/include/llvm/Support/PathV1.h
index f4bedf92c441..643ee8c6c1d0 100644
--- a/include/llvm/Support/PathV1.h
+++ b/include/llvm/Support/PathV1.h
@@ -683,8 +683,8 @@ namespace sys {
       /// This function returns status information about the file. The type of
       /// path (file or directory) is updated to reflect the actual contents
       /// of the file system.
-      /// @returns 0 on failure, with Error explaining why (if non-zero)
-      /// @returns a pointer to a FileStatus structure on success.
+      /// @returns 0 on failure, with Error explaining why (if non-zero),
+      /// otherwise returns a pointer to a FileStatus structure on success.
       /// @brief Get file status.
       const FileStatus *getFileStatus(
         bool forceUpdate = false, ///< Force an update from the file system
diff --git a/include/llvm/Support/PathV2.h b/include/llvm/Support/PathV2.h
index 8d797097a840..ae1a21c7ce58 100644
--- a/include/llvm/Support/PathV2.h
+++ b/include/llvm/Support/PathV2.h
@@ -39,13 +39,14 @@ namespace path {
 /// The backwards traversal order is the reverse of forward traversal.
 ///
 /// Iteration examples. Each component is separated by ',':
-/// /          => /
-/// /foo       => /,foo
-/// foo/       => foo,.
-/// /foo/bar   => /,foo,bar
-/// ../        => ..,.
-/// C:\foo\bar => C:,/,foo,bar
-///
+/// @code
+///   /          => /
+///   /foo       => /,foo
+///   foo/       => foo,.
+///   /foo/bar   => /,foo,bar
+///   ../        => ..,.
+///   C:\foo\bar => C:,/,foo,bar
+/// @endcode
 class const_iterator {
   StringRef Path;      ///< The entire path.
   StringRef Component; ///< The current component. Not necessarily in Path.
@@ -107,18 +108,22 @@ inline reverse_iterator rend(StringRef path) {
 
 /// @brief Remove the last component from \a path unless it is the root dir.
 ///
-/// directory/filename.cpp => directory/
-/// directory/             => directory
-/// /                      => /
+/// @code
+///   directory/filename.cpp => directory/
+///   directory/             => directory
+///   /                      => /
+/// @endcode
 ///
 /// @param path A path that is modified to not have a file component.
 void remove_filename(SmallVectorImpl<char> &path);
 
 /// @brief Replace the file extension of \a path with \a extension.
 ///
-/// ./filename.cpp => ./filename.extension
-/// ./filename     => ./filename.extension
-/// ./             => ./.extension
+/// @code
+///   ./filename.cpp => ./filename.extension
+///   ./filename     => ./filename.extension
+///   ./             => ./.extension
+/// @endcode
 ///
 /// @param path A path that has its extension replaced with \a extension.
 /// @param extension The extension to be added. It may be empty. It may also
@@ -128,12 +133,14 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension);
 
 /// @brief Append to path.
 ///
-/// /foo  + bar/f => /foo/bar/f
-/// /foo/ + bar/f => /foo/bar/f
-/// foo   + bar/f => foo/bar/f
+/// @code
+///   /foo  + bar/f => /foo/bar/f
+///   /foo/ + bar/f => /foo/bar/f
+///   foo   + bar/f => foo/bar/f
+/// @endcode
 ///
 /// @param path Set to \a path + \a component.
-/// @param component The component to be appended to \a path.
+/// @param a The component to be appended to \a path.
 void append(SmallVectorImpl<char> &path, const Twine &a,
                                          const Twine &b = "",
                                          const Twine &c = "",
@@ -141,9 +148,11 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 
 /// @brief Append to path.
 ///
-/// /foo  + [bar,f] => /foo/bar/f
-/// /foo/ + [bar,f] => /foo/bar/f
-/// foo   + [bar,f] => foo/bar/f
+/// @code
+///   /foo  + [bar,f] => /foo/bar/f
+///   /foo/ + [bar,f] => /foo/bar/f
+///   foo   + [bar,f] => foo/bar/f
+/// @endcode
 ///
 /// @param path Set to \a path + [\a begin, \a end).
 /// @param begin Start of components to append.
@@ -169,9 +178,11 @@ void native(const Twine &path, SmallVectorImpl<char> &result);
 
 /// @brief Get root name.
 ///
-/// //net/hello => //net
-/// c:/hello    => c: (on Windows, on other platforms nothing)
-/// /hello      => <empty>
+/// @code
+///   //net/hello => //net
+///   c:/hello    => c: (on Windows, on other platforms nothing)
+///   /hello      => <empty>
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The root name of \a path if it has one, otherwise "".
@@ -179,9 +190,11 @@ const StringRef root_name(StringRef path);
 
 /// @brief Get root directory.
 ///
-/// /goo/hello => /
-/// c:/hello   => /
-/// d/file.txt => <empty>
+/// @code
+///   /goo/hello => /
+///   c:/hello   => /
+///   d/file.txt => <empty>
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The root directory of \a path if it has one, otherwise
@@ -198,9 +211,11 @@ const StringRef root_path(StringRef path);
 
 /// @brief Get relative path.
 ///
-/// C:\hello\world => hello\world
-/// foo/bar        => foo/bar
-/// /foo/bar       => foo/bar
+/// @code
+///   C:\hello\world => hello\world
+///   foo/bar        => foo/bar
+///   /foo/bar       => foo/bar
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The path starting after root_path if one exists, otherwise "".
@@ -208,9 +223,11 @@ const StringRef relative_path(StringRef path);
 
 /// @brief Get parent path.
 ///
-/// /          => <empty>
-/// /foo       => /
-/// foo/../bar => foo/..
+/// @code
+///   /          => <empty>
+///   /foo       => /
+///   foo/../bar => foo/..
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The parent path of \a path if one exists, otherwise "".
@@ -218,10 +235,12 @@ const StringRef parent_path(StringRef path);
 
 /// @brief Get filename.
 ///
-/// /foo.txt    => foo.txt
-/// .          => .
-/// ..         => ..
-/// /          => /
+/// @code
+///   /foo.txt    => foo.txt
+///   .          => .
+///   ..         => ..
+///   /          => /
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The filename part of \a path. This is defined as the last component
@@ -234,11 +253,13 @@ const StringRef filename(StringRef path);
 /// substring of filename ending at (but not including) the last dot. Otherwise
 /// it is filename.
 ///
-/// /foo/bar.txt => bar
-/// /foo/bar     => bar
-/// /foo/.txt    => <empty>
-/// /foo/.       => .
-/// /foo/..      => ..
+/// @code
+///   /foo/bar.txt => bar
+///   /foo/bar     => bar
+///   /foo/.txt    => <empty>
+///   /foo/.       => .
+///   /foo/..      => ..
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The stem of \a path.
@@ -250,9 +271,11 @@ const StringRef stem(StringRef path);
 /// substring of filename starting at (and including) the last dot, and ending
 /// at the end of \a path. Otherwise "".
 ///
-/// /foo/bar.txt => .txt
-/// /foo/bar     => <empty>
-/// /foo/.txt    => .txt
+/// @code
+///   /foo/bar.txt => .txt
+///   /foo/bar     => <empty>
+///   /foo/.txt    => .txt
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The extension of \a path.
@@ -272,7 +295,7 @@ bool is_separator(char value);
 /// ignored if the user or system has set the typical environment variable
 /// (e.g., TEMP on Windows, TMPDIR on *nix) to specify a temporary directory.
 ///
-/// @param Result Holds the resulting path name.
+/// @param result Holds the resulting path name.
 void system_temp_directory(bool erasedOnReboot, SmallVectorImpl<char> &result);
 
 /// @brief Has root name?
diff --git a/include/llvm/Support/PrettyStackTrace.h b/include/llvm/Support/PrettyStackTrace.h
index 9b3ecda50c1e..2122e06d53fe 100644
--- a/include/llvm/Support/PrettyStackTrace.h
+++ b/include/llvm/Support/PrettyStackTrace.h
@@ -16,6 +16,8 @@
 #ifndef LLVM_SUPPORT_PRETTYSTACKTRACE_H
 #define LLVM_SUPPORT_PRETTYSTACKTRACE_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
   class raw_ostream;
 
@@ -32,8 +34,8 @@ namespace llvm {
   /// virtual stack trace.  This gets dumped out if the program crashes.
   class PrettyStackTraceEntry {
     const PrettyStackTraceEntry *NextEntry;
-    PrettyStackTraceEntry(const PrettyStackTraceEntry &); // DO NOT IMPLEMENT
-    void operator=(const PrettyStackTraceEntry&);         // DO NOT IMPLEMENT
+    PrettyStackTraceEntry(const PrettyStackTraceEntry &) LLVM_DELETED_FUNCTION;
+    void operator=(const PrettyStackTraceEntry&) LLVM_DELETED_FUNCTION;
   public:
     PrettyStackTraceEntry();
     virtual ~PrettyStackTraceEntry();
@@ -52,7 +54,7 @@ namespace llvm {
     const char *Str;
   public:
     PrettyStackTraceString(const char *str) : Str(str) {}
-    virtual void print(raw_ostream &OS) const;
+    virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
   };
 
   /// PrettyStackTraceProgram - This object prints a specified program arguments
@@ -63,7 +65,7 @@ namespace llvm {
   public:
     PrettyStackTraceProgram(int argc, const char * const*argv)
       : ArgC(argc), ArgV(argv) {}
-    virtual void print(raw_ostream &OS) const;
+    virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
   };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index a85f23550ec8..7c9a95103158 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -34,8 +34,8 @@ namespace sys {
     void *Data_;
 
     // Noncopyable.
-    Program(const Program& other);
-    Program& operator=(const Program& other);
+    Program(const Program& other) LLVM_DELETED_FUNCTION;
+    Program& operator=(const Program& other) LLVM_DELETED_FUNCTION;
 
     /// @name Methods
     /// @{
diff --git a/include/llvm/Support/RWMutex.h b/include/llvm/Support/RWMutex.h
index 0d4cb81de397..935b3075df58 100644
--- a/include/llvm/Support/RWMutex.h
+++ b/include/llvm/Support/RWMutex.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SYSTEM_RWMUTEX_H
 #define LLVM_SYSTEM_RWMUTEX_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
 
@@ -75,8 +76,8 @@ namespace llvm
     /// @name Do Not Implement
     /// @{
     private:
-      RWMutexImpl(const RWMutexImpl & original);
-      void operator=(const RWMutexImpl &);
+      RWMutexImpl(const RWMutexImpl & original) LLVM_DELETED_FUNCTION;
+      void operator=(const RWMutexImpl &) LLVM_DELETED_FUNCTION;
     /// @}
     };
 
diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h
index 7648e77bfbb5..ffe09b19b68b 100644
--- a/include/llvm/Support/Regex.h
+++ b/include/llvm/Support/Regex.h
@@ -36,7 +36,7 @@ namespace llvm {
       Newline=2
     };
 
-    /// Compiles the given POSIX Extended Regular Expression \arg Regex.
+    /// Compiles the given POSIX Extended Regular Expression \p Regex.
     /// This implementation supports regexes and matching strings with embedded
     /// NUL characters.
     Regex(StringRef Regex, unsigned Flags = NoFlags);
@@ -51,17 +51,17 @@ namespace llvm {
     /// many entries plus one for the whole regex (as element 0).
     unsigned getNumMatches() const;
 
-    /// matches - Match the regex against a given \arg String.
+    /// matches - Match the regex against a given \p String.
     ///
     /// \param Matches - If given, on a successful match this will be filled in
-    /// with references to the matched group expressions (inside \arg String),
+    /// with references to the matched group expressions (inside \p String),
     /// the first group is always the entire pattern.
     ///
     /// This returns true on a successful match.
     bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = 0);
 
     /// sub - Return the result of replacing the first match of the regex in
-    /// \arg String with the \arg Repl string. Backreferences like "\0" in the
+    /// \p String with the \p Repl string. Backreferences like "\0" in the
     /// replacement string are replaced with the appropriate match substring.
     ///
     /// Note that the replacement string has backslash escaping performed on
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index d0375bedd9f2..29eafb63ca0e 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -37,7 +37,7 @@ namespace llvm {
   /// is necessary to define an alternate traits class.
   template <typename T>
   class RegistryTraits {
-    RegistryTraits(); // Do not implement.
+    RegistryTraits() LLVM_DELETED_FUNCTION;
 
   public:
     typedef SimpleRegistryEntry<T> entry;
@@ -63,7 +63,7 @@ namespace llvm {
     class iterator;
 
   private:
-    Registry(); // Do not implement.
+    Registry() LLVM_DELETED_FUNCTION;
 
     static void Announce(const entry &E) {
       for (listener *Cur = ListenerHead; Cur; Cur = Cur->Next)
@@ -120,6 +120,7 @@ namespace llvm {
     /// Abstract base class for registry listeners, which are informed when new
     /// entries are added to the registry. Simply subclass and instantiate:
     ///
+    /// \code
     ///   class CollectorPrinter : public Registry<Collector>::listener {
     ///   protected:
     ///     void registered(const Registry<Collector>::entry &e) {
@@ -131,7 +132,7 @@ namespace llvm {
     ///   };
     ///
     ///   CollectorPrinter Printer;
-    ///
+    /// \endcode
     class listener {
       listener *Prev, *Next;
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 8949a3a908fd..bcf95f2f6e66 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -64,9 +64,9 @@ private:
 
   DiagHandlerTy DiagHandler;
   void *DiagContext;
-  
-  SourceMgr(const SourceMgr&);    // DO NOT IMPLEMENT
-  void operator=(const SourceMgr&); // DO NOT IMPLEMENT
+
+  SourceMgr(const SourceMgr&) LLVM_DELETED_FUNCTION;
+  void operator=(const SourceMgr&) LLVM_DELETED_FUNCTION;
 public:
   SourceMgr() : LineNoCache(0), DiagHandler(0), DiagContext(0) {}
   ~SourceMgr();
@@ -145,7 +145,7 @@ public:
   /// GetMessage - Return an SMDiagnostic at the specified location with the
   /// specified string.
   ///
-  /// @param Type - If non-null, the kind of message (e.g., "error") which is
+  /// @param Msg If non-null, the kind of message (e.g., "error") which is
   /// prefixed to the message.
   SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg, 
                           ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) const;
diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h
index 531dbb216d7a..a2b4bcb9aa08 100644
--- a/include/llvm/Support/StreamableMemoryObject.h
+++ b/include/llvm/Support/StreamableMemoryObject.h
@@ -12,6 +12,7 @@
 #define STREAMABLEMEMORYOBJECT_H_
 
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/DataStream.h"
 #include <vector>
@@ -107,14 +108,15 @@ class StreamableMemoryObject : public MemoryObject {
 class StreamingMemoryObject : public StreamableMemoryObject {
 public:
   StreamingMemoryObject(DataStreamer *streamer);
-  virtual uint64_t getBase() const { return 0; }
-  virtual uint64_t getExtent() const;
-  virtual int readByte(uint64_t address, uint8_t* ptr) const;
+  virtual uint64_t getBase() const LLVM_OVERRIDE { return 0; }
+  virtual uint64_t getExtent() const LLVM_OVERRIDE;
+  virtual int readByte(uint64_t address, uint8_t* ptr) const LLVM_OVERRIDE;
   virtual int readBytes(uint64_t address,
                         uint64_t size,
                         uint8_t* buf,
-                        uint64_t* copied) const ;
-  virtual const uint8_t *getPointer(uint64_t address, uint64_t size) const {
+                        uint64_t* copied) const LLVM_OVERRIDE;
+  virtual const uint8_t *getPointer(uint64_t address,
+                                    uint64_t size) const LLVM_OVERRIDE {
     // This could be fixed by ensuring the bytes are fetched and making a copy,
     // requiring that the bitcode size be known, or otherwise ensuring that
     // the memory doesn't go away/get reallocated, but it's
@@ -122,8 +124,8 @@ public:
     assert(0 && "getPointer in streaming memory objects not allowed");
     return NULL;
   }
-  virtual bool isValidAddress(uint64_t address) const;
-  virtual bool isObjectEnd(uint64_t address) const;
+  virtual bool isValidAddress(uint64_t address) const LLVM_OVERRIDE;
+  virtual bool isObjectEnd(uint64_t address) const LLVM_OVERRIDE;
 
   /// Drop s bytes from the front of the stream, pushing the positions of the
   /// remaining bytes down by s. This is used to skip past the bitcode header,
@@ -170,8 +172,8 @@ private:
     return true;
   }
 
-  StreamingMemoryObject(const StreamingMemoryObject&);  // DO NOT IMPLEMENT
-  void operator=(const StreamingMemoryObject&);  // DO NOT IMPLEMENT
+  StreamingMemoryObject(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
 };
 
 StreamableMemoryObject *getNonStreamedMemoryObject(
diff --git a/include/llvm/Support/TargetFolder.h b/include/llvm/Support/TargetFolder.h
index c65faa66219e..45f781633656 100644
--- a/include/llvm/Support/TargetFolder.h
+++ b/include/llvm/Support/TargetFolder.h
@@ -26,11 +26,11 @@
 
 namespace llvm {
 
-class TargetData;
+class DataLayout;
 
 /// TargetFolder - Create constants with target dependent folding.
 class TargetFolder {
-  const TargetData *TD;
+  const DataLayout *TD;
 
   /// Fold - Fold the constant using target specific information.
   Constant *Fold(Constant *C) const {
@@ -41,7 +41,7 @@ class TargetFolder {
   }
 
 public:
-  explicit TargetFolder(const TargetData *TheTD) : TD(TheTD) {}
+  explicit TargetFolder(const DataLayout *TheTD) : TD(TheTD) {}
 
   //===--------------------------------------------------------------------===//
   // Binary Operators
@@ -177,7 +177,14 @@ public:
     return Fold(ConstantExpr::getIntegerCast(C, DestTy, isSigned));
   }
   Constant *CreatePointerCast(Constant *C, Type *DestTy) const {
-    return ConstantExpr::getPointerCast(C, DestTy);
+    if (C->getType() == DestTy)
+      return C; // avoid calling Fold
+    return Fold(ConstantExpr::getPointerCast(C, DestTy));
+  }
+  Constant *CreateFPCast(Constant *C, Type *DestTy) const {
+    if (C->getType() == DestTy)
+      return C; // avoid calling Fold
+    return Fold(ConstantExpr::getFPCast(C, DestTy));
   }
   Constant *CreateBitCast(Constant *C, Type *DestTy) const {
     return CreateCast(Instruction::BitCast, C, DestTy);
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index c0be8f130aba..ca58bfb0d73b 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -93,7 +93,9 @@ namespace llvm {
                                                   CodeGenOpt::Level OL);
     typedef AsmPrinter *(*AsmPrinterCtorTy)(TargetMachine &TM,
                                             MCStreamer &Streamer);
-    typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T, StringRef TT);
+    typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T,
+                                                StringRef TT,
+                                                StringRef CPU);
     typedef MCTargetAsmLexer *(*MCAsmLexerCtorTy)(const Target &T,
                                                   const MCRegisterInfo &MRI,
                                                   const MCAsmInfo &MAI);
@@ -271,7 +273,7 @@ namespace llvm {
     /// createMCAsmInfo - Create a MCAsmInfo implementation for the specified
     /// target triple.
     ///
-    /// \arg Triple - This argument is used to determine the target machine
+    /// \param Triple This argument is used to determine the target machine
     /// feature set; it should always be provided. Generally this should be
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
@@ -317,12 +319,12 @@ namespace llvm {
 
     /// createMCSubtargetInfo - Create a MCSubtargetInfo implementation.
     ///
-    /// \arg Triple - This argument is used to determine the target machine
+    /// \param Triple This argument is used to determine the target machine
     /// feature set; it should always be provided. Generally this should be
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
-    /// \arg CPU - This specifies the name of the target CPU.
-    /// \arg Features - This specifies the string representation of the
+    /// \param CPU This specifies the name of the target CPU.
+    /// \param Features This specifies the string representation of the
     /// additional target features.
     MCSubtargetInfo *createMCSubtargetInfo(StringRef Triple, StringRef CPU,
                                            StringRef Features) const {
@@ -332,9 +334,9 @@ namespace llvm {
     }
 
     /// createTargetMachine - Create a target specific machine implementation
-    /// for the specified \arg Triple.
+    /// for the specified \p Triple.
     ///
-    /// \arg Triple - This argument is used to determine the target machine
+    /// \param Triple This argument is used to determine the target machine
     /// feature set; it should always be provided. Generally this should be
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
@@ -351,12 +353,11 @@ namespace llvm {
 
     /// createMCAsmBackend - Create a target specific assembly parser.
     ///
-    /// \arg Triple - The target triple string.
-    /// \arg Backend - The target independent assembler object.
-    MCAsmBackend *createMCAsmBackend(StringRef Triple) const {
+    /// \param Triple The target triple string.
+    MCAsmBackend *createMCAsmBackend(StringRef Triple, StringRef CPU) const {
       if (!MCAsmBackendCtorFn)
         return 0;
-      return MCAsmBackendCtorFn(*this, Triple);
+      return MCAsmBackendCtorFn(*this, Triple, CPU);
     }
 
     /// createMCAsmLexer - Create a target specific assembly lexer.
@@ -370,7 +371,7 @@ namespace llvm {
 
     /// createMCAsmParser - Create a target specific assembly parser.
     ///
-    /// \arg Parser - The target independent parser implementation to use for
+    /// \param Parser The target independent parser implementation to use for
     /// parsing and lexing.
     MCTargetAsmParser *createMCAsmParser(MCSubtargetInfo &STI,
                                          MCAsmParser &Parser) const {
@@ -416,13 +417,13 @@ namespace llvm {
 
     /// createMCObjectStreamer - Create a target specific MCStreamer.
     ///
-    /// \arg TT - The target triple.
-    /// \arg Ctx - The target context.
-    /// \arg TAB - The target assembler backend object. Takes ownership.
-    /// \arg _OS - The stream object.
-    /// \arg _Emitter - The target independent assembler object.Takes ownership.
-    /// \arg RelaxAll - Relax all fixups?
-    /// \arg NoExecStack - Mark file as not needing a executable stack.
+    /// \param TT The target triple.
+    /// \param Ctx The target context.
+    /// \param TAB The target assembler backend object. Takes ownership.
+    /// \param _OS The stream object.
+    /// \param _Emitter The target independent assembler object.Takes ownership.
+    /// \param RelaxAll Relax all fixups?
+    /// \param NoExecStack Mark file as not needing a executable stack.
     MCStreamer *createMCObjectStreamer(StringRef TT, MCContext &Ctx,
                                        MCAsmBackend &TAB,
                                        raw_ostream &_OS,
@@ -1063,8 +1064,9 @@ namespace llvm {
     }
 
   private:
-    static MCAsmBackend *Allocator(const Target &T, StringRef Triple) {
-      return new MCAsmBackendImpl(T, Triple);
+    static MCAsmBackend *Allocator(const Target &T, StringRef Triple,
+                                   StringRef CPU) {
+      return new MCAsmBackendImpl(T, Triple, CPU);
     }
   };
 
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index c0e842c2fe73..9017afb89038 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -41,8 +41,8 @@ namespace llvm {
   /// before llvm_start_multithreaded().
   void llvm_release_global_lock();
 
-  /// llvm_execute_on_thread - Execute the given \arg UserFn on a separate
-  /// thread, passing it the provided \arg UserData.
+  /// llvm_execute_on_thread - Execute the given \p UserFn on a separate
+  /// thread, passing it the provided \p UserData.
   ///
   /// This function does not guarantee that the code will actually be executed
   /// on a separate thread or honoring the requested stack size, but tries to do
diff --git a/include/llvm/Support/TimeValue.h b/include/llvm/Support/TimeValue.h
index 94f132a05ca7..e780b50c6039 100644
--- a/include/llvm/Support/TimeValue.h
+++ b/include/llvm/Support/TimeValue.h
@@ -153,7 +153,6 @@ namespace sys {
 
     /// Determine if \p this is greater than or equal to \p that.
     /// @returns True iff *this >= that.
-    /// @brief True if this >= that.
     int operator >= (const TimeValue &that) const {
       if ( this->seconds_ > that.seconds_ ) {
           return 1;
@@ -164,8 +163,7 @@ namespace sys {
     }
 
     /// Determines if two TimeValue objects represent the same moment in time.
-    /// @brief True iff *this == that.
-    /// @brief True if this == that.
+    /// @returns True iff *this == that.
     int operator == (const TimeValue &that) const {
       return (this->seconds_ == that.seconds_) &&
              (this->nanos_ == that.nanos_);
@@ -173,8 +171,7 @@ namespace sys {
 
     /// Determines if two TimeValue objects represent times that are not the
     /// same.
-    /// @return True iff *this != that.
-    /// @brief True if this != that.
+    /// @returns True iff *this != that.
     int operator != (const TimeValue &that) const { return !(*this == that); }
 
     /// Adds two TimeValue objects together.
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 404cb6d6c8b6..a7418827ca32 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_TIMER_H
 #define LLVM_SUPPORT_TIMER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/ADT/StringRef.h"
 #include <cassert>
@@ -130,7 +131,7 @@ private:
 ///
 class TimeRegion {
   Timer *T;
-  TimeRegion(const TimeRegion &); // DO NOT IMPLEMENT
+  TimeRegion(const TimeRegion &) LLVM_DELETED_FUNCTION;
 public:
   explicit TimeRegion(Timer &t) : T(&t) {
     T->startTimer();
@@ -168,8 +169,8 @@ class TimerGroup {
   std::vector<std::pair<TimeRecord, std::string> > TimersToPrint;
   
   TimerGroup **Prev, *Next; // Doubly linked list of TimerGroup's.
-  TimerGroup(const TimerGroup &TG);      // DO NOT IMPLEMENT
-  void operator=(const TimerGroup &TG);  // DO NOT IMPLEMENT
+  TimerGroup(const TimerGroup &TG) LLVM_DELETED_FUNCTION;
+  void operator=(const TimerGroup &TG) LLVM_DELETED_FUNCTION;
 public:
   explicit TimerGroup(StringRef name);
   ~TimerGroup();
diff --git a/include/llvm/Support/ValueHandle.h b/include/llvm/Support/ValueHandle.h
index 61e21b86ead8..dbcf0fd11d19 100644
--- a/include/llvm/Support/ValueHandle.h
+++ b/include/llvm/Support/ValueHandle.h
@@ -59,8 +59,8 @@ private:
   // pair. The 'setValPtrInt' and 'getValPtrInt' methods below give them this
   // access.
   PointerIntPair<Value*, 2> VP;
-  
-  explicit ValueHandleBase(const ValueHandleBase&); // DO NOT IMPLEMENT.
+
+  ValueHandleBase(const ValueHandleBase&) LLVM_DELETED_FUNCTION;
 public:
   explicit ValueHandleBase(HandleBaseKind Kind)
     : PrevPair(0, Kind), Next(0), VP(0, 0) {}
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index 98910eb7578f..12958fa173d0 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -133,7 +133,6 @@ public:
   virtual void skip() {}
 
   unsigned int getType() const { return TypeID; }
-  static inline bool classof(const Node *) { return true; }
 
   void *operator new ( size_t Size
                      , BumpPtrAllocator &Alloc
@@ -166,7 +165,6 @@ class NullNode : public Node {
 public:
   NullNode(OwningPtr<Document> &D) : Node(NK_Null, D, StringRef()) {}
 
-  static inline bool classof(const NullNode *) { return true; }
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Null;
   }
@@ -199,7 +197,6 @@ public:
   ///        This happens with escaped characters and multi-line literals.
   StringRef getValue(SmallVectorImpl<char> &Storage) const;
 
-  static inline bool classof(const ScalarNode *) { return true; }
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Scalar;
   }
@@ -241,12 +238,11 @@ public:
   /// @returns The value, or nullptr if failed() == true.
   Node *getValue();
 
-  virtual void skip() {
+  virtual void skip() LLVM_OVERRIDE {
     getKey()->skip();
     getValue()->skip();
   }
 
-  static inline bool classof(const KeyValueNode *) { return true; }
   static inline bool classof(const Node *N) {
     return N->getType() == NK_KeyValue;
   }
@@ -358,11 +354,10 @@ public:
 
   iterator end() { return iterator(); }
 
-  virtual void skip() {
+  virtual void skip() LLVM_OVERRIDE {
     yaml::skip(*this);
   }
 
-  static inline bool classof(const MappingNode *) { return true; }
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Mapping;
   }
@@ -421,11 +416,10 @@ public:
 
   iterator end() { return iterator(); }
 
-  virtual void skip() {
+  virtual void skip() LLVM_OVERRIDE {
     yaml::skip(*this);
   }
 
-  static inline bool classof(const SequenceNode *) { return true; }
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Sequence;
   }
@@ -450,7 +444,6 @@ public:
   StringRef getName() const { return Name; }
   Node *getTarget();
 
-  static inline bool classof(const ScalarNode *) { return true; }
   static inline bool classof(const Node *N) {
     return N->getType() == NK_Alias;
   }
diff --git a/include/llvm/Support/circular_raw_ostream.h b/include/llvm/Support/circular_raw_ostream.h
index 2b3c329b5861..2823af33b746 100644
--- a/include/llvm/Support/circular_raw_ostream.h
+++ b/include/llvm/Support/circular_raw_ostream.h
@@ -81,12 +81,12 @@ namespace llvm
       Filled = false;
     }
 
-    virtual void write_impl(const char *Ptr, size_t Size);
+    virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
     /// current_pos - Return the current position within the stream,
     /// not counting the bytes currently in the buffer.
     ///
-    virtual uint64_t current_pos() const { 
+    virtual uint64_t current_pos() const LLVM_OVERRIDE {
       // This has the same effect as calling TheStream.current_pos(),
       // but that interface is private.
       return TheStream->tell() - TheStream->GetNumBytesInBuffer();
diff --git a/include/llvm/Support/raw_os_ostream.h b/include/llvm/Support/raw_os_ostream.h
index 4f5d3612da18..4385721e8206 100644
--- a/include/llvm/Support/raw_os_ostream.h
+++ b/include/llvm/Support/raw_os_ostream.h
@@ -24,14 +24,14 @@ namespace llvm {
 /// use the underlying stream to detect errors.
 class raw_os_ostream : public raw_ostream {
   std::ostream &OS;
-  
+
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
-  
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
+
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const;
-  
+  virtual uint64_t current_pos() const LLVM_OVERRIDE;
+
 public:
   raw_os_ostream(std::ostream &O) : OS(O) {}
   ~raw_os_ostream();
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 5de749aeae4e..eab0f2d8057e 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_RAW_OSTREAM_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -29,8 +30,8 @@ namespace llvm {
 class raw_ostream {
 private:
   // Do not implement. raw_ostream is noncopyable.
-  void operator=(const raw_ostream &);
-  raw_ostream(const raw_ostream &);
+  void operator=(const raw_ostream &) LLVM_DELETED_FUNCTION;
+  raw_ostream(const raw_ostream &) LLVM_DELETED_FUNCTION;
 
   /// The buffer is handled in such a way that the buffer is
   /// uninitialized, unbuffered, or out of space when OutBufCur >=
@@ -191,10 +192,10 @@ public:
 
   raw_ostream &operator<<(double N);
 
-  /// write_hex - Output \arg N in hexadecimal, without any prefix or padding.
+  /// write_hex - Output \p N in hexadecimal, without any prefix or padding.
   raw_ostream &write_hex(unsigned long long N);
 
-  /// write_escaped - Output \arg Str, turning '\\', '\t', '\n', '"', and
+  /// write_escaped - Output \p Str, turning '\\', '\t', '\n', '"', and
   /// anything that doesn't satisfy std::isprint into an escape sequence.
   raw_ostream &write_escaped(StringRef Str, bool UseHexEscapes = false);
 
@@ -210,13 +211,19 @@ public:
 
   /// Changes the foreground color of text that will be output from this point
   /// forward.
-  /// @param colors ANSI color to use, the special SAVEDCOLOR can be used to
+  /// @param Color ANSI color to use, the special SAVEDCOLOR can be used to
   /// change only the bold attribute, and keep colors untouched
-  /// @param bold bold/brighter text, default false
-  /// @param bg if true change the background, default: change foreground
+  /// @param Bold bold/brighter text, default false
+  /// @param BG if true change the background, default: change foreground
   /// @returns itself so it can be used within << invocations
-  virtual raw_ostream &changeColor(enum Colors, bool = false, bool = false) {
-    return *this; }
+  virtual raw_ostream &changeColor(enum Colors Color,
+                                   bool Bold = false,
+                                   bool BG = false) {
+    (void)Color;
+    (void)Bold;
+    (void)BG;
+    return *this;
+  }
 
   /// Resets the colors to terminal defaults. Call this when you are done
   /// outputting colored text, or before program exit.
@@ -239,15 +246,16 @@ public:
 
 private:
   /// write_impl - The is the piece of the class that is implemented
-  /// by subclasses.  This writes the \args Size bytes starting at
-  /// \arg Ptr to the underlying stream.
+  /// by subclasses.  This writes the \p Size bytes starting at
+  /// \p Ptr to the underlying stream.
   ///
   /// This function is guaranteed to only be called at a point at which it is
   /// safe for the subclass to install a new buffer via SetBuffer.
   ///
-  /// \arg Ptr - The start of the data to be written. For buffered streams this
+  /// \param Ptr The start of the data to be written. For buffered streams this
   /// is guaranteed to be the start of the buffer.
-  /// \arg Size - The number of bytes to be written.
+  ///
+  /// \param Size The number of bytes to be written.
   ///
   /// \invariant { Size > 0 }
   virtual void write_impl(const char *Ptr, size_t Size) = 0;
@@ -314,14 +322,14 @@ class raw_fd_ostream : public raw_ostream {
   uint64_t pos;
 
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const { return pos; }
+  virtual uint64_t current_pos() const LLVM_OVERRIDE { return pos; }
 
   /// preferred_buffer_size - Determine an efficient buffer size.
-  virtual size_t preferred_buffer_size() const;
+  virtual size_t preferred_buffer_size() const LLVM_OVERRIDE;
 
   /// error_detected - Set the flag indicating that an output error has
   /// been encountered.
@@ -382,14 +390,14 @@ public:
   }
 
   virtual raw_ostream &changeColor(enum Colors colors, bool bold=false,
-                                   bool bg=false);
-  virtual raw_ostream &resetColor();
+                                   bool bg=false) LLVM_OVERRIDE;
+  virtual raw_ostream &resetColor() LLVM_OVERRIDE;
 
-  virtual raw_ostream &reverseColor();
+  virtual raw_ostream &reverseColor() LLVM_OVERRIDE;
 
-  virtual bool is_displayed() const;
+  virtual bool is_displayed() const LLVM_OVERRIDE;
 
-  virtual bool has_colors() const;
+  virtual bool has_colors() const LLVM_OVERRIDE;
 
   /// has_error - Return the value of the flag in this raw_fd_ostream indicating
   /// whether an output error has been encountered.
@@ -435,11 +443,11 @@ class raw_string_ostream : public raw_ostream {
   std::string &OS;
 
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const { return OS.size(); }
+  virtual uint64_t current_pos() const LLVM_OVERRIDE { return OS.size(); }
 public:
   explicit raw_string_ostream(std::string &O) : OS(O) {}
   ~raw_string_ostream();
@@ -459,15 +467,15 @@ class raw_svector_ostream : public raw_ostream {
   SmallVectorImpl<char> &OS;
 
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const;
+  virtual uint64_t current_pos() const LLVM_OVERRIDE;
 public:
   /// Construct a new raw_svector_ostream.
   ///
-  /// \arg O - The vector to write to; this should generally have at least 128
+  /// \param O The vector to write to; this should generally have at least 128
   /// bytes free to avoid any extraneous memory overhead.
   explicit raw_svector_ostream(SmallVectorImpl<char> &O);
   ~raw_svector_ostream();
@@ -485,11 +493,11 @@ public:
 /// raw_null_ostream - A raw_ostream that discards all output.
 class raw_null_ostream : public raw_ostream {
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t size);
+  virtual void write_impl(const char *Ptr, size_t size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const;
+  virtual uint64_t current_pos() const LLVM_OVERRIDE;
 
 public:
   explicit raw_null_ostream() {}
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
index af812069b9fe..0d164f688d37 100644
--- a/include/llvm/Support/system_error.h
+++ b/include/llvm/Support/system_error.h
@@ -17,6 +17,8 @@
 #ifndef LLVM_SYSTEM_SYSTEM_ERROR_H
 #define LLVM_SYSTEM_SYSTEM_ERROR_H
 
+#include "llvm/Support/Compiler.h"
+
 /*
     system_error synopsis
 
@@ -629,8 +631,8 @@ public:
 
 private:
   error_category();
-  error_category(const error_category&);// = delete;
-  error_category& operator=(const error_category&);// = delete;
+  error_category(const error_category&) LLVM_DELETED_FUNCTION;
+  error_category& operator=(const error_category&) LLVM_DELETED_FUNCTION;
 
 public:
   virtual const char* name() const = 0;
@@ -651,7 +653,7 @@ public:
 class _do_message : public error_category
 {
 public:
-  virtual std::string message(int ev) const;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
 };
 
 const error_category& generic_category();
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index 7b97547be52a..f9306395fce3 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -54,8 +54,9 @@ struct is_class
   // is_class<> metafunction due to Paul Mensonides (leavings@attbi.com). For
   // more details:
   // http://groups.google.com/groups?hl=en&selm=000001c1cc83%24e154d5e0%247772e50c%40c161550a&rnum=1
- public:
-    enum { value = sizeof(char) == sizeof(dont_use::is_class_helper<T>(0)) };
+public:
+  static const bool value =
+      sizeof(char) == sizeof(dont_use::is_class_helper<T>(0));
 };
   
   
@@ -162,12 +163,11 @@ template <typename T> class is_integral_or_enum {
   static UnderlyingT &nonce_instance;
 
 public:
-  enum {
+  static const bool
     value = (!is_class<UnderlyingT>::value && !is_pointer<UnderlyingT>::value &&
              !is_same<UnderlyingT, float>::value &&
              !is_same<UnderlyingT, double>::value &&
-             sizeof(char) != sizeof(check_int_convertible(nonce_instance)))
-  };
+             sizeof(char) != sizeof(check_int_convertible(nonce_instance)));
 };
 
 // enable_if_c - Enable/disable a template based on a metafunction
diff --git a/include/llvm/SymbolTableListTraits.h b/include/llvm/SymbolTableListTraits.h
index 91a4eb99ff0d..ec5c88f5c8a7 100644
--- a/include/llvm/SymbolTableListTraits.h
+++ b/include/llvm/SymbolTableListTraits.h
@@ -46,7 +46,6 @@ public:
   /// getListOwner - Return the object that owns this list.  If this is a list
   /// of instructions, it returns the BasicBlock that owns them.
   ItemParentClass *getListOwner() {
-    typedef iplist<ValueSubClass> ItemParentClass::*Sublist;
     size_t Offset(size_t(&((ItemParentClass*)0->*ItemParentClass::
                            getSublistAccess(static_cast<ValueSubClass*>(0)))));
     iplist<ValueSubClass>* Anchor(static_cast<iplist<ValueSubClass>*>(this));
diff --git a/include/llvm/TableGen/Error.h b/include/llvm/TableGen/Error.h
index fd5f805ffc96..2f6b7e625c3d 100644
--- a/include/llvm/TableGen/Error.h
+++ b/include/llvm/TableGen/Error.h
@@ -19,26 +19,17 @@
 
 namespace llvm {
 
-class TGError {
-  SMLoc Loc;
-  std::string Message;
-public:
-  TGError(SMLoc loc, const std::string &message) : Loc(loc), Message(message) {}
-
-  SMLoc getLoc() const { return Loc; }
-  const std::string &getMessage() const { return Message; }
-};
-
-void PrintWarning(SMLoc WarningLoc, const Twine &Msg);
+void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg);
 void PrintWarning(const char *Loc, const Twine &Msg);
 void PrintWarning(const Twine &Msg);
-void PrintWarning(const TGError &Warning);
 
-void PrintError(SMLoc ErrorLoc, const Twine &Msg);
+void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg);
 void PrintError(const char *Loc, const Twine &Msg);
 void PrintError(const Twine &Msg);
-void PrintError(const TGError &Error);
 
+LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const std::string &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalError(ArrayRef<SMLoc> ErrorLoc,
+                                             const std::string &Msg);
 
 extern SourceMgr SrcMgr;
 
diff --git a/include/llvm/TableGen/Main.h b/include/llvm/TableGen/Main.h
index deaef4a9908a..6b51e20146d7 100644
--- a/include/llvm/TableGen/Main.h
+++ b/include/llvm/TableGen/Main.h
@@ -16,10 +16,13 @@
 
 namespace llvm {
 
-class TableGenAction;
+class RecordKeeper;
+class raw_ostream;
+/// \brief Perform the action using Records, and write output to OS.
+/// \returns true on error, false otherwise
+typedef bool TableGenMainFn(raw_ostream &OS, RecordKeeper &Records);
 
-/// Run the table generator, performing the specified Action on parsed records.
-int TableGenMain(char *argv0, TableGenAction &Action);
+int TableGenMain(char *argv0, TableGenMainFn *MainFn);
 
 }
 
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index a8256b77357c..319298c13253 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -66,10 +67,27 @@ class RecordKeeper;
 //===----------------------------------------------------------------------===//
 
 class RecTy {
+public:
+  /// \brief Subclass discriminator (for dyn_cast<> et al.)
+  enum RecTyKind {
+    BitRecTyKind,
+    BitsRecTyKind,
+    IntRecTyKind,
+    StringRecTyKind,
+    ListRecTyKind,
+    DagRecTyKind,
+    RecordRecTyKind
+  };
+
+private:
+  RecTyKind Kind;
   ListRecTy *ListTy;
   virtual void anchor();
+
 public:
-  RecTy() : ListTy(0) {}
+  RecTyKind getRecTyKind() const { return Kind; }
+
+  RecTy(RecTyKind K) : Kind(K), ListTy(0) {}
   virtual ~RecTy() {}
 
   virtual std::string getAsString() const = 0;
@@ -132,8 +150,12 @@ inline raw_ostream &operator<<(raw_ostream &OS, const RecTy &Ty) {
 ///
 class BitRecTy : public RecTy {
   static BitRecTy Shared;
-  BitRecTy() {}
+  BitRecTy() : RecTy(BitRecTyKind) {}
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == BitRecTyKind;
+  }
+
   static BitRecTy *get() { return &Shared; }
 
   virtual Init *convertValue( UnsetInit *UI) { return (Init*)UI; }
@@ -152,9 +174,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "bit"; }
+  virtual std::string getAsString() const { return "bit"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
   virtual bool baseClassOf(const BitRecTy    *RHS) const { return true; }
@@ -173,8 +195,12 @@ public:
 ///
 class BitsRecTy : public RecTy {
   unsigned Size;
-  explicit BitsRecTy(unsigned Sz) : Size(Sz) {}
+  explicit BitsRecTy(unsigned Sz) : RecTy(BitsRecTyKind), Size(Sz) {}
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == BitsRecTyKind;
+  }
+
   static BitsRecTy *get(unsigned Sz);
 
   unsigned getNumBits() const { return Size; }
@@ -195,9 +221,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const;
+  virtual std::string getAsString() const;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
   virtual bool baseClassOf(const BitRecTy    *RHS) const { return Size == 1; }
@@ -217,8 +243,12 @@ public:
 ///
 class IntRecTy : public RecTy {
   static IntRecTy Shared;
-  IntRecTy() {}
+  IntRecTy() : RecTy(IntRecTyKind) {}
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == IntRecTyKind;
+  }
+
   static IntRecTy *get() { return &Shared; }
 
   virtual Init *convertValue( UnsetInit *UI) { return (Init*)UI; }
@@ -237,9 +267,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "int"; }
+  virtual std::string getAsString() const { return "int"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -257,8 +287,12 @@ public:
 ///
 class StringRecTy : public RecTy {
   static StringRecTy Shared;
-  StringRecTy() {}
+  StringRecTy() : RecTy(StringRecTyKind) {}
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == StringRecTyKind;
+  }
+
   static StringRecTy *get() { return &Shared; }
 
   virtual Init *convertValue( UnsetInit *UI) { return (Init*)UI; }
@@ -278,9 +312,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "string"; }
+  virtual std::string getAsString() const { return "string"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -300,9 +334,13 @@ public:
 ///
 class ListRecTy : public RecTy {
   RecTy *Ty;
-  explicit ListRecTy(RecTy *T) : Ty(T) {}
+  explicit ListRecTy(RecTy *T) : RecTy(ListRecTyKind), Ty(T) {}
   friend ListRecTy *RecTy::getListTy();
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == ListRecTyKind;
+  }
+
   static ListRecTy *get(RecTy *T) { return T->getListTy(); }
   RecTy *getElementType() const { return Ty; }
 
@@ -322,9 +360,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const;
+  virtual std::string getAsString() const;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -343,8 +381,12 @@ public:
 ///
 class DagRecTy : public RecTy {
   static DagRecTy Shared;
-  DagRecTy() {}
+  DagRecTy() : RecTy(DagRecTyKind) {}
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == DagRecTyKind;
+  }
+
   static DagRecTy *get() { return &Shared; }
 
   virtual Init *convertValue( UnsetInit *UI) { return (Init*)UI; }
@@ -363,9 +405,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "dag"; }
+  virtual std::string getAsString() const { return "dag"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -384,9 +426,13 @@ public:
 ///
 class RecordRecTy : public RecTy {
   Record *Rec;
-  explicit RecordRecTy(Record *R) : Rec(R) {}
+  explicit RecordRecTy(Record *R) : RecTy(RecordRecTyKind), Rec(R) {}
   friend class Record;
 public:
+  static bool classof(const RecTy *RT) {
+    return RT->getRecTyKind() == RecordRecTyKind;
+  }
+
   static RecordRecTy *get(Record *R);
 
   Record *getRecord() const { return Rec; }
@@ -407,9 +453,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const;
+  virtual std::string getAsString() const;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
   virtual bool baseClassOf(const BitRecTy    *RHS) const { return false; }
@@ -431,12 +477,53 @@ RecTy *resolveTypes(RecTy *T1, RecTy *T2);
 //===----------------------------------------------------------------------===//
 
 class Init {
-  Init(const Init &);  // Do not define.
-  Init &operator=(const Init &);  // Do not define.
+protected:
+  /// \brief Discriminator enum (for isa<>, dyn_cast<>, et al.)
+  ///
+  /// This enum is laid out by a preorder traversal of the inheritance
+  /// hierarchy, and does not contain an entry for abstract classes, as per
+  /// the recommendation in docs/HowToSetUpLLVMStyleRTTI.rst.
+  ///
+  /// We also explicitly include "first" and "last" values for each
+  /// interior node of the inheritance tree, to make it easier to read the
+  /// corresponding classof().
+  ///
+  /// We could pack these a bit tighter by not having the IK_FirstXXXInit
+  /// and IK_LastXXXInit be their own values, but that would degrade
+  /// readability for really no benefit.
+  enum InitKind {
+    IK_BitInit,
+    IK_BitsInit,
+    IK_FirstTypedInit,
+    IK_DagInit,
+    IK_DefInit,
+    IK_FieldInit,
+    IK_IntInit,
+    IK_ListInit,
+    IK_FirstOpInit,
+    IK_BinOpInit,
+    IK_TernOpInit,
+    IK_UnOpInit,
+    IK_LastOpInit,
+    IK_StringInit,
+    IK_VarInit,
+    IK_VarListElementInit,
+    IK_LastTypedInit,
+    IK_UnsetInit,
+    IK_VarBitInit
+  };
+
+private:
+  const InitKind Kind;
+  Init(const Init &) LLVM_DELETED_FUNCTION;
+  Init &operator=(const Init &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
+public:
+  InitKind getKind() const { return Kind; }
+
 protected:
-  Init(void) {}
+  explicit Init(InitKind K) : Kind(K) {}
 
 public:
   virtual ~Init() {}
@@ -509,6 +596,18 @@ public:
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const {
     return const_cast<Init *>(this);
   }
+
+  /// getBit - This method is used to return the initializer for the specified
+  /// bit.
+  virtual Init *getBit(unsigned Bit) const = 0;
+
+  /// getBitVar - This method is used to retrieve the initializer for bit
+  /// reference. For non-VarBitInit, it simply returns itself.
+  virtual Init *getBitVar() const { return const_cast<Init*>(this); }
+
+  /// getBitNum - This method is used to retrieve the bit number of a bit
+  /// reference. For non-VarBitInit, it simply returns 0.
+  virtual unsigned getBitNum() const { return 0; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
@@ -521,13 +620,17 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
 class TypedInit : public Init {
   RecTy *Ty;
 
-  TypedInit(const TypedInit &Other);  // Do not define.
-  TypedInit &operator=(const TypedInit &Other);  // Do not define.
+  TypedInit(const TypedInit &Other) LLVM_DELETED_FUNCTION;
+  TypedInit &operator=(const TypedInit &Other) LLVM_DELETED_FUNCTION;
 
 protected:
-  explicit TypedInit(RecTy *T) : Ty(T) {}
+  explicit TypedInit(InitKind K, RecTy *T) : Init(K), Ty(T) {}
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() >= IK_FirstTypedInit &&
+           I->getKind() <= IK_LastTypedInit;
+  }
   RecTy *getType() const { return Ty; }
 
   virtual Init *
@@ -541,13 +644,6 @@ public:
   ///
   virtual RecTy *getFieldType(const std::string &FieldName) const;
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const = 0;
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -559,18 +655,25 @@ public:
 /// UnsetInit - ? - Represents an uninitialized value
 ///
 class UnsetInit : public Init {
-  UnsetInit() : Init() {}
-  UnsetInit(const UnsetInit &);  // Do not define.
-  UnsetInit &operator=(const UnsetInit &Other);  // Do not define.
+  UnsetInit() : Init(IK_UnsetInit) {}
+  UnsetInit(const UnsetInit &) LLVM_DELETED_FUNCTION;
+  UnsetInit &operator=(const UnsetInit &Other) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_UnsetInit;
+  }
   static UnsetInit *get();
 
   virtual Init *convertInitializerTo(RecTy *Ty) const {
     return Ty->convertValue(const_cast<UnsetInit *>(this));
   }
 
+  virtual Init *getBit(unsigned Bit) const {
+    return const_cast<UnsetInit*>(this);
+  }
+
   virtual bool isComplete() const { return false; }
   virtual std::string getAsString() const { return "?"; }
 };
@@ -581,12 +684,15 @@ public:
 class BitInit : public Init {
   bool Value;
 
-  explicit BitInit(bool V) : Value(V) {}
-  BitInit(const BitInit &Other);  // Do not define.
-  BitInit &operator=(BitInit &Other);  // Do not define.
+  explicit BitInit(bool V) : Init(IK_BitInit), Value(V) {}
+  BitInit(const BitInit &Other) LLVM_DELETED_FUNCTION;
+  BitInit &operator=(BitInit &Other) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_BitInit;
+  }
   static BitInit *get(bool V);
 
   bool getValue() const { return Value; }
@@ -595,6 +701,11 @@ public:
     return Ty->convertValue(const_cast<BitInit *>(this));
   }
 
+  virtual Init *getBit(unsigned Bit) const {
+    assert(Bit < 1 && "Bit index out of range!");
+    return const_cast<BitInit*>(this);
+  }
+
   virtual std::string getAsString() const { return Value ? "1" : "0"; }
 };
 
@@ -604,23 +715,22 @@ public:
 class BitsInit : public Init, public FoldingSetNode {
   std::vector<Init*> Bits;
 
-  BitsInit(ArrayRef<Init *> Range) : Bits(Range.begin(), Range.end()) {}
+  BitsInit(ArrayRef<Init *> Range)
+    : Init(IK_BitsInit), Bits(Range.begin(), Range.end()) {}
 
-  BitsInit(const BitsInit &Other);  // Do not define.
-  BitsInit &operator=(const BitsInit &Other);  // Do not define.
+  BitsInit(const BitsInit &Other) LLVM_DELETED_FUNCTION;
+  BitsInit &operator=(const BitsInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_BitsInit;
+  }
   static BitsInit *get(ArrayRef<Init *> Range);
 
   void Profile(FoldingSetNodeID &ID) const;
 
   unsigned getNumBits() const { return Bits.size(); }
 
-  Init *getBit(unsigned Bit) const {
-    assert(Bit < Bits.size() && "Bit index out of range!");
-    return Bits[Bit];
-  }
-
   virtual Init *convertInitializerTo(RecTy *Ty) const {
     return Ty->convertValue(const_cast<BitsInit *>(this));
   }
@@ -640,6 +750,11 @@ public:
   virtual std::string getAsString() const;
 
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
+
+  virtual Init *getBit(unsigned Bit) const {
+    assert(Bit < Bits.size() && "Bit index out of range!");
+    return Bits[Bit];
+  }
 };
 
 
@@ -648,12 +763,16 @@ public:
 class IntInit : public TypedInit {
   int64_t Value;
 
-  explicit IntInit(int64_t V) : TypedInit(IntRecTy::get()), Value(V) {}
+  explicit IntInit(int64_t V)
+    : TypedInit(IK_IntInit, IntRecTy::get()), Value(V) {}
 
-  IntInit(const IntInit &Other);  // Do not define.
-  IntInit &operator=(const IntInit &Other);  // Do note define.
+  IntInit(const IntInit &Other) LLVM_DELETED_FUNCTION;
+  IntInit &operator=(const IntInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_IntInit;
+  }
   static IntInit *get(int64_t V);
 
   int64_t getValue() const { return Value; }
@@ -666,15 +785,6 @@ public:
 
   virtual std::string getAsString() const;
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
-    llvm_unreachable("Illegal bit reference off int");
-  }
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -682,6 +792,10 @@ public:
                                             unsigned Elt) const {
     llvm_unreachable("Illegal element reference off int");
   }
+
+  virtual Init *getBit(unsigned Bit) const {
+    return BitInit::get((Value & (1ULL << Bit)) != 0);
+  }
 };
 
 
@@ -691,13 +805,16 @@ class StringInit : public TypedInit {
   std::string Value;
 
   explicit StringInit(const std::string &V)
-    : TypedInit(StringRecTy::get()), Value(V) {}
+    : TypedInit(IK_StringInit, StringRecTy::get()), Value(V) {}
 
-  StringInit(const StringInit &Other);  // Do not define.
-  StringInit &operator=(const StringInit &Other);  // Do not define.
+  StringInit(const StringInit &Other) LLVM_DELETED_FUNCTION;
+  StringInit &operator=(const StringInit &Other) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_StringInit;
+  }
   static StringInit *get(StringRef);
 
   const std::string &getValue() const { return Value; }
@@ -709,15 +826,6 @@ public:
   virtual std::string getAsString() const { return "\"" + Value + "\""; }
   virtual std::string getAsUnquotedString() const { return Value; }
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
-    llvm_unreachable("Illegal bit reference off string");
-  }
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -725,6 +833,10 @@ public:
                                             unsigned Elt) const {
     llvm_unreachable("Illegal element reference off string");
   }
+
+  virtual Init *getBit(unsigned Bit) const {
+    llvm_unreachable("Illegal bit reference off string");
+  }
 };
 
 /// ListInit - [AL, AH, CL] - Represent a list of defs
@@ -736,12 +848,16 @@ public:
 
 private:
   explicit ListInit(ArrayRef<Init *> Range, RecTy *EltTy)
-      : TypedInit(ListRecTy::get(EltTy)), Values(Range.begin(), Range.end()) {}
+    : TypedInit(IK_ListInit, ListRecTy::get(EltTy)),
+      Values(Range.begin(), Range.end()) {}
 
-  ListInit(const ListInit &Other);  // Do not define.
-  ListInit &operator=(const ListInit &Other);  // Do not define.
+  ListInit(const ListInit &Other) LLVM_DELETED_FUNCTION;
+  ListInit &operator=(const ListInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_ListInit;
+  }
   static ListInit *get(ArrayRef<Init *> Range, RecTy *EltTy);
 
   void Profile(FoldingSetNodeID &ID) const;
@@ -754,7 +870,8 @@ public:
 
   Record *getElementAsRecord(unsigned i) const;
 
-  Init *convertInitListSlice(const std::vector<unsigned> &Elements) const;
+  virtual Init *
+    convertInitListSlice(const std::vector<unsigned> &Elements) const;
 
   virtual Init *convertInitializerTo(RecTy *Ty) const {
     return Ty->convertValue(const_cast<ListInit *>(this));
@@ -777,33 +894,32 @@ public:
   inline size_t         size () const { return Values.size();  }
   inline bool           empty() const { return Values.empty(); }
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
-    llvm_unreachable("Illegal bit reference off list");
-  }
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
   virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
                                             unsigned Elt) const;
+
+  virtual Init *getBit(unsigned Bit) const {
+    llvm_unreachable("Illegal bit reference off list");
+  }
 };
 
 
 /// OpInit - Base class for operators
 ///
 class OpInit : public TypedInit {
-  OpInit(const OpInit &Other);  // Do not define.
-  OpInit &operator=(OpInit &Other);  // Do not define.
+  OpInit(const OpInit &Other) LLVM_DELETED_FUNCTION;
+  OpInit &operator=(OpInit &Other) LLVM_DELETED_FUNCTION;
 
 protected:
-  explicit OpInit(RecTy *Type) : TypedInit(Type) {}
+  explicit OpInit(InitKind K, RecTy *Type) : TypedInit(K, Type) {}
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() >= IK_FirstOpInit &&
+           I->getKind() <= IK_LastOpInit;
+  }
   // Clone - Clone this operator, replacing arguments with the new list
   virtual OpInit *clone(std::vector<Init *> &Operands) const = 0;
 
@@ -818,10 +934,10 @@ public:
     return Ty->convertValue(const_cast<OpInit *>(this));
   }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
   virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
                                             unsigned Elt) const;
+
+  virtual Init *getBit(unsigned Bit) const;
 };
 
 
@@ -835,12 +951,15 @@ private:
   Init *LHS;
 
   UnOpInit(UnaryOp opc, Init *lhs, RecTy *Type)
-      : OpInit(Type), Opc(opc), LHS(lhs) {}
+    : OpInit(IK_UnOpInit, Type), Opc(opc), LHS(lhs) {}
 
-  UnOpInit(const UnOpInit &Other);  // Do not define.
-  UnOpInit &operator=(const UnOpInit &Other);  // Do not define.
+  UnOpInit(const UnOpInit &Other) LLVM_DELETED_FUNCTION;
+  UnOpInit &operator=(const UnOpInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_UnOpInit;
+  }
   static UnOpInit *get(UnaryOp opc, Init *lhs, RecTy *Type);
 
   // Clone - Clone this operator, replacing arguments with the new list
@@ -850,8 +969,8 @@ public:
     return UnOpInit::get(getOpcode(), *Operands.begin(), getType());
   }
 
-  int getNumOperands() const { return 1; }
-  Init *getOperand(int i) const {
+  virtual int getNumOperands() const { return 1; }
+  virtual Init *getOperand(int i) const {
     assert(i == 0 && "Invalid operand id for unary operator");
     return getOperand();
   }
@@ -861,7 +980,7 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
+  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
 
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
 
@@ -878,12 +997,15 @@ private:
   Init *LHS, *RHS;
 
   BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, RecTy *Type) :
-      OpInit(Type), Opc(opc), LHS(lhs), RHS(rhs) {}
+      OpInit(IK_BinOpInit, Type), Opc(opc), LHS(lhs), RHS(rhs) {}
 
-  BinOpInit(const BinOpInit &Other);  // Do not define.
-  BinOpInit &operator=(const BinOpInit &Other);  // Do not define.
+  BinOpInit(const BinOpInit &Other) LLVM_DELETED_FUNCTION;
+  BinOpInit &operator=(const BinOpInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_BinOpInit;
+  }
   static BinOpInit *get(BinaryOp opc, Init *lhs, Init *rhs,
                         RecTy *Type);
 
@@ -894,8 +1016,8 @@ public:
     return BinOpInit::get(getOpcode(), Operands[0], Operands[1], getType());
   }
 
-  int getNumOperands() const { return 2; }
-  Init *getOperand(int i) const {
+  virtual int getNumOperands() const { return 2; }
+  virtual Init *getOperand(int i) const {
     assert((i == 0 || i == 1) && "Invalid operand id for binary operator");
     if (i == 0) {
       return getLHS();
@@ -910,7 +1032,7 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
+  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
 
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
 
@@ -928,12 +1050,15 @@ private:
 
   TernOpInit(TernaryOp opc, Init *lhs, Init *mhs, Init *rhs,
              RecTy *Type) :
-      OpInit(Type), Opc(opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
+      OpInit(IK_TernOpInit, Type), Opc(opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
 
-  TernOpInit(const TernOpInit &Other);  // Do not define.
-  TernOpInit &operator=(const TernOpInit &Other);  // Do not define.
+  TernOpInit(const TernOpInit &Other) LLVM_DELETED_FUNCTION;
+  TernOpInit &operator=(const TernOpInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_TernOpInit;
+  }
   static TernOpInit *get(TernaryOp opc, Init *lhs,
                          Init *mhs, Init *rhs,
                          RecTy *Type);
@@ -946,8 +1071,8 @@ public:
                            getType());
   }
 
-  int getNumOperands() const { return 3; }
-  Init *getOperand(int i) const {
+  virtual int getNumOperands() const { return 3; }
+  virtual Init *getOperand(int i) const {
     assert((i == 0 || i == 1 || i == 2) &&
            "Invalid operand id for ternary operator");
     if (i == 0) {
@@ -966,7 +1091,7 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
+  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
 
   virtual bool isComplete() const { return false; }
 
@@ -982,14 +1107,17 @@ class VarInit : public TypedInit {
   Init *VarName;
 
   explicit VarInit(const std::string &VN, RecTy *T)
-      : TypedInit(T), VarName(StringInit::get(VN)) {}
+      : TypedInit(IK_VarInit, T), VarName(StringInit::get(VN)) {}
   explicit VarInit(Init *VN, RecTy *T)
-      : TypedInit(T), VarName(VN) {}
+      : TypedInit(IK_VarInit, T), VarName(VN) {}
 
-  VarInit(const VarInit &Other);  // Do not define.
-  VarInit &operator=(const VarInit &Other);  // Do not define.
+  VarInit(const VarInit &Other) LLVM_DELETED_FUNCTION;
+  VarInit &operator=(const VarInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_VarInit;
+  }
   static VarInit *get(const std::string &VN, RecTy *T);
   static VarInit *get(Init *VN, RecTy *T);
 
@@ -1003,8 +1131,6 @@ public:
     return getNameInit()->getAsUnquotedString();
   }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
   virtual Init *resolveListElementReference(Record &R, const RecordVal *RV,
                                             unsigned Elt) const;
 
@@ -1019,6 +1145,8 @@ public:
   ///
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
 
+  virtual Init *getBit(unsigned Bit) const;
+
   virtual std::string getAsString() const { return getName(); }
 };
 
@@ -1029,27 +1157,37 @@ class VarBitInit : public Init {
   TypedInit *TI;
   unsigned Bit;
 
-  VarBitInit(TypedInit *T, unsigned B) : TI(T), Bit(B) {
-    assert(T->getType() && dynamic_cast<BitsRecTy*>(T->getType()) &&
-           ((BitsRecTy*)T->getType())->getNumBits() > B &&
+  VarBitInit(TypedInit *T, unsigned B) : Init(IK_VarBitInit), TI(T), Bit(B) {
+    assert(T->getType() &&
+           (isa<IntRecTy>(T->getType()) ||
+            (isa<BitsRecTy>(T->getType()) &&
+             cast<BitsRecTy>(T->getType())->getNumBits() > B)) &&
            "Illegal VarBitInit expression!");
   }
 
-  VarBitInit(const VarBitInit &Other);  // Do not define.
-  VarBitInit &operator=(const VarBitInit &Other);  // Do not define.
+  VarBitInit(const VarBitInit &Other) LLVM_DELETED_FUNCTION;
+  VarBitInit &operator=(const VarBitInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_VarBitInit;
+  }
   static VarBitInit *get(TypedInit *T, unsigned B);
 
   virtual Init *convertInitializerTo(RecTy *Ty) const {
     return Ty->convertValue(const_cast<VarBitInit *>(this));
   }
 
-  TypedInit *getVariable() const { return TI; }
-  unsigned getBitNum() const { return Bit; }
+  virtual Init *getBitVar() const { return TI; }
+  virtual unsigned getBitNum() const { return Bit; }
 
   virtual std::string getAsString() const;
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
+
+  virtual Init *getBit(unsigned B) const {
+    assert(B < 1 && "Bit index out of range!");
+    return const_cast<VarBitInit*>(this);
+  }
 };
 
 /// VarListElementInit - List[4] - Represent access to one element of a var or
@@ -1059,18 +1197,20 @@ class VarListElementInit : public TypedInit {
   unsigned Element;
 
   VarListElementInit(TypedInit *T, unsigned E)
-      : TypedInit(dynamic_cast<ListRecTy*>(T->getType())->getElementType()),
-          TI(T), Element(E) {
-    assert(T->getType() && dynamic_cast<ListRecTy*>(T->getType()) &&
+      : TypedInit(IK_VarListElementInit,
+                  cast<ListRecTy>(T->getType())->getElementType()),
+        TI(T), Element(E) {
+    assert(T->getType() && isa<ListRecTy>(T->getType()) &&
            "Illegal VarBitInit expression!");
   }
 
-  VarListElementInit(const VarListElementInit &Other);  // Do not define.
-  VarListElementInit &operator=(const VarListElementInit &Other);  // Do
-                                                                   // not
-                                                                   // define.
+  VarListElementInit(const VarListElementInit &Other) LLVM_DELETED_FUNCTION;
+  void operator=(const VarListElementInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_VarListElementInit;
+  }
   static VarListElementInit *get(TypedInit *T, unsigned E);
 
   virtual Init *convertInitializerTo(RecTy *Ty) const {
@@ -1080,9 +1220,6 @@ public:
   TypedInit *getVariable() const { return TI; }
   unsigned getElementNum() const { return Element; }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
-
   /// resolveListElementReference - This method is used to implement
   /// VarListElementInit::resolveReferences.  If the list element is resolvable
   /// now, we return the resolved value, otherwise we return null.
@@ -1092,6 +1229,8 @@ public:
 
   virtual std::string getAsString() const;
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
+
+  virtual Init *getBit(unsigned Bit) const;
 };
 
 /// DefInit - AL - Represent a reference to a 'def' in the description
@@ -1099,13 +1238,16 @@ public:
 class DefInit : public TypedInit {
   Record *Def;
 
-  DefInit(Record *D, RecordRecTy *T) : TypedInit(T), Def(D) {}
+  DefInit(Record *D, RecordRecTy *T) : TypedInit(IK_DefInit, T), Def(D) {}
   friend class Record;
 
-  DefInit(const DefInit &Other);  // Do not define.
-  DefInit &operator=(const DefInit &Other);  // Do not define.
+  DefInit(const DefInit &Other) LLVM_DELETED_FUNCTION;
+  DefInit &operator=(const DefInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_DefInit;
+  }
   static DefInit *get(Record*);
 
   virtual Init *convertInitializerTo(RecTy *Ty) const {
@@ -1122,12 +1264,7 @@ public:
 
   virtual std::string getAsString() const;
 
-  /// resolveBitReference - This method is used to implement
-  /// VarBitInit::resolveReferences.  If the bit is able to be resolved, we
-  /// simply return the resolved value, otherwise we return null.
-  ///
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
+  virtual Init *getBit(unsigned Bit) const {
     llvm_unreachable("Illegal bit reference off def");
   }
 
@@ -1148,14 +1285,17 @@ class FieldInit : public TypedInit {
   std::string FieldName;    // Field we are accessing
 
   FieldInit(Init *R, const std::string &FN)
-      : TypedInit(R->getFieldType(FN)), Rec(R), FieldName(FN) {
+      : TypedInit(IK_FieldInit, R->getFieldType(FN)), Rec(R), FieldName(FN) {
     assert(getType() && "FieldInit with non-record type!");
   }
 
-  FieldInit(const FieldInit &Other);  // Do not define.
-  FieldInit &operator=(const FieldInit &Other);  // Do not define.
+  FieldInit(const FieldInit &Other) LLVM_DELETED_FUNCTION;
+  FieldInit &operator=(const FieldInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_FieldInit;
+  }
   static FieldInit *get(Init *R, const std::string &FN);
   static FieldInit *get(Init *R, const Init *FN);
 
@@ -1163,8 +1303,8 @@ public:
     return Ty->convertValue(const_cast<FieldInit *>(this));
   }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const;
+  virtual Init *getBit(unsigned Bit) const;
+
   virtual Init *resolveListElementReference(Record &R,
                                             const RecordVal *RV,
                                             unsigned Elt) const;
@@ -1189,14 +1329,17 @@ class DagInit : public TypedInit, public FoldingSetNode {
   DagInit(Init *V, const std::string &VN,
           ArrayRef<Init *> ArgRange,
           ArrayRef<std::string> NameRange)
-      : TypedInit(DagRecTy::get()), Val(V), ValName(VN),
+      : TypedInit(IK_DagInit, DagRecTy::get()), Val(V), ValName(VN),
           Args(ArgRange.begin(), ArgRange.end()),
           ArgNames(NameRange.begin(), NameRange.end()) {}
 
-  DagInit(const DagInit &Other);  // Do not define.
-  DagInit &operator=(const DagInit &Other);  // Do not define.
+  DagInit(const DagInit &Other) LLVM_DELETED_FUNCTION;
+  DagInit &operator=(const DagInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
+  static bool classof(const Init *I) {
+    return I->getKind() == IK_DagInit;
+  }
   static DagInit *get(Init *V, const std::string &VN,
                       ArrayRef<Init *> ArgRange,
                       ArrayRef<std::string> NameRange);
@@ -1243,8 +1386,7 @@ public:
   inline size_t              name_size () const { return ArgNames.size();  }
   inline bool                name_empty() const { return ArgNames.empty(); }
 
-  virtual Init *resolveBitReference(Record &R, const RecordVal *RV,
-                                    unsigned Bit) const {
+  virtual Init *getBit(unsigned Bit) const {
     llvm_unreachable("Illegal bit reference off dag");
   }
 
@@ -1301,7 +1443,9 @@ class Record {
   // Unique record ID.
   unsigned ID;
   Init *Name;
-  SMLoc Loc;
+  // Location where record was instantiated, followed by the location of
+  // multiclass prototypes used.
+  SmallVector<SMLoc, 4> Locs;
   std::vector<Init *> TemplateArgs;
   std::vector<RecordVal> Values;
   std::vector<Record*> SuperClasses;
@@ -1317,15 +1461,25 @@ class Record {
 public:
 
   // Constructs a record.
-  explicit Record(const std::string &N, SMLoc loc, RecordKeeper &records) :
-    ID(LastID++), Name(StringInit::get(N)), Loc(loc), TrackedRecords(records),
-      TheInit(0) {
+  explicit Record(const std::string &N, ArrayRef<SMLoc> locs,
+                  RecordKeeper &records) :
+    ID(LastID++), Name(StringInit::get(N)), Locs(locs.begin(), locs.end()),
+    TrackedRecords(records), TheInit(0) {
     init();
   }
-  explicit Record(Init *N, SMLoc loc, RecordKeeper &records) :
-    ID(LastID++), Name(N), Loc(loc), TrackedRecords(records), TheInit(0) {
+  explicit Record(Init *N, ArrayRef<SMLoc> locs, RecordKeeper &records) :
+    ID(LastID++), Name(N), Locs(locs.begin(), locs.end()),
+    TrackedRecords(records), TheInit(0) {
     init();
   }
+
+  // When copy-constructing a Record, we must still guarantee a globally unique
+  // ID number.  All other fields can be copied normally.
+  Record(const Record &O) :
+    ID(LastID++), Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
+    Values(O.Values), SuperClasses(O.SuperClasses),
+    TrackedRecords(O.TrackedRecords), TheInit(O.TheInit) { }
+
   ~Record() {}
 
 
@@ -1345,7 +1499,7 @@ public:
   void setName(Init *Name);               // Also updates RecordKeeper.
   void setName(const std::string &Name);  // Also updates RecordKeeper.
 
-  SMLoc getLoc() const { return Loc; }
+  ArrayRef<SMLoc> getLoc() const { return Locs; }
 
   /// get the corresponding DefInit.
   DefInit *getDefInit();
@@ -1507,6 +1661,12 @@ public:
   ///
   bool getValueAsBit(StringRef FieldName) const;
 
+  /// getValueAsBitOrUnset - This method looks up the specified field and
+  /// returns its value as a bit. If the field is unset, sets Unset to true and
+  /// retunrs false.
+  ///
+  bool getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const;
+
   /// getValueAsInt - This method looks up the specified field and returns its
   /// value as an int64_t, throwing an exception if the field does not exist or
   /// if the value is not the right type.
@@ -1601,6 +1761,16 @@ struct LessRecord {
   }
 };
 
+/// LessRecordByID - Sorting predicate to sort record pointers by their
+/// unique ID. If you just need a deterministic order, use this, since it
+/// just compares two `unsigned`; the other sorting predicates require
+/// string manipulation.
+struct LessRecordByID {
+  bool operator()(const Record *LHS, const Record *RHS) const {
+    return LHS->getID() < RHS->getID();
+  }
+};
+
 /// LessRecordFieldName - Sorting predicate to sort record pointers by their
 /// name field.
 ///
diff --git a/include/llvm/TableGen/TableGenAction.h b/include/llvm/TableGen/TableGenAction.h
deleted file mode 100644
index 733ae626447c..000000000000
--- a/include/llvm/TableGen/TableGenAction.h
+++ /dev/null
@@ -1,35 +0,0 @@
-//===- llvm/TableGen/TableGenAction.h - defines TableGenAction --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the TableGenAction base class to be derived from by
-// tblgen tools.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TABLEGEN_TABLEGENACTION_H
-#define LLVM_TABLEGEN_TABLEGENACTION_H
-
-namespace llvm {
-
-class raw_ostream;
-class RecordKeeper;
-
-class TableGenAction {
-  virtual void anchor();
-public:
-  virtual ~TableGenAction() {}
-
-  /// Perform the action using Records, and write output to OS.
-  /// @returns true on error, false otherwise
-  virtual bool operator()(raw_ostream &OS, RecordKeeper &Records) = 0;
-};
-
-}
-
-#endif
diff --git a/include/llvm/Target/Mangler.h b/include/llvm/Target/Mangler.h
index d5e165e58b91..a50f54a436e9 100644
--- a/include/llvm/Target/Mangler.h
+++ b/include/llvm/Target/Mangler.h
@@ -22,7 +22,7 @@ class GlobalValue;
 template <typename T> class SmallVectorImpl;
 class MCContext;
 class MCSymbol;
-class TargetData;
+class DataLayout;
 
 class Mangler {
 public:
@@ -34,7 +34,7 @@ public:
 
 private:
   MCContext &Context;
-  const TargetData &TD;
+  const DataLayout &TD;
 
   /// AnonGlobalIDs - We need to give global values the same name every time
   /// they are mangled.  This keeps track of the number we give to anonymous
@@ -47,20 +47,19 @@ private:
   unsigned NextAnonGlobalID;
 
 public:
-  Mangler(MCContext &context, const TargetData &td)
+  Mangler(MCContext &context, const DataLayout &td)
     : Context(context), TD(td), NextAnonGlobalID(1) {}
 
   /// getSymbol - Return the MCSymbol for the specified global value.  This
   /// symbol is the main label that is the address of the global.
   MCSymbol *getSymbol(const GlobalValue *GV);
 
-  
   /// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
   /// and the specified global variable's name.  If the global variable doesn't
   /// have a name, this fills in a unique name for the global.
   void getNameWithPrefix(SmallVectorImpl<char> &OutName, const GlobalValue *GV,
                          bool isImplicitlyPrivate);
-  
+
   /// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
   /// and the specified name as the global variable name.  GVName must not be
   /// empty.
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 1816445579ed..12f5c0eb306a 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -343,8 +343,8 @@ class Instruction {
   bit isBarrier    = 0;     // Can control flow fall through this instruction?
   bit isCall       = 0;     // Is this instruction a call instruction?
   bit canFoldAsLoad = 0;    // Can this be folded as a simple memory operand?
-  bit mayLoad      = 0;     // Is it possible for this inst to read memory?
-  bit mayStore     = 0;     // Is it possible for this inst to write memory?
+  bit mayLoad      = ?;     // Is it possible for this inst to read memory?
+  bit mayStore     = ?;     // Is it possible for this inst to write memory?
   bit isConvertibleToThreeAddress = 0;  // Can this 2-addr instruction promote?
   bit isCommutable = 0;     // Is this 3 operand instruction commutable?
   bit isTerminator = 0;     // Is this part of the terminator for a basic block?
@@ -369,7 +369,7 @@ class Instruction {
   //
   //  neverHasSideEffects - Set on an instruction with no pattern if it has no
   //    side effects.
-  bit hasSideEffects = 0;
+  bit hasSideEffects = ?;
   bit neverHasSideEffects = 0;
 
   // Is this instruction a "real" instruction (with a distinct machine
@@ -495,7 +495,8 @@ def ptr_rc : PointerLikeRegClass<0>;
 
 /// unknown definition - Mark this operand as being of unknown type, causing
 /// it to be resolved by inference in the context it is used.
-def unknown;
+class unknown_class;
+def unknown : unknown_class;
 
 /// AsmOperandClass - Representation for the kinds of operands which the target
 /// specific parser can create and the assembly matcher may need to distinguish.
@@ -602,23 +603,31 @@ def f64imm : Operand<f64>;
 ///
 def zero_reg;
 
+/// OperandWithDefaultOps - This Operand class can be used as the parent class
+/// for an Operand that needs to be initialized with a default value if
+/// no value is supplied in a pattern.  This class can be used to simplify the
+/// pattern definitions for instructions that have target specific flags
+/// encoded as immediate operands.
+class OperandWithDefaultOps<ValueType ty, dag defaultops>
+  : Operand<ty> {
+  dag DefaultOps = defaultops;
+}
+
 /// PredicateOperand - This can be used to define a predicate operand for an
 /// instruction.  OpTypes specifies the MIOperandInfo for the operand, and
 /// AlwaysVal specifies the value of this predicate when set to "always
 /// execute".
 class PredicateOperand<ValueType ty, dag OpTypes, dag AlwaysVal>
-  : Operand<ty> {
+  : OperandWithDefaultOps<ty, AlwaysVal> {
   let MIOperandInfo = OpTypes;
-  dag DefaultOps = AlwaysVal;
 }
 
 /// OptionalDefOperand - This is used to define a optional definition operand
 /// for an instruction. DefaultOps is the register the operand represents if
 /// none is supplied, e.g. zero_reg.
 class OptionalDefOperand<ValueType ty, dag OpTypes, dag defaultops>
-  : Operand<ty> {
+  : OperandWithDefaultOps<ty, defaultops> {
   let MIOperandInfo = OpTypes;
-  dag DefaultOps = defaultops;
 }
 
 
@@ -631,6 +640,17 @@ class InstrInfo {
   // Sparc manual specifies its instructions in the format [31..0] (big), while
   // PowerPC specifies them using the format [0..31] (little).
   bit isLittleEndianEncoding = 0;
+
+  // The instruction properties mayLoad, mayStore, and hasSideEffects are unset
+  // by default, and TableGen will infer their value from the instruction
+  // pattern when possible.
+  //
+  // Normally, TableGen will issue an error it it can't infer the value of a
+  // property that hasn't been set explicitly. When guessInstructionProperties
+  // is set, it will guess a safe value instead.
+  //
+  // This option is a temporary migration help. It will go away.
+  bit guessInstructionProperties = 1;
 }
 
 // Standard Pseudo Instructions.
@@ -734,6 +754,18 @@ def BUNDLE : Instruction {
   let InOperandList = (ins variable_ops);
   let AsmString = "BUNDLE";
 }
+def LIFETIME_START : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$id);
+  let AsmString = "LIFETIME_START";
+  let neverHasSideEffects = 1;
+}
+def LIFETIME_END : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$id);
+  let AsmString = "LIFETIME_END";
+  let neverHasSideEffects = 1;
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -753,6 +785,10 @@ class AsmParser {
   // function of the AsmParser class to call on every matched instruction.
   // This can be used to perform target specific instruction post-processing.
   string AsmParserInstCleanup  = "";
+
+  //ShouldEmitMatchRegisterName - Set to false if the target needs a hand
+  //written register name matcher
+  bit ShouldEmitMatchRegisterName = 1;
 }
 def DefaultAsmParser : AsmParser;
 
@@ -953,12 +989,64 @@ class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
 // ProcessorModel allows subtargets to specify the more general
 // SchedMachineModel instead if a ProcessorItinerary. Subtargets will
 // gradually move to this newer form.
+//
+// Although this class always passes NoItineraries to the Processor
+// class, the SchedMachineModel may still define valid Itineraries.
 class ProcessorModel<string n, SchedMachineModel m, list<SubtargetFeature> f>
   : Processor<n, NoItineraries, f> {
   let SchedModel = m;
 }
 
 //===----------------------------------------------------------------------===//
+// InstrMapping - This class is used to create mapping tables to relate
+// instructions with each other based on the values specified in RowFields,
+// ColFields, KeyCol and ValueCols.
+//
+class InstrMapping {
+  // FilterClass - Used to limit search space only to the instructions that
+  // define the relationship modeled by this InstrMapping record.
+  string FilterClass;
+
+  // RowFields - List of fields/attributes that should be same for all the
+  // instructions in a row of the relation table. Think of this as a set of
+  // properties shared by all the instructions related by this relationship
+  // model and is used to categorize instructions into subgroups. For instance,
+  // if we want to define a relation that maps 'Add' instruction to its
+  // predicated forms, we can define RowFields like this:
+  //
+  // let RowFields = BaseOp
+  // All add instruction predicated/non-predicated will have to set their BaseOp
+  // to the same value.
+  //
+  // def Add: { let BaseOp = 'ADD'; let predSense = 'nopred' }
+  // def Add_predtrue: { let BaseOp = 'ADD'; let predSense = 'true' }
+  // def Add_predfalse: { let BaseOp = 'ADD'; let predSense = 'false'  }
+  list<string> RowFields = [];
+
+  // List of fields/attributes that are same for all the instructions
+  // in a column of the relation table.
+  // Ex: let ColFields = 'predSense' -- It means that the columns are arranged
+  // based on the 'predSense' values. All the instruction in a specific
+  // column have the same value and it is fixed for the column according
+  // to the values set in 'ValueCols'.
+  list<string> ColFields = [];
+
+  // Values for the fields/attributes listed in 'ColFields'.
+  // Ex: let KeyCol = 'nopred' -- It means that the key instruction (instruction
+  // that models this relation) should be non-predicated.
+  // In the example above, 'Add' is the key instruction.
+  list<string> KeyCol = [];
+
+  // List of values for the fields/attributes listed in 'ColFields', one for
+  // each column in the relation table.
+  //
+  // Ex: let ValueCols = [['true'],['false']] -- It adds two columns in the
+  // table. First column requires all the instructions to have predSense
+  // set to 'true' and second column requires it to be 'false'.
+  list<list<string> > ValueCols = [];
+}
+
+//===----------------------------------------------------------------------===//
 // Pull in the common support for calling conventions.
 //
 include "llvm/Target/TargetCallingConv.td"
diff --git a/include/llvm/Target/TargetCallingConv.h b/include/llvm/Target/TargetCallingConv.h
index f8cebefb0eae..2160e371bda9 100644
--- a/include/llvm/Target/TargetCallingConv.h
+++ b/include/llvm/Target/TargetCallingConv.h
@@ -113,9 +113,18 @@ namespace ISD {
     MVT VT;
     bool Used;
 
+    /// Index original Function's argument.
+    unsigned OrigArgIndex;
+
+    /// Offset in bytes of current input value relative to the beginning of
+    /// original argument. E.g. if argument was splitted into four 32 bit
+    /// registers, we got 4 InputArgs with PartOffsets 0, 4, 8 and 12.
+    unsigned PartOffset;
+
     InputArg() : VT(MVT::Other), Used(false) {}
-    InputArg(ArgFlagsTy flags, EVT vt, bool used)
-      : Flags(flags), Used(used) {
+    InputArg(ArgFlagsTy flags, EVT vt, bool used,
+             unsigned origIdx, unsigned partOffs)
+      : Flags(flags), Used(used), OrigArgIndex(origIdx), PartOffset(partOffs) {
       VT = vt.getSimpleVT();
     }
   };
@@ -131,9 +140,19 @@ namespace ISD {
     /// IsFixed - Is this a "fixed" value, ie not passed through a vararg "...".
     bool IsFixed;
 
+    /// Index original Function's argument.
+    unsigned OrigArgIndex;
+
+    /// Offset in bytes of current output value relative to the beginning of
+    /// original argument. E.g. if argument was splitted into four 32 bit
+    /// registers, we got 4 OutputArgs with PartOffsets 0, 4, 8 and 12.
+    unsigned PartOffset;
+
     OutputArg() : IsFixed(false) {}
-    OutputArg(ArgFlagsTy flags, EVT vt, bool isfixed)
-      : Flags(flags), IsFixed(isfixed) {
+    OutputArg(ArgFlagsTy flags, EVT vt, bool isfixed,
+              unsigned origIdx, unsigned partOffs)
+      : Flags(flags), IsFixed(isfixed), OrigArgIndex(origIdx),
+        PartOffset(partOffs) {
       VT = vt.getSimpleVT();
     }
   };
diff --git a/include/llvm/Target/TargetELFWriterInfo.h b/include/llvm/Target/TargetELFWriterInfo.h
deleted file mode 100644
index 5e48629cf4d6..000000000000
--- a/include/llvm/Target/TargetELFWriterInfo.h
+++ /dev/null
@@ -1,121 +0,0 @@
-//===-- llvm/Target/TargetELFWriterInfo.h - ELF Writer Info -----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the TargetELFWriterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_TARGETELFWRITERINFO_H
-#define LLVM_TARGET_TARGETELFWRITERINFO_H
-
-namespace llvm {
-
-  //===--------------------------------------------------------------------===//
-  //                          TargetELFWriterInfo
-  //===--------------------------------------------------------------------===//
-
-  class TargetELFWriterInfo {
-  protected:
-    // EMachine - This field is the target specific value to emit as the
-    // e_machine member of the ELF header.
-    unsigned short EMachine;
-    bool is64Bit, isLittleEndian;
-  public:
-
-    // Machine architectures
-    enum MachineType {
-      EM_NONE = 0,     // No machine
-      EM_M32 = 1,      // AT&T WE 32100
-      EM_SPARC = 2,    // SPARC
-      EM_386 = 3,      // Intel 386
-      EM_68K = 4,      // Motorola 68000
-      EM_88K = 5,      // Motorola 88000
-      EM_486 = 6,      // Intel 486 (deprecated)
-      EM_860 = 7,      // Intel 80860
-      EM_MIPS = 8,     // MIPS R3000
-      EM_PPC = 20,     // PowerPC
-      EM_ARM = 40,     // ARM
-      EM_ALPHA = 41,   // DEC Alpha
-      EM_SPARCV9 = 43, // SPARC V9
-      EM_X86_64 = 62,  // AMD64
-      EM_HEXAGON = 164 // Qualcomm Hexagon
-    };
-
-    // ELF File classes
-    enum {
-      ELFCLASS32 = 1, // 32-bit object file
-      ELFCLASS64 = 2  // 64-bit object file
-    };
-
-    // ELF Endianess
-    enum {
-      ELFDATA2LSB = 1, // Little-endian object file
-      ELFDATA2MSB = 2  // Big-endian object file
-    };
-
-    explicit TargetELFWriterInfo(bool is64Bit_, bool isLittleEndian_);
-    virtual ~TargetELFWriterInfo();
-
-    unsigned short getEMachine() const { return EMachine; }
-    unsigned getEFlags() const { return 0; }
-    unsigned getEIClass() const { return is64Bit ? ELFCLASS64 : ELFCLASS32; }
-    unsigned getEIData() const {
-      return isLittleEndian ? ELFDATA2LSB : ELFDATA2MSB;
-    }
-
-    /// ELF Header and ELF Section Header Info
-    unsigned getHdrSize() const { return is64Bit ? 64 : 52; }
-    unsigned getSHdrSize() const { return is64Bit ? 64 : 40; }
-
-    /// Symbol Table Info
-    unsigned getSymTabEntrySize() const { return is64Bit ? 24 : 16; }
-
-    /// getPrefELFAlignment - Returns the preferred alignment for ELF. This
-    /// is used to align some sections.
-    unsigned getPrefELFAlignment() const { return is64Bit ? 8 : 4; }
-
-    /// getRelocationEntrySize - Entry size used in the relocation section
-    unsigned getRelocationEntrySize() const {
-      return is64Bit ? (hasRelocationAddend() ? 24 : 16)
-                     : (hasRelocationAddend() ? 12 : 8);
-    }
-
-    /// getRelocationType - Returns the target specific ELF Relocation type.
-    /// 'MachineRelTy' contains the object code independent relocation type
-    virtual unsigned getRelocationType(unsigned MachineRelTy) const = 0;
-
-    /// hasRelocationAddend - True if the target uses an addend in the
-    /// ELF relocation entry.
-    virtual bool hasRelocationAddend() const = 0;
-
-    /// getDefaultAddendForRelTy - Gets the default addend value for a
-    /// relocation entry based on the target ELF relocation type.
-    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
-                                              long int Modifier = 0) const = 0;
-
-    /// getRelTySize - Returns the size of relocatable field in bits
-    virtual unsigned getRelocationTySize(unsigned RelTy) const = 0;
-
-    /// isPCRelativeRel - True if the relocation type is pc relative
-    virtual bool isPCRelativeRel(unsigned RelTy) const = 0;
-
-    /// getJumpTableRelocationTy - Returns the machine relocation type used
-    /// to reference a jumptable.
-    virtual unsigned getAbsoluteLabelMachineRelTy() const = 0;
-
-    /// computeRelocation - Some relocatable fields could be relocated
-    /// directly, avoiding the relocation symbol emission, compute the
-    /// final relocation value for this symbol.
-    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
-                                       unsigned RelTy) const = 0;
-  };
-
-} // end llvm namespace
-
-#endif // LLVM_TARGET_TARGETELFWRITERINFO_H
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index da30ab82d6c2..4570813ba6c2 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -45,8 +45,8 @@ template<class T> class SmallVectorImpl;
 /// TargetInstrInfo - Interface to description of machine instruction set
 ///
 class TargetInstrInfo : public MCInstrInfo {
-  TargetInstrInfo(const TargetInstrInfo &);  // DO NOT IMPLEMENT
-  void operator=(const TargetInstrInfo &);   // DO NOT IMPLEMENT
+  TargetInstrInfo(const TargetInstrInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetInstrInfo &) LLVM_DELETED_FUNCTION;
 public:
   TargetInstrInfo(int CFSetupOpcode = -1, int CFDestroyOpcode = -1)
     : CallFrameSetupOpcode(CFSetupOpcode),
@@ -459,6 +459,13 @@ public:
   }
 
   /// copyPhysReg - Emit instructions to copy a pair of physical registers.
+  ///
+  /// This function should support copies within any legal register class as
+  /// well as any cross-class copies created during instruction selection.
+  ///
+  /// The source and destination registers may overlap, which may require a
+  /// careful implementation when multiple copy instructions are required for
+  /// large registers. See for example the ARM target.
   virtual void copyPhysReg(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, DebugLoc DL,
                            unsigned DestReg, unsigned SrcReg,
@@ -794,29 +801,6 @@ public:
                                  const MachineInstr *UseMI, unsigned UseIdx,
                                  bool FindMin = false) const;
 
-  /// computeOperandLatency - Compute and return the latency of the given data
-  /// dependent def and use. DefMI must be a valid def. UseMI may be NULL for
-  /// an unknown use. If the subtarget allows, this may or may not need to call
-  /// getOperandLatency().
-  ///
-  /// FindMin may be set to get the minimum vs. expected latency. Minimum
-  /// latency is used for scheduling groups, while expected latency is for
-  /// instruction cost and critical path.
-  unsigned computeOperandLatency(const InstrItineraryData *ItinData,
-                                 const TargetRegisterInfo *TRI,
-                                 const MachineInstr *DefMI,
-                                 const MachineInstr *UseMI,
-                                 unsigned Reg, bool FindMin) const;
-
-  /// getOutputLatency - Compute and return the output dependency latency of a
-  /// a given pair of defs which both target the same register. This is usually
-  /// one.
-  virtual unsigned getOutputLatency(const InstrItineraryData *ItinData,
-                                    const MachineInstr *DefMI, unsigned DefIdx,
-                                    const MachineInstr *DepMI) const {
-    return 1;
-  }
-
   /// getInstrLatency - Compute the instruction latency of a given instruction.
   /// If the instruction has higher cost when predicated, it's returned via
   /// PredCost.
@@ -831,6 +815,9 @@ public:
   unsigned defaultDefLatency(const MCSchedModel *SchedModel,
                              const MachineInstr *DefMI) const;
 
+  int computeDefOperandLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr *DefMI, bool FindMin) const;
+
   /// isHighLatencyDef - Return true if this opcode has high latency to its
   /// result.
   virtual bool isHighLatencyDef(int opc) const { return false; }
diff --git a/include/llvm/Target/TargetIntrinsicInfo.h b/include/llvm/Target/TargetIntrinsicInfo.h
index c44b9230c0d8..ce213496935d 100644
--- a/include/llvm/Target/TargetIntrinsicInfo.h
+++ b/include/llvm/Target/TargetIntrinsicInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGET_TARGETINTRINSICINFO_H
 #define LLVM_TARGET_TARGETINTRINSICINFO_H
 
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -27,8 +28,8 @@ class Type;
 /// TargetIntrinsicInfo - Interface to description of machine instruction set
 ///
 class TargetIntrinsicInfo {
-  TargetIntrinsicInfo(const TargetIntrinsicInfo &); // DO NOT IMPLEMENT
-  void operator=(const TargetIntrinsicInfo &);      // DO NOT IMPLEMENT
+  TargetIntrinsicInfo(const TargetIntrinsicInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetIntrinsicInfo &) LLVM_DELETED_FUNCTION;
 public:
   TargetIntrinsicInfo();
   virtual ~TargetIntrinsicInfo();
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index ea2874f440f7..a2c97d782e29 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -18,6 +18,26 @@ namespace llvm {
 
   namespace LibFunc {
     enum Func {
+      /// void operator delete[](void*);
+      ZdaPv,
+      /// void operator delete(void*);
+      ZdlPv,
+      /// void *new[](unsigned int);
+      Znaj,
+      /// void *new[](unsigned int, nothrow);
+      ZnajRKSt9nothrow_t,
+      /// void *new[](unsigned long);
+      Znam,
+      /// void *new[](unsigned long, nothrow);
+      ZnamRKSt9nothrow_t,
+      /// void *new(unsigned int);
+      Znwj,
+      /// void *new(unsigned int, nothrow);
+      ZnwjRKSt9nothrow_t,
+      /// void *new(unsigned long);
+      Znwm,
+      /// void *new(unsigned long, nothrow);
+      ZnwmRKSt9nothrow_t,
       /// int __cxa_atexit(void (*f)(void *), void *p, void *d);
       cxa_atexit,
       /// void __cxa_guard_abort(guard_t *guard);
@@ -33,12 +53,24 @@ namespace llvm {
       acos,
       /// float acosf(float x);
       acosf,
+      /// double acosh(double x);
+      acosh,
+      /// float acoshf(float x);
+      acoshf,
+      /// long double acoshl(long double x);
+      acoshl,
       /// long double acosl(long double x);
       acosl,
       /// double asin(double x);
       asin,
       /// float asinf(float x);
       asinf,
+      /// double asinh(double x);
+      asinh,
+      /// float asinhf(float x);
+      asinhf,
+      /// long double asinhl(long double x);
+      asinhl,
       /// long double asinl(long double x);
       asinl,
       /// double atan(double x);
@@ -51,8 +83,22 @@ namespace llvm {
       atan2l,
       /// float atanf(float x);
       atanf,
+      /// double atanh(double x);
+      atanh,
+      /// float atanhf(float x);
+      atanhf,
+      /// long double atanhl(long double x);
+      atanhl,
       /// long double atanl(long double x);
       atanl,
+      /// void *calloc(size_t count, size_t size);
+      calloc,
+      /// double cbrt(double x);
+      cbrt,
+      /// float cbrtf(float x);
+      cbrtf,
+      /// long double cbrtl(long double x);
+      cbrtl,
       /// double ceil(double x);
       ceil,
       /// float ceilf(float x);
@@ -79,6 +125,12 @@ namespace llvm {
       cosl,
       /// double exp(double x);
       exp,
+      /// double exp10(double x);
+      exp10,
+      /// float exp10f(float x);
+      exp10f,
+      /// long double exp10l(long double x);
+      exp10l,
       /// double exp2(double x);
       exp2,
       /// float exp2f(float x);
@@ -119,6 +171,8 @@ namespace llvm {
       fputc,
       /// int fputs(const char *s, FILE *stream);
       fputs,
+      /// void free(void *ptr);
+      free,
       /// size_t fwrite(const void *ptr, size_t size, size_t nitems,
       /// FILE *stream);
       fwrite,
@@ -144,10 +198,18 @@ namespace llvm {
       log2f,
       /// double long double log2l(long double x);
       log2l,
+      /// double logb(double x);
+      logb,
+      /// float logbf(float x);
+      logbf,
+      /// long double logbl(long double x);
+      logbl,
       /// float logf(float x);
       logf,
       /// long double logl(long double x);
       logl,
+      /// void *malloc(size_t size);
+      malloc,
       /// void *memchr(const void *s, int c, size_t n);
       memchr,
       /// int memcmp(const void *s1, const void *s2, size_t n);
@@ -166,6 +228,8 @@ namespace llvm {
       nearbyintf,
       /// long double nearbyintl(long double x);
       nearbyintl,
+      /// int posix_memalign(void **memptr, size_t alignment, size_t size);
+      posix_memalign,
       /// double pow(double x, double y);
       pow,
       /// float powf(float x, float y);
@@ -176,6 +240,10 @@ namespace llvm {
       putchar,
       /// int puts(const char *s);
       puts,
+      /// void *realloc(void *ptr, size_t size);
+      realloc,
+      /// void *reallocf(void *ptr, size_t size);
+      reallocf,
       /// double rint(double x);
       rint,
       /// float rintf(float x);
@@ -208,12 +276,20 @@ namespace llvm {
       sqrtf,
       /// long double sqrtl(long double x);
       sqrtl,
+      /// char *stpcpy(char *s1, const char *s2);
+      stpcpy,
       /// char *strcat(char *s1, const char *s2);
       strcat,
       /// char *strchr(const char *s, int c);
       strchr,
+      /// int strcmp(const char *s1, const char *s2);
+      strcmp,
       /// char *strcpy(char *s1, const char *s2);
       strcpy,
+      /// size_t strcspn(const char *s1, const char *s2);
+      strcspn,
+      /// char *strdup(const char *s1);
+      strdup,
       /// size_t strlen(const char *s);
       strlen,
       /// char *strncat(char *s1, const char *s2, size_t n);
@@ -222,8 +298,33 @@ namespace llvm {
       strncmp,
       /// char *strncpy(char *s1, const char *s2, size_t n);
       strncpy,
+      /// char *strndup(const char *s1, size_t n);
+      strndup,
       /// size_t strnlen(const char *s, size_t maxlen);
       strnlen,
+      /// char *strpbrk(const char *s1, const char *s2);
+      strpbrk,
+      /// char *strrchr(const char *s, int c);
+      strrchr,
+      /// size_t strspn(const char *s1, const char *s2);
+      strspn,
+      /// char *strstr(const char *s1, const char *s2);
+      strstr,
+      /// double strtod(const char *nptr, char **endptr);
+      strtod,
+      /// float strtof(const char *nptr, char **endptr);
+      strtof,
+      /// long int strtol(const char *nptr, char **endptr, int base);
+      strtol,
+      /// long double strtold(const char *nptr, char **endptr);
+      strtold,
+      /// long long int strtoll(const char *nptr, char **endptr, int base);
+      strtoll,
+      /// unsigned long int strtoul(const char *nptr, char **endptr, int base);
+      strtoul,
+      /// unsigned long long int strtoull(const char *nptr, char **endptr,
+      ///                                 int base);
+      strtoull,
       /// double tan(double x);
       tan,
       /// float tanf(float x);
@@ -242,6 +343,8 @@ namespace llvm {
       truncf,
       /// long double truncl(long double x);
       truncl,
+      /// void *valloc(size_t size);
+      valloc,
 
       NumLibFuncs
     };
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index acf0419510e9..580a30fcd2d8 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -22,9 +22,11 @@
 #ifndef LLVM_TARGET_TARGETLOWERING_H
 #define LLVM_TARGET_TARGETLOWERING_H
 
+#include "llvm/AddressingMode.h"
 #include "llvm/CallingConv.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Attributes.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
@@ -49,7 +51,7 @@ namespace llvm {
   class MCContext;
   class MCExpr;
   template<typename T> class SmallVectorImpl;
-  class TargetData;
+  class DataLayout;
   class TargetRegisterClass;
   class TargetLibraryInfo;
   class TargetLoweringObjectFile;
@@ -76,8 +78,8 @@ namespace llvm {
 /// target-specific constructs to SelectionDAG operators.
 ///
 class TargetLowering {
-  TargetLowering(const TargetLowering&);  // DO NOT IMPLEMENT
-  void operator=(const TargetLowering&);  // DO NOT IMPLEMENT
+  TargetLowering(const TargetLowering&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetLowering&) LLVM_DELETED_FUNCTION;
 public:
   /// LegalizeAction - This enum indicates whether operations are valid for a
   /// target, and if not, what action should be used to make them valid.
@@ -101,12 +103,24 @@ public:
     TypeWidenVector      // This vector should be widened into a larger vector.
   };
 
+  /// LegalizeKind holds the legalization kind that needs to happen to EVT
+  /// in order to type-legalize it.
+  typedef std::pair<LegalizeTypeAction, EVT> LegalizeKind;
+
   enum BooleanContent { // How the target represents true/false values.
     UndefinedBooleanContent,    // Only bit 0 counts, the rest can hold garbage.
     ZeroOrOneBooleanContent,        // All bits zero except for bit 0.
     ZeroOrNegativeOneBooleanContent // All bits equal to bit 0.
   };
 
+  enum SelectSupportKind {
+    ScalarValSelect,      // The target supports scalar selects (ex: cmov).
+    ScalarCondVectorVal,  // The target supports selects with a scalar condition
+                          // and vector values (ex: cmov).
+    VectorMaskSelect      // The target supports vector selects with a vector
+                          // mask (ex: x86 blends).
+  };
+
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
     switch (Content) {
     case UndefinedBooleanContent:
@@ -128,22 +142,37 @@ public:
   virtual ~TargetLowering();
 
   const TargetMachine &getTargetMachine() const { return TM; }
-  const TargetData *getTargetData() const { return TD; }
+  const DataLayout *getDataLayout() const { return TD; }
   const TargetLoweringObjectFile &getObjFileLowering() const { return TLOF; }
 
   bool isBigEndian() const { return !IsLittleEndian; }
   bool isLittleEndian() const { return IsLittleEndian; }
-  MVT getPointerTy() const { return PointerTy; }
+  // Return the pointer type for the given address space, defaults to
+  // the pointer type from the data layout.
+  // FIXME: The default needs to be removed once all the code is updated.
+  virtual MVT getPointerTy(uint32_t AS = 0) const { return PointerTy; }
   virtual MVT getShiftAmountTy(EVT LHSTy) const;
 
   /// isSelectExpensive - Return true if the select operation is expensive for
   /// this target.
   bool isSelectExpensive() const { return SelectIsExpensive; }
 
+  virtual bool isSelectSupported(SelectSupportKind kind) const { return true; }
+
   /// isIntDivCheap() - Return true if integer divide is usually cheaper than
   /// a sequence of several shifts, adds, and multiplies for this target.
   bool isIntDivCheap() const { return IntDivIsCheap; }
 
+  /// isSlowDivBypassed - Returns true if target has indicated at least one
+  /// type should be bypassed.
+  bool isSlowDivBypassed() const { return !BypassSlowDivWidths.empty(); }
+
+  /// getBypassSlowDivTypes - Returns map of slow types for division or
+  /// remainder with corresponding fast types
+  const DenseMap<unsigned int, unsigned int> &getBypassSlowDivWidths() const {
+    return BypassSlowDivWidths;
+  }
+
   /// isPow2DivCheap() - Return true if pow2 div is cheaper than a chain of
   /// srl/add/sra.
   bool isPow2DivCheap() const { return Pow2DivIsCheap; }
@@ -382,6 +411,13 @@ public:
        getOperationAction(Op, VT) == Custom);
   }
 
+  /// isOperationExpand - Return true if the specified operation is illegal on
+  /// this target or unlikely to be made legal with custom lowering. This is
+  /// used to help guide high-level lowering decisions.
+  bool isOperationExpand(unsigned Op, EVT VT) const {
+    return (!isTypeLegal(VT) || getOperationAction(Op, VT) == Expand);
+  }
+
   /// isOperationLegal - Return true if the specified operation is legal on this
   /// target.
   bool isOperationLegal(unsigned Op, EVT VT) const {
@@ -475,8 +511,12 @@ public:
     assert((unsigned)CC < array_lengthof(CondCodeActions) &&
            (unsigned)VT.getSimpleVT().SimpleTy < sizeof(CondCodeActions[0])*4 &&
            "Table isn't big enough!");
+    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 64bit
+    /// value and the upper 27 bits index into the second dimension of the
+    /// array to select what 64bit value to use.
     LegalizeAction Action = (LegalizeAction)
-      ((CondCodeActions[CC] >> (2*VT.getSimpleVT().SimpleTy)) & 3);
+      ((CondCodeActions[CC][VT.getSimpleVT().SimpleTy >> 5]
+        >> (2*(VT.getSimpleVT().SimpleTy & 0x1F))) & 3);
     assert(Action != Promote && "Can't promote condition code!");
     return Action;
   }
@@ -533,6 +573,7 @@ public:
     }
     return EVT::getEVT(Ty, AllowUnknown);
   }
+  
 
   /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   /// function arguments in the caller parameter area.  This is the actual
@@ -686,6 +727,12 @@ public:
     return SupportJumpTables;
   }
 
+  /// getMinimumJumpTableEntries - return integer threshold on number of
+  /// blocks to use jump tables rather than if sequence.
+  int getMinimumJumpTableEntries() const {
+    return MinimumJumpTableEntries;
+  }
+
   /// getStackPointerRegisterToSaveRestore - If a physical register, this
   /// specifies the register that llvm.savestack/llvm.restorestack should save
   /// and restore.
@@ -1006,6 +1053,12 @@ protected:
     SupportJumpTables = Val;
   }
 
+  /// setMinimumJumpTableEntries - Indicate the number of blocks to generate
+  /// jump tables rather than if sequence.
+  void setMinimumJumpTableEntries(int Val) {
+    MinimumJumpTableEntries = Val;
+  }
+
   /// setStackPointerRegisterToSaveRestore - If set to a physical register, this
   /// specifies the register that llvm.savestack/llvm.restorestack should save
   /// and restore.
@@ -1045,6 +1098,11 @@ protected:
   /// of instructions not containing an integer divide.
   void setIntDivIsCheap(bool isCheap = true) { IntDivIsCheap = isCheap; }
 
+  /// addBypassSlowDiv - Tells the code generator which bitwidths to bypass.
+  void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth) {
+    BypassSlowDivWidths[SlowBitWidth] = FastBitWidth;
+  }
+
   /// setPow2DivIsCheap - Tells the code generator that it shouldn't generate
   /// srl/add/sra for a signed divide by power of two, and let the target handle
   /// it.
@@ -1127,8 +1185,13 @@ protected:
     assert(VT < MVT::LAST_VALUETYPE &&
            (unsigned)CC < array_lengthof(CondCodeActions) &&
            "Table isn't big enough!");
-    CondCodeActions[(unsigned)CC] &= ~(uint64_t(3UL)  << VT.SimpleTy*2);
-    CondCodeActions[(unsigned)CC] |= (uint64_t)Action << VT.SimpleTy*2;
+    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 64bit
+    /// value and the upper 27 bits index into the second dimension of the
+    /// array to select what 64bit value to use.
+    CondCodeActions[(unsigned)CC][VT.SimpleTy >> 5]
+      &= ~(uint64_t(3UL)  << (VT.SimpleTy & 0x1F)*2);
+    CondCodeActions[(unsigned)CC][VT.SimpleTy >> 5]
+      |= (uint64_t)Action << (VT.SimpleTy & 0x1F)*2;
   }
 
   /// AddPromotedToType - If Opc/OrigVT is specified as being promoted, the
@@ -1201,7 +1264,7 @@ protected:
 public:
   //===--------------------------------------------------------------------===//
   // Lowering methods - These methods must be implemented by targets so that
-  // the SelectionDAGLowering code knows how to lower these.
+  // the SelectionDAGBuilder code knows how to lower these.
   //
 
   /// LowerFormalArguments - This hook must be implemented to lower the
@@ -1271,9 +1334,9 @@ public:
                      FunctionType *FTy, bool isTailCall, SDValue callee,
                      ArgListTy &args, SelectionDAG &dag, DebugLoc dl,
                      ImmutableCallSite &cs)
-    : Chain(chain), RetTy(retTy), RetSExt(cs.paramHasAttr(0, Attribute::SExt)),
-      RetZExt(cs.paramHasAttr(0, Attribute::ZExt)), IsVarArg(FTy->isVarArg()),
-      IsInReg(cs.paramHasAttr(0, Attribute::InReg)),
+    : Chain(chain), RetTy(retTy), RetSExt(cs.paramHasAttr(0, Attributes::SExt)),
+      RetZExt(cs.paramHasAttr(0, Attributes::ZExt)), IsVarArg(FTy->isVarArg()),
+      IsInReg(cs.paramHasAttr(0, Attributes::InReg)),
       DoesNotReturn(cs.doesNotReturn()),
       IsReturnValueUsed(!cs.getInstruction()->use_empty()),
       IsTailCall(isTailCall), NumFixedArgs(FTy->getNumParams()),
@@ -1314,7 +1377,7 @@ public:
   }
 
   /// HandleByVal - Target-specific cleanup for formal ByVal parameters.
-  virtual void HandleByVal(CCState *, unsigned &) const {}
+  virtual void HandleByVal(CCState *, unsigned &, unsigned) const {}
 
   /// CanLowerReturn - This hook should be implemented to check whether the
   /// return values described by the Outs array can fit into the return
@@ -1584,22 +1647,6 @@ public:
   // Addressing mode description hooks (used by LSR etc).
   //
 
-  /// AddrMode - This represents an addressing mode of:
-  ///    BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
-  /// If BaseGV is null,  there is no BaseGV.
-  /// If BaseOffs is zero, there is no base offset.
-  /// If HasBaseReg is false, there is no base register.
-  /// If Scale is zero, there is no ScaleReg.  Scale of 1 indicates a reg with
-  /// no scale.
-  ///
-  struct AddrMode {
-    GlobalValue *BaseGV;
-    int64_t      BaseOffs;
-    bool         HasBaseReg;
-    int64_t      Scale;
-    AddrMode() : BaseGV(0), BaseOffs(0), HasBaseReg(false), Scale(0) {}
-  };
-
   /// GetAddrModeArguments - CodeGenPrepare sinks address calculations into the
   /// same BB as Load/Store instructions reading the address.  This allows as
   /// much computation as possible to be done in the address mode for that
@@ -1741,10 +1788,11 @@ public:
 
 private:
   const TargetMachine &TM;
-  const TargetData *TD;
+  const DataLayout *TD;
   const TargetLoweringObjectFile &TLOF;
 
-  /// PointerTy - The type to use for pointers, usually i32 or i64.
+  /// PointerTy - The type to use for pointers for the default address space,
+  /// usually i32 or i64.
   ///
   MVT PointerTy;
 
@@ -1762,6 +1810,12 @@ private:
   /// set to true unconditionally.
   bool IntDivIsCheap;
 
+  /// BypassSlowDivMap - Tells the code generator to bypass slow divide or
+  /// remainder instructions. For example, BypassSlowDivWidths[32,8] tells the
+  /// code generator to bypass 32-bit integer div/rem with an 8-bit unsigned
+  /// integer div/rem when the operands are positive and less than 256.
+  DenseMap <unsigned int, unsigned int> BypassSlowDivWidths;
+
   /// Pow2DivIsCheap - Tells the code generator that it shouldn't generate
   /// srl/add/sra for a signed divide by power of two, and let the target handle
   /// it.
@@ -1784,6 +1838,9 @@ private:
   /// If it's not true, then each jumptable must be lowered into if-then-else's.
   bool SupportJumpTables;
 
+  /// MinimumJumpTableEntries - Number of blocks threshold to use jump tables.
+  int MinimumJumpTableEntries;
+
   /// BooleanContents - Information about the contents of the high-bits in
   /// boolean values held in a type wider than i1.  See getBooleanContents.
   BooleanContent BooleanContents;
@@ -1901,12 +1958,14 @@ private:
   /// CondCodeActions - For each condition code (ISD::CondCode) keep a
   /// LegalizeAction that indicates how instruction selection should
   /// deal with the condition code.
-  uint64_t CondCodeActions[ISD::SETCC_INVALID];
+  /// Because each CC action takes up 2 bits, we need to have the array size
+  /// be large enough to fit all of the value types. This can be done by
+  /// dividing the MVT::LAST_VALUETYPE by 32 and adding one.
+  uint64_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE / 32) + 1];
 
   ValueTypeActionImpl ValueTypeActions;
 
-  typedef std::pair<LegalizeTypeAction, EVT> LegalizeKind;
-
+public:
   LegalizeKind
   getTypeConversion(LLVMContext &Context, EVT VT) const {
     // If this is a simple type, use the ComputeRegisterProp mechanism.
@@ -1921,6 +1980,9 @@ private:
          ValueTypeActions.getTypeAction(NVT.getSimpleVT()) != TypePromoteInteger)
          && "Promote may not follow Expand or Promote");
 
+      if (LA == TypeSplitVector)
+        NVT = EVT::getVectorVT(Context, VT.getVectorElementType(),
+                               VT.getVectorNumElements() / 2);
       return LegalizeKind(LA, NVT);
     }
 
@@ -2023,6 +2085,7 @@ private:
     return LegalizeKind(TypeSplitVector, NVT);
   }
 
+private:
   std::vector<std::pair<EVT, const TargetRegisterClass*> > AvailableRegClasses;
 
   /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index d631f58aab74..13a6fe37d7a9 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -33,10 +33,11 @@ namespace llvm {
   
 class TargetLoweringObjectFile : public MCObjectFileInfo {
   MCContext *Ctx;
-  
-  TargetLoweringObjectFile(const TargetLoweringObjectFile&); // DO NOT IMPLEMENT
-  void operator=(const TargetLoweringObjectFile&);           // DO NOT IMPLEMENT
-  
+
+  TargetLoweringObjectFile(
+    const TargetLoweringObjectFile&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetLoweringObjectFile&) LLVM_DELETED_FUNCTION;
+
 public:
   MCContext &getContext() const { return *Ctx; }
 
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index e4bf32bd86c8..50066473b552 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -17,6 +17,8 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Target/TargetTransformImpl.h"
 #include "llvm/ADT/StringRef.h"
 #include <cassert>
 #include <string>
@@ -31,8 +33,7 @@ class MCCodeGenInfo;
 class MCContext;
 class PassManagerBase;
 class Target;
-class TargetData;
-class TargetELFWriterInfo;
+class DataLayout;
 class TargetFrameLowering;
 class TargetInstrInfo;
 class TargetIntrinsicInfo;
@@ -52,8 +53,8 @@ class raw_ostream;
 /// through this interface.
 ///
 class TargetMachine {
-  TargetMachine(const TargetMachine &);   // DO NOT IMPLEMENT
-  void operator=(const TargetMachine &);  // DO NOT IMPLEMENT
+  TargetMachine(const TargetMachine &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetMachine &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef TargetTriple,
                 StringRef CPU, StringRef FS, const TargetOptions &Options);
@@ -106,7 +107,11 @@ public:
   virtual const TargetFrameLowering *getFrameLowering() const { return 0; }
   virtual const TargetLowering    *getTargetLowering() const { return 0; }
   virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const{ return 0; }
-  virtual const TargetData             *getTargetData() const { return 0; }
+  virtual const DataLayout             *getDataLayout() const { return 0; }
+  virtual const ScalarTargetTransformInfo*
+  getScalarTargetTransformInfo() const { return 0; }
+  virtual const VectorTargetTransformInfo*
+  getVectorTargetTransformInfo() const { return 0; }
 
   /// getMCAsmInfo - Return target specific asm information.
   ///
@@ -142,11 +147,6 @@ public:
     return 0;
   }
 
-  /// getELFWriterInfo - If this target supports an ELF writer, return
-  /// information for it, otherwise return null.
-  ///
-  virtual const TargetELFWriterInfo *getELFWriterInfo() const { return 0; }
-
   /// hasMCRelaxAll - Check whether all machine code instructions should be
   /// relaxed.
   bool hasMCRelaxAll() const { return MCRelaxAll; }
diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index f0b181e345b7..516e0706b897 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h
@@ -87,7 +87,11 @@ namespace TargetOpcode {
     /// BUNDLE - This instruction represents an instruction bundle. Instructions
     /// which immediately follow a BUNDLE instruction which are marked with
     /// 'InsideBundle' flag are inside the bundle.
-    BUNDLE
+    BUNDLE = 14,
+
+    /// Lifetime markers.
+    LIFETIME_START = 15,
+    LIFETIME_END = 16
   };
 } // end namespace TargetOpcode
 } // end namespace llvm
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index d1a07d1480b4..68ca5678369a 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -155,6 +155,10 @@ namespace llvm {
     /// automatically realigned, if needed.
     unsigned RealignStack : 1;
 
+    /// SSPBufferSize - The minimum size of buffers that will receive stack
+    /// smashing protection when -fstack-protection is used.
+    unsigned SSPBufferSize;
+
     /// EnableFastISel - This flag enables fast-path instruction selection
     /// which trades away generated code quality in favor of reducing
     /// compile time.
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index df4d900e4c8e..afa2ee27443a 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -221,13 +221,17 @@ public:
 private:
   const TargetRegisterInfoDesc *InfoDesc;     // Extra desc array for codegen
   const char *const *SubRegIndexNames;        // Names of subreg indexes.
+  // Pointer to array of lane masks, one per sub-reg index.
+  const unsigned *SubRegIndexLaneMasks;
+
   regclass_iterator RegClassBegin, RegClassEnd;   // List of regclasses
 
 protected:
   TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
                      regclass_iterator RegClassBegin,
                      regclass_iterator RegClassEnd,
-                     const char *const *subregindexnames);
+                     const char *const *SRINames,
+                     const unsigned *SRILaneMasks);
   virtual ~TargetRegisterInfo();
 public:
 
@@ -327,10 +331,36 @@ public:
   /// getSubRegIndexName - Return the human-readable symbolic target-specific
   /// name for the specified SubRegIndex.
   const char *getSubRegIndexName(unsigned SubIdx) const {
-    assert(SubIdx && "This is not a subregister index");
+    assert(SubIdx && SubIdx < getNumSubRegIndices() &&
+           "This is not a subregister index");
     return SubRegIndexNames[SubIdx-1];
   }
 
+  /// getSubRegIndexLaneMask - Return a bitmask representing the parts of a
+  /// register that are covered by SubIdx.
+  ///
+  /// Lane masks for sub-register indices are similar to register units for
+  /// physical registers. The individual bits in a lane mask can't be assigned
+  /// any specific meaning. They can be used to check if two sub-register
+  /// indices overlap.
+  ///
+  /// If the target has a register such that:
+  ///
+  ///   getSubReg(Reg, A) overlaps getSubReg(Reg, B)
+  ///
+  /// then:
+  ///
+  ///   getSubRegIndexLaneMask(A) & getSubRegIndexLaneMask(B) != 0
+  ///
+  /// The converse is not necessarily true. If two lane masks have a common
+  /// bit, the corresponding sub-registers may not overlap, but it can be
+  /// assumed that they usually will.
+  unsigned getSubRegIndexLaneMask(unsigned SubIdx) const {
+    // SubIdx == 0 is allowed, it has the lane mask ~0u.
+    assert(SubIdx < getNumSubRegIndices() && "This is not a subregister index");
+    return SubRegIndexLaneMasks[SubIdx];
+  }
+
   /// regsOverlap - Returns true if the two registers are equal or alias each
   /// other. The registers may be virtual register.
   bool regsOverlap(unsigned regA, unsigned regB) const {
@@ -416,18 +446,6 @@ public:
     return MCRegisterInfo::getMatchingSuperReg(Reg, SubIdx, RC->MC);
   }
 
-  /// canCombineSubRegIndices - Given a register class and a list of
-  /// subregister indices, return true if it's possible to combine the
-  /// subregister indices into one that corresponds to a larger
-  /// subregister. Return the new subregister index by reference. Note the
-  /// new index may be zero if the given subregisters can be combined to
-  /// form the whole register.
-  virtual bool canCombineSubRegIndices(const TargetRegisterClass *RC,
-                                       SmallVectorImpl<unsigned> &SubIndices,
-                                       unsigned &NewSubIdx) const {
-    return 0;
-  }
-
   /// getMatchingSuperRegClass - Return a subclass of the specified register
   /// class A so that each register in it has a sub-register of the
   /// specified sub-register index which is in the specified register class B.
@@ -458,6 +476,8 @@ public:
   /// composeSubRegIndices - Return the subregister index you get from composing
   /// two subregister indices.
   ///
+  /// The special null sub-register index composes as the identity.
+  ///
   /// If R:a:b is the same register as R:c, then composeSubRegIndices(a, b)
   /// returns c. Note that composeSubRegIndices does not tell you about illegal
   /// compositions. If R does not have a subreg a, or R:a does not have a subreg
@@ -467,11 +487,19 @@ public:
   /// ssub_0:S0 - ssub_3:S3 subregs.
   /// If you compose subreg indices dsub_1, ssub_0 you get ssub_2.
   ///
-  virtual unsigned composeSubRegIndices(unsigned a, unsigned b) const {
-    // This default implementation is correct for most targets.
-    return b;
+  unsigned composeSubRegIndices(unsigned a, unsigned b) const {
+    if (!a) return b;
+    if (!b) return a;
+    return composeSubRegIndicesImpl(a, b);
   }
 
+protected:
+  /// Overridden by TableGen in targets that have sub-registers.
+  virtual unsigned composeSubRegIndicesImpl(unsigned, unsigned) const {
+    llvm_unreachable("Target has no sub-registers");
+  }
+
+public:
   /// getCommonSuperRegClass - Find a common super-register class if it exists.
   ///
   /// Find a register class, SuperRC and two sub-register indices, PreA and
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 4dc488dbaece..0da82fdd8971 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -10,25 +10,77 @@
 // This file defines the target-independent scheduling interfaces which should
 // be implemented by each target which is using TableGen based scheduling.
 //
+// The SchedMachineModel is defined by subtargets for three categories of data:
+// 1. Basic properties for coarse grained instruction cost model.
+// 2. Scheduler Read/Write resources for simple per-opcode cost model.
+// 3. Instruction itineraties for detailed reservation tables.
+//
+// (1) Basic properties are defined by the SchedMachineModel
+// class. Target hooks allow subtargets to associate opcodes with
+// those properties.
+//
+// (2) A per-operand machine model can be implemented in any
+// combination of the following ways:
+//
+// A. Associate per-operand SchedReadWrite types with Instructions by
+// modifying the Instruction definition to inherit from Sched. For
+// each subtarget, define WriteRes and ReadAdvance to associate
+// processor resources and latency with each SchedReadWrite type.
+//
+// B. In each instruction definition, name an ItineraryClass. For each
+// subtarget, define ItinRW entries to map ItineraryClass to
+// per-operand SchedReadWrite types. Unlike method A, these types may
+// be subtarget specific and can be directly associated with resources
+// by defining SchedWriteRes and SchedReadAdvance.
+//
+// C. In the subtarget, map SchedReadWrite types to specific
+// opcodes. This overrides any SchedReadWrite types or
+// ItineraryClasses defined by the Instruction. As in method B, the
+// subtarget can directly associate resources with SchedReadWrite
+// types by defining SchedWriteRes and SchedReadAdvance.
+//
+// D. In either the target or subtarget, define SchedWriteVariant or
+// SchedReadVariant to map one SchedReadWrite type onto another
+// sequence of SchedReadWrite types. This allows dynamic selection of
+// an instruction's machine model via custom C++ code. It also allows
+// a machine-independent SchedReadWrite type to map to a sequence of
+// machine-dependent types.
+//
+// (3) A per-pipeline-stage machine model can be implemented by providing
+// Itineraries in addition to mapping instructions to ItineraryClasses.
 //===----------------------------------------------------------------------===//
 
+// Include legacy support for instruction itineraries.
 include "llvm/Target/TargetItinerary.td"
 
-// The SchedMachineModel is defined by subtargets for three categories of data:
-// 1) Basic properties for coarse grained instruction cost model.
-// 2) Scheduler Read/Write resources for simple per-opcode cost model.
-// 3) Instruction itineraties for detailed reservation tables.
+class Instruction; // Forward def
+
+// DAG operator that interprets the DAG args as Instruction defs.
+def instrs;
+
+// DAG operator that interprets each DAG arg as a regex pattern for
+// matching Instruction opcode names.
+// The regex must match the beginning of the opcode (as in Python re.match).
+// To avoid matching prefixes, append '$' to the pattern.
+def instregex;
+
+// Define the SchedMachineModel and provide basic properties for
+// coarse grained instruction cost model. Default values for the
+// properties are defined in MCSchedModel. A value of "-1" in the
+// target description's SchedMachineModel indicates that the property
+// is not overriden by the target.
 //
-// Default values for basic properties are defined in MCSchedModel. "-1"
-// indicates that the property is not overriden by the target description.
+// Target hooks allow subtargets to associate LoadLatency and
+// HighLatency with groups of opcodes.
 class SchedMachineModel {
-  int IssueWidth = -1; // Max instructions that may be scheduled per cycle.
+  int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
   int MinLatency = -1; // Determines which instrucions are allowed in a group.
                        // (-1) inorder (0) ooo, (1): inorder +var latencies.
   int LoadLatency = -1; // Cycles for loads to access the cache.
   int HighLatency = -1; // Approximation of cycles for "high latency" ops.
   int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
 
+  // Per-cycle resources tables.
   ProcessorItineraries Itineraries = NoItineraries;
 
   bit NoModel = 0; // Special tag to indicate missing machine model.
@@ -38,4 +90,276 @@ def NoSchedModel : SchedMachineModel {
   let NoModel = 1;
 }
 
-// TODO: Define classes for processor and scheduler resources.
+// Define a kind of processor resource that may be common across
+// similar subtargets.
+class ProcResourceKind;
+
+// Define a number of interchangeable processor resources. NumUnits
+// determines the throughput of instructions that require the resource.
+//
+// An optional Super resource may be given to model these resources as
+// a subset of the more general super resources. Using one of these
+// resources implies using one of the super resoruces.
+//
+// ProcResourceUnits normally model a few buffered resources within an
+// out-of-order engine that the compiler attempts to conserve.
+// Buffered resources may be held for multiple clock cycles, but the
+// scheduler does not pin them to a particular clock cycle relative to
+// instruction dispatch. Setting Buffered=0 changes this to an
+// in-order resource. In this case, the scheduler counts down from the
+// cycle that the instruction issues in-order, forcing an interlock
+// with subsequent instructions that require the same resource until
+// the number of ResourceCyles specified in WriteRes expire.
+//
+// SchedModel ties these units to a processor for any stand-alone defs
+// of this class. Instances of subclass ProcResource will be automatically
+// attached to a processor, so SchedModel is not needed.
+class ProcResourceUnits<ProcResourceKind kind, int num> {
+  ProcResourceKind Kind = kind;
+  int NumUnits = num;
+  ProcResourceKind Super = ?;
+  bit Buffered = 1;
+  SchedMachineModel SchedModel = ?;
+}
+
+// EponymousProcResourceKind helps implement ProcResourceUnits by
+// allowing a ProcResourceUnits definition to reference itself. It
+// should not be referenced anywhere else.
+def EponymousProcResourceKind : ProcResourceKind;
+
+// Subtargets typically define processor resource kind and number of
+// units in one place.
+class ProcResource<int num> : ProcResourceKind,
+  ProcResourceUnits<EponymousProcResourceKind, num>;
+
+// A target architecture may define SchedReadWrite types and associate
+// them with instruction operands.
+class SchedReadWrite;
+
+// List the per-operand types that map to the machine model of an
+// instruction. One SchedWrite type must be listed for each explicit
+// def operand in order. Additional SchedWrite types may optionally be
+// listed for implicit def operands.  SchedRead types may optionally
+// be listed for use operands in order. The order of defs relative to
+// uses is insignificant. This way, the same SchedReadWrite list may
+// be used for multiple forms of an operation. For example, a
+// two-address instruction could have two tied operands or single
+// operand that both reads and writes a reg. In both cases we have a
+// single SchedWrite and single SchedRead in any order.
+class Sched<list<SchedReadWrite> schedrw> {
+  list<SchedReadWrite> SchedRW = schedrw;
+}
+
+// Define a scheduler resource associated with a def operand.
+class SchedWrite : SchedReadWrite;
+def NoWrite : SchedWrite;
+
+// Define a scheduler resource associated with a use operand.
+class SchedRead  : SchedReadWrite;
+
+// Define a SchedWrite that is modeled as a sequence of other
+// SchedWrites with additive latency. This allows a single operand to
+// be mapped the resources composed from a set of previously defined
+// SchedWrites.
+//
+// If the final write in this sequence is a SchedWriteVariant marked
+// Variadic, then the list of prior writes are distributed across all
+// operands after resolving the predicate for the final write.
+//
+// SchedModel silences warnings but is ignored.
+class WriteSequence<list<SchedWrite> writes, int rep = 1> : SchedWrite {
+  list<SchedWrite> Writes = writes;
+  int Repeat = rep;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Define values common to WriteRes and SchedWriteRes.
+//
+// SchedModel ties these resources to a processor.
+class ProcWriteResources<list<ProcResourceKind> resources> {
+  list<ProcResourceKind> ProcResources = resources;
+  list<int> ResourceCycles = [];
+  int Latency = 1;
+  int NumMicroOps = 1;
+  bit BeginGroup = 0;
+  bit EndGroup = 0;
+  // Allow a processor to mark some scheduling classes as unsupported
+  // for stronger verification.
+  bit Unsupported = 0;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Define the resources and latency of a SchedWrite. This will be used
+// directly by targets that have no itinerary classes. In this case,
+// SchedWrite is defined by the target, while WriteResources is
+// defined by the subtarget, and maps the SchedWrite to processor
+// resources.
+//
+// If a target already has itinerary classes, SchedWriteResources can
+// be used instead to define subtarget specific SchedWrites and map
+// them to processor resources in one place. Then ItinRW can map
+// itinerary classes to the subtarget's SchedWrites.
+//
+// ProcResources indicates the set of resources consumed by the write.
+// Optionally, ResourceCycles indicates the number of cycles the
+// resource is consumed. Each ResourceCycles item is paired with the
+// ProcResource item at the same position in its list. Since
+// ResourceCycles are rarely specialized, the list may be
+// incomplete. By default, resources are consumed for a single cycle,
+// regardless of latency, which models a fully pipelined processing
+// unit. A value of 0 for ResourceCycles means that the resource must
+// be available but is not consumed, which is only relevant for
+// unbuffered resources.
+//
+// By default, each SchedWrite takes one micro-op, which is counted
+// against the processor's IssueWidth limit. If an instruction can
+// write multiple registers with a single micro-op, the subtarget
+// should define one of the writes to be zero micro-ops. If a
+// subtarget requires multiple micro-ops to write a single result, it
+// should either override the write's NumMicroOps to be greater than 1
+// or require additional writes. Extra writes can be required either
+// by defining a WriteSequence, or simply listing extra writes in the
+// instruction's list of writers beyond the number of "def"
+// operands. The scheduler assumes that all micro-ops must be
+// dispatched in the same cycle. These micro-ops may be required to
+// begin or end the current dispatch group.
+class WriteRes<SchedWrite write, list<ProcResourceKind> resources>
+  : ProcWriteResources<resources> {
+  SchedWrite WriteType = write;
+}
+
+// Directly name a set of WriteResources defining a new SchedWrite
+// type at the same time. This class is unaware of its SchedModel so
+// must be referenced by InstRW or ItinRW.
+class SchedWriteRes<list<ProcResourceKind> resources> : SchedWrite,
+  ProcWriteResources<resources>;
+
+// Define values common to ReadAdvance and SchedReadAdvance.
+//
+// SchedModel ties these resources to a processor.
+class ProcReadAdvance<int cycles, list<SchedWrite> writes = []> {
+  int Cycles = cycles;
+  list<SchedWrite> ValidWrites = writes;
+  // Allow a processor to mark some scheduling classes as unsupported
+  // for stronger verification.
+  bit Unsupported = 0;
+  SchedMachineModel SchedModel = ?;
+}
+
+// A processor may define a ReadAdvance associated with a SchedRead
+// to reduce latency of a prior write by N cycles. A negative advance
+// effectively increases latency, which may be used for cross-domain
+// stalls.
+//
+// A ReadAdvance may be associated with a list of SchedWrites
+// to implement pipeline bypass. The Writes list may be empty to
+// indicate operands that are always read this number of Cycles later
+// than a normal register read, allowing the read's parent instruction
+// to issue earlier relative to the writer.
+class ReadAdvance<SchedRead read, int cycles, list<SchedWrite> writes = []>
+  : ProcReadAdvance<cycles, writes> {
+  SchedRead ReadType = read;
+}
+
+// Directly associate a new SchedRead type with a delay and optional
+// pipeline bypess. For use with InstRW or ItinRW.
+class SchedReadAdvance<int cycles, list<SchedWrite> writes = []> : SchedRead,
+  ProcReadAdvance<cycles, writes>;
+
+// Define SchedRead defaults. Reads seldom need special treatment.
+def ReadDefault : SchedRead;
+def NoReadAdvance : SchedReadAdvance<0>;
+
+// Define shared code that will be in the same scope as all
+// SchedPredicates. Available variables are:
+// (const MachineInstr *MI, const TargetSchedModel *SchedModel)
+class PredicateProlog<code c> {
+  code Code = c;
+}
+
+// Define a predicate to determine which SchedVariant applies to a
+// particular MachineInstr. The code snippet is used as an
+// if-statement's expression. Available variables are MI, SchedModel,
+// and anything defined in a PredicateProlog.
+//
+// SchedModel silences warnings but is ignored.
+class SchedPredicate<code pred> {
+  SchedMachineModel SchedModel = ?;
+  code Predicate = pred;
+}
+def NoSchedPred : SchedPredicate<[{true}]>;
+
+// Associate a predicate with a list of SchedReadWrites. By default,
+// the selected SchedReadWrites are still associated with a single
+// operand and assumed to execute sequentially with additive
+// latency. However, if the parent SchedWriteVariant or
+// SchedReadVariant is marked "Variadic", then each Selected
+// SchedReadWrite is mapped in place to the instruction's variadic
+// operands. In this case, latency is not additive. If the current Variant
+// is already part of a Sequence, then that entire chain leading up to
+// the Variant is distributed over the variadic operands.
+class SchedVar<SchedPredicate pred, list<SchedReadWrite> selected> {
+  SchedPredicate Predicate = pred;
+  list<SchedReadWrite> Selected = selected;
+}
+
+// SchedModel silences warnings but is ignored.
+class SchedVariant<list<SchedVar> variants> {
+  list<SchedVar> Variants = variants;
+  bit Variadic = 0;
+  SchedMachineModel SchedModel = ?;
+}
+
+// A SchedWriteVariant is a single SchedWrite type that maps to a list
+// of SchedWrite types under the conditions defined by its predicates.
+//
+// A Variadic write is expanded to cover multiple "def" operands. The
+// SchedVariant's Expansion list is then interpreted as one write
+// per-operand instead of the usual sequential writes feeding a single
+// operand.
+class SchedWriteVariant<list<SchedVar> variants> : SchedWrite,
+  SchedVariant<variants> {
+}
+
+// A SchedReadVariant is a single SchedRead type that maps to a list
+// of SchedRead types under the conditions defined by its predicates.
+//
+// A Variadic write is expanded to cover multiple "readsReg" operands as
+// explained above.
+class SchedReadVariant<list<SchedVar> variants> : SchedRead,
+  SchedVariant<variants> {
+}
+
+// Map a set of opcodes to a list of SchedReadWrite types. This allows
+// the subtarget to easily override specific operations.
+//
+// SchedModel ties this opcode mapping to a processor.
+class InstRW<list<SchedReadWrite> rw, dag instrlist> {
+  list<SchedReadWrite> OperandReadWrites = rw;
+  dag Instrs = instrlist;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Map a set of itinerary classes to SchedReadWrite resources. This is
+// used to bootstrap a target (e.g. ARM) when itineraries already
+// exist and changing InstrInfo is undesirable.
+//
+// SchedModel ties this ItineraryClass mapping to a processor.
+class ItinRW<list<SchedReadWrite> rw, list<InstrItinClass> iic> {
+  list<InstrItinClass> MatchedItinClasses = iic;
+  list<SchedReadWrite> OperandReadWrites = rw;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Alias a target-defined SchedReadWrite to a processor specific
+// SchedReadWrite. This allows a subtarget to easily map a
+// SchedReadWrite type onto a WriteSequence, SchedWriteVariant, or
+// SchedReadVariant.
+//
+// SchedModel will usually be provided by surrounding let statement
+// and ties this SchedAlias mapping to a processor.
+class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
+  SchedReadWrite MatchRW = match;
+  SchedReadWrite AliasRW = alias;
+  SchedMachineModel SchedModel = ?;
+}
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 3f81c06bc0b6..83bd7874df76 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -445,9 +445,9 @@ def atomic_load_umin : SDNode<"ISD::ATOMIC_LOAD_UMIN", SDTAtomic2,
 def atomic_load_umax : SDNode<"ISD::ATOMIC_LOAD_UMAX", SDTAtomic2,
                     [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_load      : SDNode<"ISD::ATOMIC_LOAD", SDTAtomicLoad,
-                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+                    [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def atomic_store     : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
-                    [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
+                    [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
 // and truncst (see below).
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index c9ca7223b5f5..96793bc036e7 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -20,7 +20,7 @@
 
 namespace llvm {
 
-class TargetData;
+class DataLayout;
 class TargetMachine;
 
 //===----------------------------------------------------------------------===//
@@ -28,13 +28,13 @@ class TargetMachine;
 /// SelectionDAG lowering and instruction selection process.
 ///
 class TargetSelectionDAGInfo {
-  TargetSelectionDAGInfo(const TargetSelectionDAGInfo &); // DO NOT IMPLEMENT
-  void operator=(const TargetSelectionDAGInfo &);         // DO NOT IMPLEMENT
+  TargetSelectionDAGInfo(const TargetSelectionDAGInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetSelectionDAGInfo &) LLVM_DELETED_FUNCTION;
 
-  const TargetData *TD;
+  const DataLayout *TD;
 
 protected:
-  const TargetData *getTargetData() const { return TD; }
+  const DataLayout *getDataLayout() const { return TD; }
 
 public:
   explicit TargetSelectionDAGInfo(const TargetMachine &TM);
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index fc23b2c6b58d..6db96d980b5e 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -19,9 +19,11 @@
 
 namespace llvm {
 
+class MachineInstr;
 class SDep;
 class SUnit;
 class TargetRegisterClass;
+class TargetSchedModel;
 template <typename T> class SmallVectorImpl;
 
 //===----------------------------------------------------------------------===//
@@ -31,8 +33,8 @@ template <typename T> class SmallVectorImpl;
 /// be exposed through a TargetSubtargetInfo-derived class.
 ///
 class TargetSubtargetInfo : public MCSubtargetInfo {
-  TargetSubtargetInfo(const TargetSubtargetInfo&);   // DO NOT IMPLEMENT
-  void operator=(const TargetSubtargetInfo&);  // DO NOT IMPLEMENT
+  TargetSubtargetInfo(const TargetSubtargetInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetSubtargetInfo&) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses...
   TargetSubtargetInfo();
 public:
@@ -43,23 +45,26 @@ public:
 
   virtual ~TargetSubtargetInfo();
 
-  /// getSpecialAddressLatency - For targets where it is beneficial to
-  /// backschedule instructions that compute addresses, return a value
-  /// indicating the number of scheduling cycles of backscheduling that
-  /// should be attempted.
-  virtual unsigned getSpecialAddressLatency() const { return 0; }
+  /// Resolve a SchedClass at runtime, where SchedClass identifies an
+  /// MCSchedClassDesc with the isVariant property. This may return the ID of
+  /// another variant SchedClass, but repeated invocation must quickly terminate
+  /// in a nonvariant SchedClass.
+  virtual unsigned resolveSchedClass(unsigned SchedClass, const MachineInstr *MI,
+                                     const TargetSchedModel* SchedModel) const {
+    return 0;
+  }
 
   // enablePostRAScheduler - If the target can benefit from post-regalloc
   // scheduling and the specified optimization level meets the requirement
   // return true to enable post-register-allocation scheduling. In
   // CriticalPathRCs return any register classes that should only be broken
-  // if on the critical path. 
+  // if on the critical path.
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
                                      RegClassVector& CriticalPathRCs) const;
   // adjustSchedDependency - Perform target specific adjustments to
   // the latency of a schedule dependency.
-  virtual void adjustSchedDependency(SUnit *def, SUnit *use, 
+  virtual void adjustSchedDependency(SUnit *def, SUnit *use,
                                      SDep& dep) const { }
 };
 
diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
new file mode 100644
index 000000000000..7ea2396076dc
--- /dev/null
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -0,0 +1,98 @@
+//=- llvm/Target/TargetTransformImpl.h - Target Loop Trans Info----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the target-specific implementations of the
+// TargetTransform interfaces.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_TARGET_TRANSFORMATION_IMPL_H
+#define LLVM_TARGET_TARGET_TRANSFORMATION_IMPL_H
+
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+
+namespace llvm {
+
+class TargetLowering;
+
+/// ScalarTargetTransformInfo - This is a default implementation for the
+/// ScalarTargetTransformInfo interface. Different targets can implement
+/// this interface differently.
+class ScalarTargetTransformImpl : public ScalarTargetTransformInfo {
+private:
+  const TargetLowering *TLI;
+
+public:
+  /// Ctor
+  explicit ScalarTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
+
+  virtual bool isLegalAddImmediate(int64_t imm) const;
+
+  virtual bool isLegalICmpImmediate(int64_t imm) const;
+
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+
+  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
+
+  virtual bool isTypeLegal(Type *Ty) const;
+
+  virtual unsigned getJumpBufAlignment() const;
+
+  virtual unsigned getJumpBufSize() const;
+
+  virtual bool shouldBuildLookupTables() const;
+};
+
+class VectorTargetTransformImpl : public VectorTargetTransformInfo {
+protected:
+  const TargetLowering *TLI;
+
+  /// Estimate the cost of type-legalization and the legalized type.
+  std::pair<unsigned, MVT> getTypeLegalizationCost(Type *Ty) const;
+
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+  // Get the ISD node that corresponds to the Instruction class opcode.
+  int InstructionOpcodeToISD(unsigned Opcode) const;
+
+public:
+  explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
+
+  virtual ~VectorTargetTransformImpl() {}
+
+  virtual unsigned getInstrCost(unsigned Opcode, Type *Ty1, Type *Ty2) const;
+
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+
+  virtual unsigned getBroadcastCost(Type *Tp) const;
+
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const;
+
+  virtual unsigned getCFInstrCost(unsigned Opcode) const;
+
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const;
+
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const;
+
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const;
+
+  virtual unsigned getNumberOfParts(Type *Tp) const;
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h
new file mode 100644
index 000000000000..94db49044332
--- /dev/null
+++ b/include/llvm/TargetTransformInfo.h
@@ -0,0 +1,204 @@
+//===- llvm/Transforms/TargetTransformInfo.h --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass exposes codegen information to IR-level passes. Every
+// transformation that uses codegen information is broken into three parts:
+// 1. The IR-level analysis pass.
+// 2. The IR-level transformation interface which provides the needed
+//    information.
+// 3. Codegen-level implementation which uses target-specific hooks.
+//
+// This file defines #2, which is the interface that IR-level transformations
+// use for querying the codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TARGET_TRANSFORM_INTERFACE
+#define LLVM_TRANSFORMS_TARGET_TRANSFORM_INTERFACE
+
+#include "llvm/Pass.h"
+#include "llvm/AddressingMode.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Type.h"
+
+namespace llvm {
+
+class ScalarTargetTransformInfo;
+class VectorTargetTransformInfo;
+
+/// TargetTransformInfo - This pass provides access to the codegen
+/// interfaces that are needed for IR-level transformations.
+class TargetTransformInfo : public ImmutablePass {
+private:
+  const ScalarTargetTransformInfo *STTI;
+  const VectorTargetTransformInfo *VTTI;
+public:
+  /// Default ctor.
+  ///
+  /// @note This has to exist, because this is a pass, but it should never be
+  /// used.
+  TargetTransformInfo();
+
+  TargetTransformInfo(const ScalarTargetTransformInfo* S,
+                      const VectorTargetTransformInfo *V)
+      : ImmutablePass(ID), STTI(S), VTTI(V) {
+    initializeTargetTransformInfoPass(*PassRegistry::getPassRegistry());
+  }
+
+  TargetTransformInfo(const TargetTransformInfo &T) :
+    ImmutablePass(ID), STTI(T.STTI), VTTI(T.VTTI) { }
+
+  const ScalarTargetTransformInfo* getScalarTargetTransformInfo() const {
+    return STTI;
+  }
+  const VectorTargetTransformInfo* getVectorTargetTransformInfo() const {
+    return VTTI;
+  }
+
+  /// Pass identification, replacement for typeid.
+  static char ID;
+};
+
+// ---------------------------------------------------------------------------//
+//  The classes below are inherited and implemented by target-specific classes
+//  in the codegen.
+// ---------------------------------------------------------------------------//
+
+/// ScalarTargetTransformInfo - This interface is used by IR-level passes
+/// that need target-dependent information for generic scalar transformations.
+/// LSR, and LowerInvoke use this interface.
+class ScalarTargetTransformInfo {
+public:
+  virtual ~ScalarTargetTransformInfo() {}
+
+  /// isLegalAddImmediate - Return true if the specified immediate is legal
+  /// add immediate, that is the target has add instructions which can add
+  /// a register with the immediate without having to materialize the
+  /// immediate into a register.
+  virtual bool isLegalAddImmediate(int64_t) const {
+    return false;
+  }
+  /// isLegalICmpImmediate - Return true if the specified immediate is legal
+  /// icmp immediate, that is the target has icmp instructions which can compare
+  /// a register against the immediate without having to materialize the
+  /// immediate into a register.
+  virtual bool isLegalICmpImmediate(int64_t) const {
+    return false;
+  }
+  /// isLegalAddressingMode - Return true if the addressing mode represented by
+  /// AM is legal for this target, for a load/store of the specified type.
+  /// The type may be VoidTy, in which case only return true if the addressing
+  /// mode is legal for a load/store of any legal type.
+  /// TODO: Handle pre/postinc as well.
+  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const {
+    return false;
+  }
+  /// isTruncateFree - Return true if it's free to truncate a value of
+  /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+  /// register EAX to i16 by referencing its sub-register AX.
+  virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const {
+    return false;
+  }
+  /// Is this type legal.
+  virtual bool isTypeLegal(Type *Ty) const {
+    return false;
+  }
+  /// getJumpBufAlignment - returns the target's jmp_buf alignment in bytes
+  virtual unsigned getJumpBufAlignment() const {
+    return 0;
+  }
+  /// getJumpBufSize - returns the target's jmp_buf size in bytes.
+  virtual unsigned getJumpBufSize() const {
+    return 0;
+  }
+  /// shouldBuildLookupTables - Return true if switches should be turned into
+  /// lookup tables for the target.
+  virtual bool shouldBuildLookupTables() const {
+    return true;
+  }
+};
+
+/// VectorTargetTransformInfo - This interface is used by the vectorizers
+/// to estimate the profitability of vectorization for different instructions.
+class VectorTargetTransformInfo {
+public:
+  virtual ~VectorTargetTransformInfo() {}
+
+  /// Returns the expected cost of the instruction opcode. The opcode is one of
+  /// the enums like Instruction::Add. The type arguments are the type of the
+  /// operation.
+  /// Most instructions only use the first type and in that case the second
+  /// operand is ignored.
+  ///
+  /// Exceptions:
+  /// * Br instructions do not use any of the types.
+  /// * Select instructions pass the return type as Ty1 and the selector as Ty2.
+  /// * Cast instructions pass the destination as Ty1 and the source as Ty2.
+  /// * Insert/Extract element pass only the vector type as Ty1.
+  /// * ShuffleVector, Load, Store do not use this call.
+  virtual unsigned getInstrCost(unsigned Opcode,
+                                Type *Ty1 = 0,
+                                Type *Ty2 = 0) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc.
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
+    return 1;
+  }
+
+  /// Returns the cost of a vector broadcast of a scalar at place zero to a
+  /// vector of type 'Tp'.
+  virtual unsigned getBroadcastCost(Type *Tp) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of cast instructions, such as bitcast, trunc,
+  /// zext, etc.
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of control-flow related instrutctions such as
+  /// Phi, Ret, Br.
+  virtual unsigned getCFInstrCost(unsigned Opcode) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of compare and select instructions.
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy = 0) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of vector Insert and Extract.
+  /// Use -1 to indicate that there is no information on the index value.
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index = -1) const {
+    return 1;
+  }
+
+  /// Returns the cost of Load and Store instructions.
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+    return 1;
+  }
+
+  /// Returns the number of pieces into which the provided type must be
+  /// split during legalization. Zero is returned when the answer is unknown.
+  virtual unsigned getNumberOfParts(Type *Tp) const {
+    return 0;
+  }
+};
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index 18176e8fdbb1..fc1cd59e4e10 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -104,23 +104,14 @@ Pass *createPruneEHPass();
 
 //===----------------------------------------------------------------------===//
 /// createInternalizePass - This pass loops over all of the functions in the
-/// input module, internalizing all globals (functions and variables) not part
-/// of the api.  If a list of symbols is specified with the
-/// -internalize-public-api-* command line options, those symbols are not
-/// internalized and all others are.  Otherwise if AllButMain is set and the
-/// main function is found, all other globals are marked as internal. If no api
-/// is supplied and AllButMain is not set, or no main function is found, nothing
-/// is internalized.
-///
-ModulePass *createInternalizePass(bool AllButMain);
-
-/// createInternalizePass - This pass loops over all of the functions in the
 /// input module, internalizing all globals (functions and variables) not in the
 /// given exportList.
 ///
 /// Note that commandline options that are used with the above function are not
-/// used now! Also, when exportList is empty, nothing is internalized.
+/// used now!
 ModulePass *createInternalizePass(const std::vector<const char *> &exportList);
+/// createInternalizePass - Same as above, but with an empty exportList.
+ModulePass *createInternalizePass();
 
 //===----------------------------------------------------------------------===//
 /// createDeadArgEliminationPass - This pass removes arguments from functions
@@ -192,6 +183,16 @@ ModulePass *createMergeFunctionsPass();
 /// createPartialInliningPass - This pass inlines parts of functions.
 ///
 ModulePass *createPartialInliningPass();
+  
+//===----------------------------------------------------------------------===//
+// createMetaRenamerPass - Rename everything with metasyntatic names.
+//
+ModulePass *createMetaRenamerPass();
+
+//===----------------------------------------------------------------------===//
+/// createBarrierNoopPass - This pass is purely a module pass barrier in a pass
+/// manager.
+ModulePass *createBarrierNoopPass();
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/IPO/InlinerPass.h b/include/llvm/Transforms/IPO/InlinerPass.h
index 7c3cfc870156..b036040f5121 100644
--- a/include/llvm/Transforms/IPO/InlinerPass.h
+++ b/include/llvm/Transforms/IPO/InlinerPass.h
@@ -21,7 +21,7 @@
 
 namespace llvm {
   class CallSite;
-  class TargetData;
+  class DataLayout;
   class InlineCost;
   template<class PtrType, unsigned SmallSize>
   class SmallPtrSet;
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 47ce90265bd5..3ea0a427200d 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -104,6 +104,7 @@ public:
   bool DisableUnitAtATime;
   bool DisableUnrollLoops;
   bool Vectorize;
+  bool LoopVectorize;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 4b0c448acfce..8e63aaa4e873 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -34,7 +34,7 @@ ModulePass *createGCOVProfilerPass(bool EmitNotes = true, bool EmitData = true,
                                    bool UseExtraChecksum = false);
 
 // Insert AddressSanitizer (address sanity checking) instrumentation
-ModulePass *createAddressSanitizerPass();
+FunctionPass *createAddressSanitizerPass();
 // Insert ThreadSanitizer (race detection) instrumentation
 FunctionPass *createThreadSanitizerPass();
 
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 3dce6fe37fd4..a5d8eed74622 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -70,6 +70,12 @@ FunctionPass *createAggressiveDCEPass();
 
 //===----------------------------------------------------------------------===//
 //
+// SROA - Replace aggregates or pieces of aggregates with scalar SSA values.
+//
+FunctionPass *createSROAPass(bool RequiresDomTree = true);
+
+//===----------------------------------------------------------------------===//
+//
 // ScalarReplAggregates - Break up alloca's of aggregates into multiple allocas
 // if possible.
 //
diff --git a/include/llvm/Transforms/Utils/AddrModeMatcher.h b/include/llvm/Transforms/Utils/AddrModeMatcher.h
index 90485eb4c69c..7d672839a630 100644
--- a/include/llvm/Transforms/Utils/AddrModeMatcher.h
+++ b/include/llvm/Transforms/Utils/AddrModeMatcher.h
@@ -19,6 +19,7 @@
 #ifndef LLVM_TRANSFORMS_UTILS_ADDRMODEMATCHER_H
 #define LLVM_TRANSFORMS_UTILS_ADDRMODEMATCHER_H
 
+#include "llvm/AddressingMode.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -33,7 +34,7 @@ class raw_ostream;
 
 /// ExtAddrMode - This is an extended version of TargetLowering::AddrMode
 /// which holds actual Value*'s for register values.
-struct ExtAddrMode : public TargetLowering::AddrMode {
+struct ExtAddrMode : public AddrMode {
   Value *BaseReg;
   Value *ScaledReg;
   ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
diff --git a/include/llvm/Transforms/Utils/BasicBlockUtils.h b/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 8a939cc75ed3..b810f1a818c6 100644
--- a/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -25,8 +25,11 @@ namespace llvm {
 
 class AliasAnalysis;
 class Instruction;
+class MDNode;
 class Pass;
 class ReturnInst;
+class TargetLibraryInfo;
+class TerminatorInst;
 
 /// DeleteDeadBlock - Delete the specified block, which must have no
 /// predecessors.
@@ -44,7 +47,7 @@ void FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P = 0);
 /// a result. This includes tracing the def-use list from the PHI to see if
 /// it is ultimately unused or if it reaches an unused cycle. Return true
 /// if any PHIs were deleted.
-bool DeleteDeadPHIs(BasicBlock *BB);
+bool DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI = 0);
 
 /// MergeBlockIntoPredecessor - Attempts to merge a block into its predecessor,
 /// if possible.  The return value indicates success or failure.
@@ -202,6 +205,29 @@ void SplitLandingPadPredecessors(BasicBlock *OrigBB,ArrayRef<BasicBlock*> Preds,
 ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
                                        BasicBlock *Pred);
 
+/// SplitBlockAndInsertIfThen - Split the containing block at the
+/// specified instruction - everything before and including Cmp stays
+/// in the old basic block, and everything after Cmp is moved to a
+/// new block. The two blocks are connected by a conditional branch
+/// (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   Cmp
+///   Tail
+/// After:
+///   Head
+///   Cmp
+///   if (Cmp)
+///     ThenBlock
+///   Tail
+///
+/// If Unreachable is true, then ThenBlock ends with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+
+TerminatorInst *SplitBlockAndInsertIfThen(Instruction *Cmp,
+    bool Unreachable, MDNode *BranchWeights = 0);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index a6e41f0a27a8..ab9fc475faee 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
   class Value;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
   
   /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
@@ -28,52 +28,52 @@ namespace llvm {
   /// EmitStrLen - Emit a call to the strlen function to the builder, for the
   /// specified pointer.  Ptr is required to be some pointer type, and the
   /// return value has 'intptr_t' type.
-  Value *EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
+  Value *EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
                     const TargetLibraryInfo *TLI);
 
   /// EmitStrNLen - Emit a call to the strnlen function to the builder, for the
   /// specified pointer.  Ptr is required to be some pointer type, MaxLen must
   /// be of size_t type, and the return value has 'intptr_t' type.
   Value *EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
-                     const TargetData *TD, const TargetLibraryInfo *TLI);
+                     const DataLayout *TD, const TargetLibraryInfo *TLI);
 
   /// EmitStrChr - Emit a call to the strchr function to the builder, for the
   /// specified pointer and character.  Ptr is required to be some pointer type,
   /// and the return value has 'i8*' type.
-  Value *EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, const TargetData *TD,
+  Value *EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, const DataLayout *TD,
                     const TargetLibraryInfo *TLI);
 
   /// EmitStrNCmp - Emit a call to the strncmp function to the builder.
   Value *EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
-                     const TargetData *TD, const TargetLibraryInfo *TLI);
+                     const DataLayout *TD, const TargetLibraryInfo *TLI);
 
   /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
   /// specified pointer arguments.
   Value *EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
-                    const TargetData *TD, const TargetLibraryInfo *TLI,
+                    const DataLayout *TD, const TargetLibraryInfo *TLI,
                     StringRef Name = "strcpy");
 
   /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
   /// specified pointer arguments and length.
   Value *EmitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
-                     const TargetData *TD, const TargetLibraryInfo *TLI,
+                     const DataLayout *TD, const TargetLibraryInfo *TLI,
                      StringRef Name = "strncpy");
 
   /// EmitMemCpyChk - Emit a call to the __memcpy_chk function to the builder.
   /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
   /// are pointers.
   Value *EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                       IRBuilder<> &B, const TargetData *TD,
+                       IRBuilder<> &B, const DataLayout *TD,
                        const TargetLibraryInfo *TLI);
 
   /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
   /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
   Value *EmitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
-                    const TargetData *TD, const TargetLibraryInfo *TLI);
+                    const DataLayout *TD, const TargetLibraryInfo *TLI);
 
   /// EmitMemCmp - Emit a call to the memcmp function.
   Value *EmitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
-                    const TargetData *TD, const TargetLibraryInfo *TLI);
+                    const DataLayout *TD, const TargetLibraryInfo *TLI);
 
   /// EmitUnaryFloatFnCall - Emit a call to the unary function named 'Name'
   /// (e.g.  'floor').  This function is known to take a single of type matching
@@ -85,28 +85,28 @@ namespace llvm {
 
   /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
   /// is an integer.
-  Value *EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
+  Value *EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
                      const TargetLibraryInfo *TLI);
 
   /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
   /// some pointer.
-  Value *EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
+  Value *EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
                   const TargetLibraryInfo *TLI);
 
   /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
   /// an i32, and File is a pointer to FILE.
   Value *EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
-                   const TargetData *TD, const TargetLibraryInfo *TLI);
+                   const DataLayout *TD, const TargetLibraryInfo *TLI);
 
   /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
   /// pointer and File is a pointer to FILE.
-  Value *EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const TargetData *TD,
+  Value *EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const DataLayout *TD,
                    const TargetLibraryInfo *TLI);
 
   /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
   /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
   Value *EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
-                    const TargetData *TD, const TargetLibraryInfo *TLI);
+                    const DataLayout *TD, const TargetLibraryInfo *TLI);
 
   /// SimplifyFortifiedLibCalls - Helper class for folding checked library
   /// calls (e.g. __strcpy_chk) into their unchecked counterparts.
@@ -118,7 +118,7 @@ namespace llvm {
                             bool isString) const = 0;
   public:
     virtual ~SimplifyFortifiedLibCalls();
-    bool fold(CallInst *CI, const TargetData *TD, const TargetLibraryInfo *TLI);
+    bool fold(CallInst *CI, const DataLayout *TD, const TargetLibraryInfo *TLI);
   };
 }
 
diff --git a/include/llvm/Transforms/Utils/BypassSlowDivision.h b/include/llvm/Transforms/Utils/BypassSlowDivision.h
new file mode 100644
index 000000000000..ac8af122f038
--- /dev/null
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -0,0 +1,33 @@
+//===- llvm/Transforms/Utils/BypassSlowDivision.h --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+#define TRANSFORMS_UTILS_BYPASSSLOWDIVISION_H
+
+#include "llvm/Function.h"
+
+namespace llvm {
+
+/// This optimization identifies DIV instructions that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool bypassSlowDivision(Function &F,
+                        Function::iterator &I,
+                        const DenseMap<unsigned int, unsigned int> &BypassWidth);
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index b7b5d29b320f..1780025a2797 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -39,7 +39,7 @@ class ReturnInst;
 class CallSite;
 class Trace;
 class CallGraph;
-class TargetData;
+class DataLayout;
 class Loop;
 class LoopInfo;
 class AllocaInst;
@@ -116,13 +116,6 @@ Function *CloneFunction(const Function *F,
                         bool ModuleLevelChanges,
                         ClonedCodeInfo *CodeInfo = 0);
 
-/// CloneFunction - Version of the function that doesn't need the VMap.
-///
-inline Function *CloneFunction(const Function *F, ClonedCodeInfo *CodeInfo = 0){
-  ValueToValueMapTy VMap;
-  return CloneFunction(F, VMap, CodeInfo);
-}
-
 /// Clone OldFunc into NewFunc, transforming the old arguments into references
 /// to VMap values.  Note that if NewFunc already has basic blocks, the ones
 /// cloned into it will be added to the end of the function.  This function
@@ -157,7 +150,7 @@ void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                SmallVectorImpl<ReturnInst*> &Returns,
                                const char *NameSuffix = "", 
                                ClonedCodeInfo *CodeInfo = 0,
-                               const TargetData *TD = 0,
+                               const DataLayout *TD = 0,
                                Instruction *TheCall = 0);
 
   
@@ -165,13 +158,13 @@ void CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
 /// InlineFunction call, and records the auxiliary results produced by it. 
 class InlineFunctionInfo {
 public:
-  explicit InlineFunctionInfo(CallGraph *cg = 0, const TargetData *td = 0)
+  explicit InlineFunctionInfo(CallGraph *cg = 0, const DataLayout *td = 0)
     : CG(cg), TD(td) {}
   
   /// CG - If non-null, InlineFunction will update the callgraph to reflect the
   /// changes it makes.
   CallGraph *CG;
-  const TargetData *TD;
+  const DataLayout *TD;
 
   /// StaticAllocas - InlineFunction fills this in with all static allocas that
   /// get copied into the caller.
diff --git a/include/llvm/Transforms/Utils/IntegerDivision.h b/include/llvm/Transforms/Utils/IntegerDivision.h
new file mode 100644
index 000000000000..cecc8075de7d
--- /dev/null
+++ b/include/llvm/Transforms/Utils/IntegerDivision.h
@@ -0,0 +1,48 @@
+//===- llvm/Transforms/Utils/IntegerDivision.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit integer division for targets
+// that don't have native support. It's largely derived from compiler-rt's
+// implementation of __udivsi3, but hand-tuned for targets that prefer less
+// control flow.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_UTILS_INTEGERDIVISION_H
+#define TRANSFORMS_UTILS_INTEGERDIVISION_H
+
+namespace llvm {
+  class BinaryOperator;
+}
+
+namespace llvm {
+
+  /// Generate code to calculate the remainder of two integers, replacing Rem
+  /// with the generated code. This currently generates code using the udiv
+  /// expansion, but future work includes generating more specialized code,
+  /// e.g. when more information about the operands are known. Currently only
+  /// implements 32bit scalar division (due to udiv's limitation), but future
+  /// work is removing this limitation.
+  ///
+  /// @brief Replace Rem with generated code.
+  bool expandRemainder(BinaryOperator *Rem);
+
+  /// Generate code to divide two integers, replacing Div with the generated
+  /// code. This currently generates code similarly to compiler-rt's
+  /// implementations, but future work includes generating more specialized code
+  /// when more information about the operands are known. Currently only
+  /// implements 32bit scalar division, but future work is removing this
+  /// limitation.
+  ///
+  /// @brief Replace Div with generated code.
+  bool expandDivision(BinaryOperator* Div);
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 495eab73289e..be3029e545de 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -18,7 +18,7 @@
 #include "llvm/IRBuilder.h"
 #include "llvm/Operator.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -35,7 +35,9 @@ class Pass;
 class PHINode;
 class AllocaInst;
 class ConstantExpr;
-class TargetData;
+class DataLayout;
+class TargetLibraryInfo;
+class TargetTransformInfo;
 class DIBuilder;
 
 template<typename T> class SmallVectorImpl;
@@ -51,7 +53,8 @@ template<typename T> class SmallVectorImpl;
 /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
-bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false);
+bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
+                            const TargetLibraryInfo *TLI = 0);
 
 //===----------------------------------------------------------------------===//
 //  Local dead code elimination.
@@ -60,20 +63,21 @@ bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false);
 /// isInstructionTriviallyDead - Return true if the result produced by the
 /// instruction is not used, and the instruction has no side effects.
 ///
-bool isInstructionTriviallyDead(Instruction *I);
+bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=0);
 
 /// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
-bool RecursivelyDeleteTriviallyDeadInstructions(Value *V);
+bool RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                const TargetLibraryInfo *TLI=0);
 
 /// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
 /// dead PHI node, due to being a def-use chain of single-use nodes that
 /// either forms a cycle or is terminated by a trivially dead instruction,
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
-bool RecursivelyDeleteDeadPHINode(PHINode *PN);
+bool RecursivelyDeleteDeadPHINode(PHINode *PN, const TargetLibraryInfo *TLI=0);
 
   
 /// SimplifyInstructionsInBlock - Scan the specified basic block and try to
@@ -81,7 +85,8 @@ bool RecursivelyDeleteDeadPHINode(PHINode *PN);
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD = 0);
+bool SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD = 0,
+                                 const TargetLibraryInfo *TLI = 0);
     
 //===----------------------------------------------------------------------===//
 //  Control Flow Graph Restructuring.
@@ -99,7 +104,7 @@ bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD = 0);
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the 'and' to 0.
 void RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                  TargetData *TD = 0);
+                                  DataLayout *TD = 0);
     
   
 /// MergeBasicBlockIntoOnlyPred - BB is a block with one predecessor and its
@@ -130,7 +135,8 @@ bool EliminateDuplicatePHINodes(BasicBlock *BB);
 /// of the CFG.  It returns true if a modification was made, possibly deleting
 /// the basic block that was pointed to.
 ///
-bool SimplifyCFG(BasicBlock *BB, const TargetData *TD = 0);
+bool SimplifyCFG(BasicBlock *BB, const DataLayout *TD = 0,
+                 const TargetTransformInfo *TTI = 0);
 
 /// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch,
 /// and if a predecessor branches to us and one of our successors, fold the
@@ -158,10 +164,10 @@ AllocaInst *DemotePHIToStack(PHINode *P, Instruction *AllocaPoint = 0);
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                    const TargetData *TD = 0);
+                                    const DataLayout *TD = 0);
 
 /// getKnownAlignment - Try to infer an alignment for the specified pointer.
-static inline unsigned getKnownAlignment(Value *V, const TargetData *TD = 0) {
+static inline unsigned getKnownAlignment(Value *V, const DataLayout *TD = 0) {
   return getOrEnforceKnownAlignment(V, 0, TD);
 }
 
@@ -171,7 +177,7 @@ static inline unsigned getKnownAlignment(Value *V, const TargetData *TD = 0) {
 /// When NoAssumptions is true, no assumptions about index computation not
 /// overflowing is made.
 template<typename IRBuilderTy>
-Value *EmitGEPOffset(IRBuilderTy *Builder, const TargetData &TD, User *GEP,
+Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &TD, User *GEP,
                      bool NoAssumptions = false) {
   gep_type_iterator GTI = gep_type_begin(GEP);
   Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 4c821491b210..db65a47e972d 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -109,8 +109,8 @@ public:
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
 
-  void operator=(const SSAUpdater&); // DO NOT IMPLEMENT
-  SSAUpdater(const SSAUpdater&);     // DO NOT IMPLEMENT
+  void operator=(const SSAUpdater&) LLVM_DELETED_FUNCTION;
+  SSAUpdater(const SSAUpdater&) LLVM_DELETED_FUNCTION;
 };
   
 /// LoadAndStorePromoter - This little helper class provides a convenient way to
diff --git a/include/llvm/Transforms/Utils/SimplifyIndVar.h b/include/llvm/Transforms/Utils/SimplifyIndVar.h
index 2632d186ff9b..7e97e218fb0b 100644
--- a/include/llvm/Transforms/Utils/SimplifyIndVar.h
+++ b/include/llvm/Transforms/Utils/SimplifyIndVar.h
@@ -21,8 +21,6 @@
 
 namespace llvm {
 
-extern cl::opt<bool> DisableIVRewrite;
-
 class CastInst;
 class IVUsers;
 class Loop;
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
new file mode 100644
index 000000000000..fde452bca235
--- /dev/null
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -0,0 +1,52 @@
+//===- SimplifyLibCalls.h - Library call simplifier -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes an interface to build some C language libcalls for
+// optimization passes that need to call the various functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
+#define LLVM_TRANSFORMS_UTILS_SIMPLIFYLIBCALLS_H
+
+namespace llvm {
+  class Value;
+  class CallInst;
+  class DataLayout;
+  class Instruction;
+  class TargetLibraryInfo;
+  class LibCallSimplifierImpl;
+
+  /// LibCallSimplifier - This class implements a collection of optimizations
+  /// that replace well formed calls to library functions with a more optimal
+  /// form.  For example, replacing 'printf("Hello!")' with 'puts("Hello!")'.
+  class LibCallSimplifier {
+    /// Impl - A pointer to the actual implementation of the library call
+    /// simplifier.
+    LibCallSimplifierImpl *Impl;
+  public:
+    LibCallSimplifier(const DataLayout *TD, const TargetLibraryInfo *TLI);
+    virtual ~LibCallSimplifier();
+
+    /// optimizeCall - Take the given call instruction and return a more
+    /// optimal value to replace the instruction with or 0 if a more
+    /// optimal form can't be found.  Note that the returned value may
+    /// be equal to the instruction being optimized.  In this case all
+    /// other instructions that use the given instruction were modified
+    /// and the given instruction is dead.
+    Value *optimizeCall(CallInst *CI);
+
+    /// replaceAllUsesWith - This method is used when the library call
+    /// simplifier needs to replace instructions other than the library
+    /// call being modified.
+    virtual void replaceAllUsesWith(Instruction *I, Value *With) const;
+  };
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Utils/ValueMapper.h b/include/llvm/Transforms/Utils/ValueMapper.h
index 8594707a8482..5390c5e8ed47 100644
--- a/include/llvm/Transforms/Utils/ValueMapper.h
+++ b/include/llvm/Transforms/Utils/ValueMapper.h
@@ -25,7 +25,7 @@ namespace llvm {
   /// ValueMapTypeRemapper - This is a class that can be implemented by clients
   /// to remap types when cloning constants and instructions.
   class ValueMapTypeRemapper {
-    virtual void Anchor();  // Out of line method.
+    virtual void anchor();  // Out of line method.
   public:
     virtual ~ValueMapTypeRemapper() {}
     
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
index 1e49a9c01e6b..41e53a83e2f8 100644
--- a/include/llvm/Transforms/Vectorize.h
+++ b/include/llvm/Transforms/Vectorize.h
@@ -107,6 +107,12 @@ BasicBlockPass *
 createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig());
 
 //===----------------------------------------------------------------------===//
+//
+// LoopVectorize - Create a loop vectorization pass.
+//
+Pass * createLoopVectorizePass();
+
+//===----------------------------------------------------------------------===//
 /// @brief Vectorize the BasicBlock.
 ///
 /// @param BB The BasicBlock to be vectorized
diff --git a/include/llvm/Type.h b/include/llvm/Type.h
index 185258d8ff2a..def45750dd71 100644
--- a/include/llvm/Type.h
+++ b/include/llvm/Type.h
@@ -153,7 +153,7 @@ public:
   /// isPPC_FP128Ty - Return true if this is powerpc long double.
   bool isPPC_FP128Ty() const { return getTypeID() == PPC_FP128TyID; }
 
-  /// isFloatingPointTy - Return true if this is one of the five floating point
+  /// isFloatingPointTy - Return true if this is one of the six floating point
   /// types
   bool isFloatingPointTy() const {
     return getTypeID() == HalfTyID || getTypeID() == FloatTyID ||
@@ -167,7 +167,7 @@ public:
 
   /// isFPOrFPVectorTy - Return true if this is a FP type or a vector of FP.
   ///
-  bool isFPOrFPVectorTy() const;
+  bool isFPOrFPVectorTy() const { return getScalarType()->isFloatingPointTy(); }
  
   /// isLabelTy - Return true if this is 'label'.
   bool isLabelTy() const { return getTypeID() == LabelTyID; }
@@ -185,7 +185,7 @@ public:
   /// isIntOrIntVectorTy - Return true if this is an integer type or a vector of
   /// integer types.
   ///
-  bool isIntOrIntVectorTy() const;
+  bool isIntOrIntVectorTy() const { return getScalarType()->isIntegerTy(); }
   
   /// isFunctionTy - True if this is an instance of FunctionType.
   ///
@@ -203,6 +203,11 @@ public:
   ///
   bool isPointerTy() const { return getTypeID() == PointerTyID; }
 
+  /// isPtrOrPtrVectorTy - Return true if this is a pointer type or a vector of
+  /// pointer types.
+  ///
+  bool isPtrOrPtrVectorTy() const { return getScalarType()->isPointerTy(); }
+ 
   /// isVectorTy - True if this is an instance of VectorType.
   ///
   bool isVectorTy() const { return getTypeID() == VectorTyID; }
@@ -252,7 +257,7 @@ public:
 
   /// isSized - Return true if it makes sense to take the size of this type.  To
   /// get the actual size for a particular target, it is reasonable to use the
-  /// TargetData subsystem to do this.
+  /// DataLayout subsystem to do this.
   ///
   bool isSized() const {
     // If it's a primitive, it is always sized.
@@ -276,7 +281,7 @@ public:
   ///
   /// Note that this may not reflect the size of memory allocated for an
   /// instance of the type or the number of bytes that are written when an
-  /// instance of the type is stored to memory. The TargetData class provides
+  /// instance of the type is stored to memory. The DataLayout class provides
   /// additional query functions to provide this information.
   ///
   unsigned getPrimitiveSizeInBits() const;
@@ -293,6 +298,7 @@ public:
 
   /// getScalarType - If this is a vector type, return the element type,
   /// otherwise return 'this'.
+  const Type *getScalarType() const;
   Type *getScalarType();
 
   //===--------------------------------------------------------------------===//
@@ -340,8 +346,10 @@ public:
   unsigned getVectorNumElements() const;
   Type *getVectorElementType() const { return getSequentialElementType(); }
 
-  unsigned getPointerAddressSpace() const;
   Type *getPointerElementType() const { return getSequentialElementType(); }
+
+  /// \brief Get the address space of this pointer or pointer vector type.
+  unsigned getPointerAddressSpace() const;
   
   //===--------------------------------------------------------------------===//
   // Static members exported by the Type class itself.  Useful for getting
@@ -389,9 +397,6 @@ public:
   static PointerType *getInt32PtrTy(LLVMContext &C, unsigned AS = 0);
   static PointerType *getInt64PtrTy(LLVMContext &C, unsigned AS = 0);
 
-  /// Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const Type *) { return true; }
-
   /// getPointerTo - Return a pointer to the current type.  This is equivalent
   /// to PointerType::get(Foo, AddrSpace).
   PointerType *getPointerTo(unsigned AddrSpace = 0);
diff --git a/include/llvm/Use.h b/include/llvm/Use.h
index a496325c1fc6..80804459cc33 100644
--- a/include/llvm/Use.h
+++ b/include/llvm/Use.h
@@ -26,6 +26,7 @@
 #define LLVM_USE_H
 
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/Compiler.h"
 #include <cstddef>
 #include <iterator>
 
@@ -66,7 +67,7 @@ public:
 
 private:
   /// Copy ctor - do not implement
-  Use(const Use &U);
+  Use(const Use &U) LLVM_DELETED_FUNCTION;
 
   /// Destructor - Only for zap()
   ~Use() {
diff --git a/include/llvm/User.h b/include/llvm/User.h
index 5d5460cd6fff..df303d0dd5f2 100644
--- a/include/llvm/User.h
+++ b/include/llvm/User.h
@@ -31,8 +31,8 @@ template <class>
 struct OperandTraits;
 
 class User : public Value {
-  User(const User &);             // Do not implement
-  void *operator new(size_t);     // Do not implement
+  User(const User &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t) LLVM_DELETED_FUNCTION;
   template <unsigned>
   friend struct HungoffOperandTraits;
   virtual void anchor();
@@ -104,7 +104,7 @@ public:
     assert(i < NumOperands && "getOperandUse() out of range!");
     return OperandList[i];
   }
-  
+
   unsigned getNumOperands() const { return NumOperands; }
 
   // ---------------------------------------------------------------------------
@@ -118,6 +118,45 @@ public:
   inline op_iterator       op_end()         { return OperandList+NumOperands; }
   inline const_op_iterator op_end()   const { return OperandList+NumOperands; }
 
+  /// Convenience iterator for directly iterating over the Values in the
+  /// OperandList
+  class value_op_iterator : public std::iterator<std::forward_iterator_tag,
+                                                 Value*> {
+    op_iterator OI;
+  public:
+    explicit value_op_iterator(Use *U) : OI(U) {}
+
+    bool operator==(const value_op_iterator &x) const {
+      return OI == x.OI;
+    }
+    bool operator!=(const value_op_iterator &x) const {
+      return !operator==(x);
+    }
+
+    /// Iterator traversal: forward iteration only
+    value_op_iterator &operator++() {          // Preincrement
+      ++OI;
+      return *this;
+    }
+    value_op_iterator operator++(int) {        // Postincrement
+      value_op_iterator tmp = *this; ++*this; return tmp;
+    }
+
+    /// Retrieve a pointer to the current Value.
+    Value *operator*() const {
+      return *OI;
+    }
+
+    Value *operator->() const { return operator*(); }
+  };
+
+  inline value_op_iterator value_op_begin() {
+    return value_op_iterator(op_begin());
+  }
+  inline value_op_iterator value_op_end() {
+    return value_op_iterator(op_end());
+  }
+
   // dropAllReferences() - This function is in charge of "letting go" of all
   // objects that this User refers to.  This allows one to
   // 'delete' a whole class at a time, even though there may be circular
@@ -137,7 +176,6 @@ public:
   void replaceUsesOfWith(Value *From, Value *To);
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const User *) { return true; }
   static inline bool classof(const Value *V) {
     return isa<Instruction>(V) || isa<Constant>(V);
   }
diff --git a/include/llvm/Value.h b/include/llvm/Value.h
index a82ac45c49ed..5b19435ebaf4 100644
--- a/include/llvm/Value.h
+++ b/include/llvm/Value.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Use.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -80,8 +81,8 @@ private:
   friend class ValueHandleBase;
   ValueName *Name;
 
-  void operator=(const Value &);     // Do not implement
-  Value(const Value &);              // Do not implement
+  void operator=(const Value &) LLVM_DELETED_FUNCTION;
+  Value(const Value &) LLVM_DELETED_FUNCTION;
 
 protected:
   /// printCustom - Value subclasses can override this to implement custom
@@ -120,7 +121,7 @@ public:
   /// setName() - Change the name of the value, choosing a new unique name if
   /// the provided name is taken.
   ///
-  /// \arg Name - The new name; or "" if the value's name should be removed.
+  /// \param Name The new name; or "" if the value's name should be removed.
   void setName(const Twine &Name);
 
   
@@ -256,11 +257,6 @@ public:
   /// hasValueHandle - Return true if there is a value handle associated with
   /// this value.
   bool hasValueHandle() const { return HasValueHandle; }
-  
-  // Methods for support type inquiry through isa, cast, and dyn_cast:
-  static inline bool classof(const Value *) {
-    return true; // Values are always values.
-  }
 
   /// stripPointerCasts - This method strips off any unneeded pointer casts and
   /// all-zero GEPs from the specified value, returning the original uncasted
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index 3b6aab13a568..752edd52b454 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -35,7 +35,8 @@
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Type.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 // Register the AliasAnalysis interface, providing a nice name to refer to.
@@ -451,7 +452,8 @@ AliasAnalysis::~AliasAnalysis() {}
 /// AliasAnalysis interface before any other methods are called.
 ///
 void AliasAnalysis::InitializeAliasAnalysis(Pass *P) {
-  TD = P->getAnalysisIfAvailable<TargetData>();
+  TD = P->getAnalysisIfAvailable<DataLayout>();
+  TLI = P->getAnalysisIfAvailable<TargetLibraryInfo>();
   AA = &P->getAnalysis<AliasAnalysis>();
 }
 
@@ -461,7 +463,7 @@ void AliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AliasAnalysis>();         // All AA's chain
 }
 
-/// getTypeStoreSize - Return the TargetData store size for the given type,
+/// getTypeStoreSize - Return the DataLayout store size for the given type,
 /// if known, or a conservative value otherwise.
 ///
 uint64_t AliasAnalysis::getTypeStoreSize(Type *Ty) {
@@ -501,7 +503,7 @@ bool AliasAnalysis::canInstructionRangeModify(const Instruction &I1,
 bool llvm::isNoAliasCall(const Value *V) {
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
     return ImmutableCallSite(cast<Instruction>(V))
-      .paramHasAttr(0, Attribute::NoAlias);
+      .paramHasAttr(0, Attributes::NoAlias);
   return false;
 }
 
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 92e89068e440..388c755cbd31 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -18,7 +18,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Pass.h"
 #include "llvm/Type.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -550,7 +550,7 @@ void AliasSetTracker::copyValue(Value *From, Value *To) {
 //===----------------------------------------------------------------------===//
 
 void AliasSet::print(raw_ostream &OS) const {
-  OS << "  AliasSet[" << (void*)this << ", " << RefCount << "] ";
+  OS << "  AliasSet[" << (const void*)this << ", " << RefCount << "] ";
   OS << (AliasTy == MustAlias ? "must" : "may") << " alias, ";
   switch (AccessTy) {
   case NoModRef: OS << "No access "; break;
@@ -590,8 +590,10 @@ void AliasSetTracker::print(raw_ostream &OS) const {
   OS << "\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void AliasSet::dump() const { print(dbgs()); }
 void AliasSetTracker::dump() const { print(dbgs()); }
+#endif
 
 //===----------------------------------------------------------------------===//
 //                     ASTCallbackVH Class Implementation
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 0ba6af93b511..9dc81a6a630f 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -26,11 +26,13 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeBasicAliasAnalysisPass(Registry);
   initializeBlockFrequencyInfoPass(Registry);
   initializeBranchProbabilityInfoPass(Registry);
+  initializeCostModelAnalysisPass(Registry);
   initializeCFGViewerPass(Registry);
   initializeCFGPrinterPass(Registry);
   initializeCFGOnlyViewerPass(Registry);
   initializeCFGOnlyPrinterPass(Registry);
   initializePrintDbgInfoPass(Registry);
+  initializeDependenceAnalysisPass(Registry);
   initializeDominanceFrontierPass(Registry);
   initializeDomViewerPass(Registry);
   initializeDomPrinterPass(Registry);
@@ -46,7 +48,6 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLazyValueInfoPass(Registry);
   initializeLibCallAliasAnalysisPass(Registry);
   initializeLintPass(Registry);
-  initializeLoopDependenceAnalysisPass(Registry);
   initializeLoopInfoPass(Registry);
   initializeMemDepPrinterPass(Registry);
   initializeMemoryDependenceAnalysisPass(Registry);
@@ -61,6 +62,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializePathProfileLoaderPassPass(Registry);
   initializeProfileVerifierPassPass(Registry);
   initializePathProfileVerifierPass(Registry);
+  initializeProfileMetadataLoaderPassPass(Registry);
   initializeRegionInfoPass(Registry);
   initializeRegionViewerPass(Registry);
   initializeRegionPrinterPass(Registry);
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 1d028c27b8c3..4bb93ee88a49 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -29,7 +29,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -58,12 +58,12 @@ static bool isNonEscapingLocalObject(const Value *V) {
   // then it has not escaped before entering the function.  Check if it escapes
   // inside the function.
   if (const Argument *A = dyn_cast<Argument>(V))
-    if (A->hasByValAttr() || A->hasNoAliasAttr()) {
-      // Don't bother analyzing arguments already known not to escape.
-      if (A->hasNoCaptureAttr())
-        return true;
+    if (A->hasByValAttr() || A->hasNoAliasAttr())
+      // Note even if the argument is marked nocapture we still need to check
+      // for copies made inside the function. The nocapture attribute only
+      // specifies that there are no copies made that outlive the function.
       return !PointerMayBeCaptured(V, false, /*StoreCaptures=*/true);
-    }
+
   return false;
 }
 
@@ -84,10 +84,11 @@ static bool isEscapeSource(const Value *V) {
 
 /// getObjectSize - Return the size of the object specified by V, or
 /// UnknownSize if unknown.
-static uint64_t getObjectSize(const Value *V, const TargetData &TD,
+static uint64_t getObjectSize(const Value *V, const DataLayout &TD,
+                              const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
   uint64_t Size;
-  if (getObjectSize(V, Size, &TD, RoundToAlign))
+  if (getObjectSize(V, Size, &TD, &TLI, RoundToAlign))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -95,10 +96,11 @@ static uint64_t getObjectSize(const Value *V, const TargetData &TD,
 /// isObjectSmallerThan - Return true if we can prove that the object specified
 /// by V is smaller than Size.
 static bool isObjectSmallerThan(const Value *V, uint64_t Size,
-                                const TargetData &TD) {
+                                const DataLayout &TD,
+                                const TargetLibraryInfo &TLI) {
   // This function needs to use the aligned object size because we allow
   // reads a bit past the end given sufficient alignment.
-  uint64_t ObjectSize = getObjectSize(V, TD, /*RoundToAlign*/true);
+  uint64_t ObjectSize = getObjectSize(V, TD, TLI, /*RoundToAlign*/true);
   
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize < Size;
 }
@@ -106,8 +108,8 @@ static bool isObjectSmallerThan(const Value *V, uint64_t Size,
 /// isObjectSize - Return true if we can prove that the object specified
 /// by V has size Size.
 static bool isObjectSize(const Value *V, uint64_t Size,
-                         const TargetData &TD) {
-  uint64_t ObjectSize = getObjectSize(V, TD);
+                         const DataLayout &TD, const TargetLibraryInfo &TLI) {
+  uint64_t ObjectSize = getObjectSize(V, TD, TLI);
   return ObjectSize != AliasAnalysis::UnknownSize && ObjectSize == Size;
 }
 
@@ -126,6 +128,15 @@ namespace {
     const Value *V;
     ExtensionKind Extension;
     int64_t Scale;
+
+    bool operator==(const VariableGEPIndex &Other) const {
+      return V == Other.V && Extension == Other.Extension &&
+        Scale == Other.Scale;
+    }
+
+    bool operator!=(const VariableGEPIndex &Other) const {
+      return !operator==(Other);
+    }
   };
 }
 
@@ -140,7 +151,7 @@ namespace {
 /// represented in the result.
 static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
                                   ExtensionKind &Extension,
-                                  const TargetData &TD, unsigned Depth) {
+                                  const DataLayout &TD, unsigned Depth) {
   assert(V->getType()->isIntegerTy() && "Not an integer value");
 
   // Limit our recursion depth.
@@ -215,14 +226,14 @@ static Value *GetLinearExpression(Value *V, APInt &Scale, APInt &Offset,
 /// specified amount, but which may have other unrepresented high bits. As such,
 /// the gep cannot necessarily be reconstructed from its decomposed form.
 ///
-/// When TargetData is around, this function is capable of analyzing everything
+/// When DataLayout is around, this function is capable of analyzing everything
 /// that GetUnderlyingObject can look through.  When not, it just looks
 /// through pointer casts.
 ///
 static const Value *
 DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
                        SmallVectorImpl<VariableGEPIndex> &VarIndices,
-                       const TargetData *TD) {
+                       const DataLayout *TD) {
   // Limit recursion depth to limit compile time in crazy cases.
   unsigned MaxLookup = 6;
   
@@ -266,7 +277,7 @@ DecomposeGEPExpression(const Value *V, int64_t &BaseOffs,
         ->getElementType()->isSized())
       return V;
     
-    // If we are lacking TargetData information, we can't compute the offets of
+    // If we are lacking DataLayout information, we can't compute the offets of
     // elements computed by GEPs.  However, we can handle bitcast equivalent
     // GEPs.
     if (TD == 0) {
@@ -417,13 +428,7 @@ namespace {
   /// BasicAliasAnalysis - This is the primary alias analysis implementation.
   struct BasicAliasAnalysis : public ImmutablePass, public AliasAnalysis {
     static char ID; // Class identification, replacement for typeinfo
-    BasicAliasAnalysis() : ImmutablePass(ID),
-                           // AliasCache rarely has more than 1 or 2 elements,
-                           // so start it off fairly small so that clear()
-                           // doesn't have to tromp through 64 (the default)
-                           // elements on each alias query. This really wants
-                           // something like a SmallDenseMap.
-                           AliasCache(8) {
+    BasicAliasAnalysis() : ImmutablePass(ID) {
       initializeBasicAliasAnalysisPass(*PassRegistry::getPassRegistry());
     }
 
@@ -443,7 +448,11 @@ namespace {
              "BasicAliasAnalysis doesn't support interprocedural queries.");
       AliasResult Alias = aliasCheck(LocA.Ptr, LocA.Size, LocA.TBAATag,
                                      LocB.Ptr, LocB.Size, LocB.TBAATag);
-      AliasCache.clear();
+      // AliasCache rarely has more than 1 or 2 elements, always use
+      // shrink_and_clear so it quickly returns to the inline capacity of the
+      // SmallDenseMap if it ever grows larger.
+      // FIXME: This should really be shrink_to_inline_capacity_and_clear().
+      AliasCache.shrink_and_clear();
       return Alias;
     }
 
@@ -481,7 +490,7 @@ namespace {
   private:
     // AliasCache - Track alias queries to guard against recursion.
     typedef std::pair<Location, Location> LocPair;
-    typedef DenseMap<LocPair, AliasResult> AliasCacheTy;
+    typedef SmallDenseMap<LocPair, AliasResult, 8> AliasCacheTy;
     AliasCacheTy AliasCache;
 
     // Visited - Track instructions visited by pointsToConstantMemory.
@@ -490,6 +499,7 @@ namespace {
     // aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP
     // instruction against another.
     AliasResult aliasGEP(const GEPOperator *V1, uint64_t V1Size,
+                         const MDNode *V1TBAAInfo,
                          const Value *V2, uint64_t V2Size,
                          const MDNode *V2TBAAInfo,
                          const Value *UnderlyingV1, const Value *UnderlyingV2);
@@ -807,6 +817,21 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
   return ModRefResult(AliasAnalysis::getModRefInfo(CS, Loc) & Min);
 }
 
+static bool areVarIndicesEqual(SmallVector<VariableGEPIndex, 4> &Indices1,
+                               SmallVector<VariableGEPIndex, 4> &Indices2) {
+  unsigned Size1 = Indices1.size();
+  unsigned Size2 = Indices2.size();
+
+  if (Size1 != Size2)
+    return false;
+
+  for (unsigned I = 0; I != Size1; ++I)
+    if (Indices1[I] != Indices2[I])
+      return false;
+
+  return true;
+}
+
 /// aliasGEP - Provide a bunch of ad-hoc rules to disambiguate a GEP instruction
 /// against another pointer.  We know that V1 is a GEP, but we don't know
 /// anything about V2.  UnderlyingV1 is GetUnderlyingObject(GEP1, TD),
@@ -814,6 +839,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
 ///
 AliasAnalysis::AliasResult
 BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
+                             const MDNode *V1TBAAInfo,
                              const Value *V2, uint64_t V2Size,
                              const MDNode *V2TBAAInfo,
                              const Value *UnderlyingV1,
@@ -821,9 +847,41 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
   int64_t GEP1BaseOffset;
   SmallVector<VariableGEPIndex, 4> GEP1VariableIndices;
 
-  // If we have two gep instructions with must-alias'ing base pointers, figure
-  // out if the indexes to the GEP tell us anything about the derived pointer.
+  // If we have two gep instructions with must-alias or not-alias'ing base
+  // pointers, figure out if the indexes to the GEP tell us anything about the
+  // derived pointer.
   if (const GEPOperator *GEP2 = dyn_cast<GEPOperator>(V2)) {
+    // Check for geps of non-aliasing underlying pointers where the offsets are
+    // identical.
+    if (V1Size == V2Size) {
+      // Do the base pointers alias assuming type and size.
+      AliasResult PreciseBaseAlias = aliasCheck(UnderlyingV1, V1Size,
+                                                V1TBAAInfo, UnderlyingV2,
+                                                V2Size, V2TBAAInfo);
+      if (PreciseBaseAlias == NoAlias) {
+        // See if the computed offset from the common pointer tells us about the
+        // relation of the resulting pointer.
+        int64_t GEP2BaseOffset;
+        SmallVector<VariableGEPIndex, 4> GEP2VariableIndices;
+        const Value *GEP2BasePtr =
+          DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
+        const Value *GEP1BasePtr =
+          DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
+        // DecomposeGEPExpression and GetUnderlyingObject should return the
+        // same result except when DecomposeGEPExpression has no DataLayout.
+        if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
+          assert(TD == 0 &&
+             "DecomposeGEPExpression and GetUnderlyingObject disagree!");
+          return MayAlias;
+        }
+        // Same offsets.
+        if (GEP1BaseOffset == GEP2BaseOffset &&
+            areVarIndicesEqual(GEP1VariableIndices, GEP2VariableIndices))
+          return NoAlias;
+        GEP1VariableIndices.clear();
+      }
+    }
+
     // Do the base pointers alias?
     AliasResult BaseAlias = aliasCheck(UnderlyingV1, UnknownSize, 0,
                                        UnderlyingV2, UnknownSize, 0);
@@ -843,9 +901,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     const Value *GEP2BasePtr =
       DecomposeGEPExpression(GEP2, GEP2BaseOffset, GEP2VariableIndices, TD);
     
-    // If DecomposeGEPExpression isn't able to look all the way through the
-    // addressing operation, we must not have TD and this is too complex for us
-    // to handle without it.
+    // DecomposeGEPExpression and GetUnderlyingObject should return the
+    // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1 || GEP2BasePtr != UnderlyingV2) {
       assert(TD == 0 &&
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
@@ -879,9 +936,8 @@ BasicAliasAnalysis::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
     const Value *GEP1BasePtr =
       DecomposeGEPExpression(GEP1, GEP1BaseOffset, GEP1VariableIndices, TD);
     
-    // If DecomposeGEPExpression isn't able to look all the way through the
-    // addressing operation, we must not have TD and this is too complex for us
-    // to handle without it.
+    // DecomposeGEPExpression and GetUnderlyingObject should return the
+    // same result except when DecomposeGEPExpression has no DataLayout.
     if (GEP1BasePtr != UnderlyingV1) {
       assert(TD == 0 &&
              "DecomposeGEPExpression and GetUnderlyingObject disagree!");
@@ -1004,12 +1060,42 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
   // on corresponding edges.
   if (const PHINode *PN2 = dyn_cast<PHINode>(V2))
     if (PN2->getParent() == PN->getParent()) {
+      LocPair Locs(Location(PN, PNSize, PNTBAAInfo),
+                   Location(V2, V2Size, V2TBAAInfo));
+      if (PN > V2)
+        std::swap(Locs.first, Locs.second);
+
       AliasResult Alias =
         aliasCheck(PN->getIncomingValue(0), PNSize, PNTBAAInfo,
                    PN2->getIncomingValueForBlock(PN->getIncomingBlock(0)),
                    V2Size, V2TBAAInfo);
       if (Alias == MayAlias)
         return MayAlias;
+
+      // If the first source of the PHI nodes NoAlias and the other inputs are
+      // the PHI node itself through some amount of recursion this does not add
+      // any new information so just return NoAlias.
+      // bb:
+      //    ptr = ptr2 + 1
+      // loop:
+      //    ptr_phi = phi [bb, ptr], [loop, ptr_plus_one]
+      //    ptr2_phi = phi [bb, ptr2], [loop, ptr2_plus_one]
+      //    ...
+      //    ptr_plus_one = gep ptr_phi, 1
+      //    ptr2_plus_one = gep ptr2_phi, 1
+      // We assume for the recursion that the the phis (ptr_phi, ptr2_phi) do
+      // not alias each other.
+      bool ArePhisAssumedNoAlias = false;
+      AliasResult OrigAliasResult = NoAlias;
+      if (Alias == NoAlias) {
+        // Pretend the phis do not alias.
+        assert(AliasCache.count(Locs) &&
+               "There must exist an entry for the phi node");
+        OrigAliasResult = AliasCache[Locs];
+        AliasCache[Locs] = NoAlias;
+        ArePhisAssumedNoAlias = true;
+      }
+
       for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
         AliasResult ThisAlias =
           aliasCheck(PN->getIncomingValue(i), PNSize, PNTBAAInfo,
@@ -1019,6 +1105,11 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
         if (Alias == MayAlias)
           break;
       }
+
+      // Reset if speculation failed.
+      if (ArePhisAssumedNoAlias && Alias != NoAlias)
+        AliasCache[Locs] = OrigAliasResult;
+
       return Alias;
     }
 
@@ -1133,8 +1224,8 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   // If the size of one access is larger than the entire object on the other
   // side, then we know such behavior is undefined and can assume no alias.
   if (TD)
-    if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD)) ||
-        (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD)))
+    if ((V1Size != UnknownSize && isObjectSmallerThan(O2, V1Size, *TD, *TLI)) ||
+        (V2Size != UnknownSize && isObjectSmallerThan(O1, V2Size, *TD, *TLI)))
       return NoAlias;
   
   // Check the cache before climbing up use-def chains. This also terminates
@@ -1154,15 +1245,17 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
     std::swap(O1, O2);
+    std::swap(V1TBAAInfo, V2TBAAInfo);
   }
   if (const GEPOperator *GV1 = dyn_cast<GEPOperator>(V1)) {
-    AliasResult Result = aliasGEP(GV1, V1Size, V2, V2Size, V2TBAAInfo, O1, O2);
+    AliasResult Result = aliasGEP(GV1, V1Size, V1TBAAInfo, V2, V2Size, V2TBAAInfo, O1, O2);
     if (Result != MayAlias) return AliasCache[Locs] = Result;
   }
 
   if (isa<PHINode>(V2) && !isa<PHINode>(V1)) {
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
+    std::swap(V1TBAAInfo, V2TBAAInfo);
   }
   if (const PHINode *PN = dyn_cast<PHINode>(V1)) {
     AliasResult Result = aliasPHI(PN, V1Size, V1TBAAInfo,
@@ -1173,6 +1266,7 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   if (isa<SelectInst>(V2) && !isa<SelectInst>(V1)) {
     std::swap(V1, V2);
     std::swap(V1Size, V2Size);
+    std::swap(V1TBAAInfo, V2TBAAInfo);
   }
   if (const SelectInst *S1 = dyn_cast<SelectInst>(V1)) {
     AliasResult Result = aliasSelect(S1, V1Size, V1TBAAInfo,
@@ -1184,8 +1278,8 @@ BasicAliasAnalysis::aliasCheck(const Value *V1, uint64_t V1Size,
   // accesses is accessing the entire object, then the accesses must
   // overlap in some way.
   if (TD && O1 == O2)
-    if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD)) ||
-        (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD)))
+    if ((V1Size != UnknownSize && isObjectSize(O1, V1Size, *TD, *TLI)) ||
+        (V2Size != UnknownSize && isObjectSize(O2, V2Size, *TD, *TLI)))
       return AliasCache[Locs] = PartialAlias;
 
   AliasResult Result =
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index b255ce6dba51..04a6560262cb 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -115,14 +115,14 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
     return false;
   }
 
-  SmallPtrSet<BasicBlock *, 4> UnreachableEdges;
-  SmallPtrSet<BasicBlock *, 4> ReachableEdges;
+  SmallVector<unsigned, 4> UnreachableEdges;
+  SmallVector<unsigned, 4> ReachableEdges;
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     if (PostDominatedByUnreachable.count(*I))
-      UnreachableEdges.insert(*I);
+      UnreachableEdges.push_back(I.getSuccessorIndex());
     else
-      ReachableEdges.insert(*I);
+      ReachableEdges.push_back(I.getSuccessorIndex());
   }
 
   // If all successors are in the set of blocks post-dominated by unreachable,
@@ -136,18 +136,19 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(BasicBlock *BB) {
     return false;
 
   uint32_t UnreachableWeight =
-    std::max(UR_TAKEN_WEIGHT / UnreachableEdges.size(), MIN_WEIGHT);
-  for (SmallPtrSet<BasicBlock *, 4>::iterator I = UnreachableEdges.begin(),
-                                              E = UnreachableEdges.end();
+    std::max(UR_TAKEN_WEIGHT / (unsigned)UnreachableEdges.size(), MIN_WEIGHT);
+  for (SmallVector<unsigned, 4>::iterator I = UnreachableEdges.begin(),
+                                          E = UnreachableEdges.end();
        I != E; ++I)
     setEdgeWeight(BB, *I, UnreachableWeight);
 
   if (ReachableEdges.empty())
     return true;
   uint32_t ReachableWeight =
-    std::max(UR_NONTAKEN_WEIGHT / ReachableEdges.size(), NORMAL_WEIGHT);
-  for (SmallPtrSet<BasicBlock *, 4>::iterator I = ReachableEdges.begin(),
-                                              E = ReachableEdges.end();
+    std::max(UR_NONTAKEN_WEIGHT / (unsigned)ReachableEdges.size(),
+             NORMAL_WEIGHT);
+  for (SmallVector<unsigned, 4>::iterator I = ReachableEdges.begin(),
+                                          E = ReachableEdges.end();
        I != E; ++I)
     setEdgeWeight(BB, *I, ReachableWeight);
 
@@ -187,7 +188,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(BasicBlock *BB) {
   }
   assert(Weights.size() == TI->getNumSuccessors() && "Checked above");
   for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-    setEdgeWeight(BB, TI->getSuccessor(i), Weights[i]);
+    setEdgeWeight(BB, i, Weights[i]);
 
   return true;
 }
@@ -211,19 +212,17 @@ bool BranchProbabilityInfo::calcPointerHeuristics(BasicBlock *BB) {
 
   assert(CI->getOperand(1)->getType()->isPointerTy());
 
-  BasicBlock *Taken = BI->getSuccessor(0);
-  BasicBlock *NonTaken = BI->getSuccessor(1);
-
   // p != 0   ->   isProb = true
   // p == 0   ->   isProb = false
   // p != q   ->   isProb = true
   // p == q   ->   isProb = false;
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
   bool isProb = CI->getPredicate() == ICmpInst::ICMP_NE;
   if (!isProb)
-    std::swap(Taken, NonTaken);
+    std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, Taken, PH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTaken, PH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, TakenIdx, PH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, PH_NONTAKEN_WEIGHT);
   return true;
 }
 
@@ -234,17 +233,17 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
   if (!L)
     return false;
 
-  SmallPtrSet<BasicBlock *, 8> BackEdges;
-  SmallPtrSet<BasicBlock *, 8> ExitingEdges;
-  SmallPtrSet<BasicBlock *, 8> InEdges; // Edges from header to the loop.
+  SmallVector<unsigned, 8> BackEdges;
+  SmallVector<unsigned, 8> ExitingEdges;
+  SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
 
   for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     if (!L->contains(*I))
-      ExitingEdges.insert(*I);
+      ExitingEdges.push_back(I.getSuccessorIndex());
     else if (L->getHeader() == *I)
-      BackEdges.insert(*I);
+      BackEdges.push_back(I.getSuccessorIndex());
     else
-      InEdges.insert(*I);
+      InEdges.push_back(I.getSuccessorIndex());
   }
 
   if (uint32_t numBackEdges = BackEdges.size()) {
@@ -252,10 +251,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (backWeight < NORMAL_WEIGHT)
       backWeight = NORMAL_WEIGHT;
 
-    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = BackEdges.begin(),
+    for (SmallVector<unsigned, 8>::iterator EI = BackEdges.begin(),
          EE = BackEdges.end(); EI != EE; ++EI) {
-      BasicBlock *Back = *EI;
-      setEdgeWeight(BB, Back, backWeight);
+      setEdgeWeight(BB, *EI, backWeight);
     }
   }
 
@@ -264,10 +262,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (inWeight < NORMAL_WEIGHT)
       inWeight = NORMAL_WEIGHT;
 
-    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = InEdges.begin(),
+    for (SmallVector<unsigned, 8>::iterator EI = InEdges.begin(),
          EE = InEdges.end(); EI != EE; ++EI) {
-      BasicBlock *Back = *EI;
-      setEdgeWeight(BB, Back, inWeight);
+      setEdgeWeight(BB, *EI, inWeight);
     }
   }
 
@@ -276,10 +273,9 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(BasicBlock *BB) {
     if (exitWeight < MIN_WEIGHT)
       exitWeight = MIN_WEIGHT;
 
-    for (SmallPtrSet<BasicBlock *, 8>::iterator EI = ExitingEdges.begin(),
+    for (SmallVector<unsigned, 8>::iterator EI = ExitingEdges.begin(),
          EE = ExitingEdges.end(); EI != EE; ++EI) {
-      BasicBlock *Exiting = *EI;
-      setEdgeWeight(BB, Exiting, exitWeight);
+      setEdgeWeight(BB, *EI, exitWeight);
     }
   }
 
@@ -335,14 +331,13 @@ bool BranchProbabilityInfo::calcZeroHeuristics(BasicBlock *BB) {
     return false;
   }
 
-  BasicBlock *Taken = BI->getSuccessor(0);
-  BasicBlock *NonTaken = BI->getSuccessor(1);
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
 
   if (!isProb)
-    std::swap(Taken, NonTaken);
+    std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, Taken, ZH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTaken, ZH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, TakenIdx, ZH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, ZH_NONTAKEN_WEIGHT);
 
   return true;
 }
@@ -372,14 +367,13 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(BasicBlock *BB) {
     return false;
   }
 
-  BasicBlock *Taken = BI->getSuccessor(0);
-  BasicBlock *NonTaken = BI->getSuccessor(1);
+  unsigned TakenIdx = 0, NonTakenIdx = 1;
 
   if (!isProb)
-    std::swap(Taken, NonTaken);
+    std::swap(TakenIdx, NonTakenIdx);
 
-  setEdgeWeight(BB, Taken, FPH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, NonTaken, FPH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, TakenIdx, FPH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, NonTakenIdx, FPH_NONTAKEN_WEIGHT);
 
   return true;
 }
@@ -389,11 +383,8 @@ bool BranchProbabilityInfo::calcInvokeHeuristics(BasicBlock *BB) {
   if (!II)
     return false;
 
-  BasicBlock *Normal = II->getNormalDest();
-  BasicBlock *Unwind = II->getUnwindDest();
-
-  setEdgeWeight(BB, Normal, IH_TAKEN_WEIGHT);
-  setEdgeWeight(BB, Unwind, IH_NONTAKEN_WEIGHT);
+  setEdgeWeight(BB, 0/*Index for Normal*/, IH_TAKEN_WEIGHT);
+  setEdgeWeight(BB, 1/*Index for Unwind*/, IH_NONTAKEN_WEIGHT);
   return true;
 }
 
@@ -450,8 +441,7 @@ uint32_t BranchProbabilityInfo::getSumForBlock(const BasicBlock *BB) const {
   uint32_t Sum = 0;
 
   for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    const BasicBlock *Succ = *I;
-    uint32_t Weight = getEdgeWeight(BB, Succ);
+    uint32_t Weight = getEdgeWeight(BB, I.getSuccessorIndex());
     uint32_t PrevSum = Sum;
 
     Sum += Weight;
@@ -494,11 +484,13 @@ BasicBlock *BranchProbabilityInfo::getHotSucc(BasicBlock *BB) const {
   return 0;
 }
 
-// Return edge's weight. If can't find it, return DEFAULT_WEIGHT value.
+/// Get the raw edge weight for the edge. If can't find it, return
+/// DEFAULT_WEIGHT value. Here an edge is specified using PredBlock and an index
+/// to the successors.
 uint32_t BranchProbabilityInfo::
-getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
-  Edge E(Src, Dst);
-  DenseMap<Edge, uint32_t>::const_iterator I = Weights.find(E);
+getEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors) const {
+  DenseMap<Edge, uint32_t>::const_iterator I =
+      Weights.find(std::make_pair(Src, IndexInSuccessors));
 
   if (I != Weights.end())
     return I->second;
@@ -506,15 +498,43 @@ getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
   return DEFAULT_WEIGHT;
 }
 
+/// Get the raw edge weight calculated for the block pair. This returns the sum
+/// of all raw edge weights from Src to Dst.
+uint32_t BranchProbabilityInfo::
+getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
+  uint32_t Weight = 0;
+  DenseMap<Edge, uint32_t>::const_iterator MapI;
+  for (succ_const_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
+    if (*I == Dst) {
+      MapI = Weights.find(std::make_pair(Src, I.getSuccessorIndex()));
+      if (MapI != Weights.end())
+        Weight += MapI->second;
+    }
+  return (Weight == 0) ? DEFAULT_WEIGHT : Weight;
+}
+
+/// Set the edge weight for a given edge specified by PredBlock and an index
+/// to the successors.
 void BranchProbabilityInfo::
-setEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst, uint32_t Weight) {
-  Weights[std::make_pair(Src, Dst)] = Weight;
+setEdgeWeight(const BasicBlock *Src, unsigned IndexInSuccessors,
+              uint32_t Weight) {
+  Weights[std::make_pair(Src, IndexInSuccessors)] = Weight;
   DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
-               << Dst->getName() << " weight to " << Weight
-               << (isEdgeHot(Src, Dst) ? " [is HOT now]\n" : "\n"));
+               << IndexInSuccessors << " successor weight to "
+               << Weight << "\n");
 }
 
+/// Get an edge's probability, relative to other out-edges from Src.
+BranchProbability BranchProbabilityInfo::
+getEdgeProbability(const BasicBlock *Src, unsigned IndexInSuccessors) const {
+  uint32_t N = getEdgeWeight(Src, IndexInSuccessors);
+  uint32_t D = getSumForBlock(Src);
+
+  return BranchProbability(N, D);
+}
 
+/// Get the probability of going from Src to Dst. It returns the sum of all
+/// probabilities for edges from Src to Dst.
 BranchProbability BranchProbabilityInfo::
 getEdgeProbability(const BasicBlock *Src, const BasicBlock *Dst) const {
 
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index 96e68b419917..b3a40bee4211 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -10,9 +10,11 @@ add_llvm_library(LLVMAnalysis
   BranchProbabilityInfo.cpp
   CFGPrinter.cpp
   CaptureTracking.cpp
+  CostModel.cpp
   CodeMetrics.cpp
   ConstantFolding.cpp
   DbgInfoPrinter.cpp
+  DependenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
   IVUsers.cpp
@@ -26,7 +28,6 @@ add_llvm_library(LLVMAnalysis
   LibCallSemantics.cpp
   Lint.cpp
   Loads.cpp
-  LoopDependenceAnalysis.cpp
   LoopInfo.cpp
   LoopPass.cpp
   MemDepPrinter.cpp
@@ -44,6 +45,8 @@ add_llvm_library(LLVMAnalysis
   ProfileInfoLoader.cpp
   ProfileInfoLoaderPass.cpp
   ProfileVerifierPass.cpp
+  ProfileDataLoader.cpp
+  ProfileDataLoaderPass.cpp
   RegionInfo.cpp
   RegionPass.cpp
   RegionPrinter.cpp
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index 974b906b34ec..d9c02990a801 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -23,6 +23,8 @@ using namespace llvm;
 
 CaptureTracker::~CaptureTracker() {}
 
+bool CaptureTracker::shouldExplore(Use *U) { return true; }
+
 namespace {
   struct SimpleCaptureTracker : public CaptureTracker {
     explicit SimpleCaptureTracker(bool ReturnCaptures)
@@ -30,8 +32,6 @@ namespace {
 
     void tooManyUses() { Captured = true; }
 
-    bool shouldExplore(Use *U) { return true; }
-
     bool captured(Use *U) {
       if (isa<ReturnInst>(U->getUser()) && !ReturnCaptures)
         return false;
diff --git a/lib/Analysis/CodeMetrics.cpp b/lib/Analysis/CodeMetrics.cpp
index acda34ba14b5..651a54be1b9e 100644
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@@ -15,7 +15,7 @@
 #include "llvm/Function.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 using namespace llvm;
 
@@ -54,7 +54,7 @@ bool llvm::callIsSmall(ImmutableCallSite CS) {
   return false;
 }
 
-bool llvm::isInstructionFree(const Instruction *I, const TargetData *TD) {
+bool llvm::isInstructionFree(const Instruction *I, const DataLayout *TD) {
   if (isa<PHINode>(I))
     return true;
 
@@ -119,7 +119,7 @@ bool llvm::isInstructionFree(const Instruction *I, const TargetData *TD) {
 /// analyzeBasicBlock - Fill in the current structure with information gleaned
 /// from the specified block.
 void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
-                                    const TargetData *TD) {
+                                    const DataLayout *TD) {
   ++NumBlocks;
   unsigned NumInstsBeforeThisBB = NumInsts;
   for (BasicBlock::const_iterator II = BB->begin(), E = BB->end();
@@ -189,14 +189,14 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
   NumBBInsts[BB] = NumInsts - NumInstsBeforeThisBB;
 }
 
-void CodeMetrics::analyzeFunction(Function *F, const TargetData *TD) {
+void CodeMetrics::analyzeFunction(Function *F, const DataLayout *TD) {
   // If this function contains a call that "returns twice" (e.g., setjmp or
   // _setjmp) and it isn't marked with "returns twice" itself, never inline it.
   // This is a hack because we depend on the user marking their local variables
   // as volatile if they are live across a setjmp call, and they probably
   // won't do this in callers.
   exposesReturnsTwice = F->callsFunctionThatReturnsTwice() &&
-    !F->hasFnAttr(Attribute::ReturnsTwice);
+    !F->getFnAttributes().hasAttribute(Attributes::ReturnsTwice);
 
   // Look at the size of the callee.
   for (Function::const_iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index f5e619c6736c..91a5b84e8a63 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -11,7 +11,7 @@
 //
 // Also, to supplement the basic VMCore ConstantExpr simplifications,
 // this file defines some additional folding routines that can make use of
-// TargetData information. These functions cannot go in VMCore due to library
+// DataLayout information. These functions cannot go in VMCore due to library
 // dependency issues.
 //
 //===----------------------------------------------------------------------===//
@@ -25,7 +25,7 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/Operator.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -41,11 +41,11 @@ using namespace llvm;
 // Constant Folding internal helper functions
 //===----------------------------------------------------------------------===//
 
-/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with 
-/// TargetData.  This always returns a non-null constant, but it may be a
+/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with
+/// DataLayout.  This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
 static Constant *FoldBitCast(Constant *C, Type *DestTy,
-                             const TargetData &TD) {
+                             const DataLayout &TD) {
   // Catch the obvious splat cases.
   if (C->isNullValue() && !DestTy->isX86_MMXTy())
     return Constant::getNullValue(DestTy);
@@ -59,9 +59,9 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       return ConstantExpr::getBitCast(C, DestTy);
 
     unsigned NumSrcElts = CDV->getType()->getNumElements();
-    
+
     Type *SrcEltTy = CDV->getType()->getElementType();
-    
+
     // If the vector is a vector of floating point, convert it to vector of int
     // to simplify things.
     if (SrcEltTy->isFloatingPointTy()) {
@@ -72,7 +72,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       C = ConstantExpr::getBitCast(C, SrcIVTy);
       CDV = cast<ConstantDataVector>(C);
     }
-    
+
     // Now that we know that the input value is a vector of integers, just shift
     // and insert them into our result.
     unsigned BitShift = TD.getTypeAllocSizeInBits(SrcEltTy);
@@ -84,43 +84,43 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       else
         Result |= CDV->getElementAsInteger(i);
     }
-   
+
     return ConstantInt::get(IT, Result);
   }
-  
+
   // The code below only handles casts to vectors currently.
   VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
   if (DestVTy == 0)
     return ConstantExpr::getBitCast(C, DestTy);
-  
+
   // If this is a scalar -> vector cast, convert the input into a <1 x scalar>
   // vector so the code below can handle it uniformly.
   if (isa<ConstantFP>(C) || isa<ConstantInt>(C)) {
     Constant *Ops = C; // don't take the address of C!
     return FoldBitCast(ConstantVector::get(Ops), DestTy, TD);
   }
-  
+
   // If this is a bitcast from constant vector -> vector, fold it.
   if (!isa<ConstantDataVector>(C) && !isa<ConstantVector>(C))
     return ConstantExpr::getBitCast(C, DestTy);
-  
+
   // If the element types match, VMCore can fold it.
   unsigned NumDstElt = DestVTy->getNumElements();
   unsigned NumSrcElt = C->getType()->getVectorNumElements();
   if (NumDstElt == NumSrcElt)
     return ConstantExpr::getBitCast(C, DestTy);
-  
+
   Type *SrcEltTy = C->getType()->getVectorElementType();
   Type *DstEltTy = DestVTy->getElementType();
-  
-  // Otherwise, we're changing the number of elements in a vector, which 
+
+  // Otherwise, we're changing the number of elements in a vector, which
   // requires endianness information to do the right thing.  For example,
   //    bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
   // folds to (little endian):
   //    <4 x i32> <i32 0, i32 0, i32 1, i32 0>
   // and to (big endian):
   //    <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-  
+
   // First thing is first.  We only want to think about integer here, so if
   // we have something in FP form, recast it as integer.
   if (DstEltTy->isFloatingPointTy()) {
@@ -130,11 +130,11 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumDstElt);
     // Recursively handle this integer conversion, if possible.
     C = FoldBitCast(C, DestIVTy, TD);
-    
+
     // Finally, VMCore can handle this now that #elts line up.
     return ConstantExpr::getBitCast(C, DestTy);
   }
-  
+
   // Okay, we know the destination is integer, if the input is FP, convert
   // it to integer first.
   if (SrcEltTy->isFloatingPointTy()) {
@@ -148,13 +148,13 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
         !isa<ConstantDataVector>(C))
       return C;
   }
-  
+
   // Now we know that the input and output vectors are both integer vectors
   // of the same size, and that their #elements is not the same.  Do the
   // conversion here, which depends on whether the input or output has
   // more elements.
   bool isLittleEndian = TD.isLittleEndian();
-  
+
   SmallVector<Constant*, 32> Result;
   if (NumDstElt < NumSrcElt) {
     // Handle: bitcast (<4 x i32> <i32 0, i32 1, i32 2, i32 3> to <2 x i64>)
@@ -170,15 +170,15 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
         Constant *Src =dyn_cast<ConstantInt>(C->getAggregateElement(SrcElt++));
         if (!Src)  // Reject constantexpr elements.
           return ConstantExpr::getBitCast(C, DestTy);
-        
+
         // Zero extend the element to the right size.
         Src = ConstantExpr::getZExt(Src, Elt->getType());
-        
+
         // Shift it to the right place, depending on endianness.
-        Src = ConstantExpr::getShl(Src, 
+        Src = ConstantExpr::getShl(Src,
                                    ConstantInt::get(Src->getType(), ShiftAmt));
         ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
-        
+
         // Mix it in.
         Elt = ConstantExpr::getOr(Elt, Src);
       }
@@ -186,30 +186,30 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
     }
     return ConstantVector::get(Result);
   }
-  
+
   // Handle: bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
   unsigned Ratio = NumDstElt/NumSrcElt;
   unsigned DstBitSize = DstEltTy->getPrimitiveSizeInBits();
-  
+
   // Loop over each source value, expanding into multiple results.
   for (unsigned i = 0; i != NumSrcElt; ++i) {
     Constant *Src = dyn_cast<ConstantInt>(C->getAggregateElement(i));
     if (!Src)  // Reject constantexpr elements.
       return ConstantExpr::getBitCast(C, DestTy);
-    
+
     unsigned ShiftAmt = isLittleEndian ? 0 : DstBitSize*(Ratio-1);
     for (unsigned j = 0; j != Ratio; ++j) {
       // Shift the piece of the value into the right place, depending on
       // endianness.
-      Constant *Elt = ConstantExpr::getLShr(Src, 
+      Constant *Elt = ConstantExpr::getLShr(Src,
                                   ConstantInt::get(Src->getType(), ShiftAmt));
       ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
-      
+
       // Truncate and remember this piece.
       Result.push_back(ConstantExpr::getTrunc(Elt, DstEltTy));
     }
   }
-  
+
   return ConstantVector::get(Result);
 }
 
@@ -218,34 +218,34 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
 /// from a global, return the global and the constant.  Because of
 /// constantexprs, this function is recursive.
 static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
-                                       int64_t &Offset, const TargetData &TD) {
+                                       int64_t &Offset, const DataLayout &TD) {
   // Trivial case, constant is the global.
   if ((GV = dyn_cast<GlobalValue>(C))) {
     Offset = 0;
     return true;
   }
-  
+
   // Otherwise, if this isn't a constant expr, bail out.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE) return false;
-  
+
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
       CE->getOpcode() == Instruction::BitCast)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
-  
-  // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)    
+
+  // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     // Cannot compute this if the element type of the pointer is missing size
     // info.
     if (!cast<PointerType>(CE->getOperand(0)->getType())
                  ->getElementType()->isSized())
       return false;
-    
+
     // If the base isn't a global+constant, we aren't either.
     if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
       return false;
-    
+
     // Otherwise, add any offset that our operands provide.
     gep_type_iterator GTI = gep_type_begin(CE);
     for (User::const_op_iterator i = CE->op_begin() + 1, e = CE->op_end();
@@ -253,7 +253,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
       ConstantInt *CI = dyn_cast<ConstantInt>(*i);
       if (!CI) return false;  // Index isn't a simple constant?
       if (CI->isZero()) continue;  // Not adding anything.
-      
+
       if (StructType *ST = dyn_cast<StructType>(*GTI)) {
         // N = N + Offset
         Offset += TD.getStructLayout(ST)->getElementOffset(CI->getZExtValue());
@@ -264,7 +264,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     }
     return true;
   }
-  
+
   return false;
 }
 
@@ -274,30 +274,33 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
 /// the CurPtr buffer.  TD is the target data.
 static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
                                unsigned char *CurPtr, unsigned BytesLeft,
-                               const TargetData &TD) {
+                               const DataLayout &TD) {
   assert(ByteOffset <= TD.getTypeAllocSize(C->getType()) &&
          "Out of range access");
-  
+
   // If this element is zero or undefined, we can just return since *CurPtr is
   // zero initialized.
   if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
     return true;
-  
+
   if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
     if (CI->getBitWidth() > 64 ||
         (CI->getBitWidth() & 7) != 0)
       return false;
-    
+
     uint64_t Val = CI->getZExtValue();
     unsigned IntBytes = unsigned(CI->getBitWidth()/8);
-    
+
     for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
-      CurPtr[i] = (unsigned char)(Val >> (ByteOffset * 8));
+      int n = ByteOffset;
+      if (!TD.isLittleEndian())
+        n = IntBytes - n - 1;
+      CurPtr[i] = (unsigned char)(Val >> (n * 8));
       ++ByteOffset;
     }
     return true;
   }
-  
+
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
     if (CFP->getType()->isDoubleTy()) {
       C = FoldBitCast(C, Type::getInt64Ty(C->getContext()), TD);
@@ -309,13 +312,13 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     }
     return false;
   }
-  
+
   if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
     const StructLayout *SL = TD.getStructLayout(CS->getType());
     unsigned Index = SL->getElementContainingOffset(ByteOffset);
     uint64_t CurEltOffset = SL->getElementOffset(Index);
     ByteOffset -= CurEltOffset;
-    
+
     while (1) {
       // If the element access is to the element itself and not to tail padding,
       // read the bytes from the element.
@@ -325,9 +328,9 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
           !ReadDataFromGlobal(CS->getOperand(Index), ByteOffset, CurPtr,
                               BytesLeft, TD))
         return false;
-      
+
       ++Index;
-      
+
       // Check to see if we read from the last struct element, if so we're done.
       if (Index == CS->getType()->getNumElements())
         return true;
@@ -375,11 +378,11 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     }
     return true;
   }
-      
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr &&
-        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext())) 
-      return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr, 
+        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext()))
+      return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
                                 BytesLeft, TD);
   }
 
@@ -388,10 +391,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
 }
 
 static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
-                                                 const TargetData &TD) {
+                                                 const DataLayout &TD) {
   Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
   IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
-  
+
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
     // If this is a float/double load, we can try folding it as an int32/64 load
@@ -415,15 +418,15 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
       return FoldBitCast(Res, LoadTy, TD);
     return 0;
   }
-  
+
   unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
   if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
-  
+
   GlobalValue *GVal;
   int64_t Offset;
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
     return 0;
-  
+
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GVal);
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
       !GV->getInitializer()->getType()->isSized())
@@ -432,20 +435,29 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
   if (Offset < 0) return 0;
-  
+
   // If we're not accessing anything in this constant, the result is undefined.
   if (uint64_t(Offset) >= TD.getTypeAllocSize(GV->getInitializer()->getType()))
     return UndefValue::get(IntType);
-  
+
   unsigned char RawBytes[32] = {0};
   if (!ReadDataFromGlobal(GV->getInitializer(), Offset, RawBytes,
                           BytesLoaded, TD))
     return 0;
 
-  APInt ResultVal = APInt(IntType->getBitWidth(), RawBytes[BytesLoaded-1]);
-  for (unsigned i = 1; i != BytesLoaded; ++i) {
-    ResultVal <<= 8;
-    ResultVal |= RawBytes[BytesLoaded-1-i];
+  APInt ResultVal = APInt(IntType->getBitWidth(), 0);
+  if (TD.isLittleEndian()) {
+    ResultVal = RawBytes[BytesLoaded - 1];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[BytesLoaded-1-i];
+    }
+  } else {
+    ResultVal = RawBytes[0];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[i];
+    }
   }
 
   return ConstantInt::get(IntType->getContext(), ResultVal);
@@ -455,7 +467,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
 /// produce if it is constant and determinable.  If this is not determinable,
 /// return null.
 Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
-                                             const TargetData *TD) {
+                                             const DataLayout *TD) {
   // First, try the easy cases:
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
@@ -464,15 +476,15 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE) return 0;
-  
+
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
       if (GV->isConstant() && GV->hasDefinitiveInitializer())
-        if (Constant *V = 
+        if (Constant *V =
              ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
           return V;
   }
-  
+
   // Instead of loading constant c string, use corresponding integer value
   // directly if string length is small enough.
   StringRef Str;
@@ -500,14 +512,14 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
         SingleChar = 0;
         StrVal = (StrVal << 8) | SingleChar;
       }
-      
+
       Constant *Res = ConstantInt::get(CE->getContext(), StrVal);
       if (Ty->isFloatingPointTy())
         Res = ConstantExpr::getBitCast(Res, Ty);
       return Res;
     }
   }
-  
+
   // If this load comes from anywhere in a constant global, and if the global
   // is all undef or zero, we know what it loads.
   if (GlobalVariable *GV =
@@ -520,18 +532,16 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
         return UndefValue::get(ResTy);
     }
   }
-  
-  // Try hard to fold loads from bitcasted strange and non-type-safe things.  We
-  // currently don't do any of this for big endian systems.  It can be
-  // generalized in the future if someone is interested.
-  if (TD && TD->isLittleEndian())
+
+  // Try hard to fold loads from bitcasted strange and non-type-safe things.
+  if (TD)
     return FoldReinterpretLoadFromConstPtr(CE, *TD);
   return 0;
 }
 
-static Constant *ConstantFoldLoadInst(const LoadInst *LI, const TargetData *TD){
+static Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout *TD){
   if (LI->isVolatile()) return 0;
-  
+
   if (Constant *C = dyn_cast<Constant>(LI->getOperand(0)))
     return ConstantFoldLoadFromConstPtr(C, TD);
 
@@ -540,23 +550,23 @@ static Constant *ConstantFoldLoadInst(const LoadInst *LI, const TargetData *TD){
 
 /// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
 /// Attempt to symbolically evaluate the result of a binary operator merging
-/// these together.  If target data info is available, it is provided as TD, 
+/// these together.  If target data info is available, it is provided as TD,
 /// otherwise TD is null.
 static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
-                                           Constant *Op1, const TargetData *TD){
+                                           Constant *Op1, const DataLayout *TD){
   // SROA
-  
+
   // Fold (and 0xffffffff00000000, (shl x, 32)) -> shl.
   // Fold (lshr (or X, Y), 32) -> (lshr [X/Y], 32) if one doesn't contribute
   // bits.
-  
-  
+
+
   // If the constant expr is something like &A[123] - &A[4].f, fold this into a
   // constant.  This happens frequently when iterating over a global array.
   if (Opc == Instruction::Sub && TD) {
     GlobalValue *GV1, *GV2;
     int64_t Offs1, Offs2;
-    
+
     if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *TD))
       if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *TD) &&
           GV1 == GV2) {
@@ -564,7 +574,7 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
         return ConstantInt::get(Op0->getType(), Offs1-Offs2);
       }
   }
-    
+
   return 0;
 }
 
@@ -572,7 +582,7 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
 /// explicitly cast them so that they aren't implicitly casted by the
 /// getelementptr.
 static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
-                                Type *ResultTy, const TargetData *TD,
+                                Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
   if (!TD) return 0;
   Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
@@ -622,20 +632,20 @@ static Constant* StripPtrCastKeepAS(Constant* Ptr) {
 /// SymbolicallyEvaluateGEP - If we can symbolically evaluate the specified GEP
 /// constant expression, do so.
 static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
-                                         Type *ResultTy, const TargetData *TD,
+                                         Type *ResultTy, const DataLayout *TD,
                                          const TargetLibraryInfo *TLI) {
   Constant *Ptr = Ops[0];
   if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized() ||
       !Ptr->getType()->isPointerTy())
     return 0;
-  
+
   Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     if (!isa<ConstantInt>(Ops[i])) {
-      
+
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
       if (Ops.size() == 2 &&
@@ -659,7 +669,8 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   unsigned BitWidth = TD->getTypeSizeInBits(IntPtrTy);
   APInt Offset =
     APInt(BitWidth, TD->getIndexedOffset(Ptr->getType(),
-                                         makeArrayRef((Value **)Ops.data() + 1,
+                                         makeArrayRef((Value *const*)
+                                                        Ops.data() + 1,
                                                       Ops.size() - 1)));
   Ptr = StripPtrCastKeepAS(Ptr);
 
@@ -708,12 +719,12 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
         // The only pointer indexing we'll do is on the first index of the GEP.
         if (!NewIdxs.empty())
           break;
-       
+
         // Only handle pointers to sized types, not pointers to functions.
         if (!ATy->getElementType()->isSized())
           return 0;
       }
-        
+
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
       IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
@@ -785,7 +796,7 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
 /// this function can only fail when attempting to fold instructions like loads
 /// and stores, which have no constant expression form.
 Constant *llvm::ConstantFoldInstruction(Instruction *I,
-                                        const TargetData *TD,
+                                        const DataLayout *TD,
                                         const TargetLibraryInfo *TLI) {
   // Handle PHI nodes quickly here...
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
@@ -836,7 +847,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   if (const CmpInst *CI = dyn_cast<CmpInst>(I))
     return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
                                            TD, TLI);
-  
+
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return ConstantFoldLoadInst(LI, TD);
 
@@ -855,10 +866,10 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
 }
 
 /// ConstantFoldConstantExpression - Attempt to fold the constant expression
-/// using the specified TargetData.  If successful, the constant result is
+/// using the specified DataLayout.  If successful, the constant result is
 /// result is returned, if not, null is returned.
 Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
-                                               const TargetData *TD,
+                                               const DataLayout *TD,
                                                const TargetLibraryInfo *TLI) {
   SmallVector<Constant*, 8> Ops;
   for (User::const_op_iterator i = CE->op_begin(), e = CE->op_end();
@@ -886,19 +897,19 @@ Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
 /// information, due to only being passed an opcode and operands. Constant
 /// folding using this function strips this information.
 ///
-Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy, 
+Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                          ArrayRef<Constant *> Ops,
-                                         const TargetData *TD,
-                                         const TargetLibraryInfo *TLI) {                                         
+                                         const DataLayout *TD,
+                                         const TargetLibraryInfo *TLI) {
   // Handle easy binops first.
   if (Instruction::isBinaryOp(Opcode)) {
     if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
       if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
         return C;
-    
+
     return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
   }
-  
+
   switch (Opcode) {
   default: return 0;
   case Instruction::ICmp:
@@ -916,7 +927,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
         Constant *Input = CE->getOperand(0);
         unsigned InWidth = Input->getType()->getScalarSizeInBits();
         if (TD->getPointerSizeInBits() < InWidth) {
-          Constant *Mask = 
+          Constant *Mask =
             ConstantInt::get(CE->getContext(), APInt::getLowBitsSet(InWidth,
                                                   TD->getPointerSizeInBits()));
           Input = ConstantExpr::getAnd(Input, Mask);
@@ -964,7 +975,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
       return C;
     if (Constant *C = SymbolicallyEvaluateGEP(Ops, DestTy, TD, TLI))
       return C;
-    
+
     return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1));
   }
 }
@@ -974,8 +985,8 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
 /// returns a constant expression of the specified operands.
 ///
 Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
-                                                Constant *Ops0, Constant *Ops1, 
-                                                const TargetData *TD,
+                                                Constant *Ops0, Constant *Ops1,
+                                                const DataLayout *TD,
                                                 const TargetLibraryInfo *TLI) {
   // fold: icmp (inttoptr x), null         -> icmp x, 0
   // fold: icmp (ptrtoint x), 0            -> icmp x, null
@@ -995,17 +1006,17 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
         Constant *Null = Constant::getNullValue(C->getType());
         return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
       }
-      
+
       // Only do this transformation if the int is intptrty in size, otherwise
       // there is a truncation or extension that we aren't modeling.
-      if (CE0->getOpcode() == Instruction::PtrToInt && 
+      if (CE0->getOpcode() == Instruction::PtrToInt &&
           CE0->getType() == IntPtrTy) {
         Constant *C = CE0->getOperand(0);
         Constant *Null = Constant::getNullValue(C->getType());
         return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
       }
     }
-    
+
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (TD && CE0->getOpcode() == CE1->getOpcode()) {
         Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
@@ -1029,24 +1040,24 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
                                                  CE1->getOperand(0), TD, TLI);
       }
     }
-    
+
     // icmp eq (or x, y), 0 -> (icmp eq x, 0) & (icmp eq y, 0)
     // icmp ne (or x, y), 0 -> (icmp ne x, 0) | (icmp ne y, 0)
     if ((Predicate == ICmpInst::ICMP_EQ || Predicate == ICmpInst::ICMP_NE) &&
         CE0->getOpcode() == Instruction::Or && Ops1->isNullValue()) {
-      Constant *LHS = 
+      Constant *LHS =
         ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0), Ops1,
                                         TD, TLI);
-      Constant *RHS = 
+      Constant *RHS =
         ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(1), Ops1,
                                         TD, TLI);
-      unsigned OpC = 
+      unsigned OpC =
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
       Constant *Ops[] = { LHS, RHS };
       return ConstantFoldInstOperands(OpC, LHS->getType(), Ops, TD, TLI);
     }
   }
-  
+
   return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
 }
 
@@ -1054,7 +1065,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 /// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
 /// getelementptr constantexpr, return the constant value being addressed by the
 /// constant expression, or null if something is funny and we can't decide.
-Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C, 
+Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
                                                        ConstantExpr *CE) {
   if (!CE->getOperand(1)->isNullValue())
     return 0;  // Do not allow stepping over the value!
@@ -1124,14 +1135,14 @@ llvm::canConstantFoldCallTo(const Function *F) {
 
   if (!F->hasName()) return false;
   StringRef Name = F->getName();
-  
+
   // In these cases, the check of the length is required.  We don't want to
   // return true for a name like "cos\0blah" which strcmp would return equal to
   // "cos", but has length 8.
   switch (Name[0]) {
   default: return false;
   case 'a':
-    return Name == "acos" || Name == "asin" || 
+    return Name == "acos" || Name == "asin" ||
       Name == "atan" || Name == "atan2";
   case 'c':
     return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
@@ -1151,7 +1162,7 @@ llvm::canConstantFoldCallTo(const Function *F) {
   }
 }
 
-static Constant *ConstantFoldFP(double (*NativeFP)(double), double V, 
+static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
                                 Type *Ty) {
   sys::llvm_fenv_clearexcept();
   V = NativeFP(V);
@@ -1159,7 +1170,7 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
     sys::llvm_fenv_clearexcept();
     return 0;
   }
-  
+
   if (Ty->isFloatTy())
     return ConstantFP::get(Ty->getContext(), APFloat((float)V));
   if (Ty->isDoubleTy())
@@ -1175,7 +1186,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
     sys::llvm_fenv_clearexcept();
     return 0;
   }
-  
+
   if (Ty->isFloatTy())
     return ConstantFP::get(Ty->getContext(), APFloat((float)V));
   if (Ty->isDoubleTy())
@@ -1269,7 +1280,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       case 'e':
         if (Name == "exp" && TLI->has(LibFunc::exp))
           return ConstantFoldFP(exp, V, Ty);
-  
+
         if (Name == "exp2" && TLI->has(LibFunc::exp2)) {
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
@@ -1345,7 +1356,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
     }
 
     // Support ConstantVector in case we have an Undef in the top.
-    if (isa<ConstantVector>(Operands[0]) || 
+    if (isa<ConstantVector>(Operands[0]) ||
         isa<ConstantDataVector>(Operands[0])) {
       Constant *Op = cast<Constant>(Operands[0]);
       switch (F->getIntrinsicID()) {
@@ -1364,11 +1375,11 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       case Intrinsic::x86_sse2_cvttsd2si64:
         if (ConstantFP *FPOp =
               dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(), 
+          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
                                           /*roundTowardZero=*/true, Ty);
       }
     }
-  
+
     if (isa<UndefValue>(Operands[0])) {
       if (F->getIntrinsicID() == Intrinsic::bswap)
         return Operands[0];
@@ -1382,14 +1393,14 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
     if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
       if (!Ty->isFloatTy() && !Ty->isDoubleTy())
         return 0;
-      double Op1V = Ty->isFloatTy() ? 
+      double Op1V = Ty->isFloatTy() ?
                       (double)Op1->getValueAPF().convertToFloat() :
                       Op1->getValueAPF().convertToDouble();
       if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
         if (Op2->getType() != Op1->getType())
           return 0;
 
-        double Op2V = Ty->isFloatTy() ? 
+        double Op2V = Ty->isFloatTy() ?
                       (double)Op2->getValueAPF().convertToFloat():
                       Op2->getValueAPF().convertToDouble();
 
@@ -1416,7 +1427,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       }
       return 0;
     }
-    
+
     if (ConstantInt *Op1 = dyn_cast<ConstantInt>(Operands[0])) {
       if (ConstantInt *Op2 = dyn_cast<ConstantInt>(Operands[1])) {
         switch (F->getIntrinsicID()) {
@@ -1466,7 +1477,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
           return ConstantInt::get(Ty, Op1->getValue().countLeadingZeros());
         }
       }
-      
+
       return 0;
     }
     return 0;
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
new file mode 100644
index 000000000000..5adbf458104e
--- /dev/null
+++ b/lib/Analysis/CostModel.cpp
@@ -0,0 +1,193 @@
+//===- CostModel.cpp ------ Cost Model Analysis ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the cost model analysis. It provides a very basic cost
+// estimation for LLVM-IR. The cost result can be thought of as cycles, but it
+// is really unit-less. The estimated cost is ment to be used for comparing
+// alternatives.
+//
+//===----------------------------------------------------------------------===//
+
+#define CM_NAME "cost-model"
+#define DEBUG_TYPE CM_NAME
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Value.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  class CostModelAnalysis : public FunctionPass {
+
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    CostModelAnalysis() : FunctionPass(ID), F(0), VTTI(0) {
+      initializeCostModelAnalysisPass(
+        *PassRegistry::getPassRegistry());
+    }
+
+    /// Returns the expected cost of the instruction.
+    /// Returns -1 if the cost is unknown.
+    /// Note, this method does not cache the cost calculation and it
+    /// can be expensive in some cases.
+    unsigned getInstructionCost(Instruction *I) const;
+
+  private:
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnFunction(Function &F);
+    virtual void print(raw_ostream &OS, const Module*) const;
+
+    /// The function that we analyze.
+    Function *F;
+    /// Vector target information.
+    const VectorTargetTransformInfo *VTTI;
+  };
+}  // End of anonymous namespace
+
+// Register this pass.
+char CostModelAnalysis::ID = 0;
+static const char cm_name[] = "Cost Model Analysis";
+INITIALIZE_PASS_BEGIN(CostModelAnalysis, CM_NAME, cm_name, false, true)
+INITIALIZE_PASS_END  (CostModelAnalysis, CM_NAME, cm_name, false, true)
+
+FunctionPass *llvm::createCostModelAnalysisPass() {
+  return new CostModelAnalysis();
+}
+
+void
+CostModelAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+bool
+CostModelAnalysis::runOnFunction(Function &F) {
+ this->F = &F;
+
+ // Target information.
+ TargetTransformInfo *TTI;
+ TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+ if (TTI)
+   VTTI = TTI->getVectorTargetTransformInfo();
+
+ return false;
+}
+
+unsigned CostModelAnalysis::getInstructionCost(Instruction *I) const {
+  if (!VTTI)
+    return -1;
+
+  switch (I->getOpcode()) {
+  case Instruction::Ret:
+  case Instruction::PHI:
+  case Instruction::Br: {
+    return VTTI->getCFInstrCost(I->getOpcode());
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    return VTTI->getArithmeticInstrCost(I->getOpcode(), I->getType());
+  }
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    Type *CondTy = SI->getCondition()->getType();
+    return VTTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    return VTTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(I);
+    Type *ValTy = SI->getValueOperand()->getType();
+    return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                                 SI->getAlignment(),
+                                 SI->getPointerAddressSpace());
+  }
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(I);
+    return VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
+                                 LI->getAlignment(),
+                                 LI->getPointerAddressSpace());
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcTy = I->getOperand(0)->getType();
+    return VTTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+  }
+  case Instruction::ExtractElement: {
+    ExtractElementInst * EEI = cast<ExtractElementInst>(I);
+    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+    unsigned Idx = -1;
+    if (CI)
+      Idx = CI->getZExtValue();
+    return VTTI->getVectorInstrCost(I->getOpcode(),
+                                    EEI->getOperand(0)->getType(), Idx);
+  }
+  case Instruction::InsertElement: {
+      InsertElementInst * IE = cast<InsertElementInst>(I);
+      ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+      unsigned Idx = -1;
+      if (CI)
+        Idx = CI->getZExtValue();
+      return VTTI->getVectorInstrCost(I->getOpcode(),
+                                      IE->getType(), Idx);
+    }
+  default:
+    // We don't have any information on this instruction.
+    return -1;
+  }
+}
+
+void CostModelAnalysis::print(raw_ostream &OS, const Module*) const {
+  if (!F)
+    return;
+
+  for (Function::iterator B = F->begin(), BE = F->end(); B != BE; ++B) {
+    for (BasicBlock::iterator it = B->begin(), e = B->end(); it != e; ++it) {
+      Instruction *Inst = it;
+      unsigned Cost = getInstructionCost(Inst);
+      if (Cost != (unsigned)-1)
+        OS << "Cost Model: Found an estimated cost of " << Cost;
+      else
+        OS << "Cost Model: Unknown cost";
+
+      OS << " for instruction: "<< *Inst << "\n";
+    }
+  }
+}
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
new file mode 100644
index 000000000000..95ac5ea233b1
--- /dev/null
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -0,0 +1,3786 @@
+//===-- DependenceAnalysis.cpp - DA Implementation --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// DependenceAnalysis is an LLVM pass that analyses dependences between memory
+// accesses. Currently, it is an (incomplete) implementation of the approach
+// described in
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+//
+// There's a single entry point that analyzes the dependence between a pair
+// of memory references in a function, returning either NULL, for no dependence,
+// or a more-or-less detailed description of the dependence between them.
+//
+// Currently, the implementation cannot propagate constraints between
+// coupled RDIV subscripts and lacks a multi-subscript MIV test.
+// Both of these are conservative weaknesses;
+// that is, not a source of correctness problems.
+//
+// The implementation depends on the GEP instruction to
+// differentiate subscripts. Since Clang linearizes subscripts
+// for most arrays, we give up some precision (though the existing MIV tests
+// will help). We trust that the GEP instruction will eventually be extended.
+// In the meantime, we should explore Maslov's ideas about delinearization.
+//
+// We should pay some careful attention to the possibility of integer overflow
+// in the implementation of the various tests. This could happen with Add,
+// Subtract, or Multiply, with both APInt's and SCEV's.
+//
+// Some non-linear subscript pairs can be handled by the GCD test
+// (and perhaps other tests).
+// Should explore how often these things occur.
+//
+// Finally, it seems like certain test cases expose weaknesses in the SCEV
+// simplification, especially in the handling of sign and zero extensions.
+// It could be useful to spend time exploring these.
+//
+// Please note that this is work in progress and the interface is subject to
+// change.
+//
+//===----------------------------------------------------------------------===//
+//                                                                            //
+//                   In memory of Ken Kennedy, 1945 - 2007                    //
+//                                                                            //
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "da"
+
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Operator.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// statistics
+
+STATISTIC(TotalArrayPairs, "Array pairs tested");
+STATISTIC(SeparableSubscriptPairs, "Separable subscript pairs");
+STATISTIC(CoupledSubscriptPairs, "Coupled subscript pairs");
+STATISTIC(NonlinearSubscriptPairs, "Nonlinear subscript pairs");
+STATISTIC(ZIVapplications, "ZIV applications");
+STATISTIC(ZIVindependence, "ZIV independence");
+STATISTIC(StrongSIVapplications, "Strong SIV applications");
+STATISTIC(StrongSIVsuccesses, "Strong SIV successes");
+STATISTIC(StrongSIVindependence, "Strong SIV independence");
+STATISTIC(WeakCrossingSIVapplications, "Weak-Crossing SIV applications");
+STATISTIC(WeakCrossingSIVsuccesses, "Weak-Crossing SIV successes");
+STATISTIC(WeakCrossingSIVindependence, "Weak-Crossing SIV independence");
+STATISTIC(ExactSIVapplications, "Exact SIV applications");
+STATISTIC(ExactSIVsuccesses, "Exact SIV successes");
+STATISTIC(ExactSIVindependence, "Exact SIV independence");
+STATISTIC(WeakZeroSIVapplications, "Weak-Zero SIV applications");
+STATISTIC(WeakZeroSIVsuccesses, "Weak-Zero SIV successes");
+STATISTIC(WeakZeroSIVindependence, "Weak-Zero SIV independence");
+STATISTIC(ExactRDIVapplications, "Exact RDIV applications");
+STATISTIC(ExactRDIVindependence, "Exact RDIV independence");
+STATISTIC(SymbolicRDIVapplications, "Symbolic RDIV applications");
+STATISTIC(SymbolicRDIVindependence, "Symbolic RDIV independence");
+STATISTIC(DeltaApplications, "Delta applications");
+STATISTIC(DeltaSuccesses, "Delta successes");
+STATISTIC(DeltaIndependence, "Delta independence");
+STATISTIC(DeltaPropagations, "Delta propagations");
+STATISTIC(GCDapplications, "GCD applications");
+STATISTIC(GCDsuccesses, "GCD successes");
+STATISTIC(GCDindependence, "GCD independence");
+STATISTIC(BanerjeeApplications, "Banerjee applications");
+STATISTIC(BanerjeeIndependence, "Banerjee independence");
+STATISTIC(BanerjeeSuccesses, "Banerjee successes");
+
+//===----------------------------------------------------------------------===//
+// basics
+
+INITIALIZE_PASS_BEGIN(DependenceAnalysis, "da",
+                      "Dependence Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(LoopInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(DependenceAnalysis, "da",
+                    "Dependence Analysis", true, true)
+
+char DependenceAnalysis::ID = 0;
+
+
+FunctionPass *llvm::createDependenceAnalysisPass() {
+  return new DependenceAnalysis();
+}
+
+
+bool DependenceAnalysis::runOnFunction(Function &F) {
+  this->F = &F;
+  AA = &getAnalysis<AliasAnalysis>();
+  SE = &getAnalysis<ScalarEvolution>();
+  LI = &getAnalysis<LoopInfo>();
+  return false;
+}
+
+
+void DependenceAnalysis::releaseMemory() {
+}
+
+
+void DependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<AliasAnalysis>();
+  AU.addRequiredTransitive<ScalarEvolution>();
+  AU.addRequiredTransitive<LoopInfo>();
+}
+
+
+// Used to test the dependence analyzer.
+// Looks through the function, noting the first store instruction
+// and the first load instruction
+// (which always follows the first load in our tests).
+// Calls depends() and prints out the result.
+// Ignores all other instructions.
+static
+void dumpExampleDependence(raw_ostream &OS, Function *F,
+                           DependenceAnalysis *DA) {
+  for (inst_iterator SrcI = inst_begin(F), SrcE = inst_end(F);
+       SrcI != SrcE; ++SrcI) {
+    if (const StoreInst *Src = dyn_cast<StoreInst>(&*SrcI)) {
+      for (inst_iterator DstI = SrcI, DstE = inst_end(F);
+           DstI != DstE; ++DstI) {
+        if (const LoadInst *Dst = dyn_cast<LoadInst>(&*DstI)) {
+          OS << "da analyze - ";
+          if (Dependence *D = DA->depends(Src, Dst, true)) {
+            D->dump(OS);
+            for (unsigned Level = 1; Level <= D->getLevels(); Level++) {
+              if (D->isSplitable(Level)) {
+                OS << "da analyze - split level = " << Level;
+                OS << ", iteration = " << *DA->getSplitIteration(D, Level);
+                OS << "!\n";
+              }
+            }
+            delete D;
+          }
+          else
+            OS << "none!\n";
+          return;
+        }
+      }
+    }
+  }
+}
+
+
+void DependenceAnalysis::print(raw_ostream &OS, const Module*) const {
+  dumpExampleDependence(OS, F, const_cast<DependenceAnalysis *>(this));
+}
+
+//===----------------------------------------------------------------------===//
+// Dependence methods
+
+// Returns true if this is an input dependence.
+bool Dependence::isInput() const {
+  return Src->mayReadFromMemory() && Dst->mayReadFromMemory();
+}
+
+
+// Returns true if this is an output dependence.
+bool Dependence::isOutput() const {
+  return Src->mayWriteToMemory() && Dst->mayWriteToMemory();
+}
+
+
+// Returns true if this is an flow (aka true)  dependence.
+bool Dependence::isFlow() const {
+  return Src->mayWriteToMemory() && Dst->mayReadFromMemory();
+}
+
+
+// Returns true if this is an anti dependence.
+bool Dependence::isAnti() const {
+  return Src->mayReadFromMemory() && Dst->mayWriteToMemory();
+}
+
+
+// Returns true if a particular level is scalar; that is,
+// if no subscript in the source or destination mention the induction
+// variable associated with the loop at this level.
+// Leave this out of line, so it will serve as a virtual method anchor
+bool Dependence::isScalar(unsigned level) const {
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// FullDependence methods
+
+FullDependence::FullDependence(const Instruction *Source,
+                               const Instruction *Destination,
+                               bool PossiblyLoopIndependent,
+                               unsigned CommonLevels) :
+  Dependence(Source, Destination),
+  Levels(CommonLevels),
+  LoopIndependent(PossiblyLoopIndependent) {
+  Consistent = true;
+  DV = CommonLevels ? new DVEntry[CommonLevels] : NULL;
+}
+
+// The rest are simple getters that hide the implementation.
+
+// getDirection - Returns the direction associated with a particular level.
+unsigned FullDependence::getDirection(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Direction;
+}
+
+
+// Returns the distance (or NULL) associated with a particular level.
+const SCEV *FullDependence::getDistance(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Distance;
+}
+
+
+// Returns true if a particular level is scalar; that is,
+// if no subscript in the source or destination mention the induction
+// variable associated with the loop at this level.
+bool FullDependence::isScalar(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Scalar;
+}
+
+
+// Returns true if peeling the first iteration from this loop
+// will break this dependence.
+bool FullDependence::isPeelFirst(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].PeelFirst;
+}
+
+
+// Returns true if peeling the last iteration from this loop
+// will break this dependence.
+bool FullDependence::isPeelLast(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].PeelLast;
+}
+
+
+// Returns true if splitting this loop will break the dependence.
+bool FullDependence::isSplitable(unsigned Level) const {
+  assert(0 < Level && Level <= Levels && "Level out of range");
+  return DV[Level - 1].Splitable;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DependenceAnalysis::Constraint methods
+
+// If constraint is a point <X, Y>, returns X.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getX() const {
+  assert(Kind == Point && "Kind should be Point");
+  return A;
+}
+
+
+// If constraint is a point <X, Y>, returns Y.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getY() const {
+  assert(Kind == Point && "Kind should be Point");
+  return B;
+}
+
+
+// If constraint is a line AX + BY = C, returns A.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getA() const {
+  assert((Kind == Line || Kind == Distance) &&
+         "Kind should be Line (or Distance)");
+  return A;
+}
+
+
+// If constraint is a line AX + BY = C, returns B.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getB() const {
+  assert((Kind == Line || Kind == Distance) &&
+         "Kind should be Line (or Distance)");
+  return B;
+}
+
+
+// If constraint is a line AX + BY = C, returns C.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getC() const {
+  assert((Kind == Line || Kind == Distance) &&
+         "Kind should be Line (or Distance)");
+  return C;
+}
+
+
+// If constraint is a distance, returns D.
+// Otherwise assert.
+const SCEV *DependenceAnalysis::Constraint::getD() const {
+  assert(Kind == Distance && "Kind should be Distance");
+  return SE->getNegativeSCEV(C);
+}
+
+
+// Returns the loop associated with this constraint.
+const Loop *DependenceAnalysis::Constraint::getAssociatedLoop() const {
+  assert((Kind == Distance || Kind == Line || Kind == Point) &&
+         "Kind should be Distance, Line, or Point");
+  return AssociatedLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setPoint(const SCEV *X,
+                                              const SCEV *Y,
+                                              const Loop *CurLoop) {
+  Kind = Point;
+  A = X;
+  B = Y;
+  AssociatedLoop = CurLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setLine(const SCEV *AA,
+                                             const SCEV *BB,
+                                             const SCEV *CC,
+                                             const Loop *CurLoop) {
+  Kind = Line;
+  A = AA;
+  B = BB;
+  C = CC;
+  AssociatedLoop = CurLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setDistance(const SCEV *D,
+                                                 const Loop *CurLoop) {
+  Kind = Distance;
+  A = SE->getConstant(D->getType(), 1);
+  B = SE->getNegativeSCEV(A);
+  C = SE->getNegativeSCEV(D);
+  AssociatedLoop = CurLoop;
+}
+
+
+void DependenceAnalysis::Constraint::setEmpty() {
+  Kind = Empty;
+}
+
+
+void DependenceAnalysis::Constraint::setAny(ScalarEvolution *NewSE) {
+  SE = NewSE;
+  Kind = Any;
+}
+
+
+// For debugging purposes. Dumps the constraint out to OS.
+void DependenceAnalysis::Constraint::dump(raw_ostream &OS) const {
+  if (isEmpty())
+    OS << " Empty\n";
+  else if (isAny())
+    OS << " Any\n";
+  else if (isPoint())
+    OS << " Point is <" << *getX() << ", " << *getY() << ">\n";
+  else if (isDistance())
+    OS << " Distance is " << *getD() <<
+      " (" << *getA() << "*X + " << *getB() << "*Y = " << *getC() << ")\n";
+  else if (isLine())
+    OS << " Line is " << *getA() << "*X + " <<
+      *getB() << "*Y = " << *getC() << "\n";
+  else
+    llvm_unreachable("unknown constraint type in Constraint::dump");
+}
+
+
+// Updates X with the intersection
+// of the Constraints X and Y. Returns true if X has changed.
+// Corresponds to Figure 4 from the paper
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+bool DependenceAnalysis::intersectConstraints(Constraint *X,
+                                              const Constraint *Y) {
+  ++DeltaApplications;
+  DEBUG(dbgs() << "\tintersect constraints\n");
+  DEBUG(dbgs() << "\t    X ="; X->dump(dbgs()));
+  DEBUG(dbgs() << "\t    Y ="; Y->dump(dbgs()));
+  assert(!Y->isPoint() && "Y must not be a Point");
+  if (X->isAny()) {
+    if (Y->isAny())
+      return false;
+    *X = *Y;
+    return true;
+  }
+  if (X->isEmpty())
+    return false;
+  if (Y->isEmpty()) {
+    X->setEmpty();
+    return true;
+  }
+
+  if (X->isDistance() && Y->isDistance()) {
+    DEBUG(dbgs() << "\t    intersect 2 distances\n");
+    if (isKnownPredicate(CmpInst::ICMP_EQ, X->getD(), Y->getD()))
+      return false;
+    if (isKnownPredicate(CmpInst::ICMP_NE, X->getD(), Y->getD())) {
+      X->setEmpty();
+      ++DeltaSuccesses;
+      return true;
+    }
+    // Hmmm, interesting situation.
+    // I guess if either is constant, keep it and ignore the other.
+    if (isa<SCEVConstant>(Y->getD())) {
+      *X = *Y;
+      return true;
+    }
+    return false;
+  }
+
+  // At this point, the pseudo-code in Figure 4 of the paper
+  // checks if (X->isPoint() && Y->isPoint()).
+  // This case can't occur in our implementation,
+  // since a Point can only arise as the result of intersecting
+  // two Line constraints, and the right-hand value, Y, is never
+  // the result of an intersection.
+  assert(!(X->isPoint() && Y->isPoint()) &&
+         "We shouldn't ever see X->isPoint() && Y->isPoint()");
+
+  if (X->isLine() && Y->isLine()) {
+    DEBUG(dbgs() << "\t    intersect 2 lines\n");
+    const SCEV *Prod1 = SE->getMulExpr(X->getA(), Y->getB());
+    const SCEV *Prod2 = SE->getMulExpr(X->getB(), Y->getA());
+    if (isKnownPredicate(CmpInst::ICMP_EQ, Prod1, Prod2)) {
+      // slopes are equal, so lines are parallel
+      DEBUG(dbgs() << "\t\tsame slope\n");
+      Prod1 = SE->getMulExpr(X->getC(), Y->getB());
+      Prod2 = SE->getMulExpr(X->getB(), Y->getC());
+      if (isKnownPredicate(CmpInst::ICMP_EQ, Prod1, Prod2))
+        return false;
+      if (isKnownPredicate(CmpInst::ICMP_NE, Prod1, Prod2)) {
+        X->setEmpty();
+        ++DeltaSuccesses;
+        return true;
+      }
+      return false;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_NE, Prod1, Prod2)) {
+      // slopes differ, so lines intersect
+      DEBUG(dbgs() << "\t\tdifferent slopes\n");
+      const SCEV *C1B2 = SE->getMulExpr(X->getC(), Y->getB());
+      const SCEV *C1A2 = SE->getMulExpr(X->getC(), Y->getA());
+      const SCEV *C2B1 = SE->getMulExpr(Y->getC(), X->getB());
+      const SCEV *C2A1 = SE->getMulExpr(Y->getC(), X->getA());
+      const SCEV *A1B2 = SE->getMulExpr(X->getA(), Y->getB());
+      const SCEV *A2B1 = SE->getMulExpr(Y->getA(), X->getB());
+      const SCEVConstant *C1A2_C2A1 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1A2, C2A1));
+      const SCEVConstant *C1B2_C2B1 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(C1B2, C2B1));
+      const SCEVConstant *A1B2_A2B1 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A1B2, A2B1));
+      const SCEVConstant *A2B1_A1B2 =
+        dyn_cast<SCEVConstant>(SE->getMinusSCEV(A2B1, A1B2));
+      if (!C1B2_C2B1 || !C1A2_C2A1 ||
+          !A1B2_A2B1 || !A2B1_A1B2)
+        return false;
+      APInt Xtop = C1B2_C2B1->getValue()->getValue();
+      APInt Xbot = A1B2_A2B1->getValue()->getValue();
+      APInt Ytop = C1A2_C2A1->getValue()->getValue();
+      APInt Ybot = A2B1_A1B2->getValue()->getValue();
+      DEBUG(dbgs() << "\t\tXtop = " << Xtop << "\n");
+      DEBUG(dbgs() << "\t\tXbot = " << Xbot << "\n");
+      DEBUG(dbgs() << "\t\tYtop = " << Ytop << "\n");
+      DEBUG(dbgs() << "\t\tYbot = " << Ybot << "\n");
+      APInt Xq = Xtop; // these need to be initialized, even
+      APInt Xr = Xtop; // though they're just going to be overwritten
+      APInt::sdivrem(Xtop, Xbot, Xq, Xr);
+      APInt Yq = Ytop;
+      APInt Yr = Ytop;;
+      APInt::sdivrem(Ytop, Ybot, Yq, Yr);
+      if (Xr != 0 || Yr != 0) {
+        X->setEmpty();
+        ++DeltaSuccesses;
+        return true;
+      }
+      DEBUG(dbgs() << "\t\tX = " << Xq << ", Y = " << Yq << "\n");
+      if (Xq.slt(0) || Yq.slt(0)) {
+        X->setEmpty();
+        ++DeltaSuccesses;
+        return true;
+      }
+      if (const SCEVConstant *CUB =
+          collectConstantUpperBound(X->getAssociatedLoop(), Prod1->getType())) {
+        APInt UpperBound = CUB->getValue()->getValue();
+        DEBUG(dbgs() << "\t\tupper bound = " << UpperBound << "\n");
+        if (Xq.sgt(UpperBound) || Yq.sgt(UpperBound)) {
+          X->setEmpty();
+          ++DeltaSuccesses;
+          return true;
+        }
+      }
+      X->setPoint(SE->getConstant(Xq),
+                  SE->getConstant(Yq),
+                  X->getAssociatedLoop());
+      ++DeltaSuccesses;
+      return true;
+    }
+    return false;
+  }
+
+  // if (X->isLine() && Y->isPoint()) This case can't occur.
+  assert(!(X->isLine() && Y->isPoint()) && "This case should never occur");
+
+  if (X->isPoint() && Y->isLine()) {
+    DEBUG(dbgs() << "\t    intersect Point and Line\n");
+    const SCEV *A1X1 = SE->getMulExpr(Y->getA(), X->getX());
+    const SCEV *B1Y1 = SE->getMulExpr(Y->getB(), X->getY());
+    const SCEV *Sum = SE->getAddExpr(A1X1, B1Y1);
+    if (isKnownPredicate(CmpInst::ICMP_EQ, Sum, Y->getC()))
+      return false;
+    if (isKnownPredicate(CmpInst::ICMP_NE, Sum, Y->getC())) {
+      X->setEmpty();
+      ++DeltaSuccesses;
+      return true;
+    }
+    return false;
+  }
+
+  llvm_unreachable("shouldn't reach the end of Constraint intersection");
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// DependenceAnalysis methods
+
+// For debugging purposes. Dumps a dependence to OS.
+void Dependence::dump(raw_ostream &OS) const {
+  bool Splitable = false;
+  if (isConfused())
+    OS << "confused";
+  else {
+    if (isConsistent())
+      OS << "consistent ";
+    if (isFlow())
+      OS << "flow";
+    else if (isOutput())
+      OS << "output";
+    else if (isAnti())
+      OS << "anti";
+    else if (isInput())
+      OS << "input";
+    unsigned Levels = getLevels();
+    if (Levels) {
+      OS << " [";
+      for (unsigned II = 1; II <= Levels; ++II) {
+        if (isSplitable(II))
+          Splitable = true;
+        if (isPeelFirst(II))
+          OS << 'p';
+        const SCEV *Distance = getDistance(II);
+        if (Distance)
+          OS << *Distance;
+        else if (isScalar(II))
+          OS << "S";
+        else {
+          unsigned Direction = getDirection(II);
+          if (Direction == DVEntry::ALL)
+            OS << "*";
+          else {
+            if (Direction & DVEntry::LT)
+              OS << "<";
+            if (Direction & DVEntry::EQ)
+              OS << "=";
+            if (Direction & DVEntry::GT)
+              OS << ">";
+          }
+        }
+        if (isPeelLast(II))
+          OS << 'p';
+        if (II < Levels)
+          OS << " ";
+      }
+      if (isLoopIndependent())
+        OS << "|<";
+      OS << "]";
+      if (Splitable)
+        OS << " splitable";
+    }
+  }
+  OS << "!\n";
+}
+
+
+
+static
+AliasAnalysis::AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
+                                                  const Value *A,
+                                                  const Value *B) {
+  const Value *AObj = GetUnderlyingObject(A);
+  const Value *BObj = GetUnderlyingObject(B);
+  return AA->alias(AObj, AA->getTypeStoreSize(AObj->getType()),
+                   BObj, AA->getTypeStoreSize(BObj->getType()));
+}
+
+
+// Returns true if the load or store can be analyzed. Atomic and volatile
+// operations have properties which this analysis does not understand.
+static
+bool isLoadOrStore(const Instruction *I) {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isUnordered();
+  else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isUnordered();
+  return false;
+}
+
+
+static
+const Value *getPointerOperand(const Instruction *I) {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerOperand();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->getPointerOperand();
+  llvm_unreachable("Value is not load or store instruction");
+  return 0;
+}
+
+
+// Examines the loop nesting of the Src and Dst
+// instructions and establishes their shared loops. Sets the variables
+// CommonLevels, SrcLevels, and MaxLevels.
+// The source and destination instructions needn't be contained in the same
+// loop. The routine establishNestingLevels finds the level of most deeply
+// nested loop that contains them both, CommonLevels. An instruction that's
+// not contained in a loop is at level = 0. MaxLevels is equal to the level
+// of the source plus the level of the destination, minus CommonLevels.
+// This lets us allocate vectors MaxLevels in length, with room for every
+// distinct loop referenced in both the source and destination subscripts.
+// The variable SrcLevels is the nesting depth of the source instruction.
+// It's used to help calculate distinct loops referenced by the destination.
+// Here's the map from loops to levels:
+//            0 - unused
+//            1 - outermost common loop
+//          ... - other common loops
+// CommonLevels - innermost common loop
+//          ... - loops containing Src but not Dst
+//    SrcLevels - innermost loop containing Src but not Dst
+//          ... - loops containing Dst but not Src
+//    MaxLevels - innermost loops containing Dst but not Src
+// Consider the follow code fragment:
+//   for (a = ...) {
+//     for (b = ...) {
+//       for (c = ...) {
+//         for (d = ...) {
+//           A[] = ...;
+//         }
+//       }
+//       for (e = ...) {
+//         for (f = ...) {
+//           for (g = ...) {
+//             ... = A[];
+//           }
+//         }
+//       }
+//     }
+//   }
+// If we're looking at the possibility of a dependence between the store
+// to A (the Src) and the load from A (the Dst), we'll note that they
+// have 2 loops in common, so CommonLevels will equal 2 and the direction
+// vector for Result will have 2 entries. SrcLevels = 4 and MaxLevels = 7.
+// A map from loop names to loop numbers would look like
+//     a - 1
+//     b - 2 = CommonLevels
+//     c - 3
+//     d - 4 = SrcLevels
+//     e - 5
+//     f - 6
+//     g - 7 = MaxLevels
+void DependenceAnalysis::establishNestingLevels(const Instruction *Src,
+                                                const Instruction *Dst) {
+  const BasicBlock *SrcBlock = Src->getParent();
+  const BasicBlock *DstBlock = Dst->getParent();
+  unsigned SrcLevel = LI->getLoopDepth(SrcBlock);
+  unsigned DstLevel = LI->getLoopDepth(DstBlock);
+  const Loop *SrcLoop = LI->getLoopFor(SrcBlock);
+  const Loop *DstLoop = LI->getLoopFor(DstBlock);
+  SrcLevels = SrcLevel;
+  MaxLevels = SrcLevel + DstLevel;
+  while (SrcLevel > DstLevel) {
+    SrcLoop = SrcLoop->getParentLoop();
+    SrcLevel--;
+  }
+  while (DstLevel > SrcLevel) {
+    DstLoop = DstLoop->getParentLoop();
+    DstLevel--;
+  }
+  while (SrcLoop != DstLoop) {
+    SrcLoop = SrcLoop->getParentLoop();
+    DstLoop = DstLoop->getParentLoop();
+    SrcLevel--;
+  }
+  CommonLevels = SrcLevel;
+  MaxLevels -= CommonLevels;
+}
+
+
+// Given one of the loops containing the source, return
+// its level index in our numbering scheme.
+unsigned DependenceAnalysis::mapSrcLoop(const Loop *SrcLoop) const {
+  return SrcLoop->getLoopDepth();
+}
+
+
+// Given one of the loops containing the destination,
+// return its level index in our numbering scheme.
+unsigned DependenceAnalysis::mapDstLoop(const Loop *DstLoop) const {
+  unsigned D = DstLoop->getLoopDepth();
+  if (D > CommonLevels)
+    return D - CommonLevels + SrcLevels;
+  else
+    return D;
+}
+
+
+// Returns true if Expression is loop invariant in LoopNest.
+bool DependenceAnalysis::isLoopInvariant(const SCEV *Expression,
+                                         const Loop *LoopNest) const {
+  if (!LoopNest)
+    return true;
+  return SE->isLoopInvariant(Expression, LoopNest) &&
+    isLoopInvariant(Expression, LoopNest->getParentLoop());
+}
+
+
+
+// Finds the set of loops from the LoopNest that
+// have a level <= CommonLevels and are referred to by the SCEV Expression.
+void DependenceAnalysis::collectCommonLoops(const SCEV *Expression,
+                                            const Loop *LoopNest,
+                                            SmallBitVector &Loops) const {
+  while (LoopNest) {
+    unsigned Level = LoopNest->getLoopDepth();
+    if (Level <= CommonLevels && !SE->isLoopInvariant(Expression, LoopNest))
+      Loops.set(Level);
+    LoopNest = LoopNest->getParentLoop();
+  }
+}
+
+
+// removeMatchingExtensions - Examines a subscript pair.
+// If the source and destination are identically sign (or zero)
+// extended, it strips off the extension in an effect to simplify
+// the actual analysis.
+void DependenceAnalysis::removeMatchingExtensions(Subscript *Pair) {
+  const SCEV *Src = Pair->Src;
+  const SCEV *Dst = Pair->Dst;
+  if ((isa<SCEVZeroExtendExpr>(Src) && isa<SCEVZeroExtendExpr>(Dst)) ||
+      (isa<SCEVSignExtendExpr>(Src) && isa<SCEVSignExtendExpr>(Dst))) {
+    const SCEVCastExpr *SrcCast = cast<SCEVCastExpr>(Src);
+    const SCEVCastExpr *DstCast = cast<SCEVCastExpr>(Dst);
+    if (SrcCast->getType() == DstCast->getType()) {
+      Pair->Src = SrcCast->getOperand();
+      Pair->Dst = DstCast->getOperand();
+    }
+  }
+}
+
+
+// Examine the scev and return true iff it's linear.
+// Collect any loops mentioned in the set of "Loops".
+bool DependenceAnalysis::checkSrcSubscript(const SCEV *Src,
+                                           const Loop *LoopNest,
+                                           SmallBitVector &Loops) {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Src);
+  if (!AddRec)
+    return isLoopInvariant(Src, LoopNest);
+  const SCEV *Start = AddRec->getStart();
+  const SCEV *Step = AddRec->getStepRecurrence(*SE);
+  if (!isLoopInvariant(Step, LoopNest))
+    return false;
+  Loops.set(mapSrcLoop(AddRec->getLoop()));
+  return checkSrcSubscript(Start, LoopNest, Loops);
+}
+
+
+
+// Examine the scev and return true iff it's linear.
+// Collect any loops mentioned in the set of "Loops".
+bool DependenceAnalysis::checkDstSubscript(const SCEV *Dst,
+                                           const Loop *LoopNest,
+                                           SmallBitVector &Loops) {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Dst);
+  if (!AddRec)
+    return isLoopInvariant(Dst, LoopNest);
+  const SCEV *Start = AddRec->getStart();
+  const SCEV *Step = AddRec->getStepRecurrence(*SE);
+  if (!isLoopInvariant(Step, LoopNest))
+    return false;
+  Loops.set(mapDstLoop(AddRec->getLoop()));
+  return checkDstSubscript(Start, LoopNest, Loops);
+}
+
+
+// Examines the subscript pair (the Src and Dst SCEVs)
+// and classifies it as either ZIV, SIV, RDIV, MIV, or Nonlinear.
+// Collects the associated loops in a set.
+DependenceAnalysis::Subscript::ClassificationKind
+DependenceAnalysis::classifyPair(const SCEV *Src, const Loop *SrcLoopNest,
+                                 const SCEV *Dst, const Loop *DstLoopNest,
+                                 SmallBitVector &Loops) {
+  SmallBitVector SrcLoops(MaxLevels + 1);
+  SmallBitVector DstLoops(MaxLevels + 1);
+  if (!checkSrcSubscript(Src, SrcLoopNest, SrcLoops))
+    return Subscript::NonLinear;
+  if (!checkDstSubscript(Dst, DstLoopNest, DstLoops))
+    return Subscript::NonLinear;
+  Loops = SrcLoops;
+  Loops |= DstLoops;
+  unsigned N = Loops.count();
+  if (N == 0)
+    return Subscript::ZIV;
+  if (N == 1)
+    return Subscript::SIV;
+  if (N == 2 && (SrcLoops.count() == 0 ||
+                 DstLoops.count() == 0 ||
+                 (SrcLoops.count() == 1 && DstLoops.count() == 1)))
+    return Subscript::RDIV;
+  return Subscript::MIV;
+}
+
+
+// A wrapper around SCEV::isKnownPredicate.
+// Looks for cases where we're interested in comparing for equality.
+// If both X and Y have been identically sign or zero extended,
+// it strips off the (confusing) extensions before invoking
+// SCEV::isKnownPredicate. Perhaps, someday, the ScalarEvolution package
+// will be similarly updated.
+//
+// If SCEV::isKnownPredicate can't prove the predicate,
+// we try simple subtraction, which seems to help in some cases
+// involving symbolics.
+bool DependenceAnalysis::isKnownPredicate(ICmpInst::Predicate Pred,
+                                          const SCEV *X,
+                                          const SCEV *Y) const {
+  if (Pred == CmpInst::ICMP_EQ ||
+      Pred == CmpInst::ICMP_NE) {
+    if ((isa<SCEVSignExtendExpr>(X) &&
+         isa<SCEVSignExtendExpr>(Y)) ||
+        (isa<SCEVZeroExtendExpr>(X) &&
+         isa<SCEVZeroExtendExpr>(Y))) {
+      const SCEVCastExpr *CX = cast<SCEVCastExpr>(X);
+      const SCEVCastExpr *CY = cast<SCEVCastExpr>(Y);
+      const SCEV *Xop = CX->getOperand();
+      const SCEV *Yop = CY->getOperand();
+      if (Xop->getType() == Yop->getType()) {
+        X = Xop;
+        Y = Yop;
+      }
+    }
+  }
+  if (SE->isKnownPredicate(Pred, X, Y))
+    return true;
+  // If SE->isKnownPredicate can't prove the condition,
+  // we try the brute-force approach of subtracting
+  // and testing the difference.
+  // By testing with SE->isKnownPredicate first, we avoid
+  // the possibility of overflow when the arguments are constants.
+  const SCEV *Delta = SE->getMinusSCEV(X, Y);
+  switch (Pred) {
+  case CmpInst::ICMP_EQ:
+    return Delta->isZero();
+  case CmpInst::ICMP_NE:
+    return SE->isKnownNonZero(Delta);
+  case CmpInst::ICMP_SGE:
+    return SE->isKnownNonNegative(Delta);
+  case CmpInst::ICMP_SLE:
+    return SE->isKnownNonPositive(Delta);
+  case CmpInst::ICMP_SGT:
+    return SE->isKnownPositive(Delta);
+  case CmpInst::ICMP_SLT:
+    return SE->isKnownNegative(Delta);
+  default:
+    llvm_unreachable("unexpected predicate in isKnownPredicate");
+  }
+}
+
+
+// All subscripts are all the same type.
+// Loop bound may be smaller (e.g., a char).
+// Should zero extend loop bound, since it's always >= 0.
+// This routine collects upper bound and extends if needed.
+// Return null if no bound available.
+const SCEV *DependenceAnalysis::collectUpperBound(const Loop *L,
+                                                  Type *T) const {
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    const SCEV *UB = SE->getBackedgeTakenCount(L);
+    return SE->getNoopOrZeroExtend(UB, T);
+  }
+  return NULL;
+}
+
+
+// Calls collectUpperBound(), then attempts to cast it to SCEVConstant.
+// If the cast fails, returns NULL.
+const SCEVConstant *DependenceAnalysis::collectConstantUpperBound(const Loop *L,
+                                                                  Type *T
+                                                                  ) const {
+  if (const SCEV *UB = collectUpperBound(L, T))
+    return dyn_cast<SCEVConstant>(UB);
+  return NULL;
+}
+
+
+// testZIV -
+// When we have a pair of subscripts of the form [c1] and [c2],
+// where c1 and c2 are both loop invariant, we attack it using
+// the ZIV test. Basically, we test by comparing the two values,
+// but there are actually three possible results:
+// 1) the values are equal, so there's a dependence
+// 2) the values are different, so there's no dependence
+// 3) the values might be equal, so we have to assume a dependence.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::testZIV(const SCEV *Src,
+                                 const SCEV *Dst,
+                                 FullDependence &Result) const {
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  ++ZIVapplications;
+  if (isKnownPredicate(CmpInst::ICMP_EQ, Src, Dst)) {
+    DEBUG(dbgs() << "    provably dependent\n");
+    return false; // provably dependent
+  }
+  if (isKnownPredicate(CmpInst::ICMP_NE, Src, Dst)) {
+    DEBUG(dbgs() << "    provably independent\n");
+    ++ZIVindependence;
+    return true; // provably independent
+  }
+  DEBUG(dbgs() << "    possibly dependent\n");
+  Result.Consistent = false;
+  return false; // possibly dependent
+}
+
+
+// strongSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.1
+//
+// When we have a pair of subscripts of the form [c1 + a*i] and [c2 + a*i],
+// where i is an induction variable, c1 and c2 are loop invariant,
+//  and a is a constant, we can solve it exactly using the Strong SIV test.
+//
+// Can prove independence. Failing that, can compute distance (and direction).
+// In the presence of symbolic terms, we can sometimes make progress.
+//
+// If there's a dependence,
+//
+//    c1 + a*i = c2 + a*i'
+//
+// The dependence distance is
+//
+//    d = i' - i = (c1 - c2)/a
+//
+// A dependence only exists if d is an integer and abs(d) <= U, where U is the
+// loop's upper bound. If a dependence exists, the dependence direction is
+// defined as
+//
+//                { < if d > 0
+//    direction = { = if d = 0
+//                { > if d < 0
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::strongSIVtest(const SCEV *Coeff,
+                                       const SCEV *SrcConst,
+                                       const SCEV *DstConst,
+                                       const Loop *CurLoop,
+                                       unsigned Level,
+                                       FullDependence &Result,
+                                       Constraint &NewConstraint) const {
+  DEBUG(dbgs() << "\tStrong SIV test\n");
+  DEBUG(dbgs() << "\t    Coeff = " << *Coeff);
+  DEBUG(dbgs() << ", " << *Coeff->getType() << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst);
+  DEBUG(dbgs() << ", " << *SrcConst->getType() << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst);
+  DEBUG(dbgs() << ", " << *DstConst->getType() << "\n");
+  ++StrongSIVapplications;
+  assert(0 < Level && Level <= CommonLevels && "level out of range");
+  Level--;
+
+  const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta);
+  DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
+
+  // check that |Delta| < iteration count
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
+    DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
+    const SCEV *AbsDelta =
+      SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
+    const SCEV *AbsCoeff =
+      SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+    const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
+    if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
+      // Distance greater than trip count - no dependence
+      ++StrongSIVindependence;
+      ++StrongSIVsuccesses;
+      return true;
+    }
+  }
+
+  // Can we compute distance?
+  if (isa<SCEVConstant>(Delta) && isa<SCEVConstant>(Coeff)) {
+    APInt ConstDelta = cast<SCEVConstant>(Delta)->getValue()->getValue();
+    APInt ConstCoeff = cast<SCEVConstant>(Coeff)->getValue()->getValue();
+    APInt Distance  = ConstDelta; // these need to be initialized
+    APInt Remainder = ConstDelta;
+    APInt::sdivrem(ConstDelta, ConstCoeff, Distance, Remainder);
+    DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
+    DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+    // Make sure Coeff divides Delta exactly
+    if (Remainder != 0) {
+      // Coeff doesn't divide Distance, no dependence
+      ++StrongSIVindependence;
+      ++StrongSIVsuccesses;
+      return true;
+    }
+    Result.DV[Level].Distance = SE->getConstant(Distance);
+    NewConstraint.setDistance(SE->getConstant(Distance), CurLoop);
+    if (Distance.sgt(0))
+      Result.DV[Level].Direction &= Dependence::DVEntry::LT;
+    else if (Distance.slt(0))
+      Result.DV[Level].Direction &= Dependence::DVEntry::GT;
+    else
+      Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
+    ++StrongSIVsuccesses;
+  }
+  else if (Delta->isZero()) {
+    // since 0/X == 0
+    Result.DV[Level].Distance = Delta;
+    NewConstraint.setDistance(Delta, CurLoop);
+    Result.DV[Level].Direction &= Dependence::DVEntry::EQ;
+    ++StrongSIVsuccesses;
+  }
+  else {
+    if (Coeff->isOne()) {
+      DEBUG(dbgs() << "\t    Distance = " << *Delta << "\n");
+      Result.DV[Level].Distance = Delta; // since X/1 == X
+      NewConstraint.setDistance(Delta, CurLoop);
+    }
+    else {
+      Result.Consistent = false;
+      NewConstraint.setLine(Coeff,
+                            SE->getNegativeSCEV(Coeff),
+                            SE->getNegativeSCEV(Delta), CurLoop);
+    }
+
+    // maybe we can get a useful direction
+    bool DeltaMaybeZero     = !SE->isKnownNonZero(Delta);
+    bool DeltaMaybePositive = !SE->isKnownNonPositive(Delta);
+    bool DeltaMaybeNegative = !SE->isKnownNonNegative(Delta);
+    bool CoeffMaybePositive = !SE->isKnownNonPositive(Coeff);
+    bool CoeffMaybeNegative = !SE->isKnownNonNegative(Coeff);
+    // The double negatives above are confusing.
+    // It helps to read !SE->isKnownNonZero(Delta)
+    // as "Delta might be Zero"
+    unsigned NewDirection = Dependence::DVEntry::NONE;
+    if ((DeltaMaybePositive && CoeffMaybePositive) ||
+        (DeltaMaybeNegative && CoeffMaybeNegative))
+      NewDirection = Dependence::DVEntry::LT;
+    if (DeltaMaybeZero)
+      NewDirection |= Dependence::DVEntry::EQ;
+    if ((DeltaMaybeNegative && CoeffMaybePositive) ||
+        (DeltaMaybePositive && CoeffMaybeNegative))
+      NewDirection |= Dependence::DVEntry::GT;
+    if (NewDirection < Result.DV[Level].Direction)
+      ++StrongSIVsuccesses;
+    Result.DV[Level].Direction &= NewDirection;
+  }
+  return false;
+}
+
+
+// weakCrossingSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.2
+//
+// When we have a pair of subscripts of the form [c1 + a*i] and [c2 - a*i],
+// where i is an induction variable, c1 and c2 are loop invariant,
+// and a is a constant, we can solve it exactly using the
+// Weak-Crossing SIV test.
+//
+// Given c1 + a*i = c2 - a*i', we can look for the intersection of
+// the two lines, where i = i', yielding
+//
+//    c1 + a*i = c2 - a*i
+//    2a*i = c2 - c1
+//    i = (c2 - c1)/2a
+//
+// If i < 0, there is no dependence.
+// If i > upperbound, there is no dependence.
+// If i = 0 (i.e., if c1 = c2), there's a dependence with distance = 0.
+// If i = upperbound, there's a dependence with distance = 0.
+// If i is integral, there's a dependence (all directions).
+// If the non-integer part = 1/2, there's a dependence (<> directions).
+// Otherwise, there's no dependence.
+//
+// Can prove independence. Failing that,
+// can sometimes refine the directions.
+// Can determine iteration for splitting.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::weakCrossingSIVtest(const SCEV *Coeff,
+                                             const SCEV *SrcConst,
+                                             const SCEV *DstConst,
+                                             const Loop *CurLoop,
+                                             unsigned Level,
+                                             FullDependence &Result,
+                                             Constraint &NewConstraint,
+                                             const SCEV *&SplitIter) const {
+  DEBUG(dbgs() << "\tWeak-Crossing SIV test\n");
+  DEBUG(dbgs() << "\t    Coeff = " << *Coeff << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++WeakCrossingSIVapplications;
+  assert(0 < Level && Level <= CommonLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  NewConstraint.setLine(Coeff, Coeff, Delta, CurLoop);
+  if (Delta->isZero()) {
+    Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::LT);
+    Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::GT);
+    ++WeakCrossingSIVsuccesses;
+    if (!Result.DV[Level].Direction) {
+      ++WeakCrossingSIVindependence;
+      return true;
+    }
+    Result.DV[Level].Distance = Delta; // = 0
+    return false;
+  }
+  const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(Coeff);
+  if (!ConstCoeff)
+    return false;
+
+  Result.DV[Level].Splitable = true;
+  if (SE->isKnownNegative(ConstCoeff)) {
+    ConstCoeff = dyn_cast<SCEVConstant>(SE->getNegativeSCEV(ConstCoeff));
+    assert(ConstCoeff &&
+           "dynamic cast of negative of ConstCoeff should yield constant");
+    Delta = SE->getNegativeSCEV(Delta);
+  }
+  assert(SE->isKnownPositive(ConstCoeff) && "ConstCoeff should be positive");
+
+  // compute SplitIter for use by DependenceAnalysis::getSplitIteration()
+  SplitIter =
+    SE->getUDivExpr(SE->getSMaxExpr(SE->getConstant(Delta->getType(), 0),
+                                    Delta),
+                    SE->getMulExpr(SE->getConstant(Delta->getType(), 2),
+                                   ConstCoeff));
+  DEBUG(dbgs() << "\t    Split iter = " << *SplitIter << "\n");
+
+  const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
+  if (!ConstDelta)
+    return false;
+
+  // We're certain that ConstCoeff > 0; therefore,
+  // if Delta < 0, then no dependence.
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  DEBUG(dbgs() << "\t    ConstCoeff = " << *ConstCoeff << "\n");
+  if (SE->isKnownNegative(Delta)) {
+    // No dependence, Delta < 0
+    ++WeakCrossingSIVindependence;
+    ++WeakCrossingSIVsuccesses;
+    return true;
+  }
+
+  // We're certain that Delta > 0 and ConstCoeff > 0.
+  // Check Delta/(2*ConstCoeff) against upper loop bound
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    const SCEV *ConstantTwo = SE->getConstant(UpperBound->getType(), 2);
+    const SCEV *ML = SE->getMulExpr(SE->getMulExpr(ConstCoeff, UpperBound),
+                                    ConstantTwo);
+    DEBUG(dbgs() << "\t    ML = " << *ML << "\n");
+    if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, ML)) {
+      // Delta too big, no dependence
+      ++WeakCrossingSIVindependence;
+      ++WeakCrossingSIVsuccesses;
+      return true;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_EQ, Delta, ML)) {
+      // i = i' = UB
+      Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::LT);
+      Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::GT);
+      ++WeakCrossingSIVsuccesses;
+      if (!Result.DV[Level].Direction) {
+        ++WeakCrossingSIVindependence;
+        return true;
+      }
+      Result.DV[Level].Splitable = false;
+      Result.DV[Level].Distance = SE->getConstant(Delta->getType(), 0);
+      return false;
+    }
+  }
+
+  // check that Coeff divides Delta
+  APInt APDelta = ConstDelta->getValue()->getValue();
+  APInt APCoeff = ConstCoeff->getValue()->getValue();
+  APInt Distance = APDelta; // these need to be initialzed
+  APInt Remainder = APDelta;
+  APInt::sdivrem(APDelta, APCoeff, Distance, Remainder);
+  DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+  if (Remainder != 0) {
+    // Coeff doesn't divide Delta, no dependence
+    ++WeakCrossingSIVindependence;
+    ++WeakCrossingSIVsuccesses;
+    return true;
+  }
+  DEBUG(dbgs() << "\t    Distance = " << Distance << "\n");
+
+  // if 2*Coeff doesn't divide Delta, then the equal direction isn't possible
+  APInt Two = APInt(Distance.getBitWidth(), 2, true);
+  Remainder = Distance.srem(Two);
+  DEBUG(dbgs() << "\t    Remainder = " << Remainder << "\n");
+  if (Remainder != 0) {
+    // Equal direction isn't possible
+    Result.DV[Level].Direction &= unsigned(~Dependence::DVEntry::EQ);
+    ++WeakCrossingSIVsuccesses;
+  }
+  return false;
+}
+
+
+// Kirch's algorithm, from
+//
+//        Optimizing Supercompilers for Supercomputers
+//        Michael Wolfe
+//        MIT Press, 1989
+//
+// Program 2.1, page 29.
+// Computes the GCD of AM and BM.
+// Also finds a solution to the equation ax - by = gdc(a, b).
+// Returns true iff the gcd divides Delta.
+static
+bool findGCD(unsigned Bits, APInt AM, APInt BM, APInt Delta,
+             APInt &G, APInt &X, APInt &Y) {
+  APInt A0(Bits, 1, true), A1(Bits, 0, true);
+  APInt B0(Bits, 0, true), B1(Bits, 1, true);
+  APInt G0 = AM.abs();
+  APInt G1 = BM.abs();
+  APInt Q = G0; // these need to be initialized
+  APInt R = G0;
+  APInt::sdivrem(G0, G1, Q, R);
+  while (R != 0) {
+    APInt A2 = A0 - Q*A1; A0 = A1; A1 = A2;
+    APInt B2 = B0 - Q*B1; B0 = B1; B1 = B2;
+    G0 = G1; G1 = R;
+    APInt::sdivrem(G0, G1, Q, R);
+  }
+  G = G1;
+  DEBUG(dbgs() << "\t    GCD = " << G << "\n");
+  X = AM.slt(0) ? -A1 : A1;
+  Y = BM.slt(0) ? B1 : -B1;
+
+  // make sure gcd divides Delta
+  R = Delta.srem(G);
+  if (R != 0)
+    return true; // gcd doesn't divide Delta, no dependence
+  Q = Delta.sdiv(G);
+  X *= Q;
+  Y *= Q;
+  return false;
+}
+
+
+static
+APInt floorOfQuotient(APInt A, APInt B) {
+  APInt Q = A; // these need to be initialized
+  APInt R = A;
+  APInt::sdivrem(A, B, Q, R);
+  if (R == 0)
+    return Q;
+  if ((A.sgt(0) && B.sgt(0)) ||
+      (A.slt(0) && B.slt(0)))
+    return Q;
+  else
+    return Q - 1;
+}
+
+
+static
+APInt ceilingOfQuotient(APInt A, APInt B) {
+  APInt Q = A; // these need to be initialized
+  APInt R = A;
+  APInt::sdivrem(A, B, Q, R);
+  if (R == 0)
+    return Q;
+  if ((A.sgt(0) && B.sgt(0)) ||
+      (A.slt(0) && B.slt(0)))
+    return Q + 1;
+  else
+    return Q;
+}
+
+
+static
+APInt maxAPInt(APInt A, APInt B) {
+  return A.sgt(B) ? A : B;
+}
+
+
+static
+APInt minAPInt(APInt A, APInt B) {
+  return A.slt(B) ? A : B;
+}
+
+
+// exactSIVtest -
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*i],
+// where i is an induction variable, c1 and c2 are loop invariant, and a1
+// and a2 are constant, we can solve it exactly using an algorithm developed
+// by Banerjee and Wolfe. See Section 2.5.3 in
+//
+//        Optimizing Supercompilers for Supercomputers
+//        Michael Wolfe
+//        MIT Press, 1989
+//
+// It's slower than the specialized tests (strong SIV, weak-zero SIV, etc),
+// so use them if possible. They're also a bit better with symbolics and,
+// in the case of the strong SIV test, can compute Distances.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::exactSIVtest(const SCEV *SrcCoeff,
+                                      const SCEV *DstCoeff,
+                                      const SCEV *SrcConst,
+                                      const SCEV *DstConst,
+                                      const Loop *CurLoop,
+                                      unsigned Level,
+                                      FullDependence &Result,
+                                      Constraint &NewConstraint) const {
+  DEBUG(dbgs() << "\tExact SIV test\n");
+  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
+  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++ExactSIVapplications;
+  assert(0 < Level && Level <= CommonLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  NewConstraint.setLine(SrcCoeff, SE->getNegativeSCEV(DstCoeff),
+                        Delta, CurLoop);
+  const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
+  const SCEVConstant *ConstSrcCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
+  const SCEVConstant *ConstDstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
+  if (!ConstDelta || !ConstSrcCoeff || !ConstDstCoeff)
+    return false;
+
+  // find gcd
+  APInt G, X, Y;
+  APInt AM = ConstSrcCoeff->getValue()->getValue();
+  APInt BM = ConstDstCoeff->getValue()->getValue();
+  unsigned Bits = AM.getBitWidth();
+  if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) {
+    // gcd doesn't divide Delta, no dependence
+    ++ExactSIVindependence;
+    ++ExactSIVsuccesses;
+    return true;
+  }
+
+  DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
+
+  // since SCEV construction normalizes, LM = 0
+  APInt UM(Bits, 1, true);
+  bool UMvalid = false;
+  // UM is perhaps unavailable, let's check
+  if (const SCEVConstant *CUB =
+      collectConstantUpperBound(CurLoop, Delta->getType())) {
+    UM = CUB->getValue()->getValue();
+    DEBUG(dbgs() << "\t    UM = " << UM << "\n");
+    UMvalid = true;
+  }
+
+  APInt TU(APInt::getSignedMaxValue(Bits));
+  APInt TL(APInt::getSignedMinValue(Bits));
+
+  // test(BM/G, LM-X) and test(-BM/G, X-UM)
+  APInt TMUL = BM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (UMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(UM - X, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (UMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(UM - X, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+
+  // test(AM/G, LM-Y) and test(-AM/G, Y-UM)
+  TMUL = AM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (UMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(UM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (UMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(UM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+  if (TL.sgt(TU)) {
+    ++ExactSIVindependence;
+    ++ExactSIVsuccesses;
+    return true;
+  }
+
+  // explore directions
+  unsigned NewDirection = Dependence::DVEntry::NONE;
+
+  // less than
+  APInt SaveTU(TU); // save these
+  APInt SaveTL(TL);
+  DEBUG(dbgs() << "\t    exploring LT direction\n");
+  TMUL = AM - BM;
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(X - Y + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(X - Y + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  if (TL.sle(TU)) {
+    NewDirection |= Dependence::DVEntry::LT;
+    ++ExactSIVsuccesses;
+  }
+
+  // equal
+  TU = SaveTU; // restore
+  TL = SaveTL;
+  DEBUG(dbgs() << "\t    exploring EQ direction\n");
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(X - Y, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(X - Y, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  TMUL = BM - AM;
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(Y - X, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(Y - X, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  if (TL.sle(TU)) {
+    NewDirection |= Dependence::DVEntry::EQ;
+    ++ExactSIVsuccesses;
+  }
+
+  // greater than
+  TU = SaveTU; // restore
+  TL = SaveTL;
+  DEBUG(dbgs() << "\t    exploring GT direction\n");
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(Y - X + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TL = " << TL << "\n");
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(Y - X + 1, TMUL));
+    DEBUG(dbgs() << "\t\t    TU = " << TU << "\n");
+  }
+  if (TL.sle(TU)) {
+    NewDirection |= Dependence::DVEntry::GT;
+    ++ExactSIVsuccesses;
+  }
+
+  // finished
+  Result.DV[Level].Direction &= NewDirection;
+  if (Result.DV[Level].Direction == Dependence::DVEntry::NONE)
+    ++ExactSIVindependence;
+  return Result.DV[Level].Direction == Dependence::DVEntry::NONE;
+}
+
+
+
+// Return true if the divisor evenly divides the dividend.
+static
+bool isRemainderZero(const SCEVConstant *Dividend,
+                     const SCEVConstant *Divisor) {
+  APInt ConstDividend = Dividend->getValue()->getValue();
+  APInt ConstDivisor = Divisor->getValue()->getValue();
+  return ConstDividend.srem(ConstDivisor) == 0;
+}
+
+
+// weakZeroSrcSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.2
+//
+// When we have a pair of subscripts of the form [c1] and [c2 + a*i],
+// where i is an induction variable, c1 and c2 are loop invariant,
+// and a is a constant, we can solve it exactly using the
+// Weak-Zero SIV test.
+//
+// Given
+//
+//    c1 = c2 + a*i
+//
+// we get
+//
+//    (c1 - c2)/a = i
+//
+// If i is not an integer, there's no dependence.
+// If i < 0 or > UB, there's no dependence.
+// If i = 0, the direction is <= and peeling the
+// 1st iteration will break the dependence.
+// If i = UB, the direction is >= and peeling the
+// last iteration will break the dependence.
+// Otherwise, the direction is *.
+//
+// Can prove independence. Failing that, we can sometimes refine
+// the directions. Can sometimes show that first or last
+// iteration carries all the dependences (so worth peeling).
+//
+// (see also weakZeroDstSIVtest)
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::weakZeroSrcSIVtest(const SCEV *DstCoeff,
+                                            const SCEV *SrcConst,
+                                            const SCEV *DstConst,
+                                            const Loop *CurLoop,
+                                            unsigned Level,
+                                            FullDependence &Result,
+                                            Constraint &NewConstraint) const {
+  // For the WeakSIV test, it's possible the loop isn't common to
+  // the Src and Dst loops. If it isn't, then there's no need to
+  // record a direction.
+  DEBUG(dbgs() << "\tWeak-Zero (src) SIV test\n");
+  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++WeakZeroSIVapplications;
+  assert(0 < Level && Level <= MaxLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(SrcConst, DstConst);
+  NewConstraint.setLine(SE->getConstant(Delta->getType(), 0),
+                        DstCoeff, Delta, CurLoop);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  if (isKnownPredicate(CmpInst::ICMP_EQ, SrcConst, DstConst)) {
+    if (Level < CommonLevels) {
+      Result.DV[Level].Direction &= Dependence::DVEntry::LE;
+      Result.DV[Level].PeelFirst = true;
+      ++WeakZeroSIVsuccesses;
+    }
+    return false; // dependences caused by first iteration
+  }
+  const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
+  if (!ConstCoeff)
+    return false;
+  const SCEV *AbsCoeff =
+    SE->isKnownNegative(ConstCoeff) ?
+    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *NewDelta =
+    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+
+  // check that Delta/SrcCoeff < iteration count
+  // really check NewDelta < count*AbsCoeff
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
+    if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
+      ++WeakZeroSIVindependence;
+      ++WeakZeroSIVsuccesses;
+      return true;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_EQ, NewDelta, Product)) {
+      // dependences caused by last iteration
+      if (Level < CommonLevels) {
+        Result.DV[Level].Direction &= Dependence::DVEntry::GE;
+        Result.DV[Level].PeelLast = true;
+        ++WeakZeroSIVsuccesses;
+      }
+      return false;
+    }
+  }
+
+  // check that Delta/SrcCoeff >= 0
+  // really check that NewDelta >= 0
+  if (SE->isKnownNegative(NewDelta)) {
+    // No dependence, newDelta < 0
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+
+  // if SrcCoeff doesn't divide Delta, then no dependence
+  if (isa<SCEVConstant>(Delta) &&
+      !isRemainderZero(cast<SCEVConstant>(Delta), ConstCoeff)) {
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+  return false;
+}
+
+
+// weakZeroDstSIVtest -
+// From the paper, Practical Dependence Testing, Section 4.2.2
+//
+// When we have a pair of subscripts of the form [c1 + a*i] and [c2],
+// where i is an induction variable, c1 and c2 are loop invariant,
+// and a is a constant, we can solve it exactly using the
+// Weak-Zero SIV test.
+//
+// Given
+//
+//    c1 + a*i = c2
+//
+// we get
+//
+//    i = (c2 - c1)/a
+//
+// If i is not an integer, there's no dependence.
+// If i < 0 or > UB, there's no dependence.
+// If i = 0, the direction is <= and peeling the
+// 1st iteration will break the dependence.
+// If i = UB, the direction is >= and peeling the
+// last iteration will break the dependence.
+// Otherwise, the direction is *.
+//
+// Can prove independence. Failing that, we can sometimes refine
+// the directions. Can sometimes show that first or last
+// iteration carries all the dependences (so worth peeling).
+//
+// (see also weakZeroSrcSIVtest)
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::weakZeroDstSIVtest(const SCEV *SrcCoeff,
+                                            const SCEV *SrcConst,
+                                            const SCEV *DstConst,
+                                            const Loop *CurLoop,
+                                            unsigned Level,
+                                            FullDependence &Result,
+                                            Constraint &NewConstraint) const {
+  // For the WeakSIV test, it's possible the loop isn't common to the
+  // Src and Dst loops. If it isn't, then there's no need to record a direction.
+  DEBUG(dbgs() << "\tWeak-Zero (dst) SIV test\n");
+  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << "\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++WeakZeroSIVapplications;
+  assert(0 < Level && Level <= SrcLevels && "Level out of range");
+  Level--;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  NewConstraint.setLine(SrcCoeff, SE->getConstant(Delta->getType(), 0),
+                        Delta, CurLoop);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  if (isKnownPredicate(CmpInst::ICMP_EQ, DstConst, SrcConst)) {
+    if (Level < CommonLevels) {
+      Result.DV[Level].Direction &= Dependence::DVEntry::LE;
+      Result.DV[Level].PeelFirst = true;
+      ++WeakZeroSIVsuccesses;
+    }
+    return false; // dependences caused by first iteration
+  }
+  const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
+  if (!ConstCoeff)
+    return false;
+  const SCEV *AbsCoeff =
+    SE->isKnownNegative(ConstCoeff) ?
+    SE->getNegativeSCEV(ConstCoeff) : ConstCoeff;
+  const SCEV *NewDelta =
+    SE->isKnownNegative(ConstCoeff) ? SE->getNegativeSCEV(Delta) : Delta;
+
+  // check that Delta/SrcCoeff < iteration count
+  // really check NewDelta < count*AbsCoeff
+  if (const SCEV *UpperBound = collectUpperBound(CurLoop, Delta->getType())) {
+    DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound << "\n");
+    const SCEV *Product = SE->getMulExpr(AbsCoeff, UpperBound);
+    if (isKnownPredicate(CmpInst::ICMP_SGT, NewDelta, Product)) {
+      ++WeakZeroSIVindependence;
+      ++WeakZeroSIVsuccesses;
+      return true;
+    }
+    if (isKnownPredicate(CmpInst::ICMP_EQ, NewDelta, Product)) {
+      // dependences caused by last iteration
+      if (Level < CommonLevels) {
+        Result.DV[Level].Direction &= Dependence::DVEntry::GE;
+        Result.DV[Level].PeelLast = true;
+        ++WeakZeroSIVsuccesses;
+      }
+      return false;
+    }
+  }
+
+  // check that Delta/SrcCoeff >= 0
+  // really check that NewDelta >= 0
+  if (SE->isKnownNegative(NewDelta)) {
+    // No dependence, newDelta < 0
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+
+  // if SrcCoeff doesn't divide Delta, then no dependence
+  if (isa<SCEVConstant>(Delta) &&
+      !isRemainderZero(cast<SCEVConstant>(Delta), ConstCoeff)) {
+    ++WeakZeroSIVindependence;
+    ++WeakZeroSIVsuccesses;
+    return true;
+  }
+  return false;
+}
+
+
+// exactRDIVtest - Tests the RDIV subscript pair for dependence.
+// Things of the form [c1 + a*i] and [c2 + b*j],
+// where i and j are induction variable, c1 and c2 are loop invariant,
+// and a and b are constants.
+// Returns true if any possible dependence is disproved.
+// Marks the result as inconsistent.
+// Works in some cases that symbolicRDIVtest doesn't, and vice versa.
+bool DependenceAnalysis::exactRDIVtest(const SCEV *SrcCoeff,
+                                       const SCEV *DstCoeff,
+                                       const SCEV *SrcConst,
+                                       const SCEV *DstConst,
+                                       const Loop *SrcLoop,
+                                       const Loop *DstLoop,
+                                       FullDependence &Result) const {
+  DEBUG(dbgs() << "\tExact RDIV test\n");
+  DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
+  DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
+  DEBUG(dbgs() << "\t    SrcConst = " << *SrcConst << "\n");
+  DEBUG(dbgs() << "\t    DstConst = " << *DstConst << "\n");
+  ++ExactRDIVapplications;
+  Result.Consistent = false;
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "\t    Delta = " << *Delta << "\n");
+  const SCEVConstant *ConstDelta = dyn_cast<SCEVConstant>(Delta);
+  const SCEVConstant *ConstSrcCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
+  const SCEVConstant *ConstDstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
+  if (!ConstDelta || !ConstSrcCoeff || !ConstDstCoeff)
+    return false;
+
+  // find gcd
+  APInt G, X, Y;
+  APInt AM = ConstSrcCoeff->getValue()->getValue();
+  APInt BM = ConstDstCoeff->getValue()->getValue();
+  unsigned Bits = AM.getBitWidth();
+  if (findGCD(Bits, AM, BM, ConstDelta->getValue()->getValue(), G, X, Y)) {
+    // gcd doesn't divide Delta, no dependence
+    ++ExactRDIVindependence;
+    return true;
+  }
+
+  DEBUG(dbgs() << "\t    X = " << X << ", Y = " << Y << "\n");
+
+  // since SCEV construction seems to normalize, LM = 0
+  APInt SrcUM(Bits, 1, true);
+  bool SrcUMvalid = false;
+  // SrcUM is perhaps unavailable, let's check
+  if (const SCEVConstant *UpperBound =
+      collectConstantUpperBound(SrcLoop, Delta->getType())) {
+    SrcUM = UpperBound->getValue()->getValue();
+    DEBUG(dbgs() << "\t    SrcUM = " << SrcUM << "\n");
+    SrcUMvalid = true;
+  }
+
+  APInt DstUM(Bits, 1, true);
+  bool DstUMvalid = false;
+  // UM is perhaps unavailable, let's check
+  if (const SCEVConstant *UpperBound =
+      collectConstantUpperBound(DstLoop, Delta->getType())) {
+    DstUM = UpperBound->getValue()->getValue();
+    DEBUG(dbgs() << "\t    DstUM = " << DstUM << "\n");
+    DstUMvalid = true;
+  }
+
+  APInt TU(APInt::getSignedMaxValue(Bits));
+  APInt TL(APInt::getSignedMinValue(Bits));
+
+  // test(BM/G, LM-X) and test(-BM/G, X-UM)
+  APInt TMUL = BM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (SrcUMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(SrcUM - X, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-X, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (SrcUMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(SrcUM - X, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+
+  // test(AM/G, LM-Y) and test(-AM/G, Y-UM)
+  TMUL = AM.sdiv(G);
+  if (TMUL.sgt(0)) {
+    TL = maxAPInt(TL, ceilingOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    if (DstUMvalid) {
+      TU = minAPInt(TU, floorOfQuotient(DstUM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    }
+  }
+  else {
+    TU = minAPInt(TU, floorOfQuotient(-Y, TMUL));
+    DEBUG(dbgs() << "\t    TU = " << TU << "\n");
+    if (DstUMvalid) {
+      TL = maxAPInt(TL, ceilingOfQuotient(DstUM - Y, TMUL));
+      DEBUG(dbgs() << "\t    TL = " << TL << "\n");
+    }
+  }
+  if (TL.sgt(TU))
+    ++ExactRDIVindependence;
+  return TL.sgt(TU);
+}
+
+
+// symbolicRDIVtest -
+// In Section 4.5 of the Practical Dependence Testing paper,the authors
+// introduce a special case of Banerjee's Inequalities (also called the
+// Extreme-Value Test) that can handle some of the SIV and RDIV cases,
+// particularly cases with symbolics. Since it's only able to disprove
+// dependence (not compute distances or directions), we'll use it as a
+// fall back for the other tests.
+//
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
+// where i and j are induction variables and c1 and c2 are loop invariants,
+// we can use the symbolic tests to disprove some dependences, serving as a
+// backup for the RDIV test. Note that i and j can be the same variable,
+// letting this test serve as a backup for the various SIV tests.
+//
+// For a dependence to exist, c1 + a1*i must equal c2 + a2*j for some
+//  0 <= i <= N1 and some 0 <= j <= N2, where N1 and N2 are the (normalized)
+// loop bounds for the i and j loops, respectively. So, ...
+//
+// c1 + a1*i = c2 + a2*j
+// a1*i - a2*j = c2 - c1
+//
+// To test for a dependence, we compute c2 - c1 and make sure it's in the
+// range of the maximum and minimum possible values of a1*i - a2*j.
+// Considering the signs of a1 and a2, we have 4 possible cases:
+//
+// 1) If a1 >= 0 and a2 >= 0, then
+//        a1*0 - a2*N2 <= c2 - c1 <= a1*N1 - a2*0
+//              -a2*N2 <= c2 - c1 <= a1*N1
+//
+// 2) If a1 >= 0 and a2 <= 0, then
+//        a1*0 - a2*0 <= c2 - c1 <= a1*N1 - a2*N2
+//                  0 <= c2 - c1 <= a1*N1 - a2*N2
+//
+// 3) If a1 <= 0 and a2 >= 0, then
+//        a1*N1 - a2*N2 <= c2 - c1 <= a1*0 - a2*0
+//        a1*N1 - a2*N2 <= c2 - c1 <= 0
+//
+// 4) If a1 <= 0 and a2 <= 0, then
+//        a1*N1 - a2*0  <= c2 - c1 <= a1*0 - a2*N2
+//        a1*N1         <= c2 - c1 <=       -a2*N2
+//
+// return true if dependence disproved
+bool DependenceAnalysis::symbolicRDIVtest(const SCEV *A1,
+                                          const SCEV *A2,
+                                          const SCEV *C1,
+                                          const SCEV *C2,
+                                          const Loop *Loop1,
+                                          const Loop *Loop2) const {
+  ++SymbolicRDIVapplications;
+  DEBUG(dbgs() << "\ttry symbolic RDIV test\n");
+  DEBUG(dbgs() << "\t    A1 = " << *A1);
+  DEBUG(dbgs() << ", type = " << *A1->getType() << "\n");
+  DEBUG(dbgs() << "\t    A2 = " << *A2 << "\n");
+  DEBUG(dbgs() << "\t    C1 = " << *C1 << "\n");
+  DEBUG(dbgs() << "\t    C2 = " << *C2 << "\n");
+  const SCEV *N1 = collectUpperBound(Loop1, A1->getType());
+  const SCEV *N2 = collectUpperBound(Loop2, A1->getType());
+  DEBUG(if (N1) dbgs() << "\t    N1 = " << *N1 << "\n");
+  DEBUG(if (N2) dbgs() << "\t    N2 = " << *N2 << "\n");
+  const SCEV *C2_C1 = SE->getMinusSCEV(C2, C1);
+  const SCEV *C1_C2 = SE->getMinusSCEV(C1, C2);
+  DEBUG(dbgs() << "\t    C2 - C1 = " << *C2_C1 << "\n");
+  DEBUG(dbgs() << "\t    C1 - C2 = " << *C1_C2 << "\n");
+  if (SE->isKnownNonNegative(A1)) {
+    if (SE->isKnownNonNegative(A2)) {
+      // A1 >= 0 && A2 >= 0
+      if (N1) {
+        // make sure that c2 - c1 <= a1*N1
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, C2_C1, A1N1)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      if (N2) {
+        // make sure that -a2*N2 <= c2 - c1, or a2*N2 >= c1 - c2
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SLT, A2N2, C1_C2)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+    }
+    else if (SE->isKnownNonPositive(A2)) {
+      // a1 >= 0 && a2 <= 0
+      if (N1 && N2) {
+        // make sure that c2 - c1 <= a1*N1 - a2*N2
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        const SCEV *A1N1_A2N2 = SE->getMinusSCEV(A1N1, A2N2);
+        DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, C2_C1, A1N1_A2N2)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      // make sure that 0 <= c2 - c1
+      if (SE->isKnownNegative(C2_C1)) {
+        ++SymbolicRDIVindependence;
+        return true;
+      }
+    }
+  }
+  else if (SE->isKnownNonPositive(A1)) {
+    if (SE->isKnownNonNegative(A2)) {
+      // a1 <= 0 && a2 >= 0
+      if (N1 && N2) {
+        // make sure that a1*N1 - a2*N2 <= c2 - c1
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        const SCEV *A1N1_A2N2 = SE->getMinusSCEV(A1N1, A2N2);
+        DEBUG(dbgs() << "\t    A1*N1 - A2*N2 = " << *A1N1_A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, A1N1_A2N2, C2_C1)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      // make sure that c2 - c1 <= 0
+      if (SE->isKnownPositive(C2_C1)) {
+        ++SymbolicRDIVindependence;
+        return true;
+      }
+    }
+    else if (SE->isKnownNonPositive(A2)) {
+      // a1 <= 0 && a2 <= 0
+      if (N1) {
+        // make sure that a1*N1 <= c2 - c1
+        const SCEV *A1N1 = SE->getMulExpr(A1, N1);
+        DEBUG(dbgs() << "\t    A1*N1 = " << *A1N1 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SGT, A1N1, C2_C1)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+      if (N2) {
+        // make sure that c2 - c1 <= -a2*N2, or c1 - c2 >= a2*N2
+        const SCEV *A2N2 = SE->getMulExpr(A2, N2);
+        DEBUG(dbgs() << "\t    A2*N2 = " << *A2N2 << "\n");
+        if (isKnownPredicate(CmpInst::ICMP_SLT, C1_C2, A2N2)) {
+          ++SymbolicRDIVindependence;
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+
+// testSIV -
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 - a2*i]
+// where i is an induction variable, c1 and c2 are loop invariant, and a1 and
+// a2 are constant, we attack it with an SIV test. While they can all be
+// solved with the Exact SIV test, it's worthwhile to use simpler tests when
+// they apply; they're cheaper and sometimes more precise.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::testSIV(const SCEV *Src,
+                                 const SCEV *Dst,
+                                 unsigned &Level,
+                                 FullDependence &Result,
+                                 Constraint &NewConstraint,
+                                 const SCEV *&SplitIter) const {
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
+  const SCEVAddRecExpr *DstAddRec = dyn_cast<SCEVAddRecExpr>(Dst);
+  if (SrcAddRec && DstAddRec) {
+    const SCEV *SrcConst = SrcAddRec->getStart();
+    const SCEV *DstConst = DstAddRec->getStart();
+    const SCEV *SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
+    const SCEV *DstCoeff = DstAddRec->getStepRecurrence(*SE);
+    const Loop *CurLoop = SrcAddRec->getLoop();
+    assert(CurLoop == DstAddRec->getLoop() &&
+           "both loops in SIV should be same");
+    Level = mapSrcLoop(CurLoop);
+    bool disproven;
+    if (SrcCoeff == DstCoeff)
+      disproven = strongSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
+                                Level, Result, NewConstraint);
+    else if (SrcCoeff == SE->getNegativeSCEV(DstCoeff))
+      disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
+                                      Level, Result, NewConstraint, SplitIter);
+    else
+      disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop,
+                               Level, Result, NewConstraint);
+    return disproven ||
+      gcdMIVtest(Src, Dst, Result) ||
+      symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurLoop, CurLoop);
+  }
+  if (SrcAddRec) {
+    const SCEV *SrcConst = SrcAddRec->getStart();
+    const SCEV *SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
+    const SCEV *DstConst = Dst;
+    const Loop *CurLoop = SrcAddRec->getLoop();
+    Level = mapSrcLoop(CurLoop);
+    return weakZeroDstSIVtest(SrcCoeff, SrcConst, DstConst, CurLoop,
+                              Level, Result, NewConstraint) ||
+      gcdMIVtest(Src, Dst, Result);
+  }
+  if (DstAddRec) {
+    const SCEV *DstConst = DstAddRec->getStart();
+    const SCEV *DstCoeff = DstAddRec->getStepRecurrence(*SE);
+    const SCEV *SrcConst = Src;
+    const Loop *CurLoop = DstAddRec->getLoop();
+    Level = mapDstLoop(CurLoop);
+    return weakZeroSrcSIVtest(DstCoeff, SrcConst, DstConst,
+                              CurLoop, Level, Result, NewConstraint) ||
+      gcdMIVtest(Src, Dst, Result);
+  }
+  llvm_unreachable("SIV test expected at least one AddRec");
+  return false;
+}
+
+
+// testRDIV -
+// When we have a pair of subscripts of the form [c1 + a1*i] and [c2 + a2*j]
+// where i and j are induction variables, c1 and c2 are loop invariant,
+// and a1 and a2 are constant, we can solve it exactly with an easy adaptation
+// of the Exact SIV test, the Restricted Double Index Variable (RDIV) test.
+// It doesn't make sense to talk about distance or direction in this case,
+// so there's no point in making special versions of the Strong SIV test or
+// the Weak-crossing SIV test.
+//
+// With minor algebra, this test can also be used for things like
+// [c1 + a1*i + a2*j][c2].
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::testRDIV(const SCEV *Src,
+                                  const SCEV *Dst,
+                                  FullDependence &Result) const {
+  // we have 3 possible situations here:
+  //   1) [a*i + b] and [c*j + d]
+  //   2) [a*i + c*j + b] and [d]
+  //   3) [b] and [a*i + c*j + d]
+  // We need to find what we've got and get organized
+
+  const SCEV *SrcConst, *DstConst;
+  const SCEV *SrcCoeff, *DstCoeff;
+  const Loop *SrcLoop, *DstLoop;
+
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  const SCEVAddRecExpr *SrcAddRec = dyn_cast<SCEVAddRecExpr>(Src);
+  const SCEVAddRecExpr *DstAddRec = dyn_cast<SCEVAddRecExpr>(Dst);
+  if (SrcAddRec && DstAddRec) {
+    SrcConst = SrcAddRec->getStart();
+    SrcCoeff = SrcAddRec->getStepRecurrence(*SE);
+    SrcLoop = SrcAddRec->getLoop();
+    DstConst = DstAddRec->getStart();
+    DstCoeff = DstAddRec->getStepRecurrence(*SE);
+    DstLoop = DstAddRec->getLoop();
+  }
+  else if (SrcAddRec) {
+    if (const SCEVAddRecExpr *tmpAddRec =
+        dyn_cast<SCEVAddRecExpr>(SrcAddRec->getStart())) {
+      SrcConst = tmpAddRec->getStart();
+      SrcCoeff = tmpAddRec->getStepRecurrence(*SE);
+      SrcLoop = tmpAddRec->getLoop();
+      DstConst = Dst;
+      DstCoeff = SE->getNegativeSCEV(SrcAddRec->getStepRecurrence(*SE));
+      DstLoop = SrcAddRec->getLoop();
+    }
+    else
+      llvm_unreachable("RDIV reached by surprising SCEVs");
+  }
+  else if (DstAddRec) {
+    if (const SCEVAddRecExpr *tmpAddRec =
+        dyn_cast<SCEVAddRecExpr>(DstAddRec->getStart())) {
+      DstConst = tmpAddRec->getStart();
+      DstCoeff = tmpAddRec->getStepRecurrence(*SE);
+      DstLoop = tmpAddRec->getLoop();
+      SrcConst = Src;
+      SrcCoeff = SE->getNegativeSCEV(DstAddRec->getStepRecurrence(*SE));
+      SrcLoop = DstAddRec->getLoop();
+    }
+    else
+      llvm_unreachable("RDIV reached by surprising SCEVs");
+  }
+  else
+    llvm_unreachable("RDIV expected at least one AddRec");
+  return exactRDIVtest(SrcCoeff, DstCoeff,
+                       SrcConst, DstConst,
+                       SrcLoop, DstLoop,
+                       Result) ||
+    gcdMIVtest(Src, Dst, Result) ||
+    symbolicRDIVtest(SrcCoeff, DstCoeff,
+                     SrcConst, DstConst,
+                     SrcLoop, DstLoop);
+}
+
+
+// Tests the single-subscript MIV pair (Src and Dst) for dependence.
+// Return true if dependence disproved.
+// Can sometimes refine direction vectors.
+bool DependenceAnalysis::testMIV(const SCEV *Src,
+                                 const SCEV *Dst,
+                                 const SmallBitVector &Loops,
+                                 FullDependence &Result) const {
+  DEBUG(dbgs() << "    src = " << *Src << "\n");
+  DEBUG(dbgs() << "    dst = " << *Dst << "\n");
+  Result.Consistent = false;
+  return gcdMIVtest(Src, Dst, Result) ||
+    banerjeeMIVtest(Src, Dst, Loops, Result);
+}
+
+
+// Given a product, e.g., 10*X*Y, returns the first constant operand,
+// in this case 10. If there is no constant part, returns NULL.
+static
+const SCEVConstant *getConstantPart(const SCEVMulExpr *Product) {
+  for (unsigned Op = 0, Ops = Product->getNumOperands(); Op < Ops; Op++) {
+    if (const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Product->getOperand(Op)))
+      return Constant;
+  }
+  return NULL;
+}
+
+
+//===----------------------------------------------------------------------===//
+// gcdMIVtest -
+// Tests an MIV subscript pair for dependence.
+// Returns true if any possible dependence is disproved.
+// Marks the result as inconsistent.
+// Can sometimes disprove the equal direction for 1 or more loops,
+// as discussed in Michael Wolfe's book,
+// High Performance Compilers for Parallel Computing, page 235.
+//
+// We spend some effort (code!) to handle cases like
+// [10*i + 5*N*j + 15*M + 6], where i and j are induction variables,
+// but M and N are just loop-invariant variables.
+// This should help us handle linearized subscripts;
+// also makes this test a useful backup to the various SIV tests.
+//
+// It occurs to me that the presence of loop-invariant variables
+// changes the nature of the test from "greatest common divisor"
+// to "a common divisor!"
+bool DependenceAnalysis::gcdMIVtest(const SCEV *Src,
+                                    const SCEV *Dst,
+                                    FullDependence &Result) const {
+  DEBUG(dbgs() << "starting gcd\n");
+  ++GCDapplications;
+  unsigned BitWidth = Src->getType()->getIntegerBitWidth();
+  APInt RunningGCD = APInt::getNullValue(BitWidth);
+
+  // Examine Src coefficients.
+  // Compute running GCD and record source constant.
+  // Because we're looking for the constant at the end of the chain,
+  // we can't quit the loop just because the GCD == 1.
+  const SCEV *Coefficients = Src;
+  while (const SCEVAddRecExpr *AddRec =
+         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+    const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+    const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Coeff);
+    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+      // If the coefficient is the product of a constant and other stuff,
+      // we can use the constant in the GCD computation.
+      Constant = getConstantPart(Product);
+    if (!Constant)
+      return false;
+    APInt ConstCoeff = Constant->getValue()->getValue();
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+    Coefficients = AddRec->getStart();
+  }
+  const SCEV *SrcConst = Coefficients;
+
+  // Examine Dst coefficients.
+  // Compute running GCD and record destination constant.
+  // Because we're looking for the constant at the end of the chain,
+  // we can't quit the loop just because the GCD == 1.
+  Coefficients = Dst;
+  while (const SCEVAddRecExpr *AddRec =
+         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+    const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+    const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Coeff);
+    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+      // If the coefficient is the product of a constant and other stuff,
+      // we can use the constant in the GCD computation.
+      Constant = getConstantPart(Product);
+    if (!Constant)
+      return false;
+    APInt ConstCoeff = Constant->getValue()->getValue();
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+    Coefficients = AddRec->getStart();
+  }
+  const SCEV *DstConst = Coefficients;
+
+  APInt ExtraGCD = APInt::getNullValue(BitWidth);
+  const SCEV *Delta = SE->getMinusSCEV(DstConst, SrcConst);
+  DEBUG(dbgs() << "    Delta = " << *Delta << "\n");
+  const SCEVConstant *Constant = dyn_cast<SCEVConstant>(Delta);
+  if (const SCEVAddExpr *Sum = dyn_cast<SCEVAddExpr>(Delta)) {
+    // If Delta is a sum of products, we may be able to make further progress.
+    for (unsigned Op = 0, Ops = Sum->getNumOperands(); Op < Ops; Op++) {
+      const SCEV *Operand = Sum->getOperand(Op);
+      if (isa<SCEVConstant>(Operand)) {
+        assert(!Constant && "Surprised to find multiple constants");
+        Constant = cast<SCEVConstant>(Operand);
+      }
+      else if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Operand)) {
+        // Search for constant operand to participate in GCD;
+        // If none found; return false.
+        const SCEVConstant *ConstOp = getConstantPart(Product);
+        if (!ConstOp)
+          return false;
+        APInt ConstOpValue = ConstOp->getValue()->getValue();
+        ExtraGCD = APIntOps::GreatestCommonDivisor(ExtraGCD,
+                                                   ConstOpValue.abs());
+      }
+      else
+        return false;
+    }
+  }
+  if (!Constant)
+    return false;
+  APInt ConstDelta = cast<SCEVConstant>(Constant)->getValue()->getValue();
+  DEBUG(dbgs() << "    ConstDelta = " << ConstDelta << "\n");
+  if (ConstDelta == 0)
+    return false;
+  RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ExtraGCD);
+  DEBUG(dbgs() << "    RunningGCD = " << RunningGCD << "\n");
+  APInt Remainder = ConstDelta.srem(RunningGCD);
+  if (Remainder != 0) {
+    ++GCDindependence;
+    return true;
+  }
+
+  // Try to disprove equal directions.
+  // For example, given a subscript pair [3*i + 2*j] and [i' + 2*j' - 1],
+  // the code above can't disprove the dependence because the GCD = 1.
+  // So we consider what happen if i = i' and what happens if j = j'.
+  // If i = i', we can simplify the subscript to [2*i + 2*j] and [2*j' - 1],
+  // which is infeasible, so we can disallow the = direction for the i level.
+  // Setting j = j' doesn't help matters, so we end up with a direction vector
+  // of [<>, *]
+  //
+  // Given A[5*i + 10*j*M + 9*M*N] and A[15*i + 20*j*M - 21*N*M + 5],
+  // we need to remember that the constant part is 5 and the RunningGCD should
+  // be initialized to ExtraGCD = 30.
+  DEBUG(dbgs() << "    ExtraGCD = " << ExtraGCD << '\n');
+
+  bool Improved = false;
+  Coefficients = Src;
+  while (const SCEVAddRecExpr *AddRec =
+         dyn_cast<SCEVAddRecExpr>(Coefficients)) {
+    Coefficients = AddRec->getStart();
+    const Loop *CurLoop = AddRec->getLoop();
+    RunningGCD = ExtraGCD;
+    const SCEV *SrcCoeff = AddRec->getStepRecurrence(*SE);
+    const SCEV *DstCoeff = SE->getMinusSCEV(SrcCoeff, SrcCoeff);
+    const SCEV *Inner = Src;
+    while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
+      AddRec = cast<SCEVAddRecExpr>(Inner);
+      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+      if (CurLoop == AddRec->getLoop())
+        ; // SrcCoeff == Coeff
+      else {
+        if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+          // If the coefficient is the product of a constant and other stuff,
+          // we can use the constant in the GCD computation.
+          Constant = getConstantPart(Product);
+        else
+          Constant = cast<SCEVConstant>(Coeff);
+        APInt ConstCoeff = Constant->getValue()->getValue();
+        RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+      }
+      Inner = AddRec->getStart();
+    }
+    Inner = Dst;
+    while (RunningGCD != 1 && isa<SCEVAddRecExpr>(Inner)) {
+      AddRec = cast<SCEVAddRecExpr>(Inner);
+      const SCEV *Coeff = AddRec->getStepRecurrence(*SE);
+      if (CurLoop == AddRec->getLoop())
+        DstCoeff = Coeff;
+      else {
+        if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Coeff))
+          // If the coefficient is the product of a constant and other stuff,
+          // we can use the constant in the GCD computation.
+          Constant = getConstantPart(Product);
+        else
+          Constant = cast<SCEVConstant>(Coeff);
+        APInt ConstCoeff = Constant->getValue()->getValue();
+        RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+      }
+      Inner = AddRec->getStart();
+    }
+    Delta = SE->getMinusSCEV(SrcCoeff, DstCoeff);
+    if (const SCEVMulExpr *Product = dyn_cast<SCEVMulExpr>(Delta))
+      // If the coefficient is the product of a constant and other stuff,
+      // we can use the constant in the GCD computation.
+      Constant = getConstantPart(Product);
+    else if (isa<SCEVConstant>(Delta))
+      Constant = cast<SCEVConstant>(Delta);
+    else {
+      // The difference of the two coefficients might not be a product
+      // or constant, in which case we give up on this direction.
+      continue;
+    }
+    APInt ConstCoeff = Constant->getValue()->getValue();
+    RunningGCD = APIntOps::GreatestCommonDivisor(RunningGCD, ConstCoeff.abs());
+    DEBUG(dbgs() << "\tRunningGCD = " << RunningGCD << "\n");
+    if (RunningGCD != 0) {
+      Remainder = ConstDelta.srem(RunningGCD);
+      DEBUG(dbgs() << "\tRemainder = " << Remainder << "\n");
+      if (Remainder != 0) {
+        unsigned Level = mapSrcLoop(CurLoop);
+        Result.DV[Level - 1].Direction &= unsigned(~Dependence::DVEntry::EQ);
+        Improved = true;
+      }
+    }
+  }
+  if (Improved)
+    ++GCDsuccesses;
+  DEBUG(dbgs() << "all done\n");
+  return false;
+}
+
+
+//===----------------------------------------------------------------------===//
+// banerjeeMIVtest -
+// Use Banerjee's Inequalities to test an MIV subscript pair.
+// (Wolfe, in the race-car book, calls this the Extreme Value Test.)
+// Generally follows the discussion in Section 2.5.2 of
+//
+//    Optimizing Supercompilers for Supercomputers
+//    Michael Wolfe
+//
+// The inequalities given on page 25 are simplified in that loops are
+// normalized so that the lower bound is always 0 and the stride is always 1.
+// For example, Wolfe gives
+//
+//     LB^<_k = (A^-_k - B_k)^- (U_k - L_k - N_k) + (A_k - B_k)L_k - B_k N_k
+//
+// where A_k is the coefficient of the kth index in the source subscript,
+// B_k is the coefficient of the kth index in the destination subscript,
+// U_k is the upper bound of the kth index, L_k is the lower bound of the Kth
+// index, and N_k is the stride of the kth index. Since all loops are normalized
+// by the SCEV package, N_k = 1 and L_k = 0, allowing us to simplify the
+// equation to
+//
+//     LB^<_k = (A^-_k - B_k)^- (U_k - 0 - 1) + (A_k - B_k)0 - B_k 1
+//            = (A^-_k - B_k)^- (U_k - 1)  - B_k
+//
+// Similar simplifications are possible for the other equations.
+//
+// When we can't determine the number of iterations for a loop,
+// we use NULL as an indicator for the worst case, infinity.
+// When computing the upper bound, NULL denotes +inf;
+// for the lower bound, NULL denotes -inf.
+//
+// Return true if dependence disproved.
+bool DependenceAnalysis::banerjeeMIVtest(const SCEV *Src,
+                                         const SCEV *Dst,
+                                         const SmallBitVector &Loops,
+                                         FullDependence &Result) const {
+  DEBUG(dbgs() << "starting Banerjee\n");
+  ++BanerjeeApplications;
+  DEBUG(dbgs() << "    Src = " << *Src << '\n');
+  const SCEV *A0;
+  CoefficientInfo *A = collectCoeffInfo(Src, true, A0);
+  DEBUG(dbgs() << "    Dst = " << *Dst << '\n');
+  const SCEV *B0;
+  CoefficientInfo *B = collectCoeffInfo(Dst, false, B0);
+  BoundInfo *Bound = new BoundInfo[MaxLevels + 1];
+  const SCEV *Delta = SE->getMinusSCEV(B0, A0);
+  DEBUG(dbgs() << "\tDelta = " << *Delta << '\n');
+
+  // Compute bounds for all the * directions.
+  DEBUG(dbgs() << "\tBounds[*]\n");
+  for (unsigned K = 1; K <= MaxLevels; ++K) {
+    Bound[K].Iterations = A[K].Iterations ? A[K].Iterations : B[K].Iterations;
+    Bound[K].Direction = Dependence::DVEntry::ALL;
+    Bound[K].DirSet = Dependence::DVEntry::NONE;
+    findBoundsALL(A, B, Bound, K);
+#ifndef NDEBUG
+    DEBUG(dbgs() << "\t    " << K << '\t');
+    if (Bound[K].Lower[Dependence::DVEntry::ALL])
+      DEBUG(dbgs() << *Bound[K].Lower[Dependence::DVEntry::ALL] << '\t');
+    else
+      DEBUG(dbgs() << "-inf\t");
+    if (Bound[K].Upper[Dependence::DVEntry::ALL])
+      DEBUG(dbgs() << *Bound[K].Upper[Dependence::DVEntry::ALL] << '\n');
+    else
+      DEBUG(dbgs() << "+inf\n");
+#endif
+  }
+
+  // Test the *, *, *, ... case.
+  bool Disproved = false;
+  if (testBounds(Dependence::DVEntry::ALL, 0, Bound, Delta)) {
+    // Explore the direction vector hierarchy.
+    unsigned DepthExpanded = 0;
+    unsigned NewDeps = exploreDirections(1, A, B, Bound,
+                                         Loops, DepthExpanded, Delta);
+    if (NewDeps > 0) {
+      bool Improved = false;
+      for (unsigned K = 1; K <= CommonLevels; ++K) {
+        if (Loops[K]) {
+          unsigned Old = Result.DV[K - 1].Direction;
+          Result.DV[K - 1].Direction = Old & Bound[K].DirSet;
+          Improved |= Old != Result.DV[K - 1].Direction;
+          if (!Result.DV[K - 1].Direction) {
+            Improved = false;
+            Disproved = true;
+            break;
+          }
+        }
+      }
+      if (Improved)
+        ++BanerjeeSuccesses;
+    }
+    else {
+      ++BanerjeeIndependence;
+      Disproved = true;
+    }
+  }
+  else {
+    ++BanerjeeIndependence;
+    Disproved = true;
+  }
+  delete [] Bound;
+  delete [] A;
+  delete [] B;
+  return Disproved;
+}
+
+
+// Hierarchically expands the direction vector
+// search space, combining the directions of discovered dependences
+// in the DirSet field of Bound. Returns the number of distinct
+// dependences discovered. If the dependence is disproved,
+// it will return 0.
+unsigned DependenceAnalysis::exploreDirections(unsigned Level,
+                                               CoefficientInfo *A,
+                                               CoefficientInfo *B,
+                                               BoundInfo *Bound,
+                                               const SmallBitVector &Loops,
+                                               unsigned &DepthExpanded,
+                                               const SCEV *Delta) const {
+  if (Level > CommonLevels) {
+    // record result
+    DEBUG(dbgs() << "\t[");
+    for (unsigned K = 1; K <= CommonLevels; ++K) {
+      if (Loops[K]) {
+        Bound[K].DirSet |= Bound[K].Direction;
+#ifndef NDEBUG
+        switch (Bound[K].Direction) {
+        case Dependence::DVEntry::LT:
+          DEBUG(dbgs() << " <");
+          break;
+        case Dependence::DVEntry::EQ:
+          DEBUG(dbgs() << " =");
+          break;
+        case Dependence::DVEntry::GT:
+          DEBUG(dbgs() << " >");
+          break;
+        case Dependence::DVEntry::ALL:
+          DEBUG(dbgs() << " *");
+          break;
+        default:
+          llvm_unreachable("unexpected Bound[K].Direction");
+        }
+#endif
+      }
+    }
+    DEBUG(dbgs() << " ]\n");
+    return 1;
+  }
+  if (Loops[Level]) {
+    if (Level > DepthExpanded) {
+      DepthExpanded = Level;
+      // compute bounds for <, =, > at current level
+      findBoundsLT(A, B, Bound, Level);
+      findBoundsGT(A, B, Bound, Level);
+      findBoundsEQ(A, B, Bound, Level);
+#ifndef NDEBUG
+      DEBUG(dbgs() << "\tBound for level = " << Level << '\n');
+      DEBUG(dbgs() << "\t    <\t");
+      if (Bound[Level].Lower[Dependence::DVEntry::LT])
+        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::LT] << '\t');
+      else
+        DEBUG(dbgs() << "-inf\t");
+      if (Bound[Level].Upper[Dependence::DVEntry::LT])
+        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::LT] << '\n');
+      else
+        DEBUG(dbgs() << "+inf\n");
+      DEBUG(dbgs() << "\t    =\t");
+      if (Bound[Level].Lower[Dependence::DVEntry::EQ])
+        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::EQ] << '\t');
+      else
+        DEBUG(dbgs() << "-inf\t");
+      if (Bound[Level].Upper[Dependence::DVEntry::EQ])
+        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::EQ] << '\n');
+      else
+        DEBUG(dbgs() << "+inf\n");
+      DEBUG(dbgs() << "\t    >\t");
+      if (Bound[Level].Lower[Dependence::DVEntry::GT])
+        DEBUG(dbgs() << *Bound[Level].Lower[Dependence::DVEntry::GT] << '\t');
+      else
+        DEBUG(dbgs() << "-inf\t");
+      if (Bound[Level].Upper[Dependence::DVEntry::GT])
+        DEBUG(dbgs() << *Bound[Level].Upper[Dependence::DVEntry::GT] << '\n');
+      else
+        DEBUG(dbgs() << "+inf\n");
+#endif
+    }
+
+    unsigned NewDeps = 0;
+
+    // test bounds for <, *, *, ...
+    if (testBounds(Dependence::DVEntry::LT, Level, Bound, Delta))
+      NewDeps += exploreDirections(Level + 1, A, B, Bound,
+                                   Loops, DepthExpanded, Delta);
+
+    // Test bounds for =, *, *, ...
+    if (testBounds(Dependence::DVEntry::EQ, Level, Bound, Delta))
+      NewDeps += exploreDirections(Level + 1, A, B, Bound,
+                                   Loops, DepthExpanded, Delta);
+
+    // test bounds for >, *, *, ...
+    if (testBounds(Dependence::DVEntry::GT, Level, Bound, Delta))
+      NewDeps += exploreDirections(Level + 1, A, B, Bound,
+                                   Loops, DepthExpanded, Delta);
+
+    Bound[Level].Direction = Dependence::DVEntry::ALL;
+    return NewDeps;
+  }
+  else
+    return exploreDirections(Level + 1, A, B, Bound, Loops, DepthExpanded, Delta);
+}
+
+
+// Returns true iff the current bounds are plausible.
+bool DependenceAnalysis::testBounds(unsigned char DirKind,
+                                    unsigned Level,
+                                    BoundInfo *Bound,
+                                    const SCEV *Delta) const {
+  Bound[Level].Direction = DirKind;
+  if (const SCEV *LowerBound = getLowerBound(Bound))
+    if (isKnownPredicate(CmpInst::ICMP_SGT, LowerBound, Delta))
+      return false;
+  if (const SCEV *UpperBound = getUpperBound(Bound))
+    if (isKnownPredicate(CmpInst::ICMP_SGT, Delta, UpperBound))
+      return false;
+  return true;
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the * direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^*_k = (A^-_k - B^+_k)(U_k - L_k) + (A_k - B_k)L_k
+//    UB^*_k = (A^+_k - B^-_k)(U_k - L_k) + (A_k - B_k)L_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^*_k = (A^-_k - B^+_k)U_k
+//    UB^*_k = (A^+_k - B^-_k)U_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+// Note that the lower bound is always <= 0
+// and the upper bound is always >= 0.
+void DependenceAnalysis::findBoundsALL(CoefficientInfo *A,
+                                       CoefficientInfo *B,
+                                       BoundInfo *Bound,
+                                       unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::ALL] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::ALL] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    Bound[K].Lower[Dependence::DVEntry::ALL] =
+      SE->getMulExpr(SE->getMinusSCEV(A[K].NegPart, B[K].PosPart),
+                     Bound[K].Iterations);
+    Bound[K].Upper[Dependence::DVEntry::ALL] =
+      SE->getMulExpr(SE->getMinusSCEV(A[K].PosPart, B[K].NegPart),
+                     Bound[K].Iterations);
+  }
+  else {
+    // If the difference is 0, we won't need to know the number of iterations.
+    if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].NegPart, B[K].PosPart))
+      Bound[K].Lower[Dependence::DVEntry::ALL] =
+        SE->getConstant(A[K].Coeff->getType(), 0);
+    if (isKnownPredicate(CmpInst::ICMP_EQ, A[K].PosPart, B[K].NegPart))
+      Bound[K].Upper[Dependence::DVEntry::ALL] =
+        SE->getConstant(A[K].Coeff->getType(), 0);
+  }
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the = direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^=_k = (A_k - B_k)^- (U_k - L_k) + (A_k - B_k)L_k
+//    UB^=_k = (A_k - B_k)^+ (U_k - L_k) + (A_k - B_k)L_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^=_k = (A_k - B_k)^- U_k
+//    UB^=_k = (A_k - B_k)^+ U_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+// Note that the lower bound is always <= 0
+// and the upper bound is always >= 0.
+void DependenceAnalysis::findBoundsEQ(CoefficientInfo *A,
+                                      CoefficientInfo *B,
+                                      BoundInfo *Bound,
+                                      unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::EQ] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::EQ] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
+    const SCEV *NegativePart = getNegativePart(Delta);
+    Bound[K].Lower[Dependence::DVEntry::EQ] =
+      SE->getMulExpr(NegativePart, Bound[K].Iterations);
+    const SCEV *PositivePart = getPositivePart(Delta);
+    Bound[K].Upper[Dependence::DVEntry::EQ] =
+      SE->getMulExpr(PositivePart, Bound[K].Iterations);
+  }
+  else {
+    // If the positive/negative part of the difference is 0,
+    // we won't need to know the number of iterations.
+    const SCEV *Delta = SE->getMinusSCEV(A[K].Coeff, B[K].Coeff);
+    const SCEV *NegativePart = getNegativePart(Delta);
+    if (NegativePart->isZero())
+      Bound[K].Lower[Dependence::DVEntry::EQ] = NegativePart; // Zero
+    const SCEV *PositivePart = getPositivePart(Delta);
+    if (PositivePart->isZero())
+      Bound[K].Upper[Dependence::DVEntry::EQ] = PositivePart; // Zero
+  }
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the < direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^<_k = (A^-_k - B_k)^- (U_k - L_k - N_k) + (A_k - B_k)L_k - B_k N_k
+//    UB^<_k = (A^+_k - B_k)^+ (U_k - L_k - N_k) + (A_k - B_k)L_k - B_k N_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^<_k = (A^-_k - B_k)^- (U_k - 1) - B_k
+//    UB^<_k = (A^+_k - B_k)^+ (U_k - 1) - B_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+void DependenceAnalysis::findBoundsLT(CoefficientInfo *A,
+                                      CoefficientInfo *B,
+                                      BoundInfo *Bound,
+                                      unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::LT] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::LT] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    const SCEV *Iter_1 =
+      SE->getMinusSCEV(Bound[K].Iterations,
+                       SE->getConstant(Bound[K].Iterations->getType(), 1));
+    const SCEV *NegPart =
+      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+    Bound[K].Lower[Dependence::DVEntry::LT] =
+      SE->getMinusSCEV(SE->getMulExpr(NegPart, Iter_1), B[K].Coeff);
+    const SCEV *PosPart =
+      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+    Bound[K].Upper[Dependence::DVEntry::LT] =
+      SE->getMinusSCEV(SE->getMulExpr(PosPart, Iter_1), B[K].Coeff);
+  }
+  else {
+    // If the positive/negative part of the difference is 0,
+    // we won't need to know the number of iterations.
+    const SCEV *NegPart =
+      getNegativePart(SE->getMinusSCEV(A[K].NegPart, B[K].Coeff));
+    if (NegPart->isZero())
+      Bound[K].Lower[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
+    const SCEV *PosPart =
+      getPositivePart(SE->getMinusSCEV(A[K].PosPart, B[K].Coeff));
+    if (PosPart->isZero())
+      Bound[K].Upper[Dependence::DVEntry::LT] = SE->getNegativeSCEV(B[K].Coeff);
+  }
+}
+
+
+// Computes the upper and lower bounds for level K
+// using the > direction. Records them in Bound.
+// Wolfe gives the equations
+//
+//    LB^>_k = (A_k - B^+_k)^- (U_k - L_k - N_k) + (A_k - B_k)L_k + A_k N_k
+//    UB^>_k = (A_k - B^-_k)^+ (U_k - L_k - N_k) + (A_k - B_k)L_k + A_k N_k
+//
+// Since we normalize loops, we can simplify these equations to
+//
+//    LB^>_k = (A_k - B^+_k)^- (U_k - 1) + A_k
+//    UB^>_k = (A_k - B^-_k)^+ (U_k - 1) + A_k
+//
+// We must be careful to handle the case where the upper bound is unknown.
+void DependenceAnalysis::findBoundsGT(CoefficientInfo *A,
+                                      CoefficientInfo *B,
+                                      BoundInfo *Bound,
+                                      unsigned K) const {
+  Bound[K].Lower[Dependence::DVEntry::GT] = NULL; // Default value = -infinity.
+  Bound[K].Upper[Dependence::DVEntry::GT] = NULL; // Default value = +infinity.
+  if (Bound[K].Iterations) {
+    const SCEV *Iter_1 =
+      SE->getMinusSCEV(Bound[K].Iterations,
+                       SE->getConstant(Bound[K].Iterations->getType(), 1));
+    const SCEV *NegPart =
+      getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    Bound[K].Lower[Dependence::DVEntry::GT] =
+      SE->getAddExpr(SE->getMulExpr(NegPart, Iter_1), A[K].Coeff);
+    const SCEV *PosPart =
+      getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    Bound[K].Upper[Dependence::DVEntry::GT] =
+      SE->getAddExpr(SE->getMulExpr(PosPart, Iter_1), A[K].Coeff);
+  }
+  else {
+    // If the positive/negative part of the difference is 0,
+    // we won't need to know the number of iterations.
+    const SCEV *NegPart = getNegativePart(SE->getMinusSCEV(A[K].Coeff, B[K].PosPart));
+    if (NegPart->isZero())
+      Bound[K].Lower[Dependence::DVEntry::GT] = A[K].Coeff;
+    const SCEV *PosPart = getPositivePart(SE->getMinusSCEV(A[K].Coeff, B[K].NegPart));
+    if (PosPart->isZero())
+      Bound[K].Upper[Dependence::DVEntry::GT] = A[K].Coeff;
+  }
+}
+
+
+// X^+ = max(X, 0)
+const SCEV *DependenceAnalysis::getPositivePart(const SCEV *X) const {
+  return SE->getSMaxExpr(X, SE->getConstant(X->getType(), 0));
+}
+
+
+// X^- = min(X, 0)
+const SCEV *DependenceAnalysis::getNegativePart(const SCEV *X) const {
+  return SE->getSMinExpr(X, SE->getConstant(X->getType(), 0));
+}
+
+
+// Walks through the subscript,
+// collecting each coefficient, the associated loop bounds,
+// and recording its positive and negative parts for later use.
+DependenceAnalysis::CoefficientInfo *
+DependenceAnalysis::collectCoeffInfo(const SCEV *Subscript,
+                                     bool SrcFlag,
+                                     const SCEV *&Constant) const {
+  const SCEV *Zero = SE->getConstant(Subscript->getType(), 0);
+  CoefficientInfo *CI = new CoefficientInfo[MaxLevels + 1];
+  for (unsigned K = 1; K <= MaxLevels; ++K) {
+    CI[K].Coeff = Zero;
+    CI[K].PosPart = Zero;
+    CI[K].NegPart = Zero;
+    CI[K].Iterations = NULL;
+  }
+  while (const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Subscript)) {
+    const Loop *L = AddRec->getLoop();
+    unsigned K = SrcFlag ? mapSrcLoop(L) : mapDstLoop(L);
+    CI[K].Coeff = AddRec->getStepRecurrence(*SE);
+    CI[K].PosPart = getPositivePart(CI[K].Coeff);
+    CI[K].NegPart = getNegativePart(CI[K].Coeff);
+    CI[K].Iterations = collectUpperBound(L, Subscript->getType());
+    Subscript = AddRec->getStart();
+  }
+  Constant = Subscript;
+#ifndef NDEBUG
+  DEBUG(dbgs() << "\tCoefficient Info\n");
+  for (unsigned K = 1; K <= MaxLevels; ++K) {
+    DEBUG(dbgs() << "\t    " << K << "\t" << *CI[K].Coeff);
+    DEBUG(dbgs() << "\tPos Part = ");
+    DEBUG(dbgs() << *CI[K].PosPart);
+    DEBUG(dbgs() << "\tNeg Part = ");
+    DEBUG(dbgs() << *CI[K].NegPart);
+    DEBUG(dbgs() << "\tUpper Bound = ");
+    if (CI[K].Iterations)
+      DEBUG(dbgs() << *CI[K].Iterations);
+    else
+      DEBUG(dbgs() << "+inf");
+    DEBUG(dbgs() << '\n');
+  }
+  DEBUG(dbgs() << "\t    Constant = " << *Subscript << '\n');
+#endif
+  return CI;
+}
+
+
+// Looks through all the bounds info and
+// computes the lower bound given the current direction settings
+// at each level. If the lower bound for any level is -inf,
+// the result is -inf.
+const SCEV *DependenceAnalysis::getLowerBound(BoundInfo *Bound) const {
+  const SCEV *Sum = Bound[1].Lower[Bound[1].Direction];
+  for (unsigned K = 2; Sum && K <= MaxLevels; ++K) {
+    if (Bound[K].Lower[Bound[K].Direction])
+      Sum = SE->getAddExpr(Sum, Bound[K].Lower[Bound[K].Direction]);
+    else
+      Sum = NULL;
+  }
+  return Sum;
+}
+
+
+// Looks through all the bounds info and
+// computes the upper bound given the current direction settings
+// at each level. If the upper bound at any level is +inf,
+// the result is +inf.
+const SCEV *DependenceAnalysis::getUpperBound(BoundInfo *Bound) const {
+  const SCEV *Sum = Bound[1].Upper[Bound[1].Direction];
+  for (unsigned K = 2; Sum && K <= MaxLevels; ++K) {
+    if (Bound[K].Upper[Bound[K].Direction])
+      Sum = SE->getAddExpr(Sum, Bound[K].Upper[Bound[K].Direction]);
+    else
+      Sum = NULL;
+  }
+  return Sum;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Constraint manipulation for Delta test.
+
+// Given a linear SCEV,
+// return the coefficient (the step)
+// corresponding to the specified loop.
+// If there isn't one, return 0.
+// For example, given a*i + b*j + c*k, zeroing the coefficient
+// corresponding to the j loop would yield b.
+const SCEV *DependenceAnalysis::findCoefficient(const SCEV *Expr,
+                                                const Loop *TargetLoop)  const {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec)
+    return SE->getConstant(Expr->getType(), 0);
+  if (AddRec->getLoop() == TargetLoop)
+    return AddRec->getStepRecurrence(*SE);
+  return findCoefficient(AddRec->getStart(), TargetLoop);
+}
+
+
+// Given a linear SCEV,
+// return the SCEV given by zeroing out the coefficient
+// corresponding to the specified loop.
+// For example, given a*i + b*j + c*k, zeroing the coefficient
+// corresponding to the j loop would yield a*i + c*k.
+const SCEV *DependenceAnalysis::zeroCoefficient(const SCEV *Expr,
+                                                const Loop *TargetLoop)  const {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec)
+    return Expr; // ignore
+  if (AddRec->getLoop() == TargetLoop)
+    return AddRec->getStart();
+  return SE->getAddRecExpr(zeroCoefficient(AddRec->getStart(), TargetLoop),
+                           AddRec->getStepRecurrence(*SE),
+                           AddRec->getLoop(),
+                           AddRec->getNoWrapFlags());
+}
+
+
+// Given a linear SCEV Expr,
+// return the SCEV given by adding some Value to the
+// coefficient corresponding to the specified TargetLoop.
+// For example, given a*i + b*j + c*k, adding 1 to the coefficient
+// corresponding to the j loop would yield a*i + (b+1)*j + c*k.
+const SCEV *DependenceAnalysis::addToCoefficient(const SCEV *Expr,
+                                                 const Loop *TargetLoop,
+                                                 const SCEV *Value)  const {
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(Expr);
+  if (!AddRec) // create a new addRec
+    return SE->getAddRecExpr(Expr,
+                             Value,
+                             TargetLoop,
+                             SCEV::FlagAnyWrap); // Worst case, with no info.
+  if (AddRec->getLoop() == TargetLoop) {
+    const SCEV *Sum = SE->getAddExpr(AddRec->getStepRecurrence(*SE), Value);
+    if (Sum->isZero())
+      return AddRec->getStart();
+    return SE->getAddRecExpr(AddRec->getStart(),
+                             Sum,
+                             AddRec->getLoop(),
+                             AddRec->getNoWrapFlags());
+  }
+  return SE->getAddRecExpr(addToCoefficient(AddRec->getStart(),
+                                            TargetLoop, Value),
+                           AddRec->getStepRecurrence(*SE),
+                           AddRec->getLoop(),
+                           AddRec->getNoWrapFlags());
+}
+
+
+// Review the constraints, looking for opportunities
+// to simplify a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+// If the simplification isn't exact (that is, if it is conservative
+// in terms of dependence), set consistent to false.
+// Corresponds to Figure 5 from the paper
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+bool DependenceAnalysis::propagate(const SCEV *&Src,
+                                   const SCEV *&Dst,
+                                   SmallBitVector &Loops,
+                                   SmallVector<Constraint, 4> &Constraints,
+                                   bool &Consistent) {
+  bool Result = false;
+  for (int LI = Loops.find_first(); LI >= 0; LI = Loops.find_next(LI)) {
+    DEBUG(dbgs() << "\t    Constraint[" << LI << "] is");
+    DEBUG(Constraints[LI].dump(dbgs()));
+    if (Constraints[LI].isDistance())
+      Result |= propagateDistance(Src, Dst, Constraints[LI], Consistent);
+    else if (Constraints[LI].isLine())
+      Result |= propagateLine(Src, Dst, Constraints[LI], Consistent);
+    else if (Constraints[LI].isPoint())
+      Result |= propagatePoint(Src, Dst, Constraints[LI]);
+  }
+  return Result;
+}
+
+
+// Attempt to propagate a distance
+// constraint into a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+// If the simplification isn't exact (that is, if it is conservative
+// in terms of dependence), set consistent to false.
+bool DependenceAnalysis::propagateDistance(const SCEV *&Src,
+                                           const SCEV *&Dst,
+                                           Constraint &CurConstraint,
+                                           bool &Consistent) {
+  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
+  const SCEV *A_K = findCoefficient(Src, CurLoop);
+  if (A_K->isZero())
+    return false;
+  const SCEV *DA_K = SE->getMulExpr(A_K, CurConstraint.getD());
+  Src = SE->getMinusSCEV(Src, DA_K);
+  Src = zeroCoefficient(Src, CurLoop);
+  DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
+  Dst = addToCoefficient(Dst, CurLoop, SE->getNegativeSCEV(A_K));
+  DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
+  if (!findCoefficient(Dst, CurLoop)->isZero())
+    Consistent = false;
+  return true;
+}
+
+
+// Attempt to propagate a line
+// constraint into a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+// If the simplification isn't exact (that is, if it is conservative
+// in terms of dependence), set consistent to false.
+bool DependenceAnalysis::propagateLine(const SCEV *&Src,
+                                       const SCEV *&Dst,
+                                       Constraint &CurConstraint,
+                                       bool &Consistent) {
+  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  const SCEV *A = CurConstraint.getA();
+  const SCEV *B = CurConstraint.getB();
+  const SCEV *C = CurConstraint.getC();
+  DEBUG(dbgs() << "\t\tA = " << *A << ", B = " << *B << ", C = " << *C << "\n");
+  DEBUG(dbgs() << "\t\tSrc = " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tDst = " << *Dst << "\n");
+  if (A->isZero()) {
+    const SCEVConstant *Bconst = dyn_cast<SCEVConstant>(B);
+    const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
+    if (!Bconst || !Cconst) return false;
+    APInt Beta = Bconst->getValue()->getValue();
+    APInt Charlie = Cconst->getValue()->getValue();
+    APInt CdivB = Charlie.sdiv(Beta);
+    assert(Charlie.srem(Beta) == 0 && "C should be evenly divisible by B");
+    const SCEV *AP_K = findCoefficient(Dst, CurLoop);
+    //    Src = SE->getAddExpr(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
+    Src = SE->getMinusSCEV(Src, SE->getMulExpr(AP_K, SE->getConstant(CdivB)));
+    Dst = zeroCoefficient(Dst, CurLoop);
+    if (!findCoefficient(Src, CurLoop)->isZero())
+      Consistent = false;
+  }
+  else if (B->isZero()) {
+    const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
+    const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
+    if (!Aconst || !Cconst) return false;
+    APInt Alpha = Aconst->getValue()->getValue();
+    APInt Charlie = Cconst->getValue()->getValue();
+    APInt CdivA = Charlie.sdiv(Alpha);
+    assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
+    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, SE->getConstant(CdivA)));
+    Src = zeroCoefficient(Src, CurLoop);
+    if (!findCoefficient(Dst, CurLoop)->isZero())
+      Consistent = false;
+  }
+  else if (isKnownPredicate(CmpInst::ICMP_EQ, A, B)) {
+    const SCEVConstant *Aconst = dyn_cast<SCEVConstant>(A);
+    const SCEVConstant *Cconst = dyn_cast<SCEVConstant>(C);
+    if (!Aconst || !Cconst) return false;
+    APInt Alpha = Aconst->getValue()->getValue();
+    APInt Charlie = Cconst->getValue()->getValue();
+    APInt CdivA = Charlie.sdiv(Alpha);
+    assert(Charlie.srem(Alpha) == 0 && "C should be evenly divisible by A");
+    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, SE->getConstant(CdivA)));
+    Src = zeroCoefficient(Src, CurLoop);
+    Dst = addToCoefficient(Dst, CurLoop, A_K);
+    if (!findCoefficient(Dst, CurLoop)->isZero())
+      Consistent = false;
+  }
+  else {
+    // paper is incorrect here, or perhaps just misleading
+    const SCEV *A_K = findCoefficient(Src, CurLoop);
+    Src = SE->getMulExpr(Src, A);
+    Dst = SE->getMulExpr(Dst, A);
+    Src = SE->getAddExpr(Src, SE->getMulExpr(A_K, C));
+    Src = zeroCoefficient(Src, CurLoop);
+    Dst = addToCoefficient(Dst, CurLoop, SE->getMulExpr(A_K, B));
+    if (!findCoefficient(Dst, CurLoop)->isZero())
+      Consistent = false;
+  }
+  DEBUG(dbgs() << "\t\tnew Src = " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tnew Dst = " << *Dst << "\n");
+  return true;
+}
+
+
+// Attempt to propagate a point
+// constraint into a subscript pair (Src and Dst).
+// Return true if some simplification occurs.
+bool DependenceAnalysis::propagatePoint(const SCEV *&Src,
+                                        const SCEV *&Dst,
+                                        Constraint &CurConstraint) {
+  const Loop *CurLoop = CurConstraint.getAssociatedLoop();
+  const SCEV *A_K = findCoefficient(Src, CurLoop);
+  const SCEV *AP_K = findCoefficient(Dst, CurLoop);
+  const SCEV *XA_K = SE->getMulExpr(A_K, CurConstraint.getX());
+  const SCEV *YAP_K = SE->getMulExpr(AP_K, CurConstraint.getY());
+  DEBUG(dbgs() << "\t\tSrc is " << *Src << "\n");
+  Src = SE->getAddExpr(Src, SE->getMinusSCEV(XA_K, YAP_K));
+  Src = zeroCoefficient(Src, CurLoop);
+  DEBUG(dbgs() << "\t\tnew Src is " << *Src << "\n");
+  DEBUG(dbgs() << "\t\tDst is " << *Dst << "\n");
+  Dst = zeroCoefficient(Dst, CurLoop);
+  DEBUG(dbgs() << "\t\tnew Dst is " << *Dst << "\n");
+  return true;
+}
+
+
+// Update direction vector entry based on the current constraint.
+void DependenceAnalysis::updateDirection(Dependence::DVEntry &Level,
+                                         const Constraint &CurConstraint
+                                         ) const {
+  DEBUG(dbgs() << "\tUpdate direction, constraint =");
+  DEBUG(CurConstraint.dump(dbgs()));
+  if (CurConstraint.isAny())
+    ; // use defaults
+  else if (CurConstraint.isDistance()) {
+    // this one is consistent, the others aren't
+    Level.Scalar = false;
+    Level.Distance = CurConstraint.getD();
+    unsigned NewDirection = Dependence::DVEntry::NONE;
+    if (!SE->isKnownNonZero(Level.Distance)) // if may be zero
+      NewDirection = Dependence::DVEntry::EQ;
+    if (!SE->isKnownNonPositive(Level.Distance)) // if may be positive
+      NewDirection |= Dependence::DVEntry::LT;
+    if (!SE->isKnownNonNegative(Level.Distance)) // if may be negative
+      NewDirection |= Dependence::DVEntry::GT;
+    Level.Direction &= NewDirection;
+  }
+  else if (CurConstraint.isLine()) {
+    Level.Scalar = false;
+    Level.Distance = NULL;
+    // direction should be accurate
+  }
+  else if (CurConstraint.isPoint()) {
+    Level.Scalar = false;
+    Level.Distance = NULL;
+    unsigned NewDirection = Dependence::DVEntry::NONE;
+    if (!isKnownPredicate(CmpInst::ICMP_NE,
+                          CurConstraint.getY(),
+                          CurConstraint.getX()))
+      // if X may be = Y
+      NewDirection |= Dependence::DVEntry::EQ;
+    if (!isKnownPredicate(CmpInst::ICMP_SLE,
+                          CurConstraint.getY(),
+                          CurConstraint.getX()))
+      // if Y may be > X
+      NewDirection |= Dependence::DVEntry::LT;
+    if (!isKnownPredicate(CmpInst::ICMP_SGE,
+                          CurConstraint.getY(),
+                          CurConstraint.getX()))
+      // if Y may be < X
+      NewDirection |= Dependence::DVEntry::GT;
+    Level.Direction &= NewDirection;
+  }
+  else
+    llvm_unreachable("constraint has unexpected kind");
+}
+
+
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+// For debugging purposes, dump a small bit vector to dbgs().
+static void dumpSmallBitVector(SmallBitVector &BV) {
+  dbgs() << "{";
+  for (int VI = BV.find_first(); VI >= 0; VI = BV.find_next(VI)) {
+    dbgs() << VI;
+    if (BV.find_next(VI) >= 0)
+      dbgs() << ' ';
+  }
+  dbgs() << "}\n";
+}
+#endif
+
+
+// depends -
+// Returns NULL if there is no dependence.
+// Otherwise, return a Dependence with as many details as possible.
+// Corresponds to Section 3.1 in the paper
+//
+//            Practical Dependence Testing
+//            Goff, Kennedy, Tseng
+//            PLDI 1991
+//
+// Care is required to keep the code below up to date w.r.t. this routine.
+Dependence *DependenceAnalysis::depends(const Instruction *Src,
+                                        const Instruction *Dst,
+                                        bool PossiblyLoopIndependent) {
+  if ((!Src->mayReadFromMemory() && !Src->mayWriteToMemory()) ||
+      (!Dst->mayReadFromMemory() && !Dst->mayWriteToMemory()))
+    // if both instructions don't reference memory, there's no dependence
+    return NULL;
+
+  if (!isLoadOrStore(Src) || !isLoadOrStore(Dst))
+    // can only analyze simple loads and stores, i.e., no calls, invokes, etc.
+    return new Dependence(Src, Dst);
+
+  const Value *SrcPtr = getPointerOperand(Src);
+  const Value *DstPtr = getPointerOperand(Dst);
+
+  switch (underlyingObjectsAlias(AA, DstPtr, SrcPtr)) {
+  case AliasAnalysis::MayAlias:
+  case AliasAnalysis::PartialAlias:
+    // cannot analyse objects if we don't understand their aliasing.
+    return new Dependence(Src, Dst);
+  case AliasAnalysis::NoAlias:
+    // If the objects noalias, they are distinct, accesses are independent.
+    return NULL;
+  case AliasAnalysis::MustAlias:
+    break; // The underlying objects alias; test accesses for dependence.
+  }
+
+  const GEPOperator *SrcGEP = dyn_cast<GEPOperator>(SrcPtr);
+  const GEPOperator *DstGEP = dyn_cast<GEPOperator>(DstPtr);
+  if (!SrcGEP || !DstGEP)
+    return new Dependence(Src, Dst); // missing GEP, assume dependence
+
+  if (SrcGEP->getPointerOperandType() != DstGEP->getPointerOperandType())
+    return new Dependence(Src, Dst); // different types, assume dependence
+
+  // establish loop nesting levels
+  establishNestingLevels(Src, Dst);
+  DEBUG(dbgs() << "    common nesting levels = " << CommonLevels << "\n");
+  DEBUG(dbgs() << "    maximum nesting levels = " << MaxLevels << "\n");
+
+  FullDependence Result(Src, Dst, PossiblyLoopIndependent, CommonLevels);
+  ++TotalArrayPairs;
+
+  // classify subscript pairs
+  unsigned Pairs = SrcGEP->idx_end() - SrcGEP->idx_begin();
+  SmallVector<Subscript, 4> Pair(Pairs);
+  for (unsigned SI = 0; SI < Pairs; ++SI) {
+    Pair[SI].Loops.resize(MaxLevels + 1);
+    Pair[SI].GroupLoops.resize(MaxLevels + 1);
+    Pair[SI].Group.resize(Pairs);
+  }
+  Pairs = 0;
+  for (GEPOperator::const_op_iterator SrcIdx = SrcGEP->idx_begin(),
+         SrcEnd = SrcGEP->idx_end(),
+         DstIdx = DstGEP->idx_begin(),
+         DstEnd = DstGEP->idx_end();
+       SrcIdx != SrcEnd && DstIdx != DstEnd;
+       ++SrcIdx, ++DstIdx, ++Pairs) {
+    Pair[Pairs].Src = SE->getSCEV(*SrcIdx);
+    Pair[Pairs].Dst = SE->getSCEV(*DstIdx);
+    removeMatchingExtensions(&Pair[Pairs]);
+    Pair[Pairs].Classification =
+      classifyPair(Pair[Pairs].Src, LI->getLoopFor(Src->getParent()),
+                   Pair[Pairs].Dst, LI->getLoopFor(Dst->getParent()),
+                   Pair[Pairs].Loops);
+    Pair[Pairs].GroupLoops = Pair[Pairs].Loops;
+    Pair[Pairs].Group.set(Pairs);
+    DEBUG(dbgs() << "    subscript " << Pairs << "\n");
+    DEBUG(dbgs() << "\tsrc = " << *Pair[Pairs].Src << "\n");
+    DEBUG(dbgs() << "\tdst = " << *Pair[Pairs].Dst << "\n");
+    DEBUG(dbgs() << "\tclass = " << Pair[Pairs].Classification << "\n");
+    DEBUG(dbgs() << "\tloops = ");
+    DEBUG(dumpSmallBitVector(Pair[Pairs].Loops));
+  }
+
+  SmallBitVector Separable(Pairs);
+  SmallBitVector Coupled(Pairs);
+
+  // Partition subscripts into separable and minimally-coupled groups
+  // Algorithm in paper is algorithmically better;
+  // this may be faster in practice. Check someday.
+  //
+  // Here's an example of how it works. Consider this code:
+  //
+  //   for (i = ...) {
+  //     for (j = ...) {
+  //       for (k = ...) {
+  //         for (l = ...) {
+  //           for (m = ...) {
+  //             A[i][j][k][m] = ...;
+  //             ... = A[0][j][l][i + j];
+  //           }
+  //         }
+  //       }
+  //     }
+  //   }
+  //
+  // There are 4 subscripts here:
+  //    0 [i] and [0]
+  //    1 [j] and [j]
+  //    2 [k] and [l]
+  //    3 [m] and [i + j]
+  //
+  // We've already classified each subscript pair as ZIV, SIV, etc.,
+  // and collected all the loops mentioned by pair P in Pair[P].Loops.
+  // In addition, we've initialized Pair[P].GroupLoops to Pair[P].Loops
+  // and set Pair[P].Group = {P}.
+  //
+  //      Src Dst    Classification Loops  GroupLoops Group
+  //    0 [i] [0]         SIV       {1}      {1}        {0}
+  //    1 [j] [j]         SIV       {2}      {2}        {1}
+  //    2 [k] [l]         RDIV      {3,4}    {3,4}      {2}
+  //    3 [m] [i + j]     MIV       {1,2,5}  {1,2,5}    {3}
+  //
+  // For each subscript SI 0 .. 3, we consider each remaining subscript, SJ.
+  // So, 0 is compared against 1, 2, and 3; 1 is compared against 2 and 3, etc.
+  //
+  // We begin by comparing 0 and 1. The intersection of the GroupLoops is empty.
+  // Next, 0 and 2. Again, the intersection of their GroupLoops is empty.
+  // Next 0 and 3. The intersection of their GroupLoop = {1}, not empty,
+  // so Pair[3].Group = {0,3} and Done = false (that is, 0 will not be added
+  // to either Separable or Coupled).
+  //
+  // Next, we consider 1 and 2. The intersection of the GroupLoops is empty.
+  // Next, 1 and 3. The intersectionof their GroupLoops = {2}, not empty,
+  // so Pair[3].Group = {0, 1, 3} and Done = false.
+  //
+  // Next, we compare 2 against 3. The intersection of the GroupLoops is empty.
+  // Since Done remains true, we add 2 to the set of Separable pairs.
+  //
+  // Finally, we consider 3. There's nothing to compare it with,
+  // so Done remains true and we add it to the Coupled set.
+  // Pair[3].Group = {0, 1, 3} and GroupLoops = {1, 2, 5}.
+  //
+  // In the end, we've got 1 separable subscript and 1 coupled group.
+  for (unsigned SI = 0; SI < Pairs; ++SI) {
+    if (Pair[SI].Classification == Subscript::NonLinear) {
+      // ignore these, but collect loops for later
+      ++NonlinearSubscriptPairs;
+      collectCommonLoops(Pair[SI].Src,
+                         LI->getLoopFor(Src->getParent()),
+                         Pair[SI].Loops);
+      collectCommonLoops(Pair[SI].Dst,
+                         LI->getLoopFor(Dst->getParent()),
+                         Pair[SI].Loops);
+      Result.Consistent = false;
+    }
+    else if (Pair[SI].Classification == Subscript::ZIV) {
+      // always separable
+      Separable.set(SI);
+    }
+    else {
+      // SIV, RDIV, or MIV, so check for coupled group
+      bool Done = true;
+      for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
+        SmallBitVector Intersection = Pair[SI].GroupLoops;
+        Intersection &= Pair[SJ].GroupLoops;
+        if (Intersection.any()) {
+          // accumulate set of all the loops in group
+          Pair[SJ].GroupLoops |= Pair[SI].GroupLoops;
+          // accumulate set of all subscripts in group
+          Pair[SJ].Group |= Pair[SI].Group;
+          Done = false;
+        }
+      }
+      if (Done) {
+        if (Pair[SI].Group.count() == 1) {
+          Separable.set(SI);
+          ++SeparableSubscriptPairs;
+        }
+        else {
+          Coupled.set(SI);
+          ++CoupledSubscriptPairs;
+        }
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "    Separable = ");
+  DEBUG(dumpSmallBitVector(Separable));
+  DEBUG(dbgs() << "    Coupled = ");
+  DEBUG(dumpSmallBitVector(Coupled));
+
+  Constraint NewConstraint;
+  NewConstraint.setAny(SE);
+
+  // test separable subscripts
+  for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+    DEBUG(dbgs() << "testing subscript " << SI);
+    switch (Pair[SI].Classification) {
+    case Subscript::ZIV:
+      DEBUG(dbgs() << ", ZIV\n");
+      if (testZIV(Pair[SI].Src, Pair[SI].Dst, Result))
+        return NULL;
+      break;
+    case Subscript::SIV: {
+      DEBUG(dbgs() << ", SIV\n");
+      unsigned Level;
+      const SCEV *SplitIter = NULL;
+      if (testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
+                  Result, NewConstraint, SplitIter))
+        return NULL;
+      break;
+    }
+    case Subscript::RDIV:
+      DEBUG(dbgs() << ", RDIV\n");
+      if (testRDIV(Pair[SI].Src, Pair[SI].Dst, Result))
+        return NULL;
+      break;
+    case Subscript::MIV:
+      DEBUG(dbgs() << ", MIV\n");
+      if (testMIV(Pair[SI].Src, Pair[SI].Dst, Pair[SI].Loops, Result))
+        return NULL;
+      break;
+    default:
+      llvm_unreachable("subscript has unexpected classification");
+    }
+  }
+
+  if (Coupled.count()) {
+    // test coupled subscript groups
+    DEBUG(dbgs() << "starting on coupled subscripts\n");
+    DEBUG(dbgs() << "MaxLevels + 1 = " << MaxLevels + 1 << "\n");
+    SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
+    for (unsigned II = 0; II <= MaxLevels; ++II)
+      Constraints[II].setAny(SE);
+    for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+      DEBUG(dbgs() << "testing subscript group " << SI << " { ");
+      SmallBitVector Group(Pair[SI].Group);
+      SmallBitVector Sivs(Pairs);
+      SmallBitVector Mivs(Pairs);
+      SmallBitVector ConstrainedLevels(MaxLevels + 1);
+      for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+        DEBUG(dbgs() << SJ << " ");
+        if (Pair[SJ].Classification == Subscript::SIV)
+          Sivs.set(SJ);
+        else
+          Mivs.set(SJ);
+      }
+      DEBUG(dbgs() << "}\n");
+      while (Sivs.any()) {
+        bool Changed = false;
+        for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+          DEBUG(dbgs() << "testing subscript " << SJ << ", SIV\n");
+          // SJ is an SIV subscript that's part of the current coupled group
+          unsigned Level;
+          const SCEV *SplitIter = NULL;
+          DEBUG(dbgs() << "SIV\n");
+          if (testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level,
+                      Result, NewConstraint, SplitIter))
+            return NULL;
+          ConstrainedLevels.set(Level);
+          if (intersectConstraints(&Constraints[Level], &NewConstraint)) {
+            if (Constraints[Level].isEmpty()) {
+              ++DeltaIndependence;
+              return NULL;
+            }
+            Changed = true;
+          }
+          Sivs.reset(SJ);
+        }
+        if (Changed) {
+          // propagate, possibly creating new SIVs and ZIVs
+          DEBUG(dbgs() << "    propagating\n");
+          DEBUG(dbgs() << "\tMivs = ");
+          DEBUG(dumpSmallBitVector(Mivs));
+          for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+            // SJ is an MIV subscript that's part of the current coupled group
+            DEBUG(dbgs() << "\tSJ = " << SJ << "\n");
+            if (propagate(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops,
+                          Constraints, Result.Consistent)) {
+              DEBUG(dbgs() << "\t    Changed\n");
+              ++DeltaPropagations;
+              Pair[SJ].Classification =
+                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
+                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
+                             Pair[SJ].Loops);
+              switch (Pair[SJ].Classification) {
+              case Subscript::ZIV:
+                DEBUG(dbgs() << "ZIV\n");
+                if (testZIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
+                  return NULL;
+                Mivs.reset(SJ);
+                break;
+              case Subscript::SIV:
+                Sivs.set(SJ);
+                Mivs.reset(SJ);
+                break;
+              case Subscript::RDIV:
+              case Subscript::MIV:
+                break;
+              default:
+                llvm_unreachable("bad subscript classification");
+              }
+            }
+          }
+        }
+      }
+
+      // test & propagate remaining RDIVs
+      for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+        if (Pair[SJ].Classification == Subscript::RDIV) {
+          DEBUG(dbgs() << "RDIV test\n");
+          if (testRDIV(Pair[SJ].Src, Pair[SJ].Dst, Result))
+            return NULL;
+          // I don't yet understand how to propagate RDIV results
+          Mivs.reset(SJ);
+        }
+      }
+
+      // test remaining MIVs
+      // This code is temporary.
+      // Better to somehow test all remaining subscripts simultaneously.
+      for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+        if (Pair[SJ].Classification == Subscript::MIV) {
+          DEBUG(dbgs() << "MIV test\n");
+          if (testMIV(Pair[SJ].Src, Pair[SJ].Dst, Pair[SJ].Loops, Result))
+            return NULL;
+        }
+        else
+          llvm_unreachable("expected only MIV subscripts at this point");
+      }
+
+      // update Result.DV from constraint vector
+      DEBUG(dbgs() << "    updating\n");
+      for (int SJ = ConstrainedLevels.find_first();
+           SJ >= 0; SJ = ConstrainedLevels.find_next(SJ)) {
+        updateDirection(Result.DV[SJ - 1], Constraints[SJ]);
+        if (Result.DV[SJ - 1].Direction == Dependence::DVEntry::NONE)
+          return NULL;
+      }
+    }
+  }
+
+  // make sure Scalar flags are set correctly
+  SmallBitVector CompleteLoops(MaxLevels + 1);
+  for (unsigned SI = 0; SI < Pairs; ++SI)
+    CompleteLoops |= Pair[SI].Loops;
+  for (unsigned II = 1; II <= CommonLevels; ++II)
+    if (CompleteLoops[II])
+      Result.DV[II - 1].Scalar = false;
+
+  // make sure loopIndepent flag is set correctly
+  if (PossiblyLoopIndependent) {
+    for (unsigned II = 1; II <= CommonLevels; ++II) {
+      if (!(Result.getDirection(II) & Dependence::DVEntry::EQ)) {
+        Result.LoopIndependent = false;
+        break;
+      }
+    }
+  }
+
+  FullDependence *Final = new FullDependence(Result);
+  Result.DV = NULL;
+  return Final;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// getSplitIteration -
+// Rather than spend rarely-used space recording the splitting iteration
+// during the Weak-Crossing SIV test, we re-compute it on demand.
+// The re-computation is basically a repeat of the entire dependence test,
+// though simplified since we know that the dependence exists.
+// It's tedious, since we must go through all propagations, etc.
+//
+// Care is required to keep this code up to date w.r.t. the code above.
+//
+// Generally, the dependence analyzer will be used to build
+// a dependence graph for a function (basically a map from instructions
+// to dependences). Looking for cycles in the graph shows us loops
+// that cannot be trivially vectorized/parallelized.
+//
+// We can try to improve the situation by examining all the dependences
+// that make up the cycle, looking for ones we can break.
+// Sometimes, peeling the first or last iteration of a loop will break
+// dependences, and we've got flags for those possibilities.
+// Sometimes, splitting a loop at some other iteration will do the trick,
+// and we've got a flag for that case. Rather than waste the space to
+// record the exact iteration (since we rarely know), we provide
+// a method that calculates the iteration. It's a drag that it must work
+// from scratch, but wonderful in that it's possible.
+//
+// Here's an example:
+//
+//    for (i = 0; i < 10; i++)
+//        A[i] = ...
+//        ... = A[11 - i]
+//
+// There's a loop-carried flow dependence from the store to the load,
+// found by the weak-crossing SIV test. The dependence will have a flag,
+// indicating that the dependence can be broken by splitting the loop.
+// Calling getSplitIteration will return 5.
+// Splitting the loop breaks the dependence, like so:
+//
+//    for (i = 0; i <= 5; i++)
+//        A[i] = ...
+//        ... = A[11 - i]
+//    for (i = 6; i < 10; i++)
+//        A[i] = ...
+//        ... = A[11 - i]
+//
+// breaks the dependence and allows us to vectorize/parallelize
+// both loops.
+const  SCEV *DependenceAnalysis::getSplitIteration(const Dependence *Dep,
+                                                   unsigned SplitLevel) {
+  assert(Dep && "expected a pointer to a Dependence");
+  assert(Dep->isSplitable(SplitLevel) &&
+         "Dep should be splitable at SplitLevel");
+  const Instruction *Src = Dep->getSrc();
+  const Instruction *Dst = Dep->getDst();
+  assert(Src->mayReadFromMemory() || Src->mayWriteToMemory());
+  assert(Dst->mayReadFromMemory() || Dst->mayWriteToMemory());
+  assert(isLoadOrStore(Src));
+  assert(isLoadOrStore(Dst));
+  const Value *SrcPtr = getPointerOperand(Src);
+  const Value *DstPtr = getPointerOperand(Dst);
+  assert(underlyingObjectsAlias(AA, DstPtr, SrcPtr) ==
+         AliasAnalysis::MustAlias);
+  const GEPOperator *SrcGEP = dyn_cast<GEPOperator>(SrcPtr);
+  const GEPOperator *DstGEP = dyn_cast<GEPOperator>(DstPtr);
+  assert(SrcGEP);
+  assert(DstGEP);
+  assert(SrcGEP->getPointerOperandType() == DstGEP->getPointerOperandType());
+
+  // establish loop nesting levels
+  establishNestingLevels(Src, Dst);
+
+  FullDependence Result(Src, Dst, false, CommonLevels);
+
+  // classify subscript pairs
+  unsigned Pairs = SrcGEP->idx_end() - SrcGEP->idx_begin();
+  SmallVector<Subscript, 4> Pair(Pairs);
+  for (unsigned SI = 0; SI < Pairs; ++SI) {
+    Pair[SI].Loops.resize(MaxLevels + 1);
+    Pair[SI].GroupLoops.resize(MaxLevels + 1);
+    Pair[SI].Group.resize(Pairs);
+  }
+  Pairs = 0;
+  for (GEPOperator::const_op_iterator SrcIdx = SrcGEP->idx_begin(),
+         SrcEnd = SrcGEP->idx_end(),
+         DstIdx = DstGEP->idx_begin(),
+         DstEnd = DstGEP->idx_end();
+       SrcIdx != SrcEnd && DstIdx != DstEnd;
+       ++SrcIdx, ++DstIdx, ++Pairs) {
+    Pair[Pairs].Src = SE->getSCEV(*SrcIdx);
+    Pair[Pairs].Dst = SE->getSCEV(*DstIdx);
+    Pair[Pairs].Classification =
+      classifyPair(Pair[Pairs].Src, LI->getLoopFor(Src->getParent()),
+                   Pair[Pairs].Dst, LI->getLoopFor(Dst->getParent()),
+                   Pair[Pairs].Loops);
+    Pair[Pairs].GroupLoops = Pair[Pairs].Loops;
+    Pair[Pairs].Group.set(Pairs);
+  }
+
+  SmallBitVector Separable(Pairs);
+  SmallBitVector Coupled(Pairs);
+
+  // partition subscripts into separable and minimally-coupled groups
+  for (unsigned SI = 0; SI < Pairs; ++SI) {
+    if (Pair[SI].Classification == Subscript::NonLinear) {
+      // ignore these, but collect loops for later
+      collectCommonLoops(Pair[SI].Src,
+                         LI->getLoopFor(Src->getParent()),
+                         Pair[SI].Loops);
+      collectCommonLoops(Pair[SI].Dst,
+                         LI->getLoopFor(Dst->getParent()),
+                         Pair[SI].Loops);
+      Result.Consistent = false;
+    }
+    else if (Pair[SI].Classification == Subscript::ZIV)
+      Separable.set(SI);
+    else {
+      // SIV, RDIV, or MIV, so check for coupled group
+      bool Done = true;
+      for (unsigned SJ = SI + 1; SJ < Pairs; ++SJ) {
+        SmallBitVector Intersection = Pair[SI].GroupLoops;
+        Intersection &= Pair[SJ].GroupLoops;
+        if (Intersection.any()) {
+          // accumulate set of all the loops in group
+          Pair[SJ].GroupLoops |= Pair[SI].GroupLoops;
+          // accumulate set of all subscripts in group
+          Pair[SJ].Group |= Pair[SI].Group;
+          Done = false;
+        }
+      }
+      if (Done) {
+        if (Pair[SI].Group.count() == 1)
+          Separable.set(SI);
+        else
+          Coupled.set(SI);
+      }
+    }
+  }
+
+  Constraint NewConstraint;
+  NewConstraint.setAny(SE);
+
+  // test separable subscripts
+  for (int SI = Separable.find_first(); SI >= 0; SI = Separable.find_next(SI)) {
+    switch (Pair[SI].Classification) {
+    case Subscript::SIV: {
+      unsigned Level;
+      const SCEV *SplitIter = NULL;
+      (void) testSIV(Pair[SI].Src, Pair[SI].Dst, Level,
+                     Result, NewConstraint, SplitIter);
+      if (Level == SplitLevel) {
+        assert(SplitIter != NULL);
+        return SplitIter;
+      }
+      break;
+    }
+    case Subscript::ZIV:
+    case Subscript::RDIV:
+    case Subscript::MIV:
+      break;
+    default:
+      llvm_unreachable("subscript has unexpected classification");
+    }
+  }
+
+  if (Coupled.count()) {
+    // test coupled subscript groups
+    SmallVector<Constraint, 4> Constraints(MaxLevels + 1);
+    for (unsigned II = 0; II <= MaxLevels; ++II)
+      Constraints[II].setAny(SE);
+    for (int SI = Coupled.find_first(); SI >= 0; SI = Coupled.find_next(SI)) {
+      SmallBitVector Group(Pair[SI].Group);
+      SmallBitVector Sivs(Pairs);
+      SmallBitVector Mivs(Pairs);
+      SmallBitVector ConstrainedLevels(MaxLevels + 1);
+      for (int SJ = Group.find_first(); SJ >= 0; SJ = Group.find_next(SJ)) {
+        if (Pair[SJ].Classification == Subscript::SIV)
+          Sivs.set(SJ);
+        else
+          Mivs.set(SJ);
+      }
+      while (Sivs.any()) {
+        bool Changed = false;
+        for (int SJ = Sivs.find_first(); SJ >= 0; SJ = Sivs.find_next(SJ)) {
+          // SJ is an SIV subscript that's part of the current coupled group
+          unsigned Level;
+          const SCEV *SplitIter = NULL;
+          (void) testSIV(Pair[SJ].Src, Pair[SJ].Dst, Level,
+                         Result, NewConstraint, SplitIter);
+          if (Level == SplitLevel && SplitIter)
+            return SplitIter;
+          ConstrainedLevels.set(Level);
+          if (intersectConstraints(&Constraints[Level], &NewConstraint))
+            Changed = true;
+          Sivs.reset(SJ);
+        }
+        if (Changed) {
+          // propagate, possibly creating new SIVs and ZIVs
+          for (int SJ = Mivs.find_first(); SJ >= 0; SJ = Mivs.find_next(SJ)) {
+            // SJ is an MIV subscript that's part of the current coupled group
+            if (propagate(Pair[SJ].Src, Pair[SJ].Dst,
+                          Pair[SJ].Loops, Constraints, Result.Consistent)) {
+              Pair[SJ].Classification =
+                classifyPair(Pair[SJ].Src, LI->getLoopFor(Src->getParent()),
+                             Pair[SJ].Dst, LI->getLoopFor(Dst->getParent()),
+                             Pair[SJ].Loops);
+              switch (Pair[SJ].Classification) {
+              case Subscript::ZIV:
+                Mivs.reset(SJ);
+                break;
+              case Subscript::SIV:
+                Sivs.set(SJ);
+                Mivs.reset(SJ);
+                break;
+              case Subscript::RDIV:
+              case Subscript::MIV:
+                break;
+              default:
+                llvm_unreachable("bad subscript classification");
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  llvm_unreachable("somehow reached end of routine");
+  return NULL;
+}
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 1604576ec4ae..3e537e9f1a36 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -133,7 +133,9 @@ void DominanceFrontierBase::print(raw_ostream &OS, const Module* ) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void DominanceFrontierBase::dump() const {
   print(dbgs());
 }
+#endif
 
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 0df3e8a38218..dec0eced2786 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -141,12 +141,13 @@ private:
       for (BasicBlock::iterator II = BB->begin(), IE = BB->end();
            II != IE; ++II) {
         CallSite CS(cast<Value>(II));
-        if (CS && !isa<IntrinsicInst>(II)) {
+        if (CS) {
           const Function *Callee = CS.getCalledFunction();
-          if (Callee)
-            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
-          else
+          if (!Callee)
+            // Indirect calls of intrinsics are not allowed so no need to check.
             Node->addCalledFunction(CS, CallsExternalNode);
+          else if (!Callee->isIntrinsic())
+            Node->addCalledFunction(CS, getOrInsertFunction(Callee));
         }
       }
   }
@@ -198,9 +199,11 @@ void CallGraph::print(raw_ostream &OS, Module*) const {
   for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
     I->second->print(OS);
 }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void CallGraph::dump() const {
   print(dbgs(), 0);
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // Implementations of public modification methods
@@ -267,7 +270,9 @@ void CallGraphNode::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void CallGraphNode::dump() const { print(dbgs()); }
+#endif
 
 /// removeCallEdgeFor - This method removes the edge in the node for the
 /// specified call site.  Note that this method takes linear time, so it
diff --git a/lib/Analysis/IPA/GlobalsModRef.cpp b/lib/Analysis/IPA/GlobalsModRef.cpp
index 22f6e96b53d3..990caa80c8d2 100644
--- a/lib/Analysis/IPA/GlobalsModRef.cpp
+++ b/lib/Analysis/IPA/GlobalsModRef.cpp
@@ -263,7 +263,7 @@ bool GlobalsModRef::AnalyzeUsesOfPointer(Value *V,
     } else if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
       if (AnalyzeUsesOfPointer(BCI, Readers, Writers, OkayStoreDest))
         return true;
-    } else if (isFreeCall(U)) {
+    } else if (isFreeCall(U, TLI)) {
       Writers.push_back(cast<Instruction>(U)->getParent()->getParent());
     } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
       // Make sure that this is just the function being called, not that it is
@@ -329,7 +329,7 @@ bool GlobalsModRef::AnalyzeIndirectGlobalMemory(GlobalValue *GV) {
       // Check the value being stored.
       Value *Ptr = GetUnderlyingObject(SI->getOperand(0));
 
-      if (!isAllocLikeFn(Ptr))
+      if (!isAllocLikeFn(Ptr, TLI))
         return false;  // Too hard to analyze.
 
       // Analyze all uses of the allocation.  If any of them are used in a
@@ -458,7 +458,7 @@ void GlobalsModRef::AnalyzeCallGraph(CallGraph &CG, Module &M) {
           if (SI->isVolatile())
             // Treat volatile stores as reading memory somewhere.
             FunctionEffect |= Ref;
-        } else if (isAllocationFn(&*II) || isFreeCall(&*II)) {
+        } else if (isAllocationFn(&*II, TLI) || isFreeCall(&*II, TLI)) {
           FunctionEffect |= ModRef;
         } else if (IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(&*II)) {
           // The callgraph doesn't include intrinsic calls.
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index 0a6682a254f5..d4221b89e0f6 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
@@ -235,7 +235,7 @@ bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
   DT = &getAnalysis<DominatorTree>();
   SE = &getAnalysis<ScalarEvolution>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
 
   // Find all uses of induction variables in this loop, and categorize
   // them by stride.  Start by finding all of the PHI nodes in the header for
@@ -273,9 +273,11 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void IVUsers::dump() const {
   print(dbgs());
 }
+#endif
 
 void IVUsers::releaseMemory() {
   Processed.clear();
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index bc1ecd2ea430..5f51f775f142 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -24,7 +24,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/GlobalAlias.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -41,8 +41,8 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   typedef InstVisitor<CallAnalyzer, bool> Base;
   friend class InstVisitor<CallAnalyzer, bool>;
 
-  // TargetData if available, or null.
-  const TargetData *const TD;
+  // DataLayout if available, or null.
+  const DataLayout *const TD;
 
   // The called function.
   Function &F;
@@ -51,9 +51,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   int Cost;
   const bool AlwaysInline;
 
-  bool IsRecursive;
+  bool IsCallerRecursive;
+  bool IsRecursiveCall;
   bool ExposesReturnsTwice;
   bool HasDynamicAlloca;
+  /// Number of bytes allocated statically by the callee.
+  uint64_t AllocatedSize;
   unsigned NumInstructions, NumVectorInstructions;
   int FiftyPercentVectorBonus, TenPercentVectorBonus;
   int VectorBonus;
@@ -123,10 +126,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool visitCallSite(CallSite CS);
 
 public:
-  CallAnalyzer(const TargetData *TD, Function &Callee, int Threshold)
+  CallAnalyzer(const DataLayout *TD, Function &Callee, int Threshold)
     : TD(TD), F(Callee), Threshold(Threshold), Cost(0),
-      AlwaysInline(F.hasFnAttr(Attribute::AlwaysInline)),
-      IsRecursive(false), ExposesReturnsTwice(false), HasDynamicAlloca(false),
+      AlwaysInline(F.getFnAttributes().hasAttribute(Attributes::AlwaysInline)),
+      IsCallerRecursive(false), IsRecursiveCall(false),
+      ExposesReturnsTwice(false), HasDynamicAlloca(false), AllocatedSize(0),
       NumInstructions(0), NumVectorInstructions(0),
       FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
       NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
@@ -138,6 +142,7 @@ public:
 
   int getThreshold() { return Threshold; }
   int getCost() { return Cost; }
+  bool isAlwaysInline() { return AlwaysInline; }
 
   // Keep a bunch of stats about the cost savings found so we can print them
   // out when debugging.
@@ -269,6 +274,13 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // FIXME: Check whether inlining will turn a dynamic alloca into a static
   // alloca, and handle that case.
 
+  // Accumulate the allocated size.
+  if (I.isStaticAlloca()) {
+    Type *Ty = I.getAllocatedType();
+    AllocatedSize += (TD ? TD->getTypeAllocSize(Ty) :
+                      Ty->getPrimitiveSizeInBits());
+  }
+
   // We will happily inline static alloca instructions or dynamic alloca
   // instructions in always-inline situations.
   if (AlwaysInline || I.isStaticAlloca())
@@ -602,7 +614,7 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
 
 bool CallAnalyzer::visitCallSite(CallSite CS) {
   if (CS.isCall() && cast<CallInst>(CS.getInstruction())->canReturnTwice() &&
-      !F.hasFnAttr(Attribute::ReturnsTwice)) {
+      !F.getFnAttributes().hasAttribute(Attributes::ReturnsTwice)) {
     // This aborts the entire analysis.
     ExposesReturnsTwice = true;
     return false;
@@ -625,7 +637,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
     if (F == CS.getInstruction()->getParent()->getParent()) {
       // This flag will fully abort the analysis, so don't bother with anything
       // else.
-      IsRecursive = true;
+      IsRecursiveCall = true;
       return false;
     }
 
@@ -712,7 +724,14 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
       Cost += InlineConstants::InstrCost;
 
     // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      return false;
+
+    // If the caller is a recursive function then we don't want to inline
+    // functions which allocate a lot of stack space because it would increase
+    // the caller stack usage dramatically.
+    if (IsCallerRecursive &&
+        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
       return false;
 
     if (NumVectorInstructions > NumInstructions/2)
@@ -814,7 +833,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
         // one load and one store per word copied.
         // FIXME: The maxStoresPerMemcpy setting from the target should be used
         // here instead of a magic number of 8, but it's not available via
-        // TargetData.
+        // DataLayout.
         NumStores = std::min(NumStores, 8U);
 
         Cost -= 2 * NumStores * InlineConstants::InstrCost;
@@ -831,12 +850,14 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       Cost += InlineConstants::LastCallToStaticBonus;
 
     // If the instruction after the call, or if the normal destination of the
-    // invoke is an unreachable instruction, the function is noreturn.  As such,
-    // there is little point in inlining this unless there is literally zero cost.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+    // invoke is an unreachable instruction, the function is noreturn. As such,
+    // there is little point in inlining this unless there is literally zero
+    // cost.
+    Instruction *Instr = CS.getInstruction();
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
       if (isa<UnreachableInst>(II->getNormalDest()->begin()))
         Threshold = 1;
-    } else if (isa<UnreachableInst>(++BasicBlock::iterator(CS.getInstruction())))
+    } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
       Threshold = 1;
 
     // If this function uses the coldcc calling convention, prefer not to inline
@@ -852,6 +873,20 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   if (F.empty())
     return true;
 
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  // Check if the caller function is recursive itself.
+  for (Value::use_iterator U = Caller->use_begin(), E = Caller->use_end();
+       U != E; ++U) {
+    CallSite Site(cast<Value>(*U));
+    if (!Site)
+      continue;
+    Instruction *I = Site.getInstruction();
+    if (I->getParent()->getParent() == Caller) {
+      IsCallerRecursive = true;
+      break;
+    }
+  }
+
   // Track whether we've seen a return instruction. The first return
   // instruction is free, as at least one will usually disappear in inlining.
   bool HasReturn = false;
@@ -908,9 +943,9 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
     // We never want to inline functions that contain an indirectbr.  This is
     // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this indirect
-    // jump would jump from the inlined copy of the function into the original
-    // function which is extremely undefined behavior.
+    // for example) would be referring to the original function, and this
+    // indirect jump would jump from the inlined copy of the function into the 
+    // original function which is extremely undefined behavior.
     // FIXME: This logic isn't really right; we can safely inline functions
     // with indirectbr's as long as no other function or global references the
     // blockaddress of a block within the current function.  And as a QOI issue,
@@ -928,8 +963,16 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     if (!analyzeBlock(BB)) {
-      if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
         return false;
+
+      // If the caller is a recursive function then we don't want to inline
+      // functions which allocate a lot of stack space because it would increase
+      // the caller stack usage dramatically.
+      if (IsCallerRecursive &&
+          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
+        return false;
+
       break;
     }
 
@@ -955,7 +998,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
     // If we're unable to select a particular successor, just count all of
     // them.
-    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; ++TIdx)
+    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
+         ++TIdx)
       BBWorklist.insert(TI->getSuccessor(TIdx));
 
     // If we had any successors at this point, than post-inlining is likely to
@@ -974,6 +1018,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   return AlwaysInline || Cost < Threshold;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// \brief Dump stats about this call's analysis.
 void CallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
@@ -987,6 +1032,7 @@ void CallAnalyzer::dump() {
   DEBUG_PRINT_STAT(SROACostSavingsLost);
 #undef DEBUG_PRINT_STAT
 }
+#endif
 
 InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, int Threshold) {
   return getInlineCost(CS, CS.getCalledFunction(), Threshold);
@@ -998,10 +1044,12 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
   // something else.  Don't inline functions marked noinline or call sites
   // marked noinline.
   if (!Callee || Callee->mayBeOverridden() ||
-      Callee->hasFnAttr(Attribute::NoInline) || CS.isNoInline())
+      Callee->getFnAttributes().hasAttribute(Attributes::NoInline) ||
+      CS.isNoInline())
     return llvm::InlineCost::getNever();
 
-  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName() << "...\n");
+  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+        << "...\n");
 
   CallAnalyzer CA(TD, *Callee, Threshold);
   bool ShouldInline = CA.analyzeCall(CS);
@@ -1011,7 +1059,8 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
   // Check if there was a reason to force inlining or no inlining.
   if (!ShouldInline && CA.getCost() < CA.getThreshold())
     return InlineCost::getNever();
-  if (ShouldInline && CA.getCost() >= CA.getThreshold())
+  if (ShouldInline && (CA.isAlwaysInline() ||
+                       CA.getCost() >= CA.getThreshold()))
     return InlineCost::getAlways();
 
   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 379a35ad379d..a76e5ad1b8f8 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -31,7 +31,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -42,11 +42,11 @@ STATISTIC(NumFactor , "Number of factorizations");
 STATISTIC(NumReassoc, "Number of reassociations");
 
 struct Query {
-  const TargetData *TD;
+  const DataLayout *TD;
   const TargetLibraryInfo *TLI;
   const DominatorTree *DT;
 
-  Query(const TargetData *td, const TargetLibraryInfo *tli,
+  Query(const DataLayout *td, const TargetLibraryInfo *tli,
         const DominatorTree *dt) : TD(td), TLI(tli), DT(dt) {}
 };
 
@@ -651,7 +651,7 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const TargetData *TD, const TargetLibraryInfo *TLI,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyAddInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
                            RecursionLimit);
@@ -664,7 +664,7 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// if the GEP has all-constant indices. Returns false if any non-constant
 /// index is encountered leaving the 'Offset' in an undefined state. The
 /// 'Offset' APInt must be the bitwidth of the target's pointer size.
-static bool accumulateGEPOffset(const TargetData &TD, GEPOperator *GEP,
+static bool accumulateGEPOffset(const DataLayout &TD, GEPOperator *GEP,
                                 APInt &Offset) {
   unsigned IntPtrWidth = TD.getPointerSizeInBits();
   assert(IntPtrWidth == Offset.getBitWidth());
@@ -696,7 +696,7 @@ static bool accumulateGEPOffset(const TargetData &TD, GEPOperator *GEP,
 /// accumulates the total constant offset applied in the returned constant. It
 /// returns 0 if V is not a pointer, and returns the constant '0' if there are
 /// no constant offsets applied.
-static Constant *stripAndComputeConstantOffsets(const TargetData &TD,
+static Constant *stripAndComputeConstantOffsets(const DataLayout &TD,
                                                 Value *&V) {
   if (!V->getType()->isPointerTy())
     return 0;
@@ -731,7 +731,7 @@ static Constant *stripAndComputeConstantOffsets(const TargetData &TD,
 
 /// \brief Compute the constant difference between two pointer values.
 /// If the difference is not a constant, returns zero.
-static Constant *computePointerDifference(const TargetData &TD,
+static Constant *computePointerDifference(const DataLayout &TD,
                                           Value *LHS, Value *RHS) {
   Constant *LHSOffset = stripAndComputeConstantOffsets(TD, LHS);
   if (!LHSOffset)
@@ -880,7 +880,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const TargetData *TD, const TargetLibraryInfo *TLI,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifySubInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
                            RecursionLimit);
@@ -951,7 +951,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyMulInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1039,7 +1039,7 @@ static Value *SimplifySDivInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifySDivInst(Value *Op0, Value *Op1, const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifySDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1055,7 +1055,7 @@ static Value *SimplifyUDivInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyUDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1074,7 +1074,7 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyFDivInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1144,7 +1144,7 @@ static Value *SimplifySRemInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifySRemInst(Value *Op0, Value *Op1, const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifySRemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1160,7 +1160,7 @@ static Value *SimplifyURemInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyURemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1179,7 +1179,7 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, const Query &,
   return 0;
 }
 
-Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyFRemInst(Value *Op0, Value *Op1, const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyFRemInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1248,7 +1248,7 @@ static Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 }
 
 Value *llvm::SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                             const TargetData *TD, const TargetLibraryInfo *TLI,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyShlInst(Op0, Op1, isNSW, isNUW, Query (TD, TLI, DT),
                            RecursionLimit);
@@ -1275,7 +1275,7 @@ static Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
 }
 
 Value *llvm::SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                              const TargetData *TD,
+                              const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyLShrInst(Op0, Op1, isExact, Query (TD, TLI, DT),
@@ -1307,7 +1307,7 @@ static Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
 }
 
 Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                              const TargetData *TD,
+                              const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyAShrInst(Op0, Op1, isExact, Query (TD, TLI, DT),
@@ -1407,7 +1407,7 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyAndInst(Value *Op0, Value *Op1, const DataLayout *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyAndInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1501,7 +1501,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout *TD,
                             const TargetLibraryInfo *TLI,
                             const DominatorTree *DT) {
   return ::SimplifyOrInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1561,7 +1561,7 @@ static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
-Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const TargetData *TD,
+Value *llvm::SimplifyXorInst(Value *Op0, Value *Op1, const DataLayout *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyXorInst(Op0, Op1, Query (TD, TLI, DT), RecursionLimit);
@@ -1591,7 +1591,7 @@ static Value *ExtractEquivalentCondition(Value *V, CmpInst::Predicate Pred,
   return 0;
 }
 
-static Constant *computePointerICmp(const TargetData &TD,
+static Constant *computePointerICmp(const DataLayout &TD,
                                     CmpInst::Predicate Pred,
                                     Value *LHS, Value *RHS) {
   // We can only fold certain predicates on pointer comparisons.
@@ -2065,8 +2065,25 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     if (A && C && (A == C || A == D || B == C || B == D) &&
         NoLHSWrapProblem && NoRHSWrapProblem) {
       // Determine Y and Z in the form icmp (X+Y), (X+Z).
-      Value *Y = (A == C || A == D) ? B : A;
-      Value *Z = (C == A || C == B) ? D : C;
+      Value *Y, *Z;
+      if (A == C) {
+        // C + B == C + D  ->  B == D
+        Y = B;
+        Z = D;
+      } else if (A == D) {
+        // D + B == C + D  ->  B == C
+        Y = B;
+        Z = C;
+      } else if (B == C) {
+        // A + C == C + D  ->  A == D
+        Y = A;
+        Z = D;
+      } else {
+        assert(B == D);
+        // A + D == C + D  ->  A == C
+        Y = A;
+        Z = C;
+      }
       if (Value *V = SimplifyICmpInst(Pred, Y, Z, Q, MaxRecurse-1))
         return V;
     }
@@ -2399,7 +2416,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              const TargetData *TD,
+                              const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyICmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
@@ -2496,7 +2513,7 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                              const TargetData *TD,
+                              const DataLayout *TD,
                               const TargetLibraryInfo *TLI,
                               const DominatorTree *DT) {
   return ::SimplifyFCmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
@@ -2531,7 +2548,7 @@ static Value *SimplifySelectInst(Value *CondVal, Value *TrueVal,
 }
 
 Value *llvm::SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
-                                const TargetData *TD,
+                                const DataLayout *TD,
                                 const TargetLibraryInfo *TLI,
                                 const DominatorTree *DT) {
   return ::SimplifySelectInst(Cond, TrueVal, FalseVal, Query (TD, TLI, DT),
@@ -2579,7 +2596,7 @@ static Value *SimplifyGEPInst(ArrayRef<Value *> Ops, const Query &Q, unsigned) {
   return ConstantExpr::getGetElementPtr(cast<Constant>(Ops[0]), Ops.slice(1));
 }
 
-Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const TargetData *TD,
+Value *llvm::SimplifyGEPInst(ArrayRef<Value *> Ops, const DataLayout *TD,
                              const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyGEPInst(Ops, Query (TD, TLI, DT), RecursionLimit);
@@ -2616,7 +2633,7 @@ static Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
 
 Value *llvm::SimplifyInsertValueInst(Value *Agg, Value *Val,
                                      ArrayRef<unsigned> Idxs,
-                                     const TargetData *TD,
+                                     const DataLayout *TD,
                                      const TargetLibraryInfo *TLI,
                                      const DominatorTree *DT) {
   return ::SimplifyInsertValueInst(Agg, Val, Idxs, Query (TD, TLI, DT),
@@ -2664,7 +2681,7 @@ static Value *SimplifyTruncInst(Value *Op, Type *Ty, const Query &Q, unsigned) {
   return 0;
 }
 
-Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const TargetData *TD,
+Value *llvm::SimplifyTruncInst(Value *Op, Type *Ty, const DataLayout *TD,
                                const TargetLibraryInfo *TLI,
                                const DominatorTree *DT) {
   return ::SimplifyTruncInst(Op, Ty, Query (TD, TLI, DT), RecursionLimit);
@@ -2730,7 +2747,7 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                           const TargetData *TD, const TargetLibraryInfo *TLI,
+                           const DataLayout *TD, const TargetLibraryInfo *TLI,
                            const DominatorTree *DT) {
   return ::SimplifyBinOp(Opcode, LHS, RHS, Query (TD, TLI, DT), RecursionLimit);
 }
@@ -2745,7 +2762,7 @@ static Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
 }
 
 Value *llvm::SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                             const TargetData *TD, const TargetLibraryInfo *TLI,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
                              const DominatorTree *DT) {
   return ::SimplifyCmpInst(Predicate, LHS, RHS, Query (TD, TLI, DT),
                            RecursionLimit);
@@ -2761,7 +2778,7 @@ static Value *SimplifyCallInst(CallInst *CI, const Query &) {
 
 /// SimplifyInstruction - See if we can compute a simplified version of this
 /// instruction.  If not, this returns null.
-Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD,
+Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout *TD,
                                  const TargetLibraryInfo *TLI,
                                  const DominatorTree *DT) {
   Value *Result;
@@ -2881,7 +2898,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const TargetData *TD,
 /// This routine returns 'true' only when *it* simplifies something. The passed
 /// in simplified value does not count toward this.
 static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
-                                              const TargetData *TD,
+                                              const DataLayout *TD,
                                               const TargetLibraryInfo *TLI,
                                               const DominatorTree *DT) {
   bool Simplified = false;
@@ -2936,14 +2953,14 @@ static bool replaceAndRecursivelySimplifyImpl(Instruction *I, Value *SimpleV,
 }
 
 bool llvm::recursivelySimplifyInstruction(Instruction *I,
-                                          const TargetData *TD,
+                                          const DataLayout *TD,
                                           const TargetLibraryInfo *TLI,
                                           const DominatorTree *DT) {
   return replaceAndRecursivelySimplifyImpl(I, 0, TD, TLI, DT);
 }
 
 bool llvm::replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
-                                         const TargetData *TD,
+                                         const DataLayout *TD,
                                          const TargetLibraryInfo *TLI,
                                          const DominatorTree *DT) {
   assert(I != SimpleV && "replaceAndRecursivelySimplify(X,X) is not valid!");
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index 9140786a1ba0..2b87d80d3732 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -13,13 +13,14 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "lazy-value-info"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ConstantRange.h"
@@ -212,7 +213,7 @@ public:
 
         // Unless we can prove that the two Constants are different, we must
         // move to overdefined.
-        // FIXME: use TargetData/TargetLibraryInfo for smarter constant folding.
+        // FIXME: use DataLayout/TargetLibraryInfo for smarter constant folding.
         if (ConstantInt *Res = dyn_cast<ConstantInt>(
                 ConstantFoldCompareInstOperands(CmpInst::ICMP_NE,
                                                 getConstant(),
@@ -238,7 +239,7 @@ public:
 
         // Unless we can prove that the two Constants are different, we must
         // move to overdefined.
-        // FIXME: use TargetData/TargetLibraryInfo for smarter constant folding.
+        // FIXME: use DataLayout/TargetLibraryInfo for smarter constant folding.
         if (ConstantInt *Res = dyn_cast<ConstantInt>(
                 ConstantFoldCompareInstOperands(CmpInst::ICMP_NE,
                                                 getNotConstant(),
@@ -294,7 +295,7 @@ raw_ostream &operator<<(raw_ostream &OS, const LVILatticeVal &Val) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-  /// LVIValueHandle - A callback value handle update the cache when
+  /// LVIValueHandle - A callback value handle updates the cache when
   /// values are erased.
   class LazyValueInfoCache;
   struct LVIValueHandle : public CallbackVH {
@@ -470,8 +471,10 @@ bool LazyValueInfoCache::hasBlockValue(Value *Val, BasicBlock *BB) {
     return true;
 
   LVIValueHandle ValHandle(Val, this);
-  if (!ValueCache.count(ValHandle)) return false;
-  return ValueCache[ValHandle].count(BB);
+  std::map<LVIValueHandle, ValueCacheEntryTy>::iterator I =
+    ValueCache.find(ValHandle);
+  if (I == ValueCache.end()) return false;
+  return I->second.count(BB);
 }
 
 LVILatticeVal LazyValueInfoCache::getBlockValue(Value *Val, BasicBlock *BB) {
@@ -555,13 +558,11 @@ bool LazyValueInfoCache::solveBlockValue(Value *Val, BasicBlock *BB) {
 static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
   if (LoadInst *L = dyn_cast<LoadInst>(I)) {
     return L->getPointerAddressSpace() == 0 &&
-        GetUnderlyingObject(L->getPointerOperand()) ==
-        GetUnderlyingObject(Ptr);
+        GetUnderlyingObject(L->getPointerOperand()) == Ptr;
   }
   if (StoreInst *S = dyn_cast<StoreInst>(I)) {
     return S->getPointerAddressSpace() == 0 &&
-        GetUnderlyingObject(S->getPointerOperand()) ==
-        GetUnderlyingObject(Ptr);
+        GetUnderlyingObject(S->getPointerOperand()) == Ptr;
   }
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
     if (MI->isVolatile()) return false;
@@ -571,11 +572,11 @@ static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
     if (!Len || Len->isZero()) return false;
 
     if (MI->getDestAddressSpace() == 0)
-      if (MI->getRawDest() == Ptr || MI->getDest() == Ptr)
+      if (GetUnderlyingObject(MI->getRawDest()) == Ptr)
         return true;
     if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
       if (MTI->getSourceAddressSpace() == 0)
-        if (MTI->getRawSource() == Ptr || MTI->getSource() == Ptr)
+        if (GetUnderlyingObject(MTI->getRawSource()) == Ptr)
           return true;
   }
   return false;
@@ -589,13 +590,19 @@ bool LazyValueInfoCache::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   // then we know that the pointer can't be NULL.
   bool NotNull = false;
   if (Val->getType()->isPointerTy()) {
-    if (isa<AllocaInst>(Val)) {
+    if (isKnownNonNull(Val)) {
       NotNull = true;
     } else {
-      for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();BI != BE;++BI){
-        if (InstructionDereferencesPointer(BI, Val)) {
-          NotNull = true;
-          break;
+      Value *UnderlyingVal = GetUnderlyingObject(Val);
+      // If 'GetUnderlyingObject' didn't converge, skip it. It won't converge
+      // inside InstructionDereferencesPointer either.
+      if (UnderlyingVal == GetUnderlyingObject(UnderlyingVal, NULL, 1)) {
+        for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+             BI != BE; ++BI) {
+          if (InstructionDereferencesPointer(BI, UnderlyingVal)) {
+            NotNull = true;
+            break;
+          }
         }
       }
     }
@@ -845,9 +852,12 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       ConstantRange EdgeVal(i.getCaseValue()->getValue());
-      if (DefaultCase)
-        EdgesVals = EdgesVals.difference(EdgeVal);
-      else if (i.getCaseSuccessor() == BBTo)
+      if (DefaultCase) {
+        // It is possible that the default destination is the destination of
+        // some cases. There is no need to perform difference for those cases.
+        if (i.getCaseSuccessor() != BBTo)
+          EdgesVals = EdgesVals.difference(EdgeVal);
+      } else if (i.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
     Result = LVILatticeVal::getRange(EdgesVals);
@@ -1004,7 +1014,7 @@ bool LazyValueInfo::runOnFunction(Function &F) {
   if (PImpl)
     getCache(PImpl).clear();
 
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   // Fully lazy.
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 83bdf5286ad7..6d6d580ed19a 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -43,7 +43,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
@@ -103,7 +103,7 @@ namespace {
     Module *Mod;
     AliasAnalysis *AA;
     DominatorTree *DT;
-    TargetData *TD;
+    DataLayout *TD;
     TargetLibraryInfo *TLI;
 
     std::string Messages;
@@ -177,7 +177,7 @@ bool Lint::runOnFunction(Function &F) {
   Mod = F.getParent();
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<DominatorTree>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   visit(F);
   dbgs() << MessagesStr.str();
@@ -411,14 +411,50 @@ void Lint::visitMemoryReference(Instruction &I,
             "Undefined behavior: Branch to non-blockaddress", &I);
   }
 
+  // Check for buffer overflows and misalignment.
   if (TD) {
-    if (Align == 0 && Ty) Align = TD->getABITypeAlignment(Ty);
+    // Only handles memory references that read/write something simple like an
+    // alloca instruction or a global variable.
+    int64_t Offset = 0;
+    if (Value *Base = GetPointerBaseWithConstantOffset(Ptr, Offset, *TD)) {
+      // OK, so the access is to a constant offset from Ptr.  Check that Ptr is
+      // something we can handle and if so extract the size of this base object
+      // along with its alignment.
+      uint64_t BaseSize = AliasAnalysis::UnknownSize;
+      unsigned BaseAlign = 0;
+
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
+        Type *ATy = AI->getAllocatedType();
+        if (!AI->isArrayAllocation() && ATy->isSized())
+          BaseSize = TD->getTypeAllocSize(ATy);
+        BaseAlign = AI->getAlignment();
+        if (BaseAlign == 0 && ATy->isSized())
+          BaseAlign = TD->getABITypeAlignment(ATy);
+      } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+        // If the global may be defined differently in another compilation unit
+        // then don't warn about funky memory accesses.
+        if (GV->hasDefinitiveInitializer()) {
+          Type *GTy = GV->getType()->getElementType();
+          if (GTy->isSized())
+            BaseSize = TD->getTypeAllocSize(GTy);
+          BaseAlign = GV->getAlignment();
+          if (BaseAlign == 0 && GTy->isSized())
+            BaseAlign = TD->getABITypeAlignment(GTy);
+        }
+      }
 
-    if (Align != 0) {
-      unsigned BitWidth = TD->getTypeSizeInBits(Ptr->getType());
-      APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
-      ComputeMaskedBits(Ptr, KnownZero, KnownOne, TD);
-      Assert1(!(KnownOne & APInt::getLowBitsSet(BitWidth, Log2_32(Align))),
+      // Accesses from before the start or after the end of the object are not
+      // defined.
+      Assert1(Size == AliasAnalysis::UnknownSize ||
+              BaseSize == AliasAnalysis::UnknownSize ||
+              (Offset >= 0 && Offset + Size <= BaseSize),
+              "Undefined behavior: Buffer overflow", &I);
+
+      // Accesses that say that the memory is more aligned than it is are not
+      // defined.
+      if (Align == 0 && Ty && Ty->isSized())
+        Align = TD->getABITypeAlignment(Ty);
+      Assert1(!BaseAlign || Align <= MinAlign(BaseAlign, Offset),
               "Undefined behavior: Memory reference address is misaligned", &I);
     }
   }
@@ -470,7 +506,7 @@ void Lint::visitShl(BinaryOperator &I) {
             "Undefined result: Shift count out of range", &I);
 }
 
-static bool isZero(Value *V, TargetData *TD) {
+static bool isZero(Value *V, DataLayout *TD) {
   // Assume undef could be zero.
   if (isa<UndefValue>(V)) return true;
 
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index 873a27543dd6..73aa8b49cda5 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -13,7 +13,7 @@
 
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/GlobalAlias.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/IntrinsicInst.h"
@@ -52,8 +52,8 @@ static bool AreEquivalentAddressValues(const Value *A, const Value *B) {
 /// bitcasts to get back to the underlying object being addressed, keeping
 /// track of the offset in bytes from the GEPs relative to the result.
 /// This is closely related to GetUnderlyingObject but is located
-/// here to avoid making VMCore depend on TargetData.
-static Value *getUnderlyingObjectWithOffset(Value *V, const TargetData *TD,
+/// here to avoid making VMCore depend on DataLayout.
+static Value *getUnderlyingObjectWithOffset(Value *V, const DataLayout *TD,
                                             uint64_t &ByteOffset,
                                             unsigned MaxLookup = 6) {
   if (!V->getType()->isPointerTy())
@@ -85,7 +85,7 @@ static Value *getUnderlyingObjectWithOffset(Value *V, const TargetData *TD,
 /// specified pointer, we do a quick local scan of the basic block containing
 /// ScanFrom, to determine if the address is already accessed.
 bool llvm::isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom,
-                                       unsigned Align, const TargetData *TD) {
+                                       unsigned Align, const DataLayout *TD) {
   uint64_t ByteOffset = 0;
   Value *Base = V;
   if (TD)
diff --git a/lib/Analysis/LoopDependenceAnalysis.cpp b/lib/Analysis/LoopDependenceAnalysis.cpp
deleted file mode 100644
index 463269d9d984..000000000000
--- a/lib/Analysis/LoopDependenceAnalysis.cpp
+++ /dev/null
@@ -1,362 +0,0 @@
-//===- LoopDependenceAnalysis.cpp - LDA Implementation ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is the (beginning) of an implementation of a loop dependence analysis
-// framework, which is used to detect dependences in memory accesses in loops.
-//
-// Please note that this is work in progress and the interface is subject to
-// change.
-//
-// TODO: adapt as implementation progresses.
-//
-// TODO: document lingo (pair, subscript, index)
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "lda"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/LoopDependenceAnalysis.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/Instructions.h"
-#include "llvm/Operator.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
-using namespace llvm;
-
-STATISTIC(NumAnswered,    "Number of dependence queries answered");
-STATISTIC(NumAnalysed,    "Number of distinct dependence pairs analysed");
-STATISTIC(NumDependent,   "Number of pairs with dependent accesses");
-STATISTIC(NumIndependent, "Number of pairs with independent accesses");
-STATISTIC(NumUnknown,     "Number of pairs with unknown accesses");
-
-LoopPass *llvm::createLoopDependenceAnalysisPass() {
-  return new LoopDependenceAnalysis();
-}
-
-INITIALIZE_PASS_BEGIN(LoopDependenceAnalysis, "lda",
-                "Loop Dependence Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
-INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
-INITIALIZE_PASS_END(LoopDependenceAnalysis, "lda",
-                "Loop Dependence Analysis", false, true)
-char LoopDependenceAnalysis::ID = 0;
-
-//===----------------------------------------------------------------------===//
-//                             Utility Functions
-//===----------------------------------------------------------------------===//
-
-static inline bool IsMemRefInstr(const Value *V) {
-  const Instruction *I = dyn_cast<const Instruction>(V);
-  return I && (I->mayReadFromMemory() || I->mayWriteToMemory());
-}
-
-static void GetMemRefInstrs(const Loop *L,
-                            SmallVectorImpl<Instruction*> &Memrefs) {
-  for (Loop::block_iterator b = L->block_begin(), be = L->block_end();
-       b != be; ++b)
-    for (BasicBlock::iterator i = (*b)->begin(), ie = (*b)->end();
-         i != ie; ++i)
-      if (IsMemRefInstr(i))
-        Memrefs.push_back(i);
-}
-
-static bool IsLoadOrStoreInst(Value *I) {
-  // Returns true if the load or store can be analyzed. Atomic and volatile
-  // operations have properties which this analysis does not understand.
-  if (LoadInst *LI = dyn_cast<LoadInst>(I))
-    return LI->isUnordered();
-  else if (StoreInst *SI = dyn_cast<StoreInst>(I))
-    return SI->isUnordered();
-  return false;
-}
-
-static Value *GetPointerOperand(Value *I) {
-  if (LoadInst *i = dyn_cast<LoadInst>(I))
-    return i->getPointerOperand();
-  if (StoreInst *i = dyn_cast<StoreInst>(I))
-    return i->getPointerOperand();
-  llvm_unreachable("Value is no load or store instruction!");
-}
-
-static AliasAnalysis::AliasResult UnderlyingObjectsAlias(AliasAnalysis *AA,
-                                                         const Value *A,
-                                                         const Value *B) {
-  const Value *aObj = GetUnderlyingObject(A);
-  const Value *bObj = GetUnderlyingObject(B);
-  return AA->alias(aObj, AA->getTypeStoreSize(aObj->getType()),
-                   bObj, AA->getTypeStoreSize(bObj->getType()));
-}
-
-static inline const SCEV *GetZeroSCEV(ScalarEvolution *SE) {
-  return SE->getConstant(Type::getInt32Ty(SE->getContext()), 0L);
-}
-
-//===----------------------------------------------------------------------===//
-//                             Dependence Testing
-//===----------------------------------------------------------------------===//
-
-bool LoopDependenceAnalysis::isDependencePair(const Value *A,
-                                              const Value *B) const {
-  return IsMemRefInstr(A) &&
-         IsMemRefInstr(B) &&
-         (cast<const Instruction>(A)->mayWriteToMemory() ||
-          cast<const Instruction>(B)->mayWriteToMemory());
-}
-
-bool LoopDependenceAnalysis::findOrInsertDependencePair(Value *A,
-                                                        Value *B,
-                                                        DependencePair *&P) {
-  void *insertPos = 0;
-  FoldingSetNodeID id;
-  id.AddPointer(A);
-  id.AddPointer(B);
-
-  P = Pairs.FindNodeOrInsertPos(id, insertPos);
-  if (P) return true;
-
-  P = new (PairAllocator) DependencePair(id, A, B);
-  Pairs.InsertNode(P, insertPos);
-  return false;
-}
-
-void LoopDependenceAnalysis::getLoops(const SCEV *S,
-                                      DenseSet<const Loop*>* Loops) const {
-  // Refactor this into an SCEVVisitor, if efficiency becomes a concern.
-  for (const Loop *L = this->L; L != 0; L = L->getParentLoop())
-    if (!SE->isLoopInvariant(S, L))
-      Loops->insert(L);
-}
-
-bool LoopDependenceAnalysis::isLoopInvariant(const SCEV *S) const {
-  DenseSet<const Loop*> loops;
-  getLoops(S, &loops);
-  return loops.empty();
-}
-
-bool LoopDependenceAnalysis::isAffine(const SCEV *S) const {
-  const SCEVAddRecExpr *rec = dyn_cast<SCEVAddRecExpr>(S);
-  return isLoopInvariant(S) || (rec && rec->isAffine());
-}
-
-bool LoopDependenceAnalysis::isZIVPair(const SCEV *A, const SCEV *B) const {
-  return isLoopInvariant(A) && isLoopInvariant(B);
-}
-
-bool LoopDependenceAnalysis::isSIVPair(const SCEV *A, const SCEV *B) const {
-  DenseSet<const Loop*> loops;
-  getLoops(A, &loops);
-  getLoops(B, &loops);
-  return loops.size() == 1;
-}
-
-LoopDependenceAnalysis::DependenceResult
-LoopDependenceAnalysis::analyseZIV(const SCEV *A,
-                                   const SCEV *B,
-                                   Subscript *S) const {
-  assert(isZIVPair(A, B) && "Attempted to ZIV-test non-ZIV SCEVs!");
-  return A == B ? Dependent : Independent;
-}
-
-LoopDependenceAnalysis::DependenceResult
-LoopDependenceAnalysis::analyseSIV(const SCEV *A,
-                                   const SCEV *B,
-                                   Subscript *S) const {
-  return Unknown; // TODO: Implement.
-}
-
-LoopDependenceAnalysis::DependenceResult
-LoopDependenceAnalysis::analyseMIV(const SCEV *A,
-                                   const SCEV *B,
-                                   Subscript *S) const {
-  return Unknown; // TODO: Implement.
-}
-
-LoopDependenceAnalysis::DependenceResult
-LoopDependenceAnalysis::analyseSubscript(const SCEV *A,
-                                         const SCEV *B,
-                                         Subscript *S) const {
-  DEBUG(dbgs() << "  Testing subscript: " << *A << ", " << *B << "\n");
-
-  if (A == B) {
-    DEBUG(dbgs() << "  -> [D] same SCEV\n");
-    return Dependent;
-  }
-
-  if (!isAffine(A) || !isAffine(B)) {
-    DEBUG(dbgs() << "  -> [?] not affine\n");
-    return Unknown;
-  }
-
-  if (isZIVPair(A, B))
-    return analyseZIV(A, B, S);
-
-  if (isSIVPair(A, B))
-    return analyseSIV(A, B, S);
-
-  return analyseMIV(A, B, S);
-}
-
-LoopDependenceAnalysis::DependenceResult
-LoopDependenceAnalysis::analysePair(DependencePair *P) const {
-  DEBUG(dbgs() << "Analysing:\n" << *P->A << "\n" << *P->B << "\n");
-
-  // We only analyse loads and stores but no possible memory accesses by e.g.
-  // free, call, or invoke instructions.
-  if (!IsLoadOrStoreInst(P->A) || !IsLoadOrStoreInst(P->B)) {
-    DEBUG(dbgs() << "--> [?] no load/store\n");
-    return Unknown;
-  }
-
-  Value *aPtr = GetPointerOperand(P->A);
-  Value *bPtr = GetPointerOperand(P->B);
-
-  switch (UnderlyingObjectsAlias(AA, aPtr, bPtr)) {
-  case AliasAnalysis::MayAlias:
-  case AliasAnalysis::PartialAlias:
-    // We can not analyse objects if we do not know about their aliasing.
-    DEBUG(dbgs() << "---> [?] may alias\n");
-    return Unknown;
-
-  case AliasAnalysis::NoAlias:
-    // If the objects noalias, they are distinct, accesses are independent.
-    DEBUG(dbgs() << "---> [I] no alias\n");
-    return Independent;
-
-  case AliasAnalysis::MustAlias:
-    break; // The underlying objects alias, test accesses for dependence.
-  }
-
-  const GEPOperator *aGEP = dyn_cast<GEPOperator>(aPtr);
-  const GEPOperator *bGEP = dyn_cast<GEPOperator>(bPtr);
-
-  if (!aGEP || !bGEP)
-    return Unknown;
-
-  // FIXME: Is filtering coupled subscripts necessary?
-
-  // Collect GEP operand pairs (FIXME: use GetGEPOperands from BasicAA), adding
-  // trailing zeroes to the smaller GEP, if needed.
-  typedef SmallVector<std::pair<const SCEV*, const SCEV*>, 4> GEPOpdPairsTy;
-  GEPOpdPairsTy opds;
-  for(GEPOperator::const_op_iterator aIdx = aGEP->idx_begin(),
-                                     aEnd = aGEP->idx_end(),
-                                     bIdx = bGEP->idx_begin(),
-                                     bEnd = bGEP->idx_end();
-      aIdx != aEnd && bIdx != bEnd;
-      aIdx += (aIdx != aEnd), bIdx += (bIdx != bEnd)) {
-    const SCEV* aSCEV = (aIdx != aEnd) ? SE->getSCEV(*aIdx) : GetZeroSCEV(SE);
-    const SCEV* bSCEV = (bIdx != bEnd) ? SE->getSCEV(*bIdx) : GetZeroSCEV(SE);
-    opds.push_back(std::make_pair(aSCEV, bSCEV));
-  }
-
-  if (!opds.empty() && opds[0].first != opds[0].second) {
-    // We cannot (yet) handle arbitrary GEP pointer offsets. By limiting
-    //
-    // TODO: this could be relaxed by adding the size of the underlying object
-    // to the first subscript. If we have e.g. (GEP x,0,i; GEP x,2,-i) and we
-    // know that x is a [100 x i8]*, we could modify the first subscript to be
-    // (i, 200-i) instead of (i, -i).
-    return Unknown;
-  }
-
-  // Now analyse the collected operand pairs (skipping the GEP ptr offsets).
-  for (GEPOpdPairsTy::const_iterator i = opds.begin() + 1, end = opds.end();
-       i != end; ++i) {
-    Subscript subscript;
-    DependenceResult result = analyseSubscript(i->first, i->second, &subscript);
-    if (result != Dependent) {
-      // We either proved independence or failed to analyse this subscript.
-      // Further subscripts will not improve the situation, so abort early.
-      return result;
-    }
-    P->Subscripts.push_back(subscript);
-  }
-  // We successfully analysed all subscripts but failed to prove independence.
-  return Dependent;
-}
-
-bool LoopDependenceAnalysis::depends(Value *A, Value *B) {
-  assert(isDependencePair(A, B) && "Values form no dependence pair!");
-  ++NumAnswered;
-
-  DependencePair *p;
-  if (!findOrInsertDependencePair(A, B, p)) {
-    // The pair is not cached, so analyse it.
-    ++NumAnalysed;
-    switch (p->Result = analysePair(p)) {
-    case Dependent:   ++NumDependent;   break;
-    case Independent: ++NumIndependent; break;
-    case Unknown:     ++NumUnknown;     break;
-    }
-  }
-  return p->Result != Independent;
-}
-
-//===----------------------------------------------------------------------===//
-//                   LoopDependenceAnalysis Implementation
-//===----------------------------------------------------------------------===//
-
-bool LoopDependenceAnalysis::runOnLoop(Loop *L, LPPassManager &) {
-  this->L = L;
-  AA = &getAnalysis<AliasAnalysis>();
-  SE = &getAnalysis<ScalarEvolution>();
-  return false;
-}
-
-void LoopDependenceAnalysis::releaseMemory() {
-  Pairs.clear();
-  PairAllocator.Reset();
-}
-
-void LoopDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  AU.addRequiredTransitive<AliasAnalysis>();
-  AU.addRequiredTransitive<ScalarEvolution>();
-}
-
-static void PrintLoopInfo(raw_ostream &OS,
-                          LoopDependenceAnalysis *LDA, const Loop *L) {
-  if (!L->empty()) return; // ignore non-innermost loops
-
-  SmallVector<Instruction*, 8> memrefs;
-  GetMemRefInstrs(L, memrefs);
-
-  OS << "Loop at depth " << L->getLoopDepth() << ", header block: ";
-  WriteAsOperand(OS, L->getHeader(), false);
-  OS << "\n";
-
-  OS << "  Load/store instructions: " << memrefs.size() << "\n";
-  for (SmallVector<Instruction*, 8>::const_iterator x = memrefs.begin(),
-       end = memrefs.end(); x != end; ++x)
-    OS << "\t" << (x - memrefs.begin()) << ": " << **x << "\n";
-
-  OS << "  Pairwise dependence results:\n";
-  for (SmallVector<Instruction*, 8>::const_iterator x = memrefs.begin(),
-       end = memrefs.end(); x != end; ++x)
-    for (SmallVector<Instruction*, 8>::const_iterator y = x + 1;
-         y != end; ++y)
-      if (LDA->isDependencePair(*x, *y))
-        OS << "\t" << (x - memrefs.begin()) << "," << (y - memrefs.begin())
-           << ": " << (LDA->depends(*x, *y) ? "dependent" : "independent")
-           << "\n";
-}
-
-void LoopDependenceAnalysis::print(raw_ostream &OS, const Module*) const {
-  // TODO: doc why const_cast is safe
-  PrintLoopInfo(OS, const_cast<LoopDependenceAnalysis*>(this), this->L);
-}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 20c33a3d9d61..8341f9d83055 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -306,9 +306,11 @@ BasicBlock *Loop::getUniqueExitBlock() const {
   return 0;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Loop::dump() const {
   print(dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 // UnloopUpdater implementation
@@ -429,8 +431,8 @@ void UnloopUpdater::updateSubloopParents() {
     Unloop->removeChildLoop(llvm::prior(Unloop->end()));
 
     assert(SubloopParents.count(Subloop) && "DFS failed to visit subloop");
-    if (SubloopParents[Subloop])
-      SubloopParents[Subloop]->addChildLoop(Subloop);
+    if (Loop *Parent = SubloopParents[Subloop])
+      Parent->addChildLoop(Subloop);
     else
       LI->addTopLevelLoop(Subloop);
   }
@@ -456,9 +458,8 @@ Loop *UnloopUpdater::getNearestLoop(BasicBlock *BB, Loop *BBLoop) {
       assert(Subloop && "subloop is not an ancestor of the original loop");
     }
     // Get the current nearest parent of the Subloop exits, initially Unloop.
-    if (!SubloopParents.count(Subloop))
-      SubloopParents[Subloop] = Unloop;
-    NearLoop = SubloopParents[Subloop];
+    NearLoop =
+      SubloopParents.insert(std::make_pair(Subloop, Unloop)).first->second;
   }
 
   succ_iterator I = succ_begin(BB), E = succ_end(BB);
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index e77d2ff9e44e..0a539fe75825 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -25,7 +25,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -39,7 +40,7 @@ enum AllocType {
 };
 
 struct AllocFnsTy {
-  const char *Name;
+  LibFunc::Func Func;
   AllocType AllocTy;
   unsigned char NumParams;
   // First and Second size parameters (or -1 if unused)
@@ -49,22 +50,22 @@ struct AllocFnsTy {
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
 static const AllocFnsTy AllocationFnData[] = {
-  {"malloc",              MallocLike,  1, 0,  -1},
-  {"valloc",              MallocLike,  1, 0,  -1},
-  {"_Znwj",               MallocLike,  1, 0,  -1}, // new(unsigned int)
-  {"_ZnwjRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
-  {"_Znwm",               MallocLike,  1, 0,  -1}, // new(unsigned long)
-  {"_ZnwmRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
-  {"_Znaj",               MallocLike,  1, 0,  -1}, // new[](unsigned int)
-  {"_ZnajRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
-  {"_Znam",               MallocLike,  1, 0,  -1}, // new[](unsigned long)
-  {"_ZnamRKSt9nothrow_t", MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
-  {"posix_memalign",      MallocLike,  3, 2,  -1},
-  {"calloc",              CallocLike,  2, 0,  1},
-  {"realloc",             ReallocLike, 2, 1,  -1},
-  {"reallocf",            ReallocLike, 2, 1,  -1},
-  {"strdup",              StrDupLike,  1, -1, -1},
-  {"strndup",             StrDupLike,  2, 1,  -1}
+  {LibFunc::malloc,              MallocLike,  1, 0,  -1},
+  {LibFunc::valloc,              MallocLike,  1, 0,  -1},
+  {LibFunc::Znwj,                MallocLike,  1, 0,  -1}, // new(unsigned int)
+  {LibFunc::ZnwjRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned int, nothrow)
+  {LibFunc::Znwm,                MallocLike,  1, 0,  -1}, // new(unsigned long)
+  {LibFunc::ZnwmRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new(unsigned long, nothrow)
+  {LibFunc::Znaj,                MallocLike,  1, 0,  -1}, // new[](unsigned int)
+  {LibFunc::ZnajRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned int, nothrow)
+  {LibFunc::Znam,                MallocLike,  1, 0,  -1}, // new[](unsigned long)
+  {LibFunc::ZnamRKSt9nothrow_t,  MallocLike,  2, 0,  -1}, // new[](unsigned long, nothrow)
+  {LibFunc::posix_memalign,      MallocLike,  3, 2,  -1},
+  {LibFunc::calloc,              CallocLike,  2, 0,   1},
+  {LibFunc::realloc,             ReallocLike, 2, 1,  -1},
+  {LibFunc::reallocf,            ReallocLike, 2, 1,  -1},
+  {LibFunc::strdup,              StrDupLike,  1, -1, -1},
+  {LibFunc::strndup,             StrDupLike,  2, 1,  -1}
 };
 
 
@@ -85,15 +86,22 @@ static Function *getCalledFunction(const Value *V, bool LookThroughBitCast) {
 /// \brief Returns the allocation data for the given value if it is a call to a
 /// known allocation function, and NULL otherwise.
 static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
+                                           const TargetLibraryInfo *TLI,
                                            bool LookThroughBitCast = false) {
   Function *Callee = getCalledFunction(V, LookThroughBitCast);
   if (!Callee)
     return 0;
 
+  // Make sure that the function is available.
+  StringRef FnName = Callee->getName();
+  LibFunc::Func TLIFn;
+  if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
+    return 0;
+
   unsigned i = 0;
   bool found = false;
   for ( ; i < array_lengthof(AllocationFnData); ++i) {
-    if (Callee->getName() == AllocationFnData[i].Name) {
+    if (AllocationFnData[i].Func == TLIFn) {
       found = true;
       break;
     }
@@ -106,7 +114,6 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
     return 0;
 
   // Check function prototype.
-  // FIXME: Check the nobuiltin metadata?? (PR5130)
   int FstParam = FnData->FstParam;
   int SndParam = FnData->SndParam;
   FunctionType *FTy = Callee->getFunctionType();
@@ -125,64 +132,72 @@ static const AllocFnsTy *getAllocationData(const Value *V, AllocType AllocTy,
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
   ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CS && CS.hasFnAttr(Attribute::NoAlias);
+  return CS && CS.hasFnAttr(Attributes::NoAlias);
 }
 
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
-bool llvm::isAllocationFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, AnyAlloc, LookThroughBitCast);
+bool llvm::isAllocationFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, AnyAlloc, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a function that returns a
 /// NoAlias pointer (including malloc/calloc/realloc/strdup-like functions).
-bool llvm::isNoAliasFn(const Value *V, bool LookThroughBitCast) {
+bool llvm::isNoAliasFn(const Value *V, const TargetLibraryInfo *TLI,
+                       bool LookThroughBitCast) {
   // it's safe to consider realloc as noalias since accessing the original
   // pointer is undefined behavior
-  return isAllocationFn(V, LookThroughBitCast) ||
+  return isAllocationFn(V, TLI, LookThroughBitCast) ||
          hasNoAliasAttr(V, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory (such as malloc).
-bool llvm::isMallocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, MallocLike, LookThroughBitCast);
+bool llvm::isMallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, MallocLike, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
-bool llvm::isCallocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, CallocLike, LookThroughBitCast);
+bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                          bool LookThroughBitCast) {
+  return getAllocationData(V, CallocLike, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates memory (either malloc, calloc, or strdup like).
-bool llvm::isAllocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, AllocLike, LookThroughBitCast);
+bool llvm::isAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                         bool LookThroughBitCast) {
+  return getAllocationData(V, AllocLike, TLI, LookThroughBitCast);
 }
 
 /// \brief Tests if a value is a call or invoke to a library function that
 /// reallocates memory (such as realloc).
-bool llvm::isReallocLikeFn(const Value *V, bool LookThroughBitCast) {
-  return getAllocationData(V, ReallocLike, LookThroughBitCast);
+bool llvm::isReallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                           bool LookThroughBitCast) {
+  return getAllocationData(V, ReallocLike, TLI, LookThroughBitCast);
 }
 
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
 /// is a malloc call.  Since CallInst::CreateMalloc() only creates calls, we
 /// ignore InvokeInst here.
-const CallInst *llvm::extractMallocCall(const Value *I) {
-  return isMallocLikeFn(I) ? dyn_cast<CallInst>(I) : 0;
+const CallInst *llvm::extractMallocCall(const Value *I,
+                                        const TargetLibraryInfo *TLI) {
+  return isMallocLikeFn(I, TLI) ? dyn_cast<CallInst>(I) : 0;
 }
 
-static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
+static Value *computeArraySize(const CallInst *CI, const DataLayout *TD,
+                               const TargetLibraryInfo *TLI,
                                bool LookThroughSExt = false) {
   if (!CI)
     return NULL;
 
   // The size of the malloc's result type must be known to determine array size.
-  Type *T = getMallocAllocatedType(CI);
+  Type *T = getMallocAllocatedType(CI, TLI);
   if (!T || !T->isSized() || !TD)
     return NULL;
 
@@ -204,9 +219,11 @@ static Value *computeArraySize(const CallInst *CI, const TargetData *TD,
 /// isArrayMalloc - Returns the corresponding CallInst if the instruction 
 /// is a call to malloc whose array size can be determined and the array size
 /// is not constant 1.  Otherwise, return NULL.
-const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
-  const CallInst *CI = extractMallocCall(I);
-  Value *ArraySize = computeArraySize(CI, TD);
+const CallInst *llvm::isArrayMalloc(const Value *I,
+                                    const DataLayout *TD,
+                                    const TargetLibraryInfo *TLI) {
+  const CallInst *CI = extractMallocCall(I, TLI);
+  Value *ArraySize = computeArraySize(CI, TD, TLI);
 
   if (ArraySize &&
       ArraySize != ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
@@ -221,8 +238,9 @@ const CallInst *llvm::isArrayMalloc(const Value *I, const TargetData *TD) {
 ///   0: PointerType is the calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-PointerType *llvm::getMallocType(const CallInst *CI) {
-  assert(isMallocLikeFn(CI) && "getMallocType and not malloc call");
+PointerType *llvm::getMallocType(const CallInst *CI,
+                                 const TargetLibraryInfo *TLI) {
+  assert(isMallocLikeFn(CI, TLI) && "getMallocType and not malloc call");
   
   PointerType *MallocType = NULL;
   unsigned NumOfBitCastUses = 0;
@@ -252,8 +270,9 @@ PointerType *llvm::getMallocType(const CallInst *CI) {
 ///   0: PointerType is the malloc calls' return type.
 ///   1: PointerType is the bitcast's result type.
 ///  >1: Unique PointerType cannot be determined, return NULL.
-Type *llvm::getMallocAllocatedType(const CallInst *CI) {
-  PointerType *PT = getMallocType(CI);
+Type *llvm::getMallocAllocatedType(const CallInst *CI,
+                                   const TargetLibraryInfo *TLI) {
+  PointerType *PT = getMallocType(CI, TLI);
   return PT ? PT->getElementType() : NULL;
 }
 
@@ -262,22 +281,24 @@ Type *llvm::getMallocAllocatedType(const CallInst *CI) {
 /// then return that multiple.  For non-array mallocs, the multiple is
 /// constant 1.  Otherwise, return NULL for mallocs whose array size cannot be
 /// determined.
-Value *llvm::getMallocArraySize(CallInst *CI, const TargetData *TD,
+Value *llvm::getMallocArraySize(CallInst *CI, const DataLayout *TD,
+                                const TargetLibraryInfo *TLI,
                                 bool LookThroughSExt) {
-  assert(isMallocLikeFn(CI) && "getMallocArraySize and not malloc call");
-  return computeArraySize(CI, TD, LookThroughSExt);
+  assert(isMallocLikeFn(CI, TLI) && "getMallocArraySize and not malloc call");
+  return computeArraySize(CI, TD, TLI, LookThroughSExt);
 }
 
 
 /// extractCallocCall - Returns the corresponding CallInst if the instruction
 /// is a calloc call.
-const CallInst *llvm::extractCallocCall(const Value *I) {
-  return isCallocLikeFn(I) ? cast<CallInst>(I) : 0;
+const CallInst *llvm::extractCallocCall(const Value *I,
+                                        const TargetLibraryInfo *TLI) {
+  return isCallocLikeFn(I, TLI) ? cast<CallInst>(I) : 0;
 }
 
 
 /// isFreeCall - Returns non-null if the value is a call to the builtin free()
-const CallInst *llvm::isFreeCall(const Value *I) {
+const CallInst *llvm::isFreeCall(const Value *I, const TargetLibraryInfo *TLI) {
   const CallInst *CI = dyn_cast<CallInst>(I);
   if (!CI)
     return 0;
@@ -285,9 +306,14 @@ const CallInst *llvm::isFreeCall(const Value *I) {
   if (Callee == 0 || !Callee->isDeclaration())
     return 0;
 
-  if (Callee->getName() != "free" &&
-      Callee->getName() != "_ZdlPv" && // operator delete(void*)
-      Callee->getName() != "_ZdaPv")   // operator delete[](void*)
+  StringRef FnName = Callee->getName();
+  LibFunc::Func TLIFn;
+  if (!TLI || !TLI->getLibFunc(FnName, TLIFn) || !TLI->has(TLIFn))
+    return 0;
+
+  if (TLIFn != LibFunc::free &&
+      TLIFn != LibFunc::ZdlPv && // operator delete(void*)
+      TLIFn != LibFunc::ZdaPv)   // operator delete[](void*)
     return 0;
 
   // Check free prototype.
@@ -315,12 +341,12 @@ const CallInst *llvm::isFreeCall(const Value *I) {
 /// object size in Size if successful, and false otherwise.
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
-bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const TargetData *TD,
-                         bool RoundToAlign) {
+bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout *TD,
+                         const TargetLibraryInfo *TLI, bool RoundToAlign) {
   if (!TD)
     return false;
 
-  ObjectSizeOffsetVisitor Visitor(TD, Ptr->getContext(), RoundToAlign);
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Ptr->getContext(), RoundToAlign);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -347,10 +373,11 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
   return Size;
 }
 
-ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const TargetData *TD,
+ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
+                                                 const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
                                                  bool RoundToAlign)
-: TD(TD), RoundToAlign(RoundToAlign) {
+: TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
   IntegerType *IntTy = TD->getIntPtrType(Context);
   IntTyBits = IntTy->getBitWidth();
   Zero = APInt::getNullValue(IntTyBits);
@@ -358,11 +385,16 @@ ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const TargetData *TD,
 
 SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
   V = V->stripPointerCasts();
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    // If we have already seen this instruction, bail out. Cycles can happen in
+    // unreachable code after constant propagation.
+    if (!SeenInsts.insert(I))
+      return unknown();
 
-  if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
-    return visitGEPOperator(*GEP);
-  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V))
+      return visitGEPOperator(*GEP);
     return visit(*I);
+  }
   if (Argument *A = dyn_cast<Argument>(V))
     return visitArgument(*A);
   if (ConstantPointerNull *P = dyn_cast<ConstantPointerNull>(V))
@@ -371,9 +403,12 @@ SizeOffsetType ObjectSizeOffsetVisitor::compute(Value *V) {
     return visitGlobalVariable(*GV);
   if (UndefValue *UV = dyn_cast<UndefValue>(V))
     return visitUndefValue(*UV);
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     if (CE->getOpcode() == Instruction::IntToPtr)
       return unknown(); // clueless
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return visitGEPOperator(cast<GEPOperator>(*CE));
+  }
 
   DEBUG(dbgs() << "ObjectSizeOffsetVisitor::compute() unhandled value: " << *V
         << '\n');
@@ -408,7 +443,8 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
-  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
+                                               TLI);
   if (!FnData)
     return unknown();
 
@@ -473,10 +509,6 @@ ObjectSizeOffsetVisitor::visitExtractValueInst(ExtractValueInst&) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitGEPOperator(GEPOperator &GEP) {
-  // Ignore self-referencing GEPs, they can occur in unreachable code.
-  if (&GEP == GEP.getPointerOperand())
-    return unknown();
-
   SizeOffsetType PtrData = compute(GEP.getPointerOperand());
   if (!bothKnown(PtrData) || !GEP.hasAllConstantIndices())
     return unknown();
@@ -510,10 +542,6 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitPHINode(PHINode&) {
 }
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
-  // ignore malformed self-looping selects
-  if (I.getTrueValue() == &I || I.getFalseValue() == &I)
-    return unknown();
-
   SizeOffsetType TrueSide  = compute(I.getTrueValue());
   SizeOffsetType FalseSide = compute(I.getFalseValue());
   if (bothKnown(TrueSide) && bothKnown(FalseSide) && TrueSide == FalseSide)
@@ -531,10 +559,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
 }
 
 
-ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const TargetData *TD,
+ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *TD,
+                                                   const TargetLibraryInfo *TLI,
                                                      LLVMContext &Context)
-: TD(TD), Context(Context), Builder(Context, TargetFolder(TD)),
-Visitor(TD, Context) {
+: TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
   IntTy = TD->getIntPtrType(Context);
   Zero = ConstantInt::get(IntTy, 0);
 }
@@ -559,6 +587,7 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
+  ObjectSizeOffsetVisitor Visitor(TD, TLI, Context);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
@@ -621,7 +650,8 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
-  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc);
+  const AllocFnsTy *FnData = getAllocationData(CS.getInstruction(), AnyAlloc,
+                                               TLI);
   if (!FnData)
     return unknown();
 
@@ -719,10 +749,6 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitPHINode(PHINode &PHI) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitSelectInst(SelectInst &I) {
-  // ignore malformed self-looping selects
-  if (I.getTrueValue() == &I || I.getFalseValue() == &I)
-    return unknown();
-
   SizeOffsetEvalType TrueSide  = compute_(I.getTrueValue());
   SizeOffsetEvalType FalseSide = compute_(I.getFalseValue());
 
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 059e57493b9f..987289049455 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -30,7 +30,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/PredIteratorCache.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 using namespace llvm;
 
 STATISTIC(NumCacheNonLocal, "Number of fully cached non-local responses");
@@ -89,7 +89,7 @@ void MemoryDependenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 
 bool MemoryDependenceAnalysis::runOnFunction(Function &) {
   AA = &getAnalysis<AliasAnalysis>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   if (PredCache == 0)
     PredCache.reset(new PredIteratorCache());
@@ -148,7 +148,7 @@ AliasAnalysis::ModRefResult GetLocation(const Instruction *Inst,
     return AliasAnalysis::ModRef;
   }
 
-  if (const CallInst *CI = isFreeCall(Inst)) {
+  if (const CallInst *CI = isFreeCall(Inst, AA->getTargetLibraryInfo())) {
     // calls to free() deallocate the entire structure
     Loc = AliasAnalysis::Location(CI->getArgOperand(0));
     return AliasAnalysis::Mod;
@@ -256,7 +256,7 @@ isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc,
                                        const Value *&MemLocBase,
                                        int64_t &MemLocOffs,
                                        const LoadInst *LI,
-                                       const TargetData *TD) {
+                                       const DataLayout *TD) {
   // If we have no target data, we can't do this.
   if (TD == 0) return false;
 
@@ -280,7 +280,7 @@ isLoadLoadClobberIfExtendedToFullWidth(const AliasAnalysis::Location &MemLoc,
 unsigned MemoryDependenceAnalysis::
 getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
                                 unsigned MemLocSize, const LoadInst *LI,
-                                const TargetData &TD) {
+                                const DataLayout &TD) {
   // We can only extend simple integer loads.
   if (!isa<IntegerType>(LI->getType()) || !LI->isSimple()) return 0;
   
@@ -327,12 +327,12 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
       return 0;
 
     if (LIOffs+NewLoadByteSize > MemLocEnd &&
-        LI->getParent()->getParent()->hasFnAttr(Attribute::AddressSafety)) {
+        LI->getParent()->getParent()->getFnAttributes().
+          hasAttribute(Attributes::AddressSafety))
       // We will be reading past the location accessed by the original program.
       // While this is safe in a regular build, Address Safety analysis tools
       // may start reporting false warnings. So, don't do widening.
       return 0;
-    }
 
     // If a load of this width would include all of MemLoc, then we succeed.
     if (LIOffs+NewLoadByteSize >= MemLocEnd)
@@ -479,12 +479,20 @@ getPointerDependencyFrom(const AliasAnalysis::Location &MemLoc, bool isLoad,
     // a subsequent bitcast of the malloc call result.  There can be stores to
     // the malloced memory between the malloc call and its bitcast uses, and we
     // need to continue scanning until the malloc call.
-    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst)) {
+    const TargetLibraryInfo *TLI = AA->getTargetLibraryInfo();
+    if (isa<AllocaInst>(Inst) || isNoAliasFn(Inst, TLI)) {
       const Value *AccessPtr = GetUnderlyingObject(MemLoc.Ptr, TD);
       
       if (AccessPtr == Inst || AA->isMustAlias(Inst, AccessPtr))
         return MemDepResult::getDef(Inst);
-      continue;
+      // Be conservative if the accessed pointer may alias the allocation.
+      if (AA->alias(Inst, AccessPtr) != AliasAnalysis::NoAlias)
+        return MemDepResult::getClobber(Inst);
+      // If the allocation is not aliased and does not read memory (like
+      // strdup), it is safe to ignore.
+      if (isa<AllocaInst>(Inst) ||
+          isMallocLikeFn(Inst, TLI) || isCallocLikeFn(Inst, TLI))
+        continue;
     }
 
     // See if this instruction (e.g. a call or vaarg) mod/ref's the pointer.
@@ -975,7 +983,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
          I != E; ++I) {
       Visited.insert(std::make_pair(I->getBB(), Addr));
-      if (!I->getResult().isNonLocal())
+      if (!I->getResult().isNonLocal() && DT->isReachableFromEntry(I->getBB()))
         Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(), Addr));
     }
     ++NumCacheCompleteNonLocalPtr;
@@ -1021,7 +1029,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
                                                  NumSortedEntries);
       
       // If we got a Def or Clobber, add this to the list of results.
-      if (!Dep.isNonLocal()) {
+      if (!Dep.isNonLocal() && DT->isReachableFromEntry(BB)) {
         Result.push_back(NonLocalDepResult(BB, Dep, Pointer.getAddr()));
         continue;
       }
diff --git a/lib/Analysis/NoAliasAnalysis.cpp b/lib/Analysis/NoAliasAnalysis.cpp
index 101c2d5b0285..2eb4137c533a 100644
--- a/lib/Analysis/NoAliasAnalysis.cpp
+++ b/lib/Analysis/NoAliasAnalysis.cpp
@@ -15,7 +15,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 using namespace llvm;
 
 namespace {
@@ -36,7 +36,7 @@ namespace {
     virtual void initializePass() {
       // Note: NoAA does not call InitializeAliasAnalysis because it's
       // special and does not support chaining.
-      TD = getAnalysisIfAvailable<TargetData>();
+      TD = getAnalysisIfAvailable<DataLayout>();
     }
 
     virtual AliasResult alias(const Location &LocA, const Location &LocB) {
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index 38cb1c91f8f8..c35737e4724c 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -41,6 +41,7 @@ static bool CanPHITrans(Instruction *Inst) {
   return false;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void PHITransAddr::dump() const {
   if (Addr == 0) {
     dbgs() << "PHITransAddr: null\n";
@@ -50,6 +51,7 @@ void PHITransAddr::dump() const {
   for (unsigned i = 0, e = InstInputs.size(); i != e; ++i)
     dbgs() << "  Input #" << i << " is " << *InstInputs[i] << "\n";
 }
+#endif
 
 
 static bool VerifySubExpr(Value *Expr,
diff --git a/lib/Analysis/ProfileDataLoader.cpp b/lib/Analysis/ProfileDataLoader.cpp
new file mode 100644
index 000000000000..a4f634af531e
--- /dev/null
+++ b/lib/Analysis/ProfileDataLoader.cpp
@@ -0,0 +1,155 @@
+//===- ProfileDataLoader.cpp - Load profile information from disk ---------===//
+//
+//                      The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ProfileDataLoader class is used to load raw profiling data from the dump
+// file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Module.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Analysis/ProfileDataLoader.h"
+#include "llvm/Analysis/ProfileDataTypes.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include <cstdio>
+#include <cstdlib>
+using namespace llvm;
+
+raw_ostream &llvm::operator<<(raw_ostream &O, std::pair<const BasicBlock *,
+                                                        const BasicBlock *> E) {
+  O << "(";
+
+  if (E.first)
+    O << E.first->getName();
+  else
+    O << "0";
+
+  O << ",";
+
+  if (E.second)
+    O << E.second->getName();
+  else
+    O << "0";
+
+  return O << ")";
+}
+
+/// AddCounts - Add 'A' and 'B', accounting for the fact that the value of one
+/// (or both) may not be defined.
+static unsigned AddCounts(unsigned A, unsigned B) {
+  // If either value is undefined, use the other.
+  // Undefined + undefined = undefined.
+  if (A == ProfileDataLoader::Uncounted) return B;
+  if (B == ProfileDataLoader::Uncounted) return A;
+
+  return A + B;
+}
+
+/// ReadProfilingData - Load 'NumEntries' items of type 'T' from file 'F'
+template <typename T>
+static void ReadProfilingData(const char *ToolName, FILE *F,
+                              T *Data, size_t NumEntries) {
+  // Read in the block of data...
+  if (fread(Data, sizeof(T), NumEntries, F) != NumEntries)
+    report_fatal_error(Twine(ToolName) + ": Profiling data truncated");
+}
+
+/// ReadProfilingNumEntries - Read how many entries are in this profiling data
+/// packet.
+static unsigned ReadProfilingNumEntries(const char *ToolName, FILE *F,
+                                        bool ShouldByteSwap) {
+  unsigned Entry;
+  ReadProfilingData<unsigned>(ToolName, F, &Entry, 1);
+  return ShouldByteSwap ? ByteSwap_32(Entry) : Entry;
+}
+
+/// ReadProfilingBlock - Read the number of entries in the next profiling data
+/// packet and then accumulate the entries into 'Data'.
+static void ReadProfilingBlock(const char *ToolName, FILE *F,
+                               bool ShouldByteSwap,
+                               SmallVector<unsigned, 32> &Data) {
+  // Read the number of entries...
+  unsigned NumEntries = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
+
+  // Read in the data.
+  SmallVector<unsigned, 8> TempSpace(NumEntries);
+  ReadProfilingData<unsigned>(ToolName, F, TempSpace.data(), NumEntries);
+
+  // Make sure we have enough space ...
+  if (Data.size() < NumEntries)
+    Data.resize(NumEntries, ProfileDataLoader::Uncounted);
+
+  // Accumulate the data we just read into the existing data.
+  for (unsigned i = 0; i < NumEntries; ++i) {
+    unsigned Entry = ShouldByteSwap ? ByteSwap_32(TempSpace[i]) : TempSpace[i];
+    Data[i] = AddCounts(Entry, Data[i]);
+  }
+}
+
+/// ReadProfilingArgBlock - Read the command line arguments that the progam was
+/// run with when the current profiling data packet(s) were generated.
+static void ReadProfilingArgBlock(const char *ToolName, FILE *F,
+                                  bool ShouldByteSwap,
+                                  SmallVector<std::string, 1> &CommandLines) {
+  // Read the number of bytes ...
+  unsigned ArgLength = ReadProfilingNumEntries(ToolName, F, ShouldByteSwap);
+
+  // Read in the arguments (if there are any to read).  Round up the length to
+  // the nearest 4-byte multiple.
+  SmallVector<char, 8> Args(ArgLength+4);
+  if (ArgLength)
+    ReadProfilingData<char>(ToolName, F, Args.data(), (ArgLength+3) & ~3);
+
+  // Store the arguments.
+  CommandLines.push_back(std::string(&Args[0], &Args[ArgLength]));
+}
+
+const unsigned ProfileDataLoader::Uncounted = ~0U;
+
+/// ProfileDataLoader ctor - Read the specified profiling data file, reporting
+/// a fatal error if the file is invalid or broken.
+ProfileDataLoader::ProfileDataLoader(const char *ToolName,
+                                     const std::string &Filename)
+  : Filename(Filename) {
+  FILE *F = fopen(Filename.c_str(), "rb");
+  if (F == 0)
+    report_fatal_error(Twine(ToolName) + ": Error opening '" +
+                       Filename + "': ");
+
+  // Keep reading packets until we run out of them.
+  unsigned PacketType;
+  while (fread(&PacketType, sizeof(unsigned), 1, F) == 1) {
+    // If the low eight bits of the packet are zero, we must be dealing with an
+    // endianness mismatch.  Byteswap all words read from the profiling
+    // information.  This can happen when the compiler host and target have
+    // different endianness.
+    bool ShouldByteSwap = (char)PacketType == 0;
+    PacketType = ShouldByteSwap ? ByteSwap_32(PacketType) : PacketType;
+
+    switch (PacketType) {
+      case ArgumentInfo:
+        ReadProfilingArgBlock(ToolName, F, ShouldByteSwap, CommandLines);
+        break;
+
+      case EdgeInfo:
+        ReadProfilingBlock(ToolName, F, ShouldByteSwap, EdgeCounts);
+        break;
+
+      default:
+        report_fatal_error(std::string(ToolName)
+                           + ": Unknown profiling packet type");
+        break;
+    }
+  }
+
+  fclose(F);
+}
diff --git a/lib/Analysis/ProfileDataLoaderPass.cpp b/lib/Analysis/ProfileDataLoaderPass.cpp
new file mode 100644
index 000000000000..c43cff05a4da
--- /dev/null
+++ b/lib/Analysis/ProfileDataLoaderPass.cpp
@@ -0,0 +1,188 @@
+//===- ProfileDataLoaderPass.cpp - Set branch weight metadata from prof ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loads profiling data from a dump file and sets branch weight
+// metadata.
+//
+// TODO: Replace all "profile-metadata-loader" strings with "profile-loader"
+// once ProfileInfo etc. has been removed.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "profile-metadata-loader"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Module.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
+#include "llvm/Metadata.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ProfileDataLoader.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Format.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumEdgesRead, "The # of edges read.");
+STATISTIC(NumTermsAnnotated, "The # of terminator instructions annotated.");
+
+static cl::opt<std::string>
+ProfileMetadataFilename("profile-file", cl::init("llvmprof.out"),
+                  cl::value_desc("filename"),
+                  cl::desc("Profile file loaded by -profile-metadata-loader"));
+
+namespace {
+  /// This pass loads profiling data from a dump file and sets branch weight
+  /// metadata.
+  class ProfileMetadataLoaderPass : public ModulePass {
+    std::string Filename;
+  public:
+    static char ID; // Class identification, replacement for typeinfo
+    explicit ProfileMetadataLoaderPass(const std::string &filename = "")
+        : ModulePass(ID), Filename(filename) {
+      initializeProfileMetadataLoaderPassPass(*PassRegistry::getPassRegistry());
+      if (filename.empty()) Filename = ProfileMetadataFilename;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    virtual const char *getPassName() const {
+      return "Profile loader";
+    }
+
+    virtual void readEdge(unsigned, ProfileData&, ProfileData::Edge,
+                          ArrayRef<unsigned>);
+    virtual unsigned matchEdges(Module&, ProfileData&, ArrayRef<unsigned>);
+    virtual void setBranchWeightMetadata(Module&, ProfileData&);
+
+    virtual bool runOnModule(Module &M);
+  };
+}  // End of anonymous namespace
+
+char ProfileMetadataLoaderPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ProfileMetadataLoaderPass, "profile-metadata-loader",
+              "Load profile information from llvmprof.out", false, true)
+INITIALIZE_PASS_END(ProfileMetadataLoaderPass, "profile-metadata-loader",
+              "Load profile information from llvmprof.out", false, true)
+
+char &llvm::ProfileMetadataLoaderPassID = ProfileMetadataLoaderPass::ID;
+
+/// createProfileMetadataLoaderPass - This function returns a Pass that loads
+/// the profiling information for the module from the specified filename,
+/// making it available to the optimizers.
+ModulePass *llvm::createProfileMetadataLoaderPass() { 
+    return new ProfileMetadataLoaderPass();
+}
+ModulePass *llvm::createProfileMetadataLoaderPass(const std::string &Filename) {
+  return new ProfileMetadataLoaderPass(Filename);
+}
+
+/// readEdge - Take the value from a profile counter and assign it to an edge.
+void ProfileMetadataLoaderPass::readEdge(unsigned ReadCount,
+                                         ProfileData &PB, ProfileData::Edge e,
+                                         ArrayRef<unsigned> Counters) {
+  if (ReadCount >= Counters.size()) return;
+
+  unsigned weight = Counters[ReadCount];
+  assert(weight != ProfileDataLoader::Uncounted);
+  PB.addEdgeWeight(e, weight);
+
+  DEBUG(dbgs() << "-- Read Edge Counter for " << e
+               << " (# "<< (ReadCount) << "): "
+               << PB.getEdgeWeight(e) << "\n");
+}
+
+/// matchEdges - Link every profile counter with an edge.
+unsigned ProfileMetadataLoaderPass::matchEdges(Module &M, ProfileData &PB,
+                                               ArrayRef<unsigned> Counters) {
+  if (Counters.size() == 0) return 0;
+
+  unsigned ReadCount = 0;
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Loading edges in '" << F->getName() << "'\n");
+    readEdge(ReadCount++, PB, PB.getEdge(0, &F->getEntryBlock()), Counters);
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      for (unsigned s = 0, e = TI->getNumSuccessors(); s != e; ++s) {
+        readEdge(ReadCount++, PB, PB.getEdge(BB,TI->getSuccessor(s)),
+                 Counters);
+      }
+    }
+  }
+
+  return ReadCount;
+}
+
+/// setBranchWeightMetadata - Translate the counter values associated with each
+/// edge into branch weights for each conditional branch (a branch with 2 or
+/// more desinations).
+void ProfileMetadataLoaderPass::setBranchWeightMetadata(Module &M,
+                                                        ProfileData &PB) {
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    DEBUG(dbgs() << "Setting branch metadata in '" << F->getName() << "'\n");
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+      TerminatorInst *TI = BB->getTerminator();
+      unsigned NumSuccessors = TI->getNumSuccessors();
+
+      // If there is only one successor then we can not set a branch
+      // probability as the target is certain.
+      if (NumSuccessors < 2) continue;
+
+      // Load the weights of all edges leading from this terminator.
+      DEBUG(dbgs() << "-- Terminator with " << NumSuccessors
+                   << " successors:\n");
+      SmallVector<uint32_t, 4> Weights(NumSuccessors);
+      for (unsigned s = 0 ; s < NumSuccessors ; ++s) {
+          ProfileData::Edge edge = PB.getEdge(BB, TI->getSuccessor(s));
+          Weights[s] = (uint32_t)PB.getEdgeWeight(edge);
+          DEBUG(dbgs() << "---- Edge '" << edge << "' has weight "
+                       << Weights[s] << "\n");
+      }
+
+      // Set branch weight metadata.  This will set branch probabilities of
+      // 100%/0% if that is true of the dynamic execution.
+      // BranchProbabilityInfo can account for this when it loads this metadata
+      // (it gives the unexectuted branch a weight of 1 for the purposes of
+      // probability calculations).
+      MDBuilder MDB(TI->getContext());
+      MDNode *Node = MDB.createBranchWeights(Weights);
+      TI->setMetadata(LLVMContext::MD_prof, Node);
+      NumTermsAnnotated++;
+    }
+  }
+}
+
+bool ProfileMetadataLoaderPass::runOnModule(Module &M) {
+  ProfileDataLoader PDL("profile-data-loader", Filename);
+  ProfileData PB;
+
+  ArrayRef<unsigned> Counters = PDL.getRawEdgeCounts();
+
+  unsigned ReadCount = matchEdges(M, PB, Counters);
+
+  if (ReadCount != Counters.size()) {
+    errs() << "WARNING: profile information is inconsistent with "
+           << "the current program!\n";
+  }
+  NumEdgesRead = ReadCount;
+
+  setBranchWeightMetadata(M, PB);
+
+  return ReadCount > 0;
+}
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index 63468f842612..12b59e0a6fd5 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp
@@ -286,7 +286,7 @@ void ProfileEstimatorPass::recurseBasicBlock(BasicBlock *BB) {
     }
   }
 
-  double fraction = floor(BBWeight/Edges.size());
+  double fraction = Edges.size() ? floor(BBWeight/Edges.size()) : 0.0;
   // Finally we know what flow is still not leaving the block, distribute this
   // flow onto the empty edges.
   for (SmallVector<Edge, 8>::iterator ei = Edges.begin(), ee = Edges.end();
diff --git a/lib/Analysis/ProfileInfo.cpp b/lib/Analysis/ProfileInfo.cpp
index 173de2c02791..b5b7ac1e5011 100644
--- a/lib/Analysis/ProfileInfo.cpp
+++ b/lib/Analysis/ProfileInfo.cpp
@@ -1016,40 +1016,14 @@ void ProfileInfoT<Function,BasicBlock>::repair(const Function *F) {
   }
 }
 
-raw_ostream& operator<<(raw_ostream &O, const Function *F) {
-  return O << F->getName();
-}
-
 raw_ostream& operator<<(raw_ostream &O, const MachineFunction *MF) {
   return O << MF->getFunction()->getName() << "(MF)";
 }
 
-raw_ostream& operator<<(raw_ostream &O, const BasicBlock *BB) {
-  return O << BB->getName();
-}
-
 raw_ostream& operator<<(raw_ostream &O, const MachineBasicBlock *MBB) {
   return O << MBB->getBasicBlock()->getName() << "(MB)";
 }
 
-raw_ostream& operator<<(raw_ostream &O, std::pair<const BasicBlock *, const BasicBlock *> E) {
-  O << "(";
-
-  if (E.first)
-    O << E.first;
-  else
-    O << "0";
-
-  O << ",";
-
-  if (E.second)
-    O << E.second;
-  else
-    O << "0";
-
-  return O << ")";
-}
-
 raw_ostream& operator<<(raw_ostream &O, std::pair<const MachineBasicBlock *, const MachineBasicBlock *> E) {
   O << "(";
 
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 868f4834b7d8..30f0d2f10d86 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -47,7 +47,7 @@ static cl::opt<enum Region::PrintStyle> printStyle("print-region-style",
   cl::values(
     clEnumValN(Region::PrintNone, "none",  "print no details"),
     clEnumValN(Region::PrintBB, "bb",
-               "print regions in detail with block_node_iterator"),
+               "print regions in detail with block_iterator"),
     clEnumValN(Region::PrintRN, "rn",
                "print regions in detail with element_iterator"),
     clEnumValEnd));
@@ -246,22 +246,6 @@ void Region::verifyRegionNest() const {
   verifyRegion();
 }
 
-Region::block_node_iterator Region::block_node_begin() {
-  return GraphTraits<FlatIt<Region*> >::nodes_begin(this);
-}
-
-Region::block_node_iterator Region::block_node_end() {
-  return GraphTraits<FlatIt<Region*> >::nodes_end(this);
-}
-
-Region::const_block_node_iterator Region::block_node_begin() const {
-  return GraphTraits<FlatIt<const Region*> >::nodes_begin(this);
-}
-
-Region::const_block_node_iterator Region::block_node_end() const {
-  return GraphTraits<FlatIt<const Region*> >::nodes_end(this);
-}
-
 Region::element_iterator Region::element_begin() {
   return GraphTraits<Region*>::nodes_begin(this);
 }
@@ -425,10 +409,8 @@ void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
     OS.indent(level*2 + 2);
 
     if (Style == PrintBB) {
-      for (const_block_node_iterator I = block_node_begin(),
-                                     E = block_node_end();
-           I != E; ++I)
-        OS << **I << ", "; // TODO: remove the last ","
+      for (const_block_iterator I = block_begin(), E = block_end(); I != E; ++I)
+        OS << (*I)->getName() << ", "; // TODO: remove the last ","
     } else if (Style == PrintRN) {
       for (const_element_iterator I = element_begin(), E = element_end(); I!=E; ++I)
         OS << **I << ", "; // TODO: remove the last ",
@@ -445,9 +427,11 @@ void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
     OS.indent(level*2) << "} \n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Region::dump() const {
   print(dbgs(), true, getDepth(), printStyle.getValue());
 }
+#endif
 
 void Region::clearNodeCache() {
   // Free the cached nodes.
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index c97b5ebd7d05..9208fa21d7ec 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -195,10 +195,9 @@ public:
 
   virtual bool runOnRegion(Region *R, RGPassManager &RGM) {
     Out << Banner;
-    for (Region::block_node_iterator I = R->block_node_begin(),
-                                     E = R->block_node_end();
+    for (Region::block_iterator I = R->block_begin(), E = R->block_end();
          I != E; ++I)
-      (*I)->getEntry()->print(Out);
+      (*I)->print(Out);
 
     return false;
   }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index a654648578bc..e3189ecc8994 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -73,7 +73,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
@@ -105,6 +105,11 @@ MaxBruteForceIterations("scalar-evolution-max-iterations", cl::ReallyHidden,
                                  "derived loop"),
                         cl::init(100));
 
+// FIXME: Enable this with XDEBUG when the test suite is clean.
+static cl::opt<bool>
+VerifySCEV("verify-scev",
+           cl::desc("Verify ScalarEvolution's backedge taken counts (slow)"));
+
 INITIALIZE_PASS_BEGIN(ScalarEvolution, "scalar-evolution",
                 "Scalar Evolution Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
@@ -122,10 +127,12 @@ char ScalarEvolution::ID = 0;
 // Implementation of the SCEV class.
 //
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SCEV::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 void SCEV::print(raw_ostream &OS) const {
   switch (getSCEVType()) {
@@ -2580,7 +2587,7 @@ const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
 }
 
 const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
-  // If we have TargetData, we can bypass creating a target-independent
+  // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
@@ -2606,7 +2613,7 @@ const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
 
 const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
                                              unsigned FieldNo) {
-  // If we have TargetData, we can bypass creating a target-independent
+  // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
@@ -2671,7 +2678,7 @@ bool ScalarEvolution::isSCEVable(Type *Ty) const {
 uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
   assert(isSCEVable(Ty) && "Type is not SCEVable!");
 
-  // If we have a TargetData, use it!
+  // If we have a DataLayout, use it!
   if (TD)
     return TD->getTypeSizeInBits(Ty);
 
@@ -2679,7 +2686,7 @@ uint64_t ScalarEvolution::getTypeSizeInBits(Type *Ty) const {
   if (Ty->isIntegerTy())
     return Ty->getPrimitiveSizeInBits();
 
-  // The only other support type is pointer. Without TargetData, conservatively
+  // The only other support type is pointer. Without DataLayout, conservatively
   // assume pointers are 64-bit.
   assert(Ty->isPointerTy() && "isSCEVable permitted a non-SCEVable type!");
   return 64;
@@ -2699,7 +2706,7 @@ Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
   if (TD) return TD->getIntPtrType(getContext());
 
-  // Without TargetData, conservatively assume pointers are 64-bit.
+  // Without DataLayout, conservatively assume pointers are 64-bit.
   return Type::getInt64Ty(getContext());
 }
 
@@ -3978,8 +3985,11 @@ getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
 
   ConstantInt *Result = MulC->getValue();
 
-  // Guard against huge trip counts.
-  if (!Result || Result->getValue().getActiveBits() > 32)
+  // Guard against huge trip counts (this requires checking
+  // for zero to handle the case where the trip count == -1 and the
+  // addition wraps).
+  if (!Result || Result->getValue().getActiveBits() > 32 ||
+      Result->getValue().getActiveBits() == 0)
     return 1;
 
   return (unsigned)Result->getZExtValue();
@@ -4749,7 +4759,7 @@ static PHINode *getConstantEvolvingPHI(Value *V, const Loop *L) {
 /// reason, return null.
 static Constant *EvaluateExpression(Value *V, const Loop *L,
                                     DenseMap<Instruction *, Constant *> &Vals,
-                                    const TargetData *TD,
+                                    const DataLayout *TD,
                                     const TargetLibraryInfo *TLI) {
   // Convenient constant check, but redundant for recursive calls.
   if (Constant *C = dyn_cast<Constant>(V)) return C;
@@ -6141,7 +6151,7 @@ bool ScalarEvolution::isImpliedCond(ICmpInst::Predicate Pred,
       return CmpInst::isTrueWhenEqual(Pred);
   if (SimplifyICmpOperands(FoundPred, FoundLHS, FoundRHS))
     if (FoundLHS == FoundRHS)
-      return CmpInst::isFalseWhenEqual(Pred);
+      return CmpInst::isFalseWhenEqual(FoundPred);
 
   // Check to see if we can make the LHS or RHS match.
   if (LHS == FoundRHS || RHS == FoundLHS) {
@@ -6588,7 +6598,7 @@ ScalarEvolution::ScalarEvolution()
 bool ScalarEvolution::runOnFunction(Function &F) {
   this->F = &F;
   LI = &getAnalysis<LoopInfo>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTree>();
   return false;
@@ -6930,3 +6940,87 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   UnsignedRanges.erase(S);
   SignedRanges.erase(S);
 }
+
+typedef DenseMap<const Loop *, std::string> VerifyMap;
+
+/// replaceSubString - Replaces all occurences of From in Str with To.
+static void replaceSubString(std::string &Str, StringRef From, StringRef To) {
+  size_t Pos = 0;
+  while ((Pos = Str.find(From, Pos)) != std::string::npos) {
+    Str.replace(Pos, From.size(), To.data(), To.size());
+    Pos += To.size();
+  }
+}
+
+/// getLoopBackedgeTakenCounts - Helper method for verifyAnalysis.
+static void
+getLoopBackedgeTakenCounts(Loop *L, VerifyMap &Map, ScalarEvolution &SE) {
+  for (Loop::reverse_iterator I = L->rbegin(), E = L->rend(); I != E; ++I) {
+    getLoopBackedgeTakenCounts(*I, Map, SE); // recurse.
+
+    std::string &S = Map[L];
+    if (S.empty()) {
+      raw_string_ostream OS(S);
+      SE.getBackedgeTakenCount(L)->print(OS);
+
+      // false and 0 are semantically equivalent. This can happen in dead loops.
+      replaceSubString(OS.str(), "false", "0");
+      // Remove wrap flags, their use in SCEV is highly fragile.
+      // FIXME: Remove this when SCEV gets smarter about them.
+      replaceSubString(OS.str(), "<nw>", "");
+      replaceSubString(OS.str(), "<nsw>", "");
+      replaceSubString(OS.str(), "<nuw>", "");
+    }
+  }
+}
+
+void ScalarEvolution::verifyAnalysis() const {
+  if (!VerifySCEV)
+    return;
+
+  ScalarEvolution &SE = *const_cast<ScalarEvolution *>(this);
+
+  // Gather stringified backedge taken counts for all loops using SCEV's caches.
+  // FIXME: It would be much better to store actual values instead of strings,
+  //        but SCEV pointers will change if we drop the caches.
+  VerifyMap BackedgeDumpsOld, BackedgeDumpsNew;
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
+    getLoopBackedgeTakenCounts(*I, BackedgeDumpsOld, SE);
+
+  // Gather stringified backedge taken counts for all loops without using
+  // SCEV's caches.
+  SE.releaseMemory();
+  for (LoopInfo::reverse_iterator I = LI->rbegin(), E = LI->rend(); I != E; ++I)
+    getLoopBackedgeTakenCounts(*I, BackedgeDumpsNew, SE);
+
+  // Now compare whether they're the same with and without caches. This allows
+  // verifying that no pass changed the cache.
+  assert(BackedgeDumpsOld.size() == BackedgeDumpsNew.size() &&
+         "New loops suddenly appeared!");
+
+  for (VerifyMap::iterator OldI = BackedgeDumpsOld.begin(),
+                           OldE = BackedgeDumpsOld.end(),
+                           NewI = BackedgeDumpsNew.begin();
+       OldI != OldE; ++OldI, ++NewI) {
+    assert(OldI->first == NewI->first && "Loop order changed!");
+
+    // Compare the stringified SCEVs. We don't care if undef backedgetaken count
+    // changes.
+    // FIXME: We currently ignore SCEV changes from/to CouldNotCompute. This
+    // means that a pass is buggy or SCEV has to learn a new pattern but is
+    // usually not harmful.
+    if (OldI->second != NewI->second &&
+        OldI->second.find("undef") == std::string::npos &&
+        NewI->second.find("undef") == std::string::npos &&
+        OldI->second != "***COULDNOTCOMPUTE***" &&
+        NewI->second != "***COULDNOTCOMPUTE***") {
+      dbgs() << "SCEVValidator: SCEV for loop '"
+             << OldI->first->getHeader()->getName()
+             << "' changed from '" << OldI->second
+             << "' to '" << NewI->second << "'!\n";
+      std::abort();
+    }
+  }
+
+  // TODO: Verify more things.
+}
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 62710c5e8b84..111bfb4a6a76 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -18,7 +18,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/ADT/STLExtras.h"
 
@@ -212,7 +212,7 @@ static bool FactorOutConstant(const SCEV *&S,
                               const SCEV *&Remainder,
                               const SCEV *Factor,
                               ScalarEvolution &SE,
-                              const TargetData *TD) {
+                              const DataLayout *TD) {
   // Everything is divisible by one.
   if (Factor->isOne())
     return true;
@@ -253,7 +253,7 @@ static bool FactorOutConstant(const SCEV *&S,
   // of the given factor.
   if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
     if (TD) {
-      // With TargetData, the size is known. Check if there is a constant
+      // With DataLayout, the size is known. Check if there is a constant
       // operand which is a multiple of the given factor. If so, we can
       // factor it.
       const SCEVConstant *FC = cast<SCEVConstant>(Factor);
@@ -267,7 +267,7 @@ static bool FactorOutConstant(const SCEV *&S,
           return true;
         }
     } else {
-      // Without TargetData, check if Factor can be factored out of any of the
+      // Without DataLayout, check if Factor can be factored out of any of the
       // Mul's operands. If so, we can just remove it.
       for (unsigned i = 0, e = M->getNumOperands(); i != e; ++i) {
         const SCEV *SOp = M->getOperand(i);
@@ -458,7 +458,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
       // An empty struct has no fields.
       if (STy->getNumElements() == 0) break;
       if (SE.TD) {
-        // With TargetData, field offsets are known. See if a constant offset
+        // With DataLayout, field offsets are known. See if a constant offset
         // falls within any of the struct fields.
         if (Ops.empty()) break;
         if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0]))
@@ -477,7 +477,7 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
             }
           }
       } else {
-        // Without TargetData, just check for an offsetof expression of the
+        // Without DataLayout, just check for an offsetof expression of the
         // appropriate struct type.
         for (unsigned i = 0, e = Ops.size(); i != e; ++i)
           if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Ops[i])) {
@@ -1618,6 +1618,17 @@ unsigned SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
          PEnd = Phis.end(); PIter != PEnd; ++PIter) {
     PHINode *Phi = *PIter;
 
+    // Fold constant phis. They may be congruent to other constant phis and
+    // would confuse the logic below that expects proper IVs.
+    if (Value *V = Phi->hasConstantValue()) {
+      Phi->replaceAllUsesWith(V);
+      DeadInsts.push_back(Phi);
+      ++NumElim;
+      DEBUG_WITH_TYPE(DebugType, dbgs()
+                      << "INDVARS: Eliminated constant iv: " << *Phi << '\n');
+      continue;
+    }
+
     if (!SE.isSCEVable(Phi->getType()))
       continue;
 
diff --git a/lib/Analysis/Trace.cpp b/lib/Analysis/Trace.cpp
index ff5010bad7bb..22da85762034 100644
--- a/lib/Analysis/Trace.cpp
+++ b/lib/Analysis/Trace.cpp
@@ -43,9 +43,11 @@ void Trace::print(raw_ostream &O) const {
   O << "; Trace parent function: \n" << *F;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Debugger convenience method; writes trace to standard error
 /// output stream.
 ///
 void Trace::dump() const {
   print(dbgs());
 }
+#endif
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index cea34e16e7bf..3beb373dc5cc 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -22,7 +22,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
@@ -36,7 +36,7 @@ const unsigned MaxDepth = 6;
 
 /// getBitWidth - Returns the bitwidth of the given scalar or pointer type (if
 /// unknown returns 0).  For vector types, returns the element type's bitwidth.
-static unsigned getBitWidth(Type *Ty, const TargetData *TD) {
+static unsigned getBitWidth(Type *Ty, const DataLayout *TD) {
   if (unsigned BitWidth = Ty->getScalarSizeInBits())
     return BitWidth;
   assert(isa<PointerType>(Ty) && "Expected a pointer type!");
@@ -46,7 +46,7 @@ static unsigned getBitWidth(Type *Ty, const TargetData *TD) {
 static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
                                     APInt &KnownZero, APInt &KnownOne,
                                     APInt &KnownZero2, APInt &KnownOne2,
-                                    const TargetData *TD, unsigned Depth) {
+                                    const DataLayout *TD, unsigned Depth) {
   if (!Add) {
     if (ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
       // We know that the top bits of C-X are clear if X contains less bits
@@ -132,7 +132,7 @@ static void ComputeMaskedBitsAddSub(bool Add, Value *Op0, Value *Op1, bool NSW,
 static void ComputeMaskedBitsMul(Value *Op0, Value *Op1, bool NSW,
                                  APInt &KnownZero, APInt &KnownOne,
                                  APInt &KnownZero2, APInt &KnownOne2,
-                                 const TargetData *TD, unsigned Depth) {
+                                 const DataLayout *TD, unsigned Depth) {
   unsigned BitWidth = KnownZero.getBitWidth();
   ComputeMaskedBits(Op1, KnownZero, KnownOne, TD, Depth+1);
   ComputeMaskedBits(Op0, KnownZero2, KnownOne2, TD, Depth+1);
@@ -226,7 +226,7 @@ void llvm::computeMaskedBitsLoad(const MDNode &Ranges, APInt &KnownZero) {
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
-                             const TargetData *TD, unsigned Depth) {
+                             const DataLayout *TD, unsigned Depth) {
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
   unsigned BitWidth = KnownZero.getBitWidth();
@@ -308,11 +308,20 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   }
   
   if (Argument *A = dyn_cast<Argument>(V)) {
-    // Get alignment information off byval arguments if specified in the IR.
-    if (A->hasByValAttr())
-      if (unsigned Align = A->getParamAlignment())
-        KnownZero = APInt::getLowBitsSet(BitWidth,
-                                         CountTrailingZeros_32(Align));
+    unsigned Align = 0;
+
+    if (A->hasByValAttr()) {
+      // Get alignment information off byval arguments if specified in the IR.
+      Align = A->getParamAlignment();
+    } else if (TD && A->hasStructRetAttr()) {
+      // An sret parameter has at least the ABI alignment of the return type.
+      Type *EltTy = cast<PointerType>(A->getType())->getElementType();
+      if (EltTy->isSized())
+        Align = TD->getABITypeAlignment(EltTy);
+    }
+
+    if (Align)
+      KnownZero = APInt::getLowBitsSet(BitWidth, CountTrailingZeros_32(Align));
     return;
   }
 
@@ -420,15 +429,13 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
   case Instruction::ZExt:
   case Instruction::Trunc: {
     Type *SrcTy = I->getOperand(0)->getType();
-    
+
     unsigned SrcBitWidth;
     // Note that we handle pointer operands here because of inttoptr/ptrtoint
     // which fall through here.
-    if (SrcTy->isPointerTy())
-      SrcBitWidth = TD->getTypeSizeInBits(SrcTy);
-    else
-      SrcBitWidth = SrcTy->getScalarSizeInBits();
-    
+    SrcBitWidth = TD->getTypeSizeInBits(SrcTy->getScalarType());
+
+    assert(SrcBitWidth && "SrcBitWidth can't be zero");
     KnownZero = KnownZero.zextOrTrunc(SrcBitWidth);
     KnownOne = KnownOne.zextOrTrunc(SrcBitWidth);
     ComputeMaskedBits(I->getOperand(0), KnownZero, KnownOne, TD, Depth+1);
@@ -778,7 +785,7 @@ void llvm::ComputeMaskedBits(Value *V, APInt &KnownZero, APInt &KnownOne,
 /// ComputeSignBit - Determine whether the sign bit is known to be zero or
 /// one.  Convenience wrapper around ComputeMaskedBits.
 void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
-                          const TargetData *TD, unsigned Depth) {
+                          const DataLayout *TD, unsigned Depth) {
   unsigned BitWidth = getBitWidth(V->getType(), TD);
   if (!BitWidth) {
     KnownZero = false;
@@ -796,7 +803,7 @@ void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined.  Supports values with integer or pointer
 /// types and vectors of integers.
-bool llvm::isPowerOfTwo(Value *V, const TargetData *TD, bool OrZero,
+bool llvm::isPowerOfTwo(Value *V, const DataLayout *TD, bool OrZero,
                         unsigned Depth) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
@@ -859,7 +866,7 @@ bool llvm::isPowerOfTwo(Value *V, const TargetData *TD, bool OrZero,
 /// when defined.  For vectors return true if every element is known to be
 /// non-zero when defined.  Supports values with integer or pointer type and
 /// vectors of integers.
-bool llvm::isKnownNonZero(Value *V, const TargetData *TD, unsigned Depth) {
+bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return false;
@@ -986,7 +993,7 @@ bool llvm::isKnownNonZero(Value *V, const TargetData *TD, unsigned Depth) {
 /// same width as the vector element, and the bit is set only if it is true
 /// for all of the elements in the vector.
 bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
-                             const TargetData *TD, unsigned Depth) {
+                             const DataLayout *TD, unsigned Depth) {
   APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
   ComputeMaskedBits(V, KnownZero, KnownOne, TD, Depth);
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
@@ -1003,10 +1010,10 @@ bool llvm::MaskedValueIsZero(Value *V, const APInt &Mask,
 ///
 /// 'Op' must have a scalar integer type.
 ///
-unsigned llvm::ComputeNumSignBits(Value *V, const TargetData *TD,
+unsigned llvm::ComputeNumSignBits(Value *V, const DataLayout *TD,
                                   unsigned Depth) {
   assert((TD || V->getType()->isIntOrIntVectorTy()) &&
-         "ComputeNumSignBits requires a TargetData object to operate "
+         "ComputeNumSignBits requires a DataLayout object to operate "
          "on non-integer values!");
   Type *Ty = V->getType();
   unsigned TyBits = TD ? TD->getTypeSizeInBits(V->getType()->getScalarType()) :
@@ -1582,7 +1589,7 @@ Value *llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
 /// it can be expressed as a base pointer plus a constant offset.  Return the
 /// base and offset to the caller.
 Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
-                                              const TargetData &TD) {
+                                              const DataLayout &TD) {
   Operator *PtrOp = dyn_cast<Operator>(Ptr);
   if (PtrOp == 0 || Ptr->getType()->isVectorTy())
     return Ptr;
@@ -1614,7 +1621,7 @@ Value *llvm::GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset,
   // right.
   unsigned PtrSize = TD.getPointerSizeInBits();
   if (PtrSize < 64)
-    Offset = (Offset << (64-PtrSize)) >> (64-PtrSize);
+    Offset = SignExtend64(Offset, PtrSize);
   
   return GetPointerBaseWithConstantOffset(GEP->getPointerOperand(), Offset, TD);
 }
@@ -1768,7 +1775,7 @@ uint64_t llvm::GetStringLength(Value *V) {
 }
 
 Value *
-llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) {
+llvm::GetUnderlyingObject(Value *V, const DataLayout *TD, unsigned MaxLookup) {
   if (!V->getType()->isPointerTy())
     return V;
   for (unsigned Count = 0; MaxLookup == 0 || Count < MaxLookup; ++Count) {
@@ -1799,7 +1806,7 @@ llvm::GetUnderlyingObject(Value *V, const TargetData *TD, unsigned MaxLookup) {
 void
 llvm::GetUnderlyingObjects(Value *V,
                            SmallVectorImpl<Value *> &Objects,
-                           const TargetData *TD,
+                           const DataLayout *TD,
                            unsigned MaxLookup) {
   SmallPtrSet<Value *, 4> Visited;
   SmallVector<Value *, 4> Worklist;
@@ -1844,7 +1851,7 @@ bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
 }
 
 bool llvm::isSafeToSpeculativelyExecute(const Value *V,
-                                        const TargetData *TD) {
+                                        const DataLayout *TD) {
   const Operator *Inst = dyn_cast<Operator>(V);
   if (!Inst)
     return false;
diff --git a/lib/Archive/ArchiveInternals.h b/lib/Archive/ArchiveInternals.h
index 55684f7023d2..639f5ac2691b 100644
--- a/lib/Archive/ArchiveInternals.h
+++ b/lib/Archive/ArchiveInternals.h
@@ -66,7 +66,7 @@ namespace llvm {
       fmag[1] = '\n';
     }
 
-    bool checkSignature() {
+    bool checkSignature() const {
       return 0 == memcmp(fmag, ARFILE_MEMBER_MAGIC,2);
     }
   };
diff --git a/lib/Archive/ArchiveReader.cpp b/lib/Archive/ArchiveReader.cpp
index 5cfc8109bec0..5052495c0d62 100644
--- a/lib/Archive/ArchiveReader.cpp
+++ b/lib/Archive/ArchiveReader.cpp
@@ -79,7 +79,7 @@ Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
   }
 
   // Cast archive member header
-  ArchiveMemberHeader* Hdr = (ArchiveMemberHeader*)At;
+  const ArchiveMemberHeader* Hdr = (const ArchiveMemberHeader*)At;
   At += sizeof(ArchiveMemberHeader);
 
   int flags = 0;
@@ -196,7 +196,7 @@ Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
       /* FALL THROUGH */
 
     default:
-      char* slash = (char*) memchr(Hdr->name, '/', 16);
+      const char* slash = (const char*) memchr(Hdr->name, '/', 16);
       if (slash == 0)
         slash = Hdr->name + 16;
       pathname.assign(Hdr->name, slash - Hdr->name);
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 481733dd4e40..a60e4aa41c42 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -456,11 +456,12 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(private);
   KEYWORD(linker_private);
   KEYWORD(linker_private_weak);
-  KEYWORD(linker_private_weak_def_auto);
+  KEYWORD(linker_private_weak_def_auto); // FIXME: For backwards compatibility.
   KEYWORD(internal);
   KEYWORD(available_externally);
   KEYWORD(linkonce);
   KEYWORD(linkonce_odr);
+  KEYWORD(linkonce_odr_auto_hide);
   KEYWORD(weak);
   KEYWORD(weak_odr);
   KEYWORD(appending);
@@ -509,6 +510,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(asm);
   KEYWORD(sideeffect);
   KEYWORD(alignstack);
+  KEYWORD(inteldialect);
   KEYWORD(gc);
 
   KEYWORD(ccc);
@@ -523,6 +525,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(msp430_intrcc);
   KEYWORD(ptx_kernel);
   KEYWORD(ptx_device);
+  KEYWORD(spir_kernel);
+  KEYWORD(spir_func);
+  KEYWORD(intel_ocl_bicc);
 
   KEYWORD(cc);
   KEYWORD(c);
@@ -553,7 +558,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(naked);
   KEYWORD(nonlazybind);
   KEYWORD(address_safety);
-  KEYWORD(ia_nsdialect);
+  KEYWORD(minsize);
 
   KEYWORD(type);
   KEYWORD(opaque);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 0ff8edd61b89..b24291ffb329 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -184,12 +184,13 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_private:             // OptionalLinkage
     case lltok::kw_linker_private:      // OptionalLinkage
     case lltok::kw_linker_private_weak: // OptionalLinkage
-    case lltok::kw_linker_private_weak_def_auto: // OptionalLinkage
+    case lltok::kw_linker_private_weak_def_auto: // FIXME: backwards compat.
     case lltok::kw_internal:            // OptionalLinkage
     case lltok::kw_weak:                // OptionalLinkage
     case lltok::kw_weak_odr:            // OptionalLinkage
     case lltok::kw_linkonce:            // OptionalLinkage
     case lltok::kw_linkonce_odr:        // OptionalLinkage
+    case lltok::kw_linkonce_odr_auto_hide: // OptionalLinkage
     case lltok::kw_appending:           // OptionalLinkage
     case lltok::kw_dllexport:           // OptionalLinkage
     case lltok::kw_common:              // OptionalLinkage
@@ -576,8 +577,7 @@ bool LLParser::ParseAlias(const std::string &Name, LocTy NameLoc,
       Linkage != GlobalValue::InternalLinkage &&
       Linkage != GlobalValue::PrivateLinkage &&
       Linkage != GlobalValue::LinkerPrivateLinkage &&
-      Linkage != GlobalValue::LinkerPrivateWeakLinkage &&
-      Linkage != GlobalValue::LinkerPrivateWeakDefAutoLinkage)
+      Linkage != GlobalValue::LinkerPrivateWeakLinkage)
     return Error(LinkageLoc, "invalid linkage type for alias");
 
   Constant *Aliasee;
@@ -779,7 +779,9 @@ GlobalValue *LLParser::GetGlobalVal(const std::string &Name, Type *Ty,
     FwdVal = Function::Create(FT, GlobalValue::ExternalWeakLinkage, Name, M);
   else
     FwdVal = new GlobalVariable(*M, PTy->getElementType(), false,
-                                GlobalValue::ExternalWeakLinkage, 0, Name);
+                                GlobalValue::ExternalWeakLinkage, 0, Name,
+                                0, GlobalVariable::NotThreadLocal,
+                                PTy->getAddressSpace());
 
   ForwardRefVals[Name] = std::make_pair(FwdVal, Loc);
   return FwdVal;
@@ -916,59 +918,50 @@ bool LLParser::ParseOptionalAddrSpace(unsigned &AddrSpace) {
 /// ParseOptionalAttrs - Parse a potentially empty attribute list.  AttrKind
 /// indicates what kind of attribute list this is: 0: function arg, 1: result,
 /// 2: function attr.
-bool LLParser::ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind) {
-  Attrs = Attribute::None;
+bool LLParser::ParseOptionalAttrs(AttrBuilder &B, unsigned AttrKind) {
   LocTy AttrLoc = Lex.getLoc();
+  bool HaveError = false;
+
+  B.clear();
 
   while (1) {
-    switch (Lex.getKind()) {
+    lltok::Kind Token = Lex.getKind();
+    switch (Token) {
     default:  // End of attributes.
-      if (AttrKind != 2 && (Attrs & Attribute::FunctionOnly))
-        return Error(AttrLoc, "invalid use of function-only attribute");
-
-      // As a hack, we allow "align 2" on functions as a synonym for
-      // "alignstack 2".
-      if (AttrKind == 2 &&
-          (Attrs & ~(Attribute::FunctionOnly | Attribute::Alignment)))
-        return Error(AttrLoc, "invalid use of attribute on a function");
-
-      if (AttrKind != 0 && (Attrs & Attribute::ParameterOnly))
-        return Error(AttrLoc, "invalid use of parameter-only attribute");
-
-      return false;
-    case lltok::kw_zeroext:         Attrs |= Attribute::ZExt; break;
-    case lltok::kw_signext:         Attrs |= Attribute::SExt; break;
-    case lltok::kw_inreg:           Attrs |= Attribute::InReg; break;
-    case lltok::kw_sret:            Attrs |= Attribute::StructRet; break;
-    case lltok::kw_noalias:         Attrs |= Attribute::NoAlias; break;
-    case lltok::kw_nocapture:       Attrs |= Attribute::NoCapture; break;
-    case lltok::kw_byval:           Attrs |= Attribute::ByVal; break;
-    case lltok::kw_nest:            Attrs |= Attribute::Nest; break;
-
-    case lltok::kw_noreturn:        Attrs |= Attribute::NoReturn; break;
-    case lltok::kw_nounwind:        Attrs |= Attribute::NoUnwind; break;
-    case lltok::kw_uwtable:         Attrs |= Attribute::UWTable; break;
-    case lltok::kw_returns_twice:   Attrs |= Attribute::ReturnsTwice; break;
-    case lltok::kw_noinline:        Attrs |= Attribute::NoInline; break;
-    case lltok::kw_readnone:        Attrs |= Attribute::ReadNone; break;
-    case lltok::kw_readonly:        Attrs |= Attribute::ReadOnly; break;
-    case lltok::kw_inlinehint:      Attrs |= Attribute::InlineHint; break;
-    case lltok::kw_alwaysinline:    Attrs |= Attribute::AlwaysInline; break;
-    case lltok::kw_optsize:         Attrs |= Attribute::OptimizeForSize; break;
-    case lltok::kw_ssp:             Attrs |= Attribute::StackProtect; break;
-    case lltok::kw_sspreq:          Attrs |= Attribute::StackProtectReq; break;
-    case lltok::kw_noredzone:       Attrs |= Attribute::NoRedZone; break;
-    case lltok::kw_noimplicitfloat: Attrs |= Attribute::NoImplicitFloat; break;
-    case lltok::kw_naked:           Attrs |= Attribute::Naked; break;
-    case lltok::kw_nonlazybind:     Attrs |= Attribute::NonLazyBind; break;
-    case lltok::kw_address_safety:  Attrs |= Attribute::AddressSafety; break;
-    case lltok::kw_ia_nsdialect:    Attrs |= Attribute::IANSDialect; break;
+      return HaveError;
+    case lltok::kw_zeroext:         B.addAttribute(Attributes::ZExt); break;
+    case lltok::kw_signext:         B.addAttribute(Attributes::SExt); break;
+    case lltok::kw_inreg:           B.addAttribute(Attributes::InReg); break;
+    case lltok::kw_sret:            B.addAttribute(Attributes::StructRet); break;
+    case lltok::kw_noalias:         B.addAttribute(Attributes::NoAlias); break;
+    case lltok::kw_nocapture:       B.addAttribute(Attributes::NoCapture); break;
+    case lltok::kw_byval:           B.addAttribute(Attributes::ByVal); break;
+    case lltok::kw_nest:            B.addAttribute(Attributes::Nest); break;
+
+    case lltok::kw_noreturn:        B.addAttribute(Attributes::NoReturn); break;
+    case lltok::kw_nounwind:        B.addAttribute(Attributes::NoUnwind); break;
+    case lltok::kw_uwtable:         B.addAttribute(Attributes::UWTable); break;
+    case lltok::kw_returns_twice:   B.addAttribute(Attributes::ReturnsTwice); break;
+    case lltok::kw_noinline:        B.addAttribute(Attributes::NoInline); break;
+    case lltok::kw_readnone:        B.addAttribute(Attributes::ReadNone); break;
+    case lltok::kw_readonly:        B.addAttribute(Attributes::ReadOnly); break;
+    case lltok::kw_inlinehint:      B.addAttribute(Attributes::InlineHint); break;
+    case lltok::kw_alwaysinline:    B.addAttribute(Attributes::AlwaysInline); break;
+    case lltok::kw_optsize:         B.addAttribute(Attributes::OptimizeForSize); break;
+    case lltok::kw_ssp:             B.addAttribute(Attributes::StackProtect); break;
+    case lltok::kw_sspreq:          B.addAttribute(Attributes::StackProtectReq); break;
+    case lltok::kw_noredzone:       B.addAttribute(Attributes::NoRedZone); break;
+    case lltok::kw_noimplicitfloat: B.addAttribute(Attributes::NoImplicitFloat); break;
+    case lltok::kw_naked:           B.addAttribute(Attributes::Naked); break;
+    case lltok::kw_nonlazybind:     B.addAttribute(Attributes::NonLazyBind); break;
+    case lltok::kw_address_safety:  B.addAttribute(Attributes::AddressSafety); break;
+    case lltok::kw_minsize:         B.addAttribute(Attributes::MinSize); break;
 
     case lltok::kw_alignstack: {
       unsigned Alignment;
       if (ParseOptionalStackAlignment(Alignment))
         return true;
-      Attrs |= Attribute::constructStackAlignmentFromInt(Alignment);
+      B.addStackAlignmentAttr(Alignment);
       continue;
     }
 
@@ -976,11 +969,57 @@ bool LLParser::ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind) {
       unsigned Alignment;
       if (ParseOptionalAlignment(Alignment))
         return true;
-      Attrs |= Attribute::constructAlignmentFromInt(Alignment);
+      B.addAlignmentAttr(Alignment);
       continue;
     }
 
     }
+
+    // Perform some error checking.
+    switch (Token) {
+    default:
+      if (AttrKind == 2)
+        HaveError |= Error(AttrLoc, "invalid use of attribute on a function");
+      break;
+    case lltok::kw_align:
+      // As a hack, we allow "align 2" on functions as a synonym for
+      // "alignstack 2".
+      break;
+
+    // Parameter Only:
+    case lltok::kw_sret:
+    case lltok::kw_nocapture:
+    case lltok::kw_byval:
+    case lltok::kw_nest:
+      if (AttrKind != 0)
+        HaveError |= Error(AttrLoc, "invalid use of parameter-only attribute");
+      break;
+
+    // Function Only:
+    case lltok::kw_noreturn:
+    case lltok::kw_nounwind:
+    case lltok::kw_readnone:
+    case lltok::kw_readonly:
+    case lltok::kw_noinline:
+    case lltok::kw_alwaysinline:
+    case lltok::kw_optsize:
+    case lltok::kw_ssp:
+    case lltok::kw_sspreq:
+    case lltok::kw_noredzone:
+    case lltok::kw_noimplicitfloat:
+    case lltok::kw_naked:
+    case lltok::kw_inlinehint:
+    case lltok::kw_alignstack:
+    case lltok::kw_uwtable:
+    case lltok::kw_nonlazybind:
+    case lltok::kw_returns_twice:
+    case lltok::kw_address_safety:
+    case lltok::kw_minsize:
+      if (AttrKind != 2)
+        HaveError |= Error(AttrLoc, "invalid use of function-only attribute");
+      break;
+    }
+
     Lex.Lex();
   }
 }
@@ -990,12 +1029,12 @@ bool LLParser::ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind) {
 ///   ::= 'private'
 ///   ::= 'linker_private'
 ///   ::= 'linker_private_weak'
-///   ::= 'linker_private_weak_def_auto'
 ///   ::= 'internal'
 ///   ::= 'weak'
 ///   ::= 'weak_odr'
 ///   ::= 'linkonce'
 ///   ::= 'linkonce_odr'
+///   ::= 'linkonce_odr_auto_hide'
 ///   ::= 'available_externally'
 ///   ::= 'appending'
 ///   ::= 'dllexport'
@@ -1012,14 +1051,15 @@ bool LLParser::ParseOptionalLinkage(unsigned &Res, bool &HasLinkage) {
   case lltok::kw_linker_private_weak:
     Res = GlobalValue::LinkerPrivateWeakLinkage;
     break;
-  case lltok::kw_linker_private_weak_def_auto:
-    Res = GlobalValue::LinkerPrivateWeakDefAutoLinkage;
-    break;
   case lltok::kw_internal:       Res = GlobalValue::InternalLinkage;      break;
   case lltok::kw_weak:           Res = GlobalValue::WeakAnyLinkage;       break;
   case lltok::kw_weak_odr:       Res = GlobalValue::WeakODRLinkage;       break;
   case lltok::kw_linkonce:       Res = GlobalValue::LinkOnceAnyLinkage;   break;
   case lltok::kw_linkonce_odr:   Res = GlobalValue::LinkOnceODRLinkage;   break;
+  case lltok::kw_linkonce_odr_auto_hide:
+  case lltok::kw_linker_private_weak_def_auto: // FIXME: For backwards compat.
+    Res = GlobalValue::LinkOnceODRAutoHideLinkage;
+    break;
   case lltok::kw_available_externally:
     Res = GlobalValue::AvailableExternallyLinkage;
     break;
@@ -1056,6 +1096,7 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= /*empty*/
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
+///   ::= 'kw_intel_ocl_bicc'
 ///   ::= 'coldcc'
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
@@ -1066,6 +1107,8 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= 'msp430_intrcc'
 ///   ::= 'ptx_kernel'
 ///   ::= 'ptx_device'
+///   ::= 'spir_func'
+///   ::= 'spir_kernel'
 ///   ::= 'cc' UINT
 ///
 bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
@@ -1083,6 +1126,9 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_msp430_intrcc:  CC = CallingConv::MSP430_INTR; break;
   case lltok::kw_ptx_kernel:     CC = CallingConv::PTX_Kernel; break;
   case lltok::kw_ptx_device:     CC = CallingConv::PTX_Device; break;
+  case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
+  case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
+  case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();
@@ -1395,16 +1441,16 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
     // Parse the argument.
     LocTy ArgLoc;
     Type *ArgTy = 0;
-    Attributes ArgAttrs1;
-    Attributes ArgAttrs2;
+    AttrBuilder ArgAttrs;
     Value *V;
     if (ParseType(ArgTy, ArgLoc))
       return true;
 
     // Otherwise, handle normal operands.
-    if (ParseOptionalAttrs(ArgAttrs1, 0) || ParseValue(ArgTy, V, PFS))
+    if (ParseOptionalAttrs(ArgAttrs, 0) || ParseValue(ArgTy, V, PFS))
       return true;
-    ArgList.push_back(ParamInfo(ArgLoc, V, ArgAttrs1|ArgAttrs2));
+    ArgList.push_back(ParamInfo(ArgLoc, V, Attributes::get(V->getContext(),
+                                                           ArgAttrs)));
   }
 
   Lex.Lex();  // Lex the ')'.
@@ -1436,7 +1482,7 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
   } else {
     LocTy TypeLoc = Lex.getLoc();
     Type *ArgTy = 0;
-    Attributes Attrs;
+    AttrBuilder Attrs;
     std::string Name;
 
     if (ParseType(ArgTy) ||
@@ -1453,7 +1499,9 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (!FunctionType::isValidArgumentType(ArgTy))
       return Error(TypeLoc, "invalid type for function argument");
 
-    ArgList.push_back(ArgInfo(TypeLoc, ArgTy, Attrs, Name));
+    ArgList.push_back(ArgInfo(TypeLoc, ArgTy,
+                              Attributes::get(ArgTy->getContext(),
+                                              Attrs), Name));
 
     while (EatIfPresent(lltok::comma)) {
       // Handle ... at end of arg list.
@@ -1479,7 +1527,9 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       if (!ArgTy->isFirstClassType())
         return Error(TypeLoc, "invalid type for function argument");
 
-      ArgList.push_back(ArgInfo(TypeLoc, ArgTy, Attrs, Name));
+      ArgList.push_back(ArgInfo(TypeLoc, ArgTy,
+                                Attributes::get(ArgTy->getContext(), Attrs),
+                                Name));
     }
   }
 
@@ -1503,7 +1553,7 @@ bool LLParser::ParseFunctionType(Type *&Result) {
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     if (!ArgList[i].Name.empty())
       return Error(ArgList[i].Loc, "argument name invalid in function type");
-    if (ArgList[i].Attrs)
+    if (ArgList[i].Attrs.hasAttributes())
       return Error(ArgList[i].Loc,
                    "argument attributes invalid in function type");
   }
@@ -2069,16 +2119,18 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
 
   case lltok::kw_asm: {
     // ValID ::= 'asm' SideEffect? AlignStack? STRINGCONSTANT ',' STRINGCONSTANT
-    bool HasSideEffect, AlignStack;
+    bool HasSideEffect, AlignStack, AsmDialect;
     Lex.Lex();
     if (ParseOptionalToken(lltok::kw_sideeffect, HasSideEffect) ||
         ParseOptionalToken(lltok::kw_alignstack, AlignStack) ||
+        ParseOptionalToken(lltok::kw_inteldialect, AsmDialect) ||
         ParseStringConstant(ID.StrVal) ||
         ParseToken(lltok::comma, "expected comma in inline asm expression") ||
         ParseToken(lltok::StringConstant, "expected constraint string"))
       return true;
     ID.StrVal2 = Lex.getStrVal();
-    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1);
+    ID.UIntVal = unsigned(HasSideEffect) | (unsigned(AlignStack)<<1) |
+      (unsigned(AsmDialect)<<2);
     ID.Kind = ValID::t_InlineAsm;
     return false;
   }
@@ -2495,7 +2547,8 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
       PTy ? dyn_cast<FunctionType>(PTy->getElementType()) : 0;
     if (!FTy || !InlineAsm::Verify(FTy, ID.StrVal2))
       return Error(ID.Loc, "invalid type for inline asm constraint string");
-    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1, ID.UIntVal>>1);
+    V = InlineAsm::get(FTy, ID.StrVal, ID.StrVal2, ID.UIntVal&1,
+                       (ID.UIntVal>>1)&1, (InlineAsm::AsmDialect(ID.UIntVal>>2)));
     return false;
   }
   case ValID::t_MDNode:
@@ -2630,7 +2683,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   unsigned Linkage;
 
   unsigned Visibility;
-  Attributes RetAttrs;
+  AttrBuilder RetAttrs;
   CallingConv::ID CC;
   Type *RetType = 0;
   LocTy RetTypeLoc = Lex.getLoc();
@@ -2653,11 +2706,11 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   case GlobalValue::PrivateLinkage:
   case GlobalValue::LinkerPrivateLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
   case GlobalValue::InternalLinkage:
   case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::DLLExportLinkage:
@@ -2694,7 +2747,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
 
   SmallVector<ArgInfo, 8> ArgList;
   bool isVarArg;
-  Attributes FuncAttrs;
+  AttrBuilder FuncAttrs;
   std::string Section;
   unsigned Alignment;
   std::string GC;
@@ -2713,9 +2766,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
     return true;
 
   // If the alignment was parsed as an attribute, move to the alignment field.
-  if (FuncAttrs & Attribute::Alignment) {
-    Alignment = Attribute::getAlignmentFromAttrs(FuncAttrs);
-    FuncAttrs &= ~Attribute::Alignment;
+  if (FuncAttrs.hasAlignmentAttr()) {
+    Alignment = FuncAttrs.getAlignment();
+    FuncAttrs.removeAttribute(Attributes::Alignment);
   }
 
   // Okay, if we got here, the function is syntactically valid.  Convert types
@@ -2723,21 +2776,28 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::vector<Type*> ParamTypeList;
   SmallVector<AttributeWithIndex, 8> Attrs;
 
-  if (RetAttrs != Attribute::None)
-    Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
+  if (RetAttrs.hasAttributes())
+    Attrs.push_back(
+      AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                              Attributes::get(RetType->getContext(),
+                                              RetAttrs)));
 
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     ParamTypeList.push_back(ArgList[i].Ty);
-    if (ArgList[i].Attrs != Attribute::None)
+    if (ArgList[i].Attrs.hasAttributes())
       Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
   }
 
-  if (FuncAttrs != Attribute::None)
-    Attrs.push_back(AttributeWithIndex::get(~0, FuncAttrs));
+  if (FuncAttrs.hasAttributes())
+    Attrs.push_back(
+      AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                              Attributes::get(RetType->getContext(),
+                                              FuncAttrs)));
 
-  AttrListPtr PAL = AttrListPtr::get(Attrs);
+  AttrListPtr PAL = AttrListPtr::get(Context, Attrs);
 
-  if (PAL.paramHasAttr(1, Attribute::StructRet) && !RetType->isVoidTy())
+  if (PAL.getParamAttributes(1).hasAttribute(Attributes::StructRet) &&
+      !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
 
   FunctionType *FT =
@@ -2752,6 +2812,9 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
       ForwardRefVals.find(FunctionName);
     if (FRVI != ForwardRefVals.end()) {
       Fn = M->getFunction(FunctionName);
+      if (!Fn)
+        return Error(FRVI->second.second, "invalid forward reference to "
+                     "function as global value!");
       if (Fn->getType() != PFT)
         return Error(FRVI->second.second, "invalid forward reference to "
                      "function '" + FunctionName + "' with wrong type!");
@@ -3205,7 +3268,7 @@ bool LLParser::ParseIndirectBr(Instruction *&Inst, PerFunctionState &PFS) {
 ///       OptionalAttrs 'to' TypeAndValue 'unwind' TypeAndValue
 bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
   LocTy CallLoc = Lex.getLoc();
-  Attributes RetAttrs, FnAttrs;
+  AttrBuilder RetAttrs, FnAttrs;
   CallingConv::ID CC;
   Type *RetType = 0;
   LocTy RetTypeLoc;
@@ -3250,8 +3313,11 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Set up the Attributes for the function.
   SmallVector<AttributeWithIndex, 8> Attrs;
-  if (RetAttrs != Attribute::None)
-    Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
+  if (RetAttrs.hasAttributes())
+    Attrs.push_back(
+      AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                              Attributes::get(Callee->getContext(),
+                                              RetAttrs)));
 
   SmallVector<Value*, 8> Args;
 
@@ -3271,18 +3337,21 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs != Attribute::None)
+    if (ArgList[i].Attrs.hasAttributes())
       Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs != Attribute::None)
-    Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
+  if (FnAttrs.hasAttributes())
+    Attrs.push_back(
+      AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                              Attributes::get(Callee->getContext(),
+                                              FnAttrs)));
 
   // Finish off the Attributes and check them
-  AttrListPtr PAL = AttrListPtr::get(Attrs);
+  AttrListPtr PAL = AttrListPtr::get(Context, Attrs);
 
   InvokeInst *II = InvokeInst::Create(Callee, NormalBB, UnwindBB, Args);
   II->setCallingConv(CC);
@@ -3604,7 +3673,7 @@ bool LLParser::ParseLandingPad(Instruction *&Inst, PerFunctionState &PFS) {
 ///       ParameterList OptionalAttrs
 bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
                          bool isTail) {
-  Attributes RetAttrs, FnAttrs;
+  AttrBuilder RetAttrs, FnAttrs;
   CallingConv::ID CC;
   Type *RetType = 0;
   LocTy RetTypeLoc;
@@ -3646,8 +3715,11 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Set up the Attributes for the function.
   SmallVector<AttributeWithIndex, 8> Attrs;
-  if (RetAttrs != Attribute::None)
-    Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
+  if (RetAttrs.hasAttributes())
+    Attrs.push_back(
+      AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                              Attributes::get(Callee->getContext(),
+                                              RetAttrs)));
 
   SmallVector<Value*, 8> Args;
 
@@ -3667,18 +3739,21 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs != Attribute::None)
+    if (ArgList[i].Attrs.hasAttributes())
       Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs != Attribute::None)
-    Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
+  if (FnAttrs.hasAttributes())
+    Attrs.push_back(
+      AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                              Attributes::get(Callee->getContext(),
+                                              FnAttrs)));
 
   // Finish off the Attributes and check them
-  AttrListPtr PAL = AttrListPtr::get(Attrs);
+  AttrListPtr PAL = AttrListPtr::get(Context, Attrs);
 
   CallInst *CI = CallInst::Create(Callee, Args);
   CI->setTailCall(isTail);
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 257c726229e7..c6bbdb27aeef 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -175,7 +175,7 @@ namespace llvm {
     bool ParseTLSModel(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalThreadLocal(GlobalVariable::ThreadLocalMode &TLM);
     bool ParseOptionalAddrSpace(unsigned &AddrSpace);
-    bool ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind);
+    bool ParseOptionalAttrs(AttrBuilder &Attrs, unsigned AttrKind);
     bool ParseOptionalLinkage(unsigned &Linkage, bool &HasLinkage);
     bool ParseOptionalLinkage(unsigned &Linkage) {
       bool HasLinkage; return ParseOptionalLinkage(Linkage, HasLinkage);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 0b0b98036eab..036686d31823 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -37,8 +37,10 @@ namespace lltok {
     kw_global,  kw_constant,
 
     kw_private, kw_linker_private, kw_linker_private_weak,
-    kw_linker_private_weak_def_auto, kw_internal,
-    kw_linkonce, kw_linkonce_odr, kw_weak, kw_weak_odr, kw_appending,
+    kw_linker_private_weak_def_auto, // FIXME: For backwards compatibility.
+    kw_internal,
+    kw_linkonce, kw_linkonce_odr, kw_linkonce_odr_auto_hide,
+    kw_weak, kw_weak_odr, kw_appending,
     kw_dllimport, kw_dllexport, kw_common, kw_available_externally,
     kw_default, kw_hidden, kw_protected,
     kw_unnamed_addr,
@@ -70,14 +72,17 @@ namespace lltok {
     kw_asm,
     kw_sideeffect,
     kw_alignstack,
+    kw_inteldialect,
     kw_gc,
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
+	  kw_intel_ocl_bicc,
     kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,
     kw_ptx_kernel, kw_ptx_device,
+    kw_spir_kernel, kw_spir_func,
 
     kw_signext,
     kw_zeroext,
@@ -105,7 +110,7 @@ namespace lltok {
     kw_naked,
     kw_nonlazybind,
     kw_address_safety,
-    kw_ia_nsdialect,
+    kw_minsize,
 
     kw_type,
     kw_opaque,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 4ffee38c8eb8..4ec9da12ddcf 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -52,6 +52,8 @@ void BitcodeReader::FreeState() {
   std::vector<Function*>().swap(FunctionsWithBodies);
   DeferredFunctionInfo.clear();
   MDKindMap.clear();
+
+  assert(BlockAddrFwdRefs.empty() && "Unresolved blockaddress fwd references");
 }
 
 //===----------------------------------------------------------------------===//
@@ -89,7 +91,7 @@ static GlobalValue::LinkageTypes GetDecodedLinkage(unsigned Val) {
   case 12: return GlobalValue::AvailableExternallyLinkage;
   case 13: return GlobalValue::LinkerPrivateLinkage;
   case 14: return GlobalValue::LinkerPrivateWeakLinkage;
-  case 15: return GlobalValue::LinkerPrivateWeakDefAutoLinkage;
+  case 15: return GlobalValue::LinkOnceODRAutoHideLinkage;
   }
 }
 
@@ -197,7 +199,7 @@ namespace {
   /// @brief A class for maintaining the slot number definition
   /// as a placeholder for the actual definition for forward constants defs.
   class ConstantPlaceHolder : public ConstantExpr {
-    void operator=(const ConstantPlaceHolder &); // DO NOT IMPLEMENT
+    void operator=(const ConstantPlaceHolder &) LLVM_DELETED_FUNCTION;
   public:
     // allocate space for exactly one operand
     void *operator new(size_t s) {
@@ -209,7 +211,6 @@ namespace {
     }
 
     /// @brief Methods to support type inquiry through isa, cast, and dyn_cast.
-    //static inline bool classof(const ConstantPlaceHolder *) { return true; }
     static bool classof(const Value *V) {
       return isa<ConstantExpr>(V) &&
              cast<ConstantExpr>(V)->getOpcode() == Instruction::UserOp1;
@@ -475,17 +476,18 @@ bool BitcodeReader::ParseAttributeBlock() {
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         Attributes ReconstitutedAttr =
-          Attribute::decodeLLVMAttributesForBitcode(Record[i+1]);
+          Attributes::decodeLLVMAttributesForBitcode(Context, Record[i+1]);
         Record[i+1] = ReconstitutedAttr.Raw();
       }
 
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
-        if (Attributes(Record[i+1]) != Attribute::None)
+        AttrBuilder B(Record[i+1]);
+        if (B.hasAttributes())
           Attrs.push_back(AttributeWithIndex::get(Record[i],
-                                                  Attributes(Record[i+1])));
+                                                  Attributes::get(Context, B)));
       }
 
-      MAttributes.push_back(AttrListPtr::get(Attrs));
+      MAttributes.push_back(AttrListPtr::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -889,9 +891,9 @@ bool BitcodeReader::ParseMetadata() {
   }
 }
 
-/// DecodeSignRotatedValue - Decode a signed value stored with the sign bit in
+/// decodeSignRotatedValue - Decode a signed value stored with the sign bit in
 /// the LSB for dense VBR encoding.
-static uint64_t DecodeSignRotatedValue(uint64_t V) {
+uint64_t BitcodeReader::decodeSignRotatedValue(uint64_t V) {
   if ((V & 1) == 0)
     return V >> 1;
   if (V != 1)
@@ -941,7 +943,7 @@ bool BitcodeReader::ResolveGlobalAndAliasInits() {
 static APInt ReadWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   SmallVector<uint64_t, 8> Words(Vals.size());
   std::transform(Vals.begin(), Vals.end(), Words.begin(),
-                 DecodeSignRotatedValue);
+                 BitcodeReader::decodeSignRotatedValue);
 
   return APInt(TypeBits, Words);
 }
@@ -995,7 +997,7 @@ bool BitcodeReader::ParseConstants() {
     case bitc::CST_CODE_INTEGER:   // INTEGER: [intval]
       if (!CurTy->isIntegerTy() || Record.empty())
         return Error("Invalid CST_INTEGER record");
-      V = ConstantInt::get(CurTy, DecodeSignRotatedValue(Record[0]));
+      V = ConstantInt::get(CurTy, decodeSignRotatedValue(Record[0]));
       break;
     case bitc::CST_CODE_WIDE_INTEGER: {// WIDE_INTEGER: [n x intval]
       if (!CurTy->isIntegerTy() || Record.empty())
@@ -1245,7 +1247,9 @@ bool BitcodeReader::ParseConstants() {
         V = ConstantExpr::getICmp(Record[3], Op0, Op1);
       break;
     }
-    case bitc::CST_CODE_INLINEASM: {
+    // This maintains backward compatibility, pre-asm dialect keywords.
+    // FIXME: Remove with the 4.0 release.
+    case bitc::CST_CODE_INLINEASM_OLD: {
       if (Record.size() < 2) return Error("Invalid INLINEASM record");
       std::string AsmStr, ConstrStr;
       bool HasSideEffects = Record[0] & 1;
@@ -1266,6 +1270,31 @@ bool BitcodeReader::ParseConstants() {
                          AsmStr, ConstrStr, HasSideEffects, IsAlignStack);
       break;
     }
+    // This version adds support for the asm dialect keywords (e.g.,
+    // inteldialect).
+    case bitc::CST_CODE_INLINEASM: {
+      if (Record.size() < 2) return Error("Invalid INLINEASM record");
+      std::string AsmStr, ConstrStr;
+      bool HasSideEffects = Record[0] & 1;
+      bool IsAlignStack = (Record[0] >> 1) & 1;
+      unsigned AsmDialect = Record[0] >> 2;
+      unsigned AsmStrSize = Record[1];
+      if (2+AsmStrSize >= Record.size())
+        return Error("Invalid INLINEASM record");
+      unsigned ConstStrSize = Record[2+AsmStrSize];
+      if (3+AsmStrSize+ConstStrSize > Record.size())
+        return Error("Invalid INLINEASM record");
+
+      for (unsigned i = 0; i != AsmStrSize; ++i)
+        AsmStr += (char)Record[2+i];
+      for (unsigned i = 0; i != ConstStrSize; ++i)
+        ConstrStr += (char)Record[3+AsmStrSize+i];
+      PointerType *PTy = cast<PointerType>(CurTy);
+      V = InlineAsm::get(cast<FunctionType>(PTy->getElementType()),
+                         AsmStr, ConstrStr, HasSideEffects, IsAlignStack,
+                         InlineAsm::AsmDialect(AsmDialect));
+      break;
+    }
     case bitc::CST_CODE_BLOCKADDRESS:{
       if (Record.size() < 3) return Error("Invalid CE_BLOCKADDRESS record");
       Type *FnTy = getTypeByID(Record[0]);
@@ -1273,13 +1302,27 @@ bool BitcodeReader::ParseConstants() {
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
       if (Fn == 0) return Error("Invalid CE_BLOCKADDRESS record");
-      
-      GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
-                                                  Type::getInt8Ty(Context),
+
+      // If the function is already parsed we can insert the block address right
+      // away.
+      if (!Fn->empty()) {
+        Function::iterator BBI = Fn->begin(), BBE = Fn->end();
+        for (size_t I = 0, E = Record[2]; I != E; ++I) {
+          if (BBI == BBE)
+            return Error("Invalid blockaddress block #");
+          ++BBI;
+        }
+        V = BlockAddress::get(Fn, BBI);
+      } else {
+        // Otherwise insert a placeholder and remember it so it can be inserted
+        // when the function is parsed.
+        GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
+                                                    Type::getInt8Ty(Context),
                                             false, GlobalValue::InternalLinkage,
-                                                  0, "");
-      BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
-      V = FwdRef;
+                                                    0, "");
+        BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
+        V = FwdRef;
+      }
       break;
     }  
     }
@@ -1481,13 +1524,22 @@ bool BitcodeReader::ParseModule(bool Resume) {
     // Read a record.
     switch (Stream.ReadRecord(Code, Record)) {
     default: break;  // Default behavior, ignore unknown content.
-    case bitc::MODULE_CODE_VERSION:  // VERSION: [version#]
+    case bitc::MODULE_CODE_VERSION: {  // VERSION: [version#]
       if (Record.size() < 1)
         return Error("Malformed MODULE_CODE_VERSION");
-      // Only version #0 is supported so far.
-      if (Record[0] != 0)
-        return Error("Unknown bitstream version!");
+      // Only version #0 and #1 are supported so far.
+      unsigned module_version = Record[0];
+      switch (module_version) {
+        default: return Error("Unknown bitstream version!");
+        case 0:
+          UseRelativeIDs = false;
+          break;
+        case 1:
+          UseRelativeIDs = true;
+          break;
+      }
       break;
+    }
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
@@ -1754,13 +1806,6 @@ bool BitcodeReader::ParseModuleTriple(std::string &Triple) {
     // Read a record.
     switch (Stream.ReadRecord(Code, Record)) {
     default: break;  // Default behavior, ignore unknown content.
-    case bitc::MODULE_CODE_VERSION:  // VERSION: [version#]
-      if (Record.size() < 1)
-        return Error("Malformed MODULE_CODE_VERSION");
-      // Only version #0 is supported so far.
-      if (Record[0] != 0)
-        return Error("Unknown bitstream version!");
-      break;
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
       std::string S;
       if (ConvertToString(Record, 0, S))
@@ -1973,7 +2018,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *LHS, *RHS;
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          getValue(Record, OpNum, LHS->getType(), RHS) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 > Record.size())
         return Error("Invalid BINOP record");
 
@@ -2088,8 +2133,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *TrueVal, *FalseVal, *Cond;
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
-          getValue(Record, OpNum, TrueVal->getType(), FalseVal) ||
-          getValue(Record, OpNum, Type::getInt1Ty(Context), Cond))
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
+          popValue(Record, OpNum, NextValueNo, Type::getInt1Ty(Context), Cond))
         return Error("Invalid SELECT record");
 
       I = SelectInst::Create(Cond, TrueVal, FalseVal);
@@ -2103,7 +2148,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *TrueVal, *FalseVal, *Cond;
       if (getValueTypePair(Record, OpNum, NextValueNo, TrueVal) ||
-          getValue(Record, OpNum, TrueVal->getType(), FalseVal) ||
+          popValue(Record, OpNum, NextValueNo, TrueVal->getType(), FalseVal) ||
           getValueTypePair(Record, OpNum, NextValueNo, Cond))
         return Error("Invalid SELECT record");
 
@@ -2128,7 +2173,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Vec, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
-          getValue(Record, OpNum, Type::getInt32Ty(Context), Idx))
+          popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
         return Error("Invalid EXTRACTELT record");
       I = ExtractElementInst::Create(Vec, Idx);
       InstructionList.push_back(I);
@@ -2139,9 +2184,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Vec, *Elt, *Idx;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec) ||
-          getValue(Record, OpNum,
+          popValue(Record, OpNum, NextValueNo,
                    cast<VectorType>(Vec->getType())->getElementType(), Elt) ||
-          getValue(Record, OpNum, Type::getInt32Ty(Context), Idx))
+          popValue(Record, OpNum, NextValueNo, Type::getInt32Ty(Context), Idx))
         return Error("Invalid INSERTELT record");
       I = InsertElementInst::Create(Vec, Elt, Idx);
       InstructionList.push_back(I);
@@ -2152,7 +2197,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Vec1, *Vec2, *Mask;
       if (getValueTypePair(Record, OpNum, NextValueNo, Vec1) ||
-          getValue(Record, OpNum, Vec1->getType(), Vec2))
+          popValue(Record, OpNum, NextValueNo, Vec1->getType(), Vec2))
         return Error("Invalid SHUFFLEVEC record");
 
       if (getValueTypePair(Record, OpNum, NextValueNo, Mask))
@@ -2172,7 +2217,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *LHS, *RHS;
       if (getValueTypePair(Record, OpNum, NextValueNo, LHS) ||
-          getValue(Record, OpNum, LHS->getType(), RHS) ||
+          popValue(Record, OpNum, NextValueNo, LHS->getType(), RHS) ||
           OpNum+1 != Record.size())
         return Error("Invalid CMP record");
 
@@ -2217,7 +2262,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       }
       else {
         BasicBlock *FalseDest = getBasicBlock(Record[1]);
-        Value *Cond = getFnValueByID(Record[2], Type::getInt1Ty(Context));
+        Value *Cond = getValue(Record, 2, NextValueNo,
+                               Type::getInt1Ty(Context));
         if (FalseDest == 0 || Cond == 0)
           return Error("Invalid BR record");
         I = BranchInst::Create(TrueDest, FalseDest, Cond);
@@ -2233,7 +2279,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         Type *OpTy = getTypeByID(Record[1]);
         unsigned ValueBitWidth = cast<IntegerType>(OpTy)->getBitWidth();
 
-        Value *Cond = getFnValueByID(Record[2], OpTy);
+        Value *Cond = getValue(Record, 2, NextValueNo, OpTy);
         BasicBlock *Default = getBasicBlock(Record[3]);
         if (OpTy == 0 || Cond == 0 || Default == 0)
           return Error("Invalid SWITCH record");
@@ -2288,7 +2334,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (Record.size() < 3 || (Record.size() & 1) == 0)
         return Error("Invalid SWITCH record");
       Type *OpTy = getTypeByID(Record[0]);
-      Value *Cond = getFnValueByID(Record[1], OpTy);
+      Value *Cond = getValue(Record, 1, NextValueNo, OpTy);
       BasicBlock *Default = getBasicBlock(Record[2]);
       if (OpTy == 0 || Cond == 0 || Default == 0)
         return Error("Invalid SWITCH record");
@@ -2312,7 +2358,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (Record.size() < 2)
         return Error("Invalid INDIRECTBR record");
       Type *OpTy = getTypeByID(Record[0]);
-      Value *Address = getFnValueByID(Record[1], OpTy);
+      Value *Address = getValue(Record, 1, NextValueNo, OpTy);
       if (OpTy == 0 || Address == 0)
         return Error("Invalid INDIRECTBR record");
       unsigned NumDests = Record.size()-2;
@@ -2354,7 +2400,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
 
       SmallVector<Value*, 16> Ops;
       for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i, ++OpNum) {
-        Ops.push_back(getFnValueByID(Record[OpNum], FTy->getParamType(i)));
+        Ops.push_back(getValue(Record, OpNum, NextValueNo,
+                               FTy->getParamType(i)));
         if (Ops.back() == 0) return Error("Invalid INVOKE record");
       }
 
@@ -2401,7 +2448,14 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       InstructionList.push_back(PN);
 
       for (unsigned i = 0, e = Record.size()-1; i != e; i += 2) {
-        Value *V = getFnValueByID(Record[1+i], Ty);
+        Value *V;
+        // With the new function encoding, it is possible that operands have
+        // negative IDs (for forward references).  Use a signed VBR
+        // representation to keep the encoding small.
+        if (UseRelativeIDs)
+          V = getValueSigned(Record, 1+i, NextValueNo, Ty);
+        else
+          V = getValue(Record, 1+i, NextValueNo, Ty);
         BasicBlock *BB = getBasicBlock(Record[2+i]);
         if (!V || !BB) return Error("Invalid PHI record");
         PN->addIncoming(V, BB);
@@ -2499,7 +2553,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
-          getValue(Record, OpNum,
+          popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+2 != Record.size())
         return Error("Invalid STORE record");
@@ -2513,7 +2567,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Val, *Ptr;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
-          getValue(Record, OpNum,
+          popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
         return Error("Invalid STOREATOMIC record");
@@ -2536,9 +2590,9 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Ptr, *Cmp, *New;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
-          getValue(Record, OpNum,
+          popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Cmp) ||
-          getValue(Record, OpNum,
+          popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), New) ||
           OpNum+3 != Record.size())
         return Error("Invalid CMPXCHG record");
@@ -2556,7 +2610,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       unsigned OpNum = 0;
       Value *Ptr, *Val;
       if (getValueTypePair(Record, OpNum, NextValueNo, Ptr) ||
-          getValue(Record, OpNum,
+          popValue(Record, OpNum, NextValueNo,
                     cast<PointerType>(Ptr->getType())->getElementType(), Val) ||
           OpNum+4 != Record.size())
         return Error("Invalid ATOMICRMW record");
@@ -2610,7 +2664,8 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
         if (FTy->getParamType(i)->isLabelTy())
           Args.push_back(getBasicBlock(Record[OpNum]));
         else
-          Args.push_back(getFnValueByID(Record[OpNum], FTy->getParamType(i)));
+          Args.push_back(getValue(Record, OpNum, NextValueNo,
+                                  FTy->getParamType(i)));
         if (Args.back() == 0) return Error("Invalid CALL record");
       }
 
@@ -2639,7 +2694,7 @@ bool BitcodeReader::ParseFunctionBody(Function *F) {
       if (Record.size() < 3)
         return Error("Invalid VAARG record");
       Type *OpTy = getTypeByID(Record[0]);
-      Value *Op = getFnValueByID(Record[1], OpTy);
+      Value *Op = getValue(Record, 1, NextValueNo, OpTy);
       Type *ResTy = getTypeByID(Record[2]);
       if (!OpTy || !Op || !ResTy)
         return Error("Invalid VAARG record");
@@ -2837,7 +2892,7 @@ bool BitcodeReader::InitStream() {
 }
 
 bool BitcodeReader::InitStreamFromBuffer() {
-  const unsigned char *BufPtr = (unsigned char *)Buffer->getBufferStart();
+  const unsigned char *BufPtr = (const unsigned char*)Buffer->getBufferStart();
   const unsigned char *BufEnd = BufPtr+Buffer->getBufferSize();
 
   if (Buffer->getBufferSize() & 3) {
diff --git a/lib/Bitcode/Reader/BitcodeReader.h b/lib/Bitcode/Reader/BitcodeReader.h
index e7c4e94f785f..3d5c0eb4def4 100644
--- a/lib/Bitcode/Reader/BitcodeReader.h
+++ b/lib/Bitcode/Reader/BitcodeReader.h
@@ -179,18 +179,27 @@ class BitcodeReader : public GVMaterializer {
   typedef std::pair<unsigned, GlobalVariable*> BlockAddrRefTy;
   DenseMap<Function*, std::vector<BlockAddrRefTy> > BlockAddrFwdRefs;
 
+  /// UseRelativeIDs - Indicates that we are using a new encoding for
+  /// instruction operands where most operands in the current
+  /// FUNCTION_BLOCK are encoded relative to the instruction number,
+  /// for a more compact encoding.  Some instruction operands are not
+  /// relative to the instruction ID: basic block numbers, and types.
+  /// Once the old style function blocks have been phased out, we would
+  /// not need this flag.
+  bool UseRelativeIDs;
+
 public:
   explicit BitcodeReader(MemoryBuffer *buffer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(buffer), BufferOwned(false),
       LazyStreamer(0), NextUnreadBit(0), SeenValueSymbolTable(false),
       ErrorString(0), ValueList(C), MDValueList(C),
-      SeenFirstFunctionBody(false) {
+      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   explicit BitcodeReader(DataStreamer *streamer, LLVMContext &C)
     : Context(C), TheModule(0), Buffer(0), BufferOwned(false),
       LazyStreamer(streamer), NextUnreadBit(0), SeenValueSymbolTable(false),
       ErrorString(0), ValueList(C), MDValueList(C),
-      SeenFirstFunctionBody(false) {
+      SeenFirstFunctionBody(false), UseRelativeIDs(false) {
   }
   ~BitcodeReader() {
     FreeState();
@@ -223,6 +232,9 @@ public:
   /// @brief Cheap mechanism to just extract module triple
   /// @returns true if an error occurred.
   bool ParseTriple(std::string &Triple);
+
+  static uint64_t decodeSignRotatedValue(uint64_t V);
+
 private:
   Type *getTypeByID(unsigned ID);
   Value *getFnValueByID(unsigned ID, Type *Ty) {
@@ -247,6 +259,9 @@ private:
                         unsigned InstNum, Value *&ResVal) {
     if (Slot == Record.size()) return true;
     unsigned ValNo = (unsigned)Record[Slot++];
+    // Adjust the ValNo, if it was encoded relative to the InstNum.
+    if (UseRelativeIDs)
+      ValNo = InstNum - ValNo;
     if (ValNo < InstNum) {
       // If this is not a forward reference, just return the value we already
       // have.
@@ -255,20 +270,54 @@ private:
     } else if (Slot == Record.size()) {
       return true;
     }
-    
+
     unsigned TypeNo = (unsigned)Record[Slot++];
     ResVal = getFnValueByID(ValNo, getTypeByID(TypeNo));
     return ResVal == 0;
   }
-  bool getValue(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
-                Type *Ty, Value *&ResVal) {
-    if (Slot == Record.size()) return true;
-    unsigned ValNo = (unsigned)Record[Slot++];
-    ResVal = getFnValueByID(ValNo, Ty);
+
+  /// popValue - Read a value out of the specified record from slot 'Slot'.
+  /// Increment Slot past the number of slots used by the value in the record.
+  /// Return true if there is an error.
+  bool popValue(SmallVector<uint64_t, 64> &Record, unsigned &Slot,
+                unsigned InstNum, Type *Ty, Value *&ResVal) {
+    if (getValue(Record, Slot, InstNum, Ty, ResVal))
+      return true;
+    // All values currently take a single record slot.
+    ++Slot;
+    return false;
+  }
+
+  /// getValue -- Like popValue, but does not increment the Slot number.
+  bool getValue(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+                unsigned InstNum, Type *Ty, Value *&ResVal) {
+    ResVal = getValue(Record, Slot, InstNum, Ty);
     return ResVal == 0;
   }
 
-  
+  /// getValue -- Version of getValue that returns ResVal directly,
+  /// or 0 if there is an error.
+  Value *getValue(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+                  unsigned InstNum, Type *Ty) {
+    if (Slot == Record.size()) return 0;
+    unsigned ValNo = (unsigned)Record[Slot];
+    // Adjust the ValNo, if it was encoded relative to the InstNum.
+    if (UseRelativeIDs)
+      ValNo = InstNum - ValNo;
+    return getFnValueByID(ValNo, Ty);
+  }
+
+  /// getValueSigned -- Like getValue, but decodes signed VBRs.
+  Value *getValueSigned(SmallVector<uint64_t, 64> &Record, unsigned Slot,
+                        unsigned InstNum, Type *Ty) {
+    if (Slot == Record.size()) return 0;
+    unsigned ValNo = (unsigned)decodeSignRotatedValue(Record[Slot]);
+    // Adjust the ValNo, if it was encoded relative to the InstNum.
+    if (UseRelativeIDs)
+      ValNo = InstNum - ValNo;
+    return getFnValueByID(ValNo, Ty);
+  }
+
   bool ParseModule(bool Resume);
   bool ParseAttributeBlock();
   bool ParseTypeTable();
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 5b1725f5508c..60c657ae6dd4 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -41,8 +41,6 @@ EnablePreserveUseListOrdering("enable-bc-uselist-preserve",
 /// These are manifest constants used by the bitcode writer. They do not need to
 /// be kept in sync with the reader, but need to be consistent within this file.
 enum {
-  CurVersion = 0,
-
   // VALUE_SYMTAB_BLOCK abbrev id's.
   VST_ENTRY_8_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
   VST_ENTRY_7_ABBREV,
@@ -177,7 +175,7 @@ static void WriteAttributeTable(const ValueEnumerator &VE,
     for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i) {
       const AttributeWithIndex &PAWI = A.getSlot(i);
       Record.push_back(PAWI.Index);
-      Record.push_back(Attribute::encodeLLVMAttributesForBitcode(PAWI.Attrs));
+      Record.push_back(Attributes::encodeLLVMAttributesForBitcode(PAWI.Attrs));
     }
 
     Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
@@ -365,7 +363,7 @@ static unsigned getEncodedLinkage(const GlobalValue *GV) {
   case GlobalValue::AvailableExternallyLinkage:      return 12;
   case GlobalValue::LinkerPrivateLinkage:            return 13;
   case GlobalValue::LinkerPrivateWeakLinkage:        return 14;
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage: return 15;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:      return 15;
   }
   llvm_unreachable("Invalid linkage");
 }
@@ -722,16 +720,20 @@ static void WriteModuleMetadataStore(const Module *M, BitstreamWriter &Stream) {
   Stream.ExitBlock();
 }
 
+static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
+  if ((int64_t)V >= 0)
+    Vals.push_back(V << 1);
+  else
+    Vals.push_back((-V << 1) | 1);
+}
+
 static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
                       unsigned &Code, unsigned &AbbrevToUse, const APInt &Val,
                       bool EmitSizeForWideNumbers = false
                       ) {
   if (Val.getBitWidth() <= 64) {
     uint64_t V = Val.getSExtValue();
-    if ((int64_t)V >= 0)
-      Vals.push_back(V << 1);
-    else
-      Vals.push_back((-V << 1) | 1);
+    emitSignedInt64(Vals, V);
     Code = bitc::CST_CODE_INTEGER;
     AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
   } else {
@@ -747,11 +749,7 @@ static void EmitAPInt(SmallVectorImpl<uint64_t> &Vals,
     
     const uint64_t *RawWords = Val.getRawData();
     for (unsigned i = 0; i != NWords; ++i) {
-      int64_t V = RawWords[i];
-      if (V >= 0)
-        Vals.push_back(V << 1);
-      else
-        Vals.push_back((-V << 1) | 1);
+      emitSignedInt64(Vals, RawWords[i]);
     }
     Code = bitc::CST_CODE_WIDE_INTEGER;
   }
@@ -814,7 +812,8 @@ static void WriteConstants(unsigned FirstVal, unsigned LastVal,
 
     if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
       Record.push_back(unsigned(IA->hasSideEffects()) |
-                       unsigned(IA->isAlignStack()) << 1);
+                       unsigned(IA->isAlignStack()) << 1 |
+                       unsigned(IA->getDialect()&1) << 2);
 
       // Add the asm string.
       const std::string &AsmStr = IA->getAsmString();
@@ -1024,12 +1023,13 @@ static void WriteModuleConstants(const ValueEnumerator &VE,
 ///
 /// This function adds V's value ID to Vals.  If the value ID is higher than the
 /// instruction ID, then it is a forward reference, and it also includes the
-/// type ID.
+/// type ID.  The value ID that is written is encoded relative to the InstID.
 static bool PushValueAndType(const Value *V, unsigned InstID,
                              SmallVector<unsigned, 64> &Vals,
                              ValueEnumerator &VE) {
   unsigned ValID = VE.getValueID(V);
-  Vals.push_back(ValID);
+  // Make encoding relative to the InstID.
+  Vals.push_back(InstID - ValID);
   if (ValID >= InstID) {
     Vals.push_back(VE.getTypeID(V->getType()));
     return true;
@@ -1037,6 +1037,30 @@ static bool PushValueAndType(const Value *V, unsigned InstID,
   return false;
 }
 
+/// pushValue - Like PushValueAndType, but where the type of the value is
+/// omitted (perhaps it was already encoded in an earlier operand).
+static void pushValue(const Value *V, unsigned InstID,
+                      SmallVector<unsigned, 64> &Vals,
+                      ValueEnumerator &VE) {
+  unsigned ValID = VE.getValueID(V);
+  Vals.push_back(InstID - ValID);
+}
+
+static void pushValue64(const Value *V, unsigned InstID,
+                        SmallVector<uint64_t, 128> &Vals,
+                        ValueEnumerator &VE) {
+  uint64_t ValID = VE.getValueID(V);
+  Vals.push_back(InstID - ValID);
+}
+
+static void pushValueSigned(const Value *V, unsigned InstID,
+                            SmallVector<uint64_t, 128> &Vals,
+                            ValueEnumerator &VE) {
+  unsigned ValID = VE.getValueID(V);
+  int64_t diff = ((int32_t)InstID - (int32_t)ValID);
+  emitSignedInt64(Vals, diff);
+}
+
 /// WriteInstruction - Emit an instruction to the specified stream.
 static void WriteInstruction(const Instruction &I, unsigned InstID,
                              ValueEnumerator &VE, BitstreamWriter &Stream,
@@ -1057,7 +1081,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Code = bitc::FUNC_CODE_INST_BINOP;
       if (!PushValueAndType(I.getOperand(0), InstID, Vals, VE))
         AbbrevToUse = FUNCTION_INST_BINOP_ABBREV;
-      Vals.push_back(VE.getValueID(I.getOperand(1)));
+      pushValue(I.getOperand(1), InstID, Vals, VE);
       Vals.push_back(GetEncodedBinaryOpcode(I.getOpcode()));
       uint64_t Flags = GetOptimizationFlags(&I);
       if (Flags != 0) {
@@ -1095,32 +1119,32 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::Select:
     Code = bitc::FUNC_CODE_INST_VSELECT;
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);
-    Vals.push_back(VE.getValueID(I.getOperand(2)));
+    pushValue(I.getOperand(2), InstID, Vals, VE);
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
     break;
   case Instruction::ExtractElement:
     Code = bitc::FUNC_CODE_INST_EXTRACTELT;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    Vals.push_back(VE.getValueID(I.getOperand(1)));
+    pushValue(I.getOperand(1), InstID, Vals, VE);
     break;
   case Instruction::InsertElement:
     Code = bitc::FUNC_CODE_INST_INSERTELT;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    Vals.push_back(VE.getValueID(I.getOperand(1)));
-    Vals.push_back(VE.getValueID(I.getOperand(2)));
+    pushValue(I.getOperand(1), InstID, Vals, VE);
+    pushValue(I.getOperand(2), InstID, Vals, VE);
     break;
   case Instruction::ShuffleVector:
     Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    Vals.push_back(VE.getValueID(I.getOperand(1)));
-    Vals.push_back(VE.getValueID(I.getOperand(2)));
+    pushValue(I.getOperand(1), InstID, Vals, VE);
+    pushValue(I.getOperand(2), InstID, Vals, VE);
     break;
   case Instruction::ICmp:
   case Instruction::FCmp:
     // compare returning Int1Ty or vector of Int1Ty
     Code = bitc::FUNC_CODE_INST_CMP2;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);
-    Vals.push_back(VE.getValueID(I.getOperand(1)));
+    pushValue(I.getOperand(1), InstID, Vals, VE);
     Vals.push_back(cast<CmpInst>(I).getPredicate());
     break;
 
@@ -1146,7 +1170,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals.push_back(VE.getValueID(II.getSuccessor(0)));
       if (II.isConditional()) {
         Vals.push_back(VE.getValueID(II.getSuccessor(1)));
-        Vals.push_back(VE.getValueID(II.getCondition()));
+        pushValue(II.getCondition(), InstID, Vals, VE);
       }
     }
     break;
@@ -1163,7 +1187,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
       Vals64.push_back(SwitchRecordHeader);      
       
       Vals64.push_back(VE.getTypeID(SI.getCondition()->getType()));
-      Vals64.push_back(VE.getValueID(SI.getCondition()));
+      pushValue64(SI.getCondition(), InstID, Vals64, VE);
       Vals64.push_back(VE.getValueID(SI.getDefaultDest()));
       Vals64.push_back(SI.getNumCases());
       for (SwitchInst::CaseIt i = SI.case_begin(), e = SI.case_end();
@@ -1214,7 +1238,9 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::IndirectBr:
     Code = bitc::FUNC_CODE_INST_INDIRECTBR;
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));
-    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+    // Encode the address operand as relative, but not the basic blocks.
+    pushValue(I.getOperand(0), InstID, Vals, VE);
+    for (unsigned i = 1, e = I.getNumOperands(); i != e; ++i)
       Vals.push_back(VE.getValueID(I.getOperand(i)));
     break;
       
@@ -1233,7 +1259,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
 
     // Emit value #'s for the fixed parameters.
     for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-      Vals.push_back(VE.getValueID(I.getOperand(i)));  // fixed param.
+      pushValue(I.getOperand(i), InstID, Vals, VE);  // fixed param.
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
@@ -1255,12 +1281,19 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::PHI: {
     const PHINode &PN = cast<PHINode>(I);
     Code = bitc::FUNC_CODE_INST_PHI;
-    Vals.push_back(VE.getTypeID(PN.getType()));
+    // With the newer instruction encoding, forward references could give
+    // negative valued IDs.  This is most common for PHIs, so we use
+    // signed VBRs.
+    SmallVector<uint64_t, 128> Vals64;
+    Vals64.push_back(VE.getTypeID(PN.getType()));
     for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-      Vals.push_back(VE.getValueID(PN.getIncomingValue(i)));
-      Vals.push_back(VE.getValueID(PN.getIncomingBlock(i)));
+      pushValueSigned(PN.getIncomingValue(i), InstID, Vals64, VE);
+      Vals64.push_back(VE.getValueID(PN.getIncomingBlock(i)));
     }
-    break;
+    // Emit a Vals64 vector and exit.
+    Stream.EmitRecord(Code, Vals64, AbbrevToUse);
+    Vals64.clear();
+    return;
   }
 
   case Instruction::LandingPad: {
@@ -1310,7 +1343,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     else
       Code = bitc::FUNC_CODE_INST_STORE;
     PushValueAndType(I.getOperand(1), InstID, Vals, VE);  // ptrty + ptr
-    Vals.push_back(VE.getValueID(I.getOperand(0)));       // val.
+    pushValue(I.getOperand(0), InstID, Vals, VE);         // val.
     Vals.push_back(Log2_32(cast<StoreInst>(I).getAlignment())+1);
     Vals.push_back(cast<StoreInst>(I).isVolatile());
     if (cast<StoreInst>(I).isAtomic()) {
@@ -1321,8 +1354,8 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::AtomicCmpXchg:
     Code = bitc::FUNC_CODE_INST_CMPXCHG;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
-    Vals.push_back(VE.getValueID(I.getOperand(1)));       // cmp.
-    Vals.push_back(VE.getValueID(I.getOperand(2)));       // newval.
+    pushValue(I.getOperand(1), InstID, Vals, VE);         // cmp.
+    pushValue(I.getOperand(2), InstID, Vals, VE);         // newval.
     Vals.push_back(cast<AtomicCmpXchgInst>(I).isVolatile());
     Vals.push_back(GetEncodedOrdering(
                      cast<AtomicCmpXchgInst>(I).getOrdering()));
@@ -1332,7 +1365,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::AtomicRMW:
     Code = bitc::FUNC_CODE_INST_ATOMICRMW;
     PushValueAndType(I.getOperand(0), InstID, Vals, VE);  // ptrty + ptr
-    Vals.push_back(VE.getValueID(I.getOperand(1)));       // val.
+    pushValue(I.getOperand(1), InstID, Vals, VE);         // val.
     Vals.push_back(GetEncodedRMWOperation(
                      cast<AtomicRMWInst>(I).getOperation()));
     Vals.push_back(cast<AtomicRMWInst>(I).isVolatile());
@@ -1357,8 +1390,13 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
     PushValueAndType(CI.getCalledValue(), InstID, Vals, VE);  // Callee
 
     // Emit value #'s for the fixed parameters.
-    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-      Vals.push_back(VE.getValueID(CI.getArgOperand(i)));  // fixed param.
+    for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+      // Check for labels (can happen with asm labels).
+      if (FTy->getParamType(i)->isLabelTy())
+        Vals.push_back(VE.getValueID(CI.getArgOperand(i)));
+      else
+        pushValue(CI.getArgOperand(i), InstID, Vals, VE);  // fixed param.
+    }
 
     // Emit type/value pairs for varargs params.
     if (FTy->isVarArg()) {
@@ -1371,7 +1409,7 @@ static void WriteInstruction(const Instruction &I, unsigned InstID,
   case Instruction::VAArg:
     Code = bitc::FUNC_CODE_INST_VAARG;
     Vals.push_back(VE.getTypeID(I.getOperand(0)->getType()));   // valistty
-    Vals.push_back(VE.getValueID(I.getOperand(0))); // valist.
+    pushValue(I.getOperand(0), InstID, Vals, VE); // valist.
     Vals.push_back(VE.getTypeID(I.getType())); // restype.
     break;
   }
@@ -1513,8 +1551,8 @@ static void WriteFunction(const Function &F, ValueEnumerator &VE,
 // Emit blockinfo, which defines the standard abbreviations etc.
 static void WriteBlockInfo(const ValueEnumerator &VE, BitstreamWriter &Stream) {
   // We only want to emit block info records for blocks that have multiple
-  // instances: CONSTANTS_BLOCK, FUNCTION_BLOCK and VALUE_SYMTAB_BLOCK.  Other
-  // blocks can defined their abbrevs inline.
+  // instances: CONSTANTS_BLOCK, FUNCTION_BLOCK and VALUE_SYMTAB_BLOCK.
+  // Other blocks can define their abbrevs inline.
   Stream.EnterBlockInfoBlock(2);
 
   { // 8-bit fixed-width VST_ENTRY/VST_BBENTRY strings.
@@ -1772,12 +1810,10 @@ static void WriteModuleUseLists(const Module *M, ValueEnumerator &VE,
 static void WriteModule(const Module *M, BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::MODULE_BLOCK_ID, 3);
 
-  // Emit the version number if it is non-zero.
-  if (CurVersion) {
-    SmallVector<unsigned, 1> Vals;
-    Vals.push_back(CurVersion);
-    Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
-  }
+  SmallVector<unsigned, 1> Vals;
+  unsigned CurVersion = 1;
+  Vals.push_back(CurVersion);
+  Stream.EmitRecord(bitc::MODULE_CODE_VERSION, Vals);
 
   // Analyze the module, enumerating globals, functions, etc.
   ValueEnumerator VE(M);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index a6ca53606248..75468e6c5e2e 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -78,9 +78,9 @@ private:
 
   unsigned FirstFuncConstantID;
   unsigned FirstInstID;
-  
-  ValueEnumerator(const ValueEnumerator &);  // DO NOT IMPLEMENT
-  void operator=(const ValueEnumerator &);   // DO NOT IMPLEMENT
+
+  ValueEnumerator(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
+  void operator=(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
 public:
   ValueEnumerator(const Module *M);
 
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 205480a46922..7a1c049d522d 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -635,7 +635,7 @@ bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
     --R;
     const unsigned NewSuperReg = Order[R];
     // Don't consider non-allocatable registers
-    if (!RegClassInfo.isAllocatable(NewSuperReg)) continue;
+    if (!MRI.isAllocatable(NewSuperReg)) continue;
     // Don't replace a register with itself.
     if (NewSuperReg == SuperReg) continue;
 
@@ -818,7 +818,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
         DEBUG(dbgs() << "\tAntidep reg: " << TRI->getName(AntiDepReg));
         assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
 
-        if (!RegClassInfo.isAllocatable(AntiDepReg)) {
+        if (!MRI.isAllocatable(AntiDepReg)) {
           // Don't break anti-dependencies on non-allocatable registers.
           DEBUG(dbgs() << " (non-allocatable)\n");
           continue;
diff --git a/lib/CodeGen/AllocationOrder.cpp b/lib/CodeGen/AllocationOrder.cpp
index 32ad34a76d69..7cde136c5ef3 100644
--- a/lib/CodeGen/AllocationOrder.cpp
+++ b/lib/CodeGen/AllocationOrder.cpp
@@ -29,6 +29,7 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
   const TargetRegisterClass *RC = VRM.getRegInfo().getRegClass(VirtReg);
   std::pair<unsigned, unsigned> HintPair =
     VRM.getRegInfo().getRegAllocationHint(VirtReg);
+  const MachineRegisterInfo &MRI = VRM.getRegInfo();
 
   // HintPair.second is a register, phys or virt.
   Hint = HintPair.second;
@@ -52,7 +53,7 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
     unsigned *P = new unsigned[Order.size()];
     Begin = P;
     for (unsigned i = 0; i != Order.size(); ++i)
-      if (!RCI.isReserved(Order[i]))
+      if (!MRI.isReserved(Order[i]))
         *P++ = Order[i];
     End = P;
 
@@ -69,7 +70,7 @@ AllocationOrder::AllocationOrder(unsigned VirtReg,
 
   // The hint must be a valid physreg for allocation.
   if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
-               !RC->contains(Hint) || RCI.isReserved(Hint)))
+               !RC->contains(Hint) || MRI.isReserved(Hint)))
     Hint = 0;
 }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 447f3981b521..5162ad762e73 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Module.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -79,7 +79,7 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
                            uint64_t StartingOffset) {
   // Given a struct type, recursively traverse the elements.
   if (StructType *STy = dyn_cast<StructType>(Ty)) {
-    const StructLayout *SL = TLI.getTargetData()->getStructLayout(STy);
+    const StructLayout *SL = TLI.getDataLayout()->getStructLayout(STy);
     for (StructType::element_iterator EB = STy->element_begin(),
                                       EI = EB,
                                       EE = STy->element_end();
@@ -91,7 +91,7 @@ void llvm::ComputeValueVTs(const TargetLowering &TLI, Type *Ty,
   // Given an array type, recursively traverse the elements.
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Type *EltTy = ATy->getElementType();
-    uint64_t EltSize = TLI.getTargetData()->getTypeAllocSize(EltTy);
+    uint64_t EltSize = TLI.getDataLayout()->getTypeAllocSize(EltTy);
     for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
       ComputeValueVTs(TLI, EltTy, ValueVTs, Offsets,
                       StartingOffset + i * EltSize);
@@ -314,11 +314,13 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
   // the return. Ignore noalias because it doesn't affect the call sequence.
   const Function *F = ExitBB->getParent();
   Attributes CallerRetAttr = F->getAttributes().getRetAttributes();
-  if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias)
+  if (AttrBuilder(CalleeRetAttr).removeAttribute(Attributes::NoAlias) !=
+      AttrBuilder(CallerRetAttr).removeAttribute(Attributes::NoAlias))
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
+  if (CallerRetAttr.hasAttribute(Attributes::ZExt) ||
+      CallerRetAttr.hasAttribute(Attributes::SExt))
     return false;
 
   // Otherwise, make sure the unmodified return value of I is the return value.
@@ -354,11 +356,13 @@ bool llvm::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore noalias because it doesn't affect the call sequence.
   Attributes CallerRetAttr = F->getAttributes().getRetAttributes();
-  if (CallerRetAttr & ~Attribute::NoAlias)
+  if (AttrBuilder(CallerRetAttr)
+      .removeAttribute(Attributes::NoAlias).hasAttributes())
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
+  if (CallerRetAttr.hasAttribute(Attributes::ZExt) ||
+      CallerRetAttr.hasAttribute(Attributes::SExt))
     return false;
 
   // Check if the only use is a function return node.
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index bf5d8c488000..b2ebf04e518f 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -24,7 +24,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index d9be7a1a58f7..d74a70362a2a 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -33,7 +33,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -67,7 +67,7 @@ static gcp_map_type &getGCMap(void *&P) {
 /// getGVAlignmentLog2 - Return the alignment to use for the specified global
 /// value in log2 form.  This rounds up to the preferred alignment if possible
 /// and legal.
-static unsigned getGVAlignmentLog2(const GlobalValue *GV, const TargetData &TD,
+static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &TD,
                                    unsigned InBits = 0) {
   unsigned NumBits = 0;
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
@@ -131,9 +131,9 @@ const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
 }
 
 
-/// getTargetData - Return information about data layout.
-const TargetData &AsmPrinter::getTargetData() const {
-  return *TM.getTargetData();
+/// getDataLayout - Return information about data layout.
+const DataLayout &AsmPrinter::getDataLayout() const {
+  return *TM.getDataLayout();
 }
 
 /// getCurrentSection() - Return the current section we are emitting to.
@@ -160,7 +160,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
     .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, *TM.getTargetData());
+  Mang = new Mangler(OutContext, *TM.getDataLayout());
 
   // Allow the target to emit any magic that it wants at the start of the file.
   EmitStartOfAsmFile(M);
@@ -213,16 +213,16 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
   case GlobalValue::CommonLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
     if (MAI->getWeakDefDirective() != 0) {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
       if ((GlobalValue::LinkageTypes)Linkage !=
-          GlobalValue::LinkerPrivateWeakDefAutoLinkage)
+          GlobalValue::LinkOnceODRAutoHideLinkage)
         // .weak_definition _foo
         OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
       else
@@ -280,7 +280,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   uint64_t Size = TD->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
@@ -312,8 +312,8 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       return;
     }
 
-    if (MAI->getLCOMMDirectiveType() != LCOMM::None &&
-        (MAI->getLCOMMDirectiveType() != LCOMM::NoAlignment || Align == 1)) {
+    if (Align == 1 ||
+        MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
       // .lcomm _foo, 42
       OutStreamer.EmitLocalCommonSymbol(GVSym, Size, Align);
       return;
@@ -482,9 +482,8 @@ void AsmPrinter::EmitFunctionEntryLabel() {
                      "' label emitted multiple times to assembly file");
 }
 
-
-/// EmitComments - Pretty-print comments for instructions.
-static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+/// emitComments - Pretty-print comments for instructions.
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetMachine &TM = MF->getTarget();
 
@@ -519,16 +518,16 @@ static void EmitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
     CommentOS << " Reload Reuse\n";
 }
 
-/// EmitImplicitDef - This method emits the specified machine instruction
+/// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
-static void EmitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
+static void emitImplicitDef(const MachineInstr *MI, AsmPrinter &AP) {
   unsigned RegNo = MI->getOperand(0).getReg();
   AP.OutStreamer.AddComment(Twine("implicit-def: ") +
                             AP.TM.getRegisterInfo()->getName(RegNo));
   AP.OutStreamer.AddBlankLine();
 }
 
-static void EmitKill(const MachineInstr *MI, AsmPrinter &AP) {
+static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
   std::string Str = "kill:";
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &Op = MI->getOperand(i);
@@ -541,10 +540,10 @@ static void EmitKill(const MachineInstr *MI, AsmPrinter &AP) {
   AP.OutStreamer.AddBlankLine();
 }
 
-/// EmitDebugValueComment - This method handles the target-independent form
+/// emitDebugValueComment - This method handles the target-independent form
 /// of DBG_VALUE, returning true if it was able to do so.  A false return
 /// means the target will need to handle MI in EmitInstruction.
-static bool EmitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
+static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   // This code handles only the 3-operand target-independent form.
   if (MI->getNumOperands() != 3)
     return false;
@@ -674,7 +673,7 @@ void AsmPrinter::EmitFunctionBody() {
       }
 
       if (isVerbose())
-        EmitComments(*II, OutStreamer.GetCommentOS());
+        emitComments(*II, OutStreamer.GetCommentOS());
 
       switch (II->getOpcode()) {
       case TargetOpcode::PROLOG_LABEL:
@@ -690,15 +689,15 @@ void AsmPrinter::EmitFunctionBody() {
         break;
       case TargetOpcode::DBG_VALUE:
         if (isVerbose()) {
-          if (!EmitDebugValueComment(II, *this))
+          if (!emitDebugValueComment(II, *this))
             EmitInstruction(II);
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
-        if (isVerbose()) EmitImplicitDef(II, *this);
+        if (isVerbose()) emitImplicitDef(II, *this);
         break;
       case TargetOpcode::KILL:
-        if (isVerbose()) EmitKill(II, *this);
+        if (isVerbose()) emitKill(II, *this);
         break;
       default:
         if (!TM.hasMCUseLoc())
@@ -992,7 +991,7 @@ void AsmPrinter::EmitConstantPool() {
       Kind = SectionKind::getReadOnlyWithRelLocal();
       break;
     case 0:
-    switch (TM.getTargetData()->getTypeAllocSize(CPE.getType())) {
+    switch (TM.getDataLayout()->getTypeAllocSize(CPE.getType())) {
     case 4:  Kind = SectionKind::getMergeableConst4(); break;
     case 8:  Kind = SectionKind::getMergeableConst8(); break;
     case 16: Kind = SectionKind::getMergeableConst16();break;
@@ -1038,7 +1037,7 @@ void AsmPrinter::EmitConstantPool() {
       OutStreamer.EmitFill(NewOffset - Offset, 0/*fillval*/, 0/*addrspace*/);
 
       Type *Ty = CPE.getType();
-      Offset = NewOffset + TM.getTargetData()->getTypeAllocSize(Ty);
+      Offset = NewOffset + TM.getDataLayout()->getTypeAllocSize(Ty);
       OutStreamer.EmitLabel(GetCPISymbol(CPI));
 
       if (CPE.isMachineConstantPoolEntry())
@@ -1081,7 +1080,12 @@ void AsmPrinter::EmitJumpTableInfo() {
     JTInDiffSection = true;
   }
 
-  EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getTargetData())));
+  EmitAlignment(Log2_32(MJTI->getEntryAlignment(*TM.getDataLayout())));
+
+  // Jump tables in code sections are marked with a data_region directive
+  // where that's supported.
+  if (!JTInDiffSection)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionJT32);
 
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
@@ -1123,6 +1127,8 @@ void AsmPrinter::EmitJumpTableInfo() {
     for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii)
       EmitJumpTableEntry(MJTI, JTBBs[ii], JTI);
   }
+  if (!JTInDiffSection)
+    OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
 /// EmitJumpTableEntry - Emit a jump table entry for the specified MBB to the
@@ -1190,7 +1196,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
 
   assert(Value && "Unknown entry kind!");
 
-  unsigned EntrySize = MJTI->getEntrySize(*TM.getTargetData());
+  unsigned EntrySize = MJTI->getEntrySize(*TM.getDataLayout());
   OutStreamer.EmitValue(Value, EntrySize, /*addrspace*/0);
 }
 
@@ -1292,7 +1298,7 @@ void AsmPrinter::EmitXXStructorList(const Constant *List, bool isCtor) {
   }
 
   // Emit the function pointers in the target-specific order
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   unsigned Align = Log2_32(TD->getPointerPrefAlignment());
   std::stable_sort(Structors.begin(), Structors.end(), priority_order);
   for (unsigned i = 0, e = Structors.size(); i != e; ++i) {
@@ -1408,7 +1414,7 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // if required for correctness.
 //
 void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalValue *GV) const {
-  if (GV) NumBits = getGVAlignmentLog2(GV, *TM.getTargetData(), NumBits);
+  if (GV) NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(), NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
 
@@ -1422,9 +1428,9 @@ void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalValue *GV) const {
 // Constant emission.
 //===----------------------------------------------------------------------===//
 
-/// LowerConstant - Lower the specified LLVM Constant to an MCExpr.
+/// lowerConstant - Lower the specified LLVM Constant to an MCExpr.
 ///
-static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
+static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
   MCContext &Ctx = AP.OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
@@ -1447,12 +1453,12 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
   switch (CE->getOpcode()) {
   default:
     // If the code isn't optimized, there may be outstanding folding
-    // opportunities. Attempt to fold the expression using TargetData as a
+    // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
     if (Constant *C =
-          ConstantFoldConstantExpression(CE, AP.TM.getTargetData()))
+          ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
       if (C != CE)
-        return LowerConstant(C, AP);
+        return lowerConstant(C, AP);
 
     // Otherwise report the problem to the user.
     {
@@ -1464,21 +1470,20 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
     const Constant *PtrVal = CE->getOperand(0);
     SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end());
     int64_t Offset = TD.getIndexedOffset(PtrVal->getType(), IdxVec);
 
-    const MCExpr *Base = LowerConstant(CE->getOperand(0), AP);
+    const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
     if (Offset == 0)
       return Base;
 
     // Truncate/sext the offset to the pointer size.
-    if (TD.getPointerSizeInBits() != 64) {
-      int SExtAmount = 64-TD.getPointerSizeInBits();
-      Offset = (Offset << SExtAmount) >> SExtAmount;
-    }
+    unsigned Width = TD.getPointerSizeInBits();
+    if (Width < 64)
+      Offset = SignExtend64(Offset, Width);
 
     return MCBinaryExpr::CreateAdd(Base, MCConstantExpr::Create(Offset, Ctx),
                                    Ctx);
@@ -1491,26 +1496,26 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
     // is reasonable to treat their delta as a 32-bit value.
     // FALL THROUGH.
   case Instruction::BitCast:
-    return LowerConstant(CE->getOperand(0), AP);
+    return lowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
     Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
                                       false/*ZExt*/);
-    return LowerConstant(Op, AP);
+    return lowerConstant(Op, AP);
   }
 
   case Instruction::PtrToInt: {
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
     Type *Ty = CE->getType();
 
-    const MCExpr *OpExpr = LowerConstant(Op, AP);
+    const MCExpr *OpExpr = lowerConstant(Op, AP);
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
@@ -1536,8 +1541,8 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    const MCExpr *LHS = LowerConstant(CE->getOperand(0), AP);
-    const MCExpr *RHS = LowerConstant(CE->getOperand(1), AP);
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0), AP);
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1), AP);
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
     case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
@@ -1554,7 +1559,7 @@ static const MCExpr *LowerConstant(const Constant *CV, AsmPrinter &AP) {
   }
 }
 
-static void EmitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
+static void emitGlobalConstantImpl(const Constant *C, unsigned AddrSpace,
                                    AsmPrinter &AP);
 
 /// isRepeatedByteSequence - Determine whether the given value is
@@ -1578,7 +1583,7 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getBitWidth() > 64) return -1;
 
-    uint64_t Size = TM.getTargetData()->getTypeAllocSize(V->getType());
+    uint64_t Size = TM.getDataLayout()->getTypeAllocSize(V->getType());
     uint64_t Value = CI->getZExtValue();
 
     // Make sure the constant is at least 8 bits long and has a power
@@ -1616,13 +1621,13 @@ static int isRepeatedByteSequence(const Value *V, TargetMachine &TM) {
   return -1;
 }
 
-static void EmitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
+static void emitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
                                              unsigned AddrSpace,AsmPrinter &AP){
   
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, AP.TM);
   if (Value != -1) {
-    uint64_t Bytes = AP.TM.getTargetData()->getTypeAllocSize(CDS->getType());
+    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
       return AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
@@ -1672,7 +1677,7 @@ static void EmitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
     }
   }
 
-  const TargetData &TD = *AP.TM.getTargetData();
+  const DataLayout &TD = *AP.TM.getDataLayout();
   unsigned Size = TD.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = TD.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
@@ -1681,28 +1686,28 @@ static void EmitGlobalConstantDataSequential(const ConstantDataSequential *CDS,
 
 }
 
-static void EmitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
+static void emitGlobalConstantArray(const ConstantArray *CA, unsigned AddrSpace,
                                     AsmPrinter &AP) {
   // See if we can aggregate some values.  Make sure it can be
   // represented as a series of bytes of the constant value.
   int Value = isRepeatedByteSequence(CA, AP.TM);
 
   if (Value != -1) {
-    uint64_t Bytes = AP.TM.getTargetData()->getTypeAllocSize(CA->getType());
+    uint64_t Bytes = AP.TM.getDataLayout()->getTypeAllocSize(CA->getType());
     AP.OutStreamer.EmitFill(Bytes, Value, AddrSpace);
   }
   else {
     for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      EmitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
+      emitGlobalConstantImpl(CA->getOperand(i), AddrSpace, AP);
   }
 }
 
-static void EmitGlobalConstantVector(const ConstantVector *CV,
+static void emitGlobalConstantVector(const ConstantVector *CV,
                                      unsigned AddrSpace, AsmPrinter &AP) {
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
-    EmitGlobalConstantImpl(CV->getOperand(i), AddrSpace, AP);
+    emitGlobalConstantImpl(CV->getOperand(i), AddrSpace, AP);
 
-  const TargetData &TD = *AP.TM.getTargetData();
+  const DataLayout &TD = *AP.TM.getDataLayout();
   unsigned Size = TD.getTypeAllocSize(CV->getType());
   unsigned EmittedSize = TD.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
@@ -1710,10 +1715,10 @@ static void EmitGlobalConstantVector(const ConstantVector *CV,
     AP.OutStreamer.EmitZeros(Padding, AddrSpace);
 }
 
-static void EmitGlobalConstantStruct(const ConstantStruct *CS,
+static void emitGlobalConstantStruct(const ConstantStruct *CS,
                                      unsigned AddrSpace, AsmPrinter &AP) {
   // Print the fields in successive locations. Pad to align if needed!
-  const TargetData *TD = AP.TM.getTargetData();
+  const DataLayout *TD = AP.TM.getDataLayout();
   unsigned Size = TD->getTypeAllocSize(CS->getType());
   const StructLayout *Layout = TD->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
@@ -1727,7 +1732,7 @@ static void EmitGlobalConstantStruct(const ConstantStruct *CS,
     SizeSoFar += FieldSize + PadSize;
 
     // Now print the actual field value.
-    EmitGlobalConstantImpl(Field, AddrSpace, AP);
+    emitGlobalConstantImpl(Field, AddrSpace, AP);
 
     // Insert padding - this may include padding to increase the size of the
     // current field up to the ABI size (if the struct is not packed) as well
@@ -1738,7 +1743,7 @@ static void EmitGlobalConstantStruct(const ConstantStruct *CS,
          "Layout of constant struct may be incorrect!");
 }
 
-static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
+static void emitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
                                  AsmPrinter &AP) {
   if (CFP->getType()->isHalfTy()) {
     if (AP.isVerbose()) {
@@ -1793,7 +1798,7 @@ static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
         << DoubleVal.convertToDouble() << '\n';
     }
 
-    if (AP.TM.getTargetData()->isBigEndian()) {
+    if (AP.TM.getDataLayout()->isBigEndian()) {
       AP.OutStreamer.EmitIntValue(p[1], 2, AddrSpace);
       AP.OutStreamer.EmitIntValue(p[0], 8, AddrSpace);
     } else {
@@ -1802,7 +1807,7 @@ static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
     }
 
     // Emit the tail padding for the long double.
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     AP.OutStreamer.EmitZeros(TD.getTypeAllocSize(CFP->getType()) -
                              TD.getTypeStoreSize(CFP->getType()), AddrSpace);
     return;
@@ -1814,7 +1819,7 @@ static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
   // API needed to prevent premature destruction.
   APInt API = CFP->getValueAPF().bitcastToAPInt();
   const uint64_t *p = API.getRawData();
-  if (AP.TM.getTargetData()->isBigEndian()) {
+  if (AP.TM.getDataLayout()->isBigEndian()) {
     AP.OutStreamer.EmitIntValue(p[0], 8, AddrSpace);
     AP.OutStreamer.EmitIntValue(p[1], 8, AddrSpace);
   } else {
@@ -1823,9 +1828,9 @@ static void EmitGlobalConstantFP(const ConstantFP *CFP, unsigned AddrSpace,
   }
 }
 
-static void EmitGlobalConstantLargeInt(const ConstantInt *CI,
+static void emitGlobalConstantLargeInt(const ConstantInt *CI,
                                        unsigned AddrSpace, AsmPrinter &AP) {
-  const TargetData *TD = AP.TM.getTargetData();
+  const DataLayout *TD = AP.TM.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
   assert((BitWidth & 63) == 0 && "only support multiples of 64-bits");
 
@@ -1839,9 +1844,9 @@ static void EmitGlobalConstantLargeInt(const ConstantInt *CI,
   }
 }
 
-static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
+static void emitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
                                    AsmPrinter &AP) {
-  const TargetData *TD = AP.TM.getTargetData();
+  const DataLayout *TD = AP.TM.getDataLayout();
   uint64_t Size = TD->getTypeAllocSize(CV->getType());
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size, AddrSpace);
@@ -1858,13 +1863,13 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
       AP.OutStreamer.EmitIntValue(CI->getZExtValue(), Size, AddrSpace);
       return;
     default:
-      EmitGlobalConstantLargeInt(CI, AddrSpace, AP);
+      emitGlobalConstantLargeInt(CI, AddrSpace, AP);
       return;
     }
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
-    return EmitGlobalConstantFP(CFP, AddrSpace, AP);
+    return emitGlobalConstantFP(CFP, AddrSpace, AP);
 
   if (isa<ConstantPointerNull>(CV)) {
     AP.OutStreamer.EmitIntValue(0, Size, AddrSpace);
@@ -1872,19 +1877,19 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
   }
 
   if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
-    return EmitGlobalConstantDataSequential(CDS, AddrSpace, AP);
+    return emitGlobalConstantDataSequential(CDS, AddrSpace, AP);
   
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
-    return EmitGlobalConstantArray(CVA, AddrSpace, AP);
+    return emitGlobalConstantArray(CVA, AddrSpace, AP);
 
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
-    return EmitGlobalConstantStruct(CVS, AddrSpace, AP);
+    return emitGlobalConstantStruct(CVS, AddrSpace, AP);
 
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
     // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
     // vectors).
     if (CE->getOpcode() == Instruction::BitCast)
-      return EmitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
+      return emitGlobalConstantImpl(CE->getOperand(0), AddrSpace, AP);
 
     if (Size > 8) {
       // If the constant expression's size is greater than 64-bits, then we have
@@ -1892,23 +1897,23 @@ static void EmitGlobalConstantImpl(const Constant *CV, unsigned AddrSpace,
       // that way.
       Constant *New = ConstantFoldConstantExpression(CE, TD);
       if (New && New != CE)
-        return EmitGlobalConstantImpl(New, AddrSpace, AP);
+        return emitGlobalConstantImpl(New, AddrSpace, AP);
     }
   }
   
   if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
-    return EmitGlobalConstantVector(V, AddrSpace, AP);
+    return emitGlobalConstantVector(V, AddrSpace, AP);
     
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(LowerConstant(CV, AP), Size, AddrSpace);
+  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size, AddrSpace);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
 void AsmPrinter::EmitGlobalConstant(const Constant *CV, unsigned AddrSpace) {
-  uint64_t Size = TM.getTargetData()->getTypeAllocSize(CV->getType());
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
   if (Size)
-    EmitGlobalConstantImpl(CV, AddrSpace, *this);
+    emitGlobalConstantImpl(CV, AddrSpace, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
     // If the global has zero size, emit a single byte so that two labels don't
     // look like they are at the same location.
@@ -2023,8 +2028,8 @@ static void PrintChildLoopComment(raw_ostream &OS, const MachineLoop *Loop,
   }
 }
 
-/// EmitBasicBlockLoopComments - Pretty-print comments for basic blocks.
-static void EmitBasicBlockLoopComments(const MachineBasicBlock &MBB,
+/// emitBasicBlockLoopComments - Pretty-print comments for basic blocks.
+static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
                                        const MachineLoopInfo *LI,
                                        const AsmPrinter &AP) {
   // Add loop depth information
@@ -2090,7 +2095,7 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock *MBB) const {
     if (const BasicBlock *BB = MBB->getBasicBlock())
       if (BB->hasName())
         OutStreamer.AddComment("%" + BB->getName());
-    EmitBasicBlockLoopComments(*MBB, LI, *this);
+    emitBasicBlockLoopComments(*MBB, LI, *this);
   }
 
   // Print the main label for the block.
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 90d511cbab0a..d94e1fe61bf7 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -18,7 +18,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -112,7 +112,7 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
   
   switch (Encoding & 0x07) {
   default: llvm_unreachable("Invalid encoded value.");
-  case dwarf::DW_EH_PE_absptr: return TM.getTargetData()->getPointerSize();
+  case dwarf::DW_EH_PE_absptr: return TM.getDataLayout()->getPointerSize();
   case dwarf::DW_EH_PE_udata2: return 2;
   case dwarf::DW_EH_PE_udata4: return 4;
   case dwarf::DW_EH_PE_udata8: return 8;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index db43b06c70f2..50f0fc30a07c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -43,10 +43,10 @@ namespace {
   };
 }
 
-/// SrcMgrDiagHandler - This callback is invoked when the SourceMgr for an
+/// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
 /// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo
 /// struct above.
-static void SrcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
+static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
   SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
   assert(DiagInfo && "Diagnostic context not passed down?");
 
@@ -68,7 +68,8 @@ static void SrcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
 }
 
 /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
-void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
+void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
+                               InlineAsm::AsmDialect Dialect) const {
   assert(!Str.empty() && "Can't emit empty inline asm block");
 
   // Remember if the buffer is nul terminated or not so we can avoid a copy.
@@ -91,12 +92,12 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
   LLVMContext &LLVMCtx = MMI->getModule()->getContext();
   bool HasDiagHandler = false;
   if (LLVMCtx.getInlineAsmDiagnosticHandler() != 0) {
-    // If the source manager has an issue, we arrange for SrcMgrDiagHandler
+    // If the source manager has an issue, we arrange for srcMgrDiagHandler
     // to be invoked, getting DiagInfo passed into it.
     DiagInfo.LocInfo = LocMDNode;
     DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
     DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
-    SrcMgr.setDiagHandler(SrcMgrDiagHandler, &DiagInfo);
+    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
     HasDiagHandler = true;
   }
 
@@ -126,6 +127,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
+  Parser->setAssemblerDialect(Dialect);
   Parser->setTargetParser(*TAP.get());
 
   // Don't implicitly switch to the text section before the asm.
@@ -135,71 +137,113 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode) const {
     report_fatal_error("Error parsing inline asm\n");
 }
 
+static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                               MachineModuleInfo *MMI, int InlineAsmVariant,
+                               AsmPrinter *AP, unsigned LocCookie,
+                               raw_ostream &OS) {
+  // Switch to the inline assembly variant.
+  OS << "\t.intel_syntax\n\t";
 
-/// EmitInlineAsm - This method formats and emits the specified machine
-/// instruction that is an inline asm.
-void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
-  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
-
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
   unsigned NumOperands = MI->getNumOperands();
 
-  // Count the number of register definitions to find the asm string.
-  unsigned NumDefs = 0;
-  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
-       ++NumDefs)
-    assert(NumDefs != NumOperands-2 && "No asm string?");
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
 
-  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+      OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
 
-  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
-  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      }
+      if (Done) break;
 
-  // If this asmstr is empty, just print the #APP/#NOAPP markers.
-  // These are useful to see where empty asm's wound up.
-  if (AsmStr[0] == 0) {
-    // Don't emit the comments if writing to a .o file.
-    if (!OutStreamer.hasRawTextSupport()) return;
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
 
-    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
-                            MAI->getInlineAsmStart());
-    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
-                            MAI->getInlineAsmEnd());
-    return;
-  }
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
 
-  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
-  // enabled, so we use EmitRawText.
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
-                            MAI->getInlineAsmStart());
+      if (Val >= NumOperands-1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
 
-  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
-  // it.
-  unsigned LocCookie = 0;
-  const MDNode *LocMD = 0;
-  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
-    if (MI->getOperand(i-1).isMetadata() &&
-        (LocMD = MI->getOperand(i-1).getMetadata()) &&
-        LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
-        LocCookie = CI->getZExtValue();
-        break;
-      }
-    }
-  }
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      unsigned OpNo = InlineAsm::MIOp_FirstOperand;
 
-  // Emit the inline asm to a temporary string so we can emit it through
-  // EmitInlineAsm.
-  SmallString<256> StringData;
-  raw_svector_ostream OS(StringData);
+      bool Error = false;
 
-  OS << '\t';
+      // Scan to find the machine operand number for the operand.
+      for (; Val; --Val) {
+        if (OpNo >= MI->getNumOperands()) break;
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+      }
 
-  // The variant of the current asmprinter.
-  int AsmPrinterVariant = MAI->getAssemblerDialect();
+      // We may have a location metadata attached to the end of the
+      // instruction, and at no point should see metadata at any
+      // other point while processing. It's an error if so.
+      if (OpNo >= MI->getNumOperands() ||
+          MI->getOperand(OpNo).isMetadata()) {
+        Error = true;
+      } else {
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        ++OpNo;  // Skip over the ID number.
+        
+        if (InlineAsm::isMemKind(OpFlags)) {
+          Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
+                                            /*Modifier*/ 0, OS);
+        } else {
+          Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
+                                      /*Modifier*/ 0, OS);
+        }
+      }
+      if (Error) {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+        MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+      }
+      break;
+    }
+    }
+  }
+  OS << "\n\t.att_syntax\n" << (char)0;  // null terminate string.
+}
 
+static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                                MachineModuleInfo *MMI, int InlineAsmVariant,
+                                int AsmPrinterVariant, AsmPrinter *AP,
+                                unsigned LocCookie, raw_ostream &OS) {
   int CurVariant = -1;            // The number of the {.|.|.} region we are in.
   const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  OS << '\t';
 
   while (*LastEmitted) {
     switch (*LastEmitted) {
@@ -272,7 +316,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
                              " string: '" + Twine(AsmStr) + "'");
 
         std::string Val(StrStart, StrEnd);
-        PrintSpecial(MI, OS, Val.c_str());
+        AP->PrintSpecial(MI, OS, Val.c_str());
         LastEmitted = StrEnd+1;
         break;
       }
@@ -340,13 +384,12 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
             // FIXME: What if the operand isn't an MBB, report error?
             OS << *MI->getOperand(OpNo).getMBB()->getSymbol();
           else {
-            AsmPrinter *AP = const_cast<AsmPrinter*>(this);
             if (InlineAsm::isMemKind(OpFlags)) {
-              Error = AP->PrintAsmMemoryOperand(MI, OpNo, AsmPrinterVariant,
+              Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
                                                 Modifier[0] ? Modifier : 0,
                                                 OS);
             } else {
-              Error = AP->PrintAsmOperand(MI, OpNo, AsmPrinterVariant,
+              Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
                                           Modifier[0] ? Modifier : 0, OS);
             }
           }
@@ -363,7 +406,74 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
     }
   }
   OS << '\n' << (char)0;  // null terminate string.
-  EmitInlineAsm(OS.str(), LocMD);
+}
+
+/// EmitInlineAsm - This method formats and emits the specified machine
+/// instruction that is an inline asm.
+void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
+  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
+
+  // Count the number of register definitions to find the asm string.
+  unsigned NumDefs = 0;
+  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+       ++NumDefs)
+    assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
+
+  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+
+  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
+  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+
+  // If this asmstr is empty, just print the #APP/#NOAPP markers.
+  // These are useful to see where empty asm's wound up.
+  if (AsmStr[0] == 0) {
+    // Don't emit the comments if writing to a .o file.
+    if (!OutStreamer.hasRawTextSupport()) return;
+
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmStart());
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmEnd());
+    return;
+  }
+
+  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use EmitRawText.
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmStart());
+
+  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
+  // it.
+  unsigned LocCookie = 0;
+  const MDNode *LocMD = 0;
+  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
+    if (MI->getOperand(i-1).isMetadata() &&
+        (LocMD = MI->getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  // Emit the inline asm to a temporary string so we can emit it through
+  // EmitInlineAsm.
+  SmallString<256> StringData;
+  raw_svector_ostream OS(StringData);
+
+  // The variant of the current asmprinter.
+  int AsmPrinterVariant = MAI->getAssemblerDialect();
+  InlineAsm::AsmDialect InlineAsmVariant = MI->getInlineAsmDialect();
+  AsmPrinter *AP = const_cast<AsmPrinter*>(this);
+  if (InlineAsmVariant == InlineAsm::AD_ATT)
+    EmitGCCInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AsmPrinterVariant,
+                        AP, LocCookie, OS);
+  else
+    EmitMSInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AP, LocCookie, OS);
+
+  EmitInlineAsm(OS.str(), LocMD, MI->getInlineAsmDialect());
 
   // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
   // enabled, so we use EmitRawText.
@@ -409,8 +519,8 @@ void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
 /// instruction, using the specified assembler variant.  Targets should
 /// override this to format as appropriate.
 bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-    unsigned AsmVariant, const char *ExtraCode,
-    raw_ostream &O) {
+                                 unsigned AsmVariant, const char *ExtraCode,
+                                 raw_ostream &O) {
   // Does this asm operand have a single letter operand modifier?
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 3776848e3f47..4d73b3c22261 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -17,7 +17,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -182,6 +182,12 @@ void DIEValue::dump() {
 void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
   unsigned Size = ~0U;
   switch (Form) {
+  case dwarf::DW_FORM_flag_present:
+    // Emit something to keep the lines and comments in sync.
+    // FIXME: Is there a better way to do this?
+    if (Asm->OutStreamer.hasRawTextSupport())
+      Asm->OutStreamer.EmitRawText(StringRef(""));
+    return;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
   case dwarf::DW_FORM_data1: Size = 1; break;
@@ -193,7 +199,8 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
   case dwarf::DW_FORM_data8: Size = 8; break;
   case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
   case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
-  case dwarf::DW_FORM_addr:  Size = Asm->getTargetData().getPointerSize(); break;
+  case dwarf::DW_FORM_addr:
+    Size = Asm->getDataLayout().getPointerSize(); break;
   default: llvm_unreachable("DIE Value form not supported yet");
   }
   Asm->OutStreamer.EmitIntValue(Integer, Size, 0/*addrspace*/);
@@ -203,6 +210,7 @@ void DIEInteger::EmitValue(AsmPrinter *Asm, unsigned Form) const {
 ///
 unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
   switch (Form) {
+  case dwarf::DW_FORM_flag_present: return 0;
   case dwarf::DW_FORM_flag:  // Fall thru
   case dwarf::DW_FORM_ref1:  // Fall thru
   case dwarf::DW_FORM_data1: return sizeof(int8_t);
@@ -214,7 +222,7 @@ unsigned DIEInteger::SizeOf(AsmPrinter *AP, unsigned Form) const {
   case dwarf::DW_FORM_data8: return sizeof(int64_t);
   case dwarf::DW_FORM_udata: return MCAsmInfo::getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata: return MCAsmInfo::getSLEB128Size(Integer);
-  case dwarf::DW_FORM_addr:  return AP->getTargetData().getPointerSize();
+  case dwarf::DW_FORM_addr:  return AP->getDataLayout().getPointerSize();
   default: llvm_unreachable("DIE Value form not supported yet");
   }
 }
@@ -241,7 +249,7 @@ void DIELabel::EmitValue(AsmPrinter *AP, unsigned Form) const {
 unsigned DIELabel::SizeOf(AsmPrinter *AP, unsigned Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getTargetData().getPointerSize();
+  return AP->getDataLayout().getPointerSize();
 }
 
 #ifndef NDEBUG
@@ -265,7 +273,7 @@ void DIEDelta::EmitValue(AsmPrinter *AP, unsigned Form) const {
 unsigned DIEDelta::SizeOf(AsmPrinter *AP, unsigned Form) const {
   if (Form == dwarf::DW_FORM_data4) return 4;
   if (Form == dwarf::DW_FORM_strp) return 4;
-  return AP->getTargetData().getPointerSize();
+  return AP->getDataLayout().getPointerSize();
 }
 
 #ifndef NDEBUG
diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
index f93ea1b045b2..28a96f3b2b65 100644
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ b/lib/CodeGen/AsmPrinter/DIE.h
@@ -214,9 +214,6 @@ namespace llvm {
     ///
     virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const = 0;
 
-    // Implement isa/cast/dyncast.
-    static bool classof(const DIEValue *) { return true; }
-
 #ifndef NDEBUG
     virtual void print(raw_ostream &O) = 0;
     void dump();
@@ -257,7 +254,6 @@ namespace llvm {
     virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
 
     // Implement isa/cast/dyncast.
-    static bool classof(const DIEInteger *) { return true; }
     static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
 
 #ifndef NDEBUG
@@ -286,7 +282,6 @@ namespace llvm {
     virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
 
     // Implement isa/cast/dyncast.
-    static bool classof(const DIELabel *)  { return true; }
     static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
 
 #ifndef NDEBUG
@@ -313,7 +308,6 @@ namespace llvm {
     virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
 
     // Implement isa/cast/dyncast.
-    static bool classof(const DIEDelta *)  { return true; }
     static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
 
 #ifndef NDEBUG
@@ -343,7 +337,6 @@ namespace llvm {
     }
 
     // Implement isa/cast/dyncast.
-    static bool classof(const DIEEntry *)  { return true; }
     static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
 
 #ifndef NDEBUG
@@ -383,7 +376,6 @@ namespace llvm {
     virtual unsigned SizeOf(AsmPrinter *AP, unsigned Form) const;
 
     // Implement isa/cast/dyncast.
-    static bool classof(const DIEBlock *)  { return true; }
     static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
 
 #ifndef NDEBUG
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index 454a923c13e0..05e0f2fb63b3 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -133,8 +133,8 @@ void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
   }
 }
 
-// Walk through and emit the buckets for the table. This will look
-// like a list of numbers of how many elements are in each bucket.
+// Walk through and emit the buckets for the table. Each index is
+// an offset into the list of hashes.
 void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
   unsigned index = 0;
   for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 963b8cdf3424..92d1bbe4f7e8 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -237,8 +237,8 @@ private:
     #endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable&); // DO NOT IMPLEMENT
-  void operator=(const DwarfAccelTable&);  // DO NOT IMPLEMENT
+  DwarfAccelTable(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+  void operator=(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index d975f1f97bea..4fdd5ca25221 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -25,7 +25,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index d30e5bbd8e5a..2b07dda31ffe 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -51,6 +51,15 @@ DIEEntry *CompileUnit::createDIEEntry(DIE *Entry) {
   return Value;
 }
 
+/// addFlag - Add a flag that is true.
+void CompileUnit::addFlag(DIE *Die, unsigned Attribute) {
+  if (!DD->useDarwinGDBCompat())
+    Die->addValue(Attribute, dwarf::DW_FORM_flag_present,
+                  DIEIntegerOne);
+  else
+    addUInt(Die, Attribute, dwarf::DW_FORM_flag, 1);
+}
+
 /// addUInt - Add an unsigned integer attribute data and value.
 ///
 void CompileUnit::addUInt(DIE *Die, unsigned Attribute,
@@ -501,7 +510,7 @@ bool CompileUnit::addConstantFPValue(DIE *Die, const MachineOperand &MO) {
   const char *FltPtr = (const char*)FltVal.getRawData();
 
   int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
-  bool LittleEndian = Asm->getTargetData().isLittleEndian();
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
   int Incr = (LittleEndian ? 1 : -1);
   int Start = (LittleEndian ? 0 : NumBytes - 1);
   int Stop = (LittleEndian ? NumBytes : -1);
@@ -543,7 +552,7 @@ bool CompileUnit::addConstantValue(DIE *Die, const ConstantInt *CI,
   const uint64_t *Ptr64 = Val.getRawData();
 
   int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
-  bool LittleEndian = Asm->getTargetData().isLittleEndian();
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
 
   // Output the constant to DWARF one byte at a time.
   for (int i = 0; i < NumBytes; i++) {
@@ -794,7 +803,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
         (Language == dwarf::DW_LANG_C89 ||
          Language == dwarf::DW_LANG_C99 ||
          Language == dwarf::DW_LANG_ObjC))
-      addUInt(&Buffer, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_prototyped);
   }
     break;
   case dwarf::DW_TAG_structure_type:
@@ -825,15 +834,15 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
           addUInt(ElemDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
             dwarf::DW_ACCESS_public);
         if (SP.isExplicit())
-          addUInt(ElemDie, dwarf::DW_AT_explicit, dwarf::DW_FORM_flag, 1);
+          addFlag(ElemDie, dwarf::DW_AT_explicit);
       }
       else if (Element.isVariable()) {
         DIVariable DV(Element);
         ElemDie = new DIE(dwarf::DW_TAG_variable);
         addString(ElemDie, dwarf::DW_AT_name, DV.getName());
         addType(ElemDie, DV.getType());
-        addUInt(ElemDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-        addUInt(ElemDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+        addFlag(ElemDie, dwarf::DW_AT_declaration);
+        addFlag(ElemDie, dwarf::DW_AT_external);
         addSourceLine(ElemDie, DV);
       } else if (Element.isDerivedType()) {
         DIDerivedType DDTy(Element);
@@ -883,7 +892,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     }
 
     if (CTy.isAppleBlockExtension())
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_block, dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_APPLE_block);
 
     DICompositeType ContainingType = CTy.getContainingType();
     if (DIDescriptor(ContainingType).isCompositeType())
@@ -895,8 +904,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
     }
 
     if (CTy.isObjcClassComplete())
-      addUInt(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type,
-              dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
 
     // Add template parameters to a class, structure or union types.
     // FIXME: The support isn't in the metadata for this yet.
@@ -929,7 +937,7 @@ void CompileUnit::constructTypeDIE(DIE &Buffer, DICompositeType CTy) {
 
     // If we're a forward decl, say so.
     if (CTy.isForwardDecl())
-      addUInt(&Buffer, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+      addFlag(&Buffer, dwarf::DW_AT_declaration);
 
     // Add source line info if available.
     if (!CTy.isForwardDecl())
@@ -1028,8 +1036,10 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   // AT_specification code in order to work around a bug in older
   // gdbs that requires the linkage name to resolve multiple template
   // functions.
+  // TODO: Remove this set of code when we get rid of the old gdb
+  // compatibility.
   StringRef LinkageName = SP.getLinkageName();
-  if (!LinkageName.empty())
+  if (!LinkageName.empty() && DD->useDarwinGDBCompat())
     addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
               getRealLinkageName(LinkageName));
 
@@ -1043,6 +1053,11 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
     return SPDie;
   }
 
+  // Add the linkage name if we have one.
+  if (!LinkageName.empty() && !DD->useDarwinGDBCompat())
+    addString(SPDie, dwarf::DW_AT_MIPS_linkage_name,
+              getRealLinkageName(LinkageName));
+
   // Constructors and operators for anonymous aggregates do not have names.
   if (!SP.getName().empty())
     addString(SPDie, dwarf::DW_AT_name, SP.getName());
@@ -1055,7 +1070,7 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
       (Language == dwarf::DW_LANG_C89 ||
        Language == dwarf::DW_LANG_C99 ||
        Language == dwarf::DW_LANG_ObjC))
-    addUInt(SPDie, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_prototyped);
 
   // Add Return Type.
   DICompositeType SPTy = SP.getType();
@@ -1079,7 +1094,7 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
   }
 
   if (!SP.isDefinition()) {
-    addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_declaration);
     
     // Add arguments. Do not add arguments for subprogram definition. They will
     // be handled while processing variables.
@@ -1090,22 +1105,22 @@ DIE *CompileUnit::getOrCreateSubprogramDIE(DISubprogram SP) {
     if (SPTag == dwarf::DW_TAG_subroutine_type)
       for (unsigned i = 1, N =  Args.getNumElements(); i < N; ++i) {
         DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-        DIType ATy = DIType(DIType(Args.getElement(i)));
+        DIType ATy = DIType(Args.getElement(i));
         addType(Arg, ATy);
         if (ATy.isArtificial())
-          addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+          addFlag(Arg, dwarf::DW_AT_artificial);
         SPDie->addChild(Arg);
       }
   }
 
   if (SP.isArtificial())
-    addUInt(SPDie, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_artificial);
 
   if (!SP.isLocalToUnit())
-    addUInt(SPDie, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_external);
 
   if (SP.isOptimized())
-    addUInt(SPDie, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+    addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
 
   if (unsigned isa = Asm->getISAEncoding()) {
     addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
@@ -1168,7 +1183,7 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
 
   // Add scoping info.
   if (!GV.isLocalToUnit())
-    addUInt(VariableDIE, dwarf::DW_AT_external, dwarf::DW_FORM_flag, 1);
+    addFlag(VariableDIE, dwarf::DW_AT_external);
 
   // Add line number info.
   addSourceLine(VariableDIE, GV);
@@ -1193,8 +1208,7 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
       addDIEEntry(VariableSpecDIE, dwarf::DW_AT_specification,
                   dwarf::DW_FORM_ref4, VariableDIE);
       addBlock(VariableSpecDIE, dwarf::DW_AT_location, 0, Block);
-      addUInt(VariableDIE, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag,
-                     1);
+      addFlag(VariableDIE, dwarf::DW_AT_declaration);
       addDie(VariableSpecDIE);
     } else {
       addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
@@ -1213,7 +1227,7 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
     SmallVector<Value*, 3> Idx(CE->op_begin()+1, CE->op_end());
     addUInt(Block, 0, dwarf::DW_FORM_udata, 
-                   Asm->getTargetData().getIndexedOffset(Ptr->getType(), Idx));
+                   Asm->getDataLayout().getIndexedOffset(Ptr->getType(), Idx));
     addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
     addBlock(VariableDIE, dwarf::DW_AT_location, 0, Block);
   }
@@ -1260,7 +1274,7 @@ void CompileUnit::constructArrayTypeDIE(DIE &Buffer,
                                         DICompositeType *CTy) {
   Buffer.setTag(dwarf::DW_TAG_array_type);
   if (CTy->getTag() == dwarf::DW_TAG_vector_type)
-    addUInt(&Buffer, dwarf::DW_AT_GNU_vector, dwarf::DW_FORM_flag, 1);
+    addFlag(&Buffer, dwarf::DW_AT_GNU_vector);
 
   // Emit derived type.
   addType(&Buffer, CTy->getTypeDerivedFrom());
@@ -1333,8 +1347,7 @@ DIE *CompileUnit::constructVariableDIE(DbgVariable *DV, bool isScopeAbstract) {
   }
 
   if (DV->isArtificial())
-    addUInt(VariableDie, dwarf::DW_AT_artificial,
-                        dwarf::DW_FORM_flag, 1);
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
 
   if (isScopeAbstract) {
     DV->setDIE(VariableDie);
@@ -1446,7 +1459,7 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
     Offset -= FieldOffset;
 
     // Maybe we need to work from the other end.
-    if (Asm->getTargetData().isLittleEndian())
+    if (Asm->getDataLayout().isLittleEndian())
       Offset = FieldSize - (Offset + Size);
     addUInt(MemberDie, dwarf::DW_AT_bit_offset, 0, Offset);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index b4ff9e8d69dd..fad9b6e06684 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -176,6 +176,9 @@ public:
   }
 public:
 
+  /// addFlag - Add a flag that is true to the DIE.
+  void addFlag(DIE *Die, unsigned Attribute);
+  
   /// addUInt - Add an unsigned integer attribute data and value.
   ///
   void addUInt(DIE *Die, unsigned Attribute, unsigned Form, uint64_t Integer);
@@ -280,8 +283,8 @@ public:
   /// for the given DITemplateTypeParameter.
   DIE *getOrCreateTemplateTypeParameterDIE(DITemplateTypeParameter TP);
 
-  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create new DIE 
-  /// for the given DITemplateValueParameter.
+  /// getOrCreateTemplateValueParameterDIE - Find existing DIE or create
+  /// new DIE for the given DITemplateValueParameter.
   DIE *getOrCreateTemplateValueParameterDIE(DITemplateValueParameter TVP);
 
   /// createDIEEntry - Creates a new DIEEntry to be a proxy for a debug
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 649684adbf04..367b52307925 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -27,7 +27,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -54,9 +54,29 @@ static cl::opt<bool> UnknownLocations("use-unknown-locations", cl::Hidden,
      cl::desc("Make an absence of debug location information explicit."),
      cl::init(false));
 
-static cl::opt<bool> DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
+namespace {
+  enum DefaultOnOff {
+    Default, Enable, Disable
+  };
+}
+
+static cl::opt<DefaultOnOff> DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
      cl::desc("Output prototype dwarf accelerator tables."),
-     cl::init(false));
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
+
+static cl::opt<DefaultOnOff> DarwinGDBCompat("darwin-gdb-compat", cl::Hidden,
+     cl::desc("Compatibility with Darwin gdb."),
+     cl::values(
+                clEnumVal(Default, "Default for platform"),
+                clEnumVal(Enable, "Enabled"),
+                clEnumVal(Disable, "Disabled"),
+                clEnumValEnd),
+     cl::init(Default));
 
 namespace {
   const char *DWARFGroupName = "DWARF Emission";
@@ -135,10 +155,25 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   DwarfDebugRangeSectionSym = DwarfDebugLocSectionSym = 0;
   FunctionBeginSym = FunctionEndSym = 0;
 
-  // Turn on accelerator tables for Darwin.
-  if (Triple(M->getTargetTriple()).isOSDarwin())
-    DwarfAccelTables = true;
-  
+  // Turn on accelerator tables and older gdb compatibility
+  // for Darwin.
+  bool isDarwin = Triple(M->getTargetTriple()).isOSDarwin();
+  if (DarwinGDBCompat == Default) {
+    if (isDarwin)
+      isDarwinGDBCompat = true;
+    else
+      isDarwinGDBCompat = false;
+  } else
+    isDarwinGDBCompat = DarwinGDBCompat == Enable ? true : false;
+
+  if (DwarfAccelTables == Default) {
+    if (isDarwin)
+      hasDwarfAccelTables = true;
+    else
+      hasDwarfAccelTables = false;
+  } else
+    hasDwarfAccelTables = DwarfAccelTables == Enable ? true : false;
+
   {
     NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
     beginModule(M);
@@ -272,44 +307,51 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
   assert(SPDie && "Unable to find subprogram DIE!");
   DISubprogram SP(SPNode);
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
-  if (!SPDecl.isSubprogram()) {
-    // There is not any need to generate specification DIE for a function
-    // defined at compile unit level. If a function is defined inside another
-    // function then gdb prefers the definition at top level and but does not
-    // expect specification DIE in parent function. So avoid creating
-    // specification DIE for a function defined inside a function.
-    if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-        !SP.getContext().isFile() &&
-        !isSubprogramContext(SP.getContext())) {
-      SPCU->addUInt(SPDie, dwarf::DW_AT_declaration, dwarf::DW_FORM_flag, 1);
-      
-      // Add arguments.
-      DICompositeType SPTy = SP.getType();
-      DIArray Args = SPTy.getTypeArray();
-      unsigned SPTag = SPTy.getTag();
-      if (SPTag == dwarf::DW_TAG_subroutine_type)
-        for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-          DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-          DIType ATy = DIType(DIType(Args.getElement(i)));
-          SPCU->addType(Arg, ATy);
-          if (ATy.isArtificial())
-            SPCU->addUInt(Arg, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag, 1);
-          SPDie->addChild(Arg);
-        }
-      DIE *SPDeclDie = SPDie;
-      SPDie = new DIE(dwarf::DW_TAG_subprogram);
-      SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                        SPDeclDie);
-      SPCU->addDie(SPDie);
-    }
-  }
-  // Pick up abstract subprogram DIE.
+  // If we're updating an abstract DIE, then we will be adding the children and
+  // object pointer later on. But what we don't want to do is process the
+  // concrete DIE twice.
   if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) {
+    // Pick up abstract subprogram DIE.
     SPDie = new DIE(dwarf::DW_TAG_subprogram);
     SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
                       dwarf::DW_FORM_ref4, AbsSPDIE);
     SPCU->addDie(SPDie);
+  } else {
+    DISubprogram SPDecl = SP.getFunctionDeclaration();
+    if (!SPDecl.isSubprogram()) {
+      // There is not any need to generate specification DIE for a function
+      // defined at compile unit level. If a function is defined inside another
+      // function then gdb prefers the definition at top level and but does not
+      // expect specification DIE in parent function. So avoid creating
+      // specification DIE for a function defined inside a function.
+      if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
+          !SP.getContext().isFile() &&
+          !isSubprogramContext(SP.getContext())) {
+        SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
+
+        // Add arguments.
+        DICompositeType SPTy = SP.getType();
+        DIArray Args = SPTy.getTypeArray();
+        unsigned SPTag = SPTy.getTag();
+        if (SPTag == dwarf::DW_TAG_subroutine_type)
+          for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+            DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+            DIType ATy = DIType(Args.getElement(i));
+            SPCU->addType(Arg, ATy);
+            if (ATy.isArtificial())
+              SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
+            if (ATy.isObjectPointer())
+              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
+                                dwarf::DW_FORM_ref4, Arg);
+            SPDie->addChild(Arg);
+          }
+        DIE *SPDeclDie = SPDie;
+        SPDie = new DIE(dwarf::DW_TAG_subprogram);
+        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
+                          SPDeclDie);
+        SPCU->addDie(SPDie);
+      }
+    }
   }
 
   SPCU->addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
@@ -346,7 +388,7 @@ DIE *DwarfDebug::constructLexicalScopeDIE(CompileUnit *TheCU,
     // DW_AT_ranges appropriately.
     TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
                    DebugRangeSymbols.size() 
-                   * Asm->getTargetData().getPointerSize());
+                   * Asm->getDataLayout().getPointerSize());
     for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
          RE = Ranges.end(); RI != RE; ++RI) {
       DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
@@ -386,7 +428,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   DISubprogram InlinedSP = getDISubprogram(DS);
   DIE *OriginDIE = TheCU->getDIE(InlinedSP);
   if (!OriginDIE) {
-    DEBUG(dbgs() << "Unable to find original DIE for inlined subprogram.");
+    DEBUG(dbgs() << "Unable to find original DIE for an inlined subprogram.");
     return NULL;
   }
 
@@ -395,7 +437,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
   const MCSymbol *EndLabel = getLabelAfterInsn(RI->second);
 
   if (StartLabel == 0 || EndLabel == 0) {
-    llvm_unreachable("Unexpected Start and End labels for a inlined scope!");
+    llvm_unreachable("Unexpected Start and End labels for an inlined scope!");
   }
   assert(StartLabel->isDefined() &&
          "Invalid starting label for an inlined scope!");
@@ -412,7 +454,7 @@ DIE *DwarfDebug::constructInlinedScopeDIE(CompileUnit *TheCU,
     // DW_AT_ranges appropriately.
     TheCU->addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_data4,
                    DebugRangeSymbols.size() 
-                   * Asm->getTargetData().getPointerSize());
+                   * Asm->getDataLayout().getPointerSize());
     for (SmallVector<InsnRange, 4>::const_iterator RI = Ranges.begin(),
          RE = Ranges.end(); RI != RE; ++RI) {
       DebugRangeSymbols.push_back(getLabelBeforeInsn(RI->first));
@@ -461,21 +503,26 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
     return NULL;
 
   SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = NULL;
 
   // Collect arguments for current function.
   if (LScopes.isCurrentFunctionScope(Scope))
     for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
       if (DbgVariable *ArgDV = CurrentFnArguments[i])
         if (DIE *Arg = 
-            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope()))
+            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope())) {
           Children.push_back(Arg);
+          if (ArgDV->isObjectPointer()) ObjectPointer = Arg;
+        }
 
   // Collect lexical scope children first.
   const SmallVector<DbgVariable *, 8> &Variables = ScopeVariables.lookup(Scope);
   for (unsigned i = 0, N = Variables.size(); i < N; ++i)
     if (DIE *Variable = 
-        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope()))
+        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope())) {
       Children.push_back(Variable);
+      if (Variables[i]->isObjectPointer()) ObjectPointer = Variable;
+    }
   const SmallVector<LexicalScope *, 4> &Scopes = Scope->getChildren();
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
     if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
@@ -509,6 +556,10 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
          E = Children.end(); I != E; ++I)
     ScopeDIE->addChild(*I);
 
+  if (DS.isSubprogram() && ObjectPointer != NULL)
+    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
+                       dwarf::DW_FORM_ref4, ObjectPointer);
+
   if (DS.isSubprogram())
     TheCU->addPubTypes(DISubprogram(DS));
 
@@ -556,7 +607,8 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   unsigned ID = GetOrCreateSourceID(FN, CompilationDir);
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
-  CompileUnit *NewCU = new CompileUnit(ID, DIUnit.getLanguage(), Die, Asm, this);
+  CompileUnit *NewCU = new CompileUnit(ID, DIUnit.getLanguage(), Die,
+                                       Asm, this);
   NewCU->addString(Die, dwarf::DW_AT_producer, DIUnit.getProducer());
   NewCU->addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                  DIUnit.getLanguage());
@@ -575,7 +627,7 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   if (!CompilationDir.empty())
     NewCU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
   if (DIUnit.isOptimized())
-    NewCU->addUInt(Die, dwarf::DW_AT_APPLE_optimized, dwarf::DW_FORM_flag, 1);
+    NewCU->addFlag(Die, dwarf::DW_AT_APPLE_optimized);
 
   StringRef Flags = DIUnit.getFlags();
   if (!Flags.empty())
@@ -755,7 +807,7 @@ void DwarfDebug::endModule() {
         LexicalScope *Scope = 
           new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
         DeadFnScopeMap[SP] = Scope;
-        
+
         // Construct subprogram DIE and add variables DIEs.
         CompileUnit *SPCU = CUMap.lookup(TheCU);
         assert(SPCU && "Unable to find Compile Unit!");
@@ -802,9 +854,9 @@ void DwarfDebug::endModule() {
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("data_end"));
 
   // End text sections.
-  for (unsigned i = 1, N = SectionMap.size(); i <= N; ++i) {
-    Asm->OutStreamer.SwitchSection(SectionMap[i]);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", i));
+  for (unsigned I = 0, E = SectionMap.size(); I != E; ++I) {
+    Asm->OutStreamer.SwitchSection(SectionMap[I]);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("section_end", I+1));
   }
 
   // Compute DIE offsets and sizes.
@@ -816,8 +868,8 @@ void DwarfDebug::endModule() {
   // Corresponding abbreviations into a abbrev section.
   emitAbbreviations();
 
-  // Emit info into a dwarf accelerator table sections.
-  if (DwarfAccelTables) {
+  // Emit info into the dwarf accelerator table sections.
+  if (useDwarfAccelTables()) {
     emitAccelNames();
     emitAccelObjC();
     emitAccelNamespaces();
@@ -825,7 +877,10 @@ void DwarfDebug::endModule() {
   }
   
   // Emit info into a debug pubtypes section.
-  emitDebugPubTypes();
+  // TODO: When we don't need the option anymore we can
+  // remove all of the code that adds to the table.
+  if (useDarwinGDBCompat())
+    emitDebugPubTypes();
 
   // Emit info into a debug loc section.
   emitDebugLoc();
@@ -840,7 +895,11 @@ void DwarfDebug::endModule() {
   emitDebugMacInfo();
 
   // Emit inline info.
-  emitDebugInlineInfo();
+  // TODO: When we don't need the option anymore we
+  // can remove all of the code that this section
+  // depends upon.
+  if (useDarwinGDBCompat())
+    emitDebugInlineInfo();
 
   // Emit info into a debug str section.
   emitDebugStr();
@@ -1014,7 +1073,7 @@ DwarfDebug::collectVariableInfo(const MachineFunction *MF,
     if (AbsVar)
       AbsVar->setMInsn(MInsn);
 
-    // Simple ranges that are fully coalesced.
+    // Simplify ranges that are fully coalesced.
     if (History.size() <= 1 || (History.size() == 2 &&
                                 MInsn->isIdenticalTo(History.back()))) {
       RegVar->setMInsn(MInsn);
@@ -1267,7 +1326,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
             // Coalesce identical entries at the end of History.
             if (History.size() >= 2 &&
                 Prev->isIdenticalTo(History[History.size() - 2])) {
-              DEBUG(dbgs() << "Coalesce identical DBG_VALUE entries:\n"
+              DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
                     << "\t" << *Prev 
                     << "\t" << *History[History.size() - 2] << "\n");
               History.pop_back();
@@ -1283,7 +1342,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                 PrevMBB->getLastNonDebugInstr();
               if (LastMI == PrevMBB->end()) {
                 // Drop DBG_VALUE for empty range.
-                DEBUG(dbgs() << "Drop DBG_VALUE for empty range:\n"
+                DEBUG(dbgs() << "Dropping DBG_VALUE for empty range:\n"
                       << "\t" << *Prev << "\n");
                 History.pop_back();
               }
@@ -1300,9 +1359,10 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
         if (!MI->isLabel())
           AtBlockEntry = false;
 
-        // First known non DBG_VALUE location marks beginning of function
-        // body.
-        if (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown())
+        // First known non-DBG_VALUE and non-frame setup location marks
+        // the beginning of the function body.
+        if (!MI->getFlag(MachineInstr::FrameSetup) &&
+            (PrologEndLoc.isUnknown() && !MI->getDebugLoc().isUnknown()))
           PrologEndLoc = MI->getDebugLoc();
 
         // Check if the instruction clobbers any registers with debug vars.
@@ -1382,7 +1442,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                                        MF->getFunction()->getContext());
     recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
                      FnStartDL.getScope(MF->getFunction()->getContext()),
-                     DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0);
+                     0);
   }
 }
 
@@ -1439,8 +1499,7 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   DIE *CurFnDIE = constructScopeDIE(TheCU, FnScope);
   
   if (!MF->getTarget().Options.DisableFramePointerElim(*MF))
-    TheCU->addUInt(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr,
-                   dwarf::DW_FORM_flag, 1);
+    TheCU->addFlag(CurFnDIE, dwarf::DW_AT_APPLE_omit_frame_ptr);
 
   DebugFrames.push_back(FunctionDebugFrameInfo(Asm->getFunctionNumber(),
                                                MMI->getFrameMoves()));
@@ -1710,7 +1769,7 @@ void DwarfDebug::emitDebugInfo() {
     Asm->EmitSectionOffset(Asm->GetTempSymbol("abbrev_begin"),
                            DwarfAbbrevSectionSym);
     Asm->OutStreamer.AddComment("Address Size (in bytes)");
-    Asm->EmitInt8(Asm->getTargetData().getPointerSize());
+    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
 
     emitDIE(Die);
     Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("info_end", TheCU->getID()));
@@ -1756,14 +1815,14 @@ void DwarfDebug::emitEndOfLineMatrix(unsigned SectionEnd) {
   Asm->EmitInt8(0);
 
   Asm->OutStreamer.AddComment("Op size");
-  Asm->EmitInt8(Asm->getTargetData().getPointerSize() + 1);
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize() + 1);
   Asm->OutStreamer.AddComment("DW_LNE_set_address");
   Asm->EmitInt8(dwarf::DW_LNE_set_address);
 
   Asm->OutStreamer.AddComment("Section end label");
 
   Asm->OutStreamer.EmitSymbolValue(Asm->GetTempSymbol("section_end",SectionEnd),
-                                   Asm->getTargetData().getPointerSize(),
+                                   Asm->getDataLayout().getPointerSize(),
                                    0/*AddrSpace*/);
 
   // Mark end of matrix.
@@ -1992,7 +2051,7 @@ void DwarfDebug::emitDebugLoc() {
   // Start the dwarf loc section.
   Asm->OutStreamer.SwitchSection(
     Asm->getObjFileLowering().getDwarfLocSection());
-  unsigned char Size = Asm->getTargetData().getPointerSize();
+  unsigned char Size = Asm->getDataLayout().getPointerSize();
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_loc", 0));
   unsigned index = 1;
   for (SmallVector<DotDebugLocEntry, 4>::iterator
@@ -2089,7 +2148,7 @@ void DwarfDebug::emitDebugRanges() {
   // Start the dwarf ranges section.
   Asm->OutStreamer.SwitchSection(
     Asm->getObjFileLowering().getDwarfRangesSection());
-  unsigned char Size = Asm->getTargetData().getPointerSize();
+  unsigned char Size = Asm->getDataLayout().getPointerSize();
   for (SmallVector<const MCSymbol *, 8>::iterator
          I = DebugRangeSymbols.begin(), E = DebugRangeSymbols.end();
        I != E; ++I) {
@@ -2147,7 +2206,7 @@ void DwarfDebug::emitDebugInlineInfo() {
   Asm->OutStreamer.AddComment("Dwarf Version");
   Asm->EmitInt16(dwarf::DWARF_VERSION);
   Asm->OutStreamer.AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getTargetData().getPointerSize());
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
 
   for (SmallVector<const MDNode *, 4>::iterator I = InlinedSPNodes.begin(),
          E = InlinedSPNodes.end(); I != E; ++I) {
@@ -2178,7 +2237,7 @@ void DwarfDebug::emitDebugInlineInfo() {
 
       if (Asm->isVerbose()) Asm->OutStreamer.AddComment("low_pc");
       Asm->OutStreamer.EmitSymbolValue(LI->first,
-                                       Asm->getTargetData().getPointerSize(),0);
+                                       Asm->getDataLayout().getPointerSize(),0);
     }
   }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index d1d651265507..61d9a51a5279 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -21,9 +21,9 @@
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/UniqueVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLoc.h"
 
@@ -96,7 +96,8 @@ typedef struct DotDebugLocEntry {
   DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, const ConstantFP *FPtr)
     : Begin(B), End(E), Variable(0), Merged(false), 
       Constant(true) { Constants.CFP = FPtr; EntryKind = E_ConstantFP; }
-  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E, const ConstantInt *IPtr)
+  DotDebugLocEntry(const MCSymbol *B, const MCSymbol *E,
+                   const ConstantInt *IPtr)
     : Begin(B), End(E), Variable(0), Merged(false), 
       Constant(true) { Constants.CIP = IPtr; EntryKind = E_ConstantInt; }
 
@@ -158,11 +159,19 @@ public:
   bool isArtificial()                const {
     if (Var.isArtificial())
       return true;
-    if (Var.getTag() == dwarf::DW_TAG_arg_variable
-        && getType().isArtificial())
+    if (getType().isArtificial())
       return true;
     return false;
   }
+
+  bool isObjectPointer()             const {
+    if (Var.isObjectPointer())
+      return true;
+    if (getType().isObjectPointer())
+      return true;
+    return false;
+  }
+  
   bool variableHasComplexAddress()   const {
     assert(Var.Verify() && "Invalid complex DbgVariable!");
     return Var.hasComplexAddress();
@@ -222,7 +231,7 @@ class DwarfDebug {
   
   /// SectionMap - Provides a unique id per text section.
   ///
-  UniqueVector<const MCSection*> SectionMap;
+  SetVector<const MCSection*> SectionMap;
 
   /// CurrentFnArguments - List of Arguments (DbgValues) for current function.
   SmallVector<DbgVariable *, 8> CurrentFnArguments;
@@ -307,6 +316,9 @@ class DwarfDebug {
   // table for the same directory as DW_at_comp_dir.
   StringRef CompilationDir;
 
+  // A holder for the DarwinGDBCompat flag so that the compile unit can use it.
+  bool isDarwinGDBCompat;
+  bool hasDwarfAccelTables;
 private:
 
   /// assignAbbrevNumber - Define a unique number for the abbreviation.
@@ -520,6 +532,11 @@ public:
   /// getStringPoolEntry - returns an entry into the string pool with the given
   /// string text.
   MCSymbol *getStringPoolEntry(StringRef Str);
+
+  /// useDarwinGDBCompat - returns whether or not to limit some of our debug
+  /// output to the limitations of darwin gdb.
+  bool useDarwinGDBCompat() { return isDarwinGDBCompat; }
+  bool useDwarfAccelTables() { return hasDwarfAccelTables; }
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.cpp b/lib/CodeGen/AsmPrinter/DwarfException.cpp
index 70cc2e56b3e1..08fb6b3f52c5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfException.cpp
@@ -24,7 +24,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
@@ -417,7 +417,7 @@ void DwarfException::EmitExceptionTable() {
     // that we're omitting that bit.
     TTypeEncoding = dwarf::DW_EH_PE_omit;
     // dwarf::DW_EH_PE_absptr
-    TypeFormatSize = Asm->getTargetData().getPointerSize();
+    TypeFormatSize = Asm->getDataLayout().getPointerSize();
   } else {
     // Okay, we have actual filters or typeinfos to emit.  As such, we need to
     // pick a type encoding for them.  We're about to emit a list of pointers to
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index 75f6056c449b..fe9e49360951 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -43,26 +43,6 @@ protected:
   /// MMI - Collected machine module information.
   MachineModuleInfo *MMI;
 
-  /// EmitExceptionTable - Emit landing pads and actions.
-  ///
-  /// The general organization of the table is complex, but the basic concepts
-  /// are easy.  First there is a header which describes the location and
-  /// organization of the three components that follow.
-  ///  1. The landing pad site information describes the range of code covered
-  ///     by the try.  In our case it's an accumulation of the ranges covered
-  ///     by the invokes in the try.  There is also a reference to the landing
-  ///     pad that handles the exception once processed.  Finally an index into
-  ///     the actions table.
-  ///  2. The action table, in our case, is composed of pairs of type ids
-  ///     and next action offset.  Starting with the action index from the
-  ///     landing pad site, each type Id is checked for a match to the current
-  ///     exception.  If it matches then the exception and type id are passed
-  ///     on to the landing pad.  Otherwise the next action is looked up.  This
-  ///     chain is terminated with a next action of zero.  If no type id is
-  ///     found the frame is unwound and handling continues.
-  ///  3. Type id table contains references to all the C++ typeinfo for all
-  ///     catches in the function.  This tables is reversed indexed base 1.
-
   /// SharedTypeIds - How many leading type ids two landing pads have in common.
   static unsigned SharedTypeIds(const LandingPadInfo *L,
                                 const LandingPadInfo *R);
@@ -119,6 +99,26 @@ protected:
                             const RangeMapType &PadMap,
                             const SmallVectorImpl<const LandingPadInfo *> &LPs,
                             const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// EmitExceptionTable - Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
   void EmitExceptionTable();
 
 public:
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 115381767751..f7c011968c23 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -20,7 +20,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/SmallString.h"
@@ -91,7 +91,7 @@ void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
 /// either condition is detected in a function which uses the GC.
 ///
 void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  unsigned IntPtrSize = AP.TM.getTargetData()->getPointerSize();
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
   EmitCamlGlobal(getModule(), AP, "code_end");
diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index b83aa5ae3a1b..70742a8d2e35 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
@@ -24,7 +24,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index fb65bb7f3fab..6f4c5a2f667b 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -357,9 +357,8 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
     --I2;
     while (I2->isDebugValue()) {
-      if (I2 == MBB2->begin()) {
+      if (I2 == MBB2->begin())
         return TailLen;
-        }
       --I2;
     }
     ++I2;
@@ -482,21 +481,19 @@ bool
 BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
   if (getHash() < o.getHash())
     return true;
-   else if (getHash() > o.getHash())
+  if (getHash() > o.getHash())
     return false;
-  else if (getBlock()->getNumber() < o.getBlock()->getNumber())
+  if (getBlock()->getNumber() < o.getBlock()->getNumber())
     return true;
-  else if (getBlock()->getNumber() > o.getBlock()->getNumber())
+  if (getBlock()->getNumber() > o.getBlock()->getNumber())
     return false;
-  else {
-    // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
-    // an object with itself.
+  // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
+  // an object with itself.
 #ifndef _GLIBCXX_DEBUG
-    llvm_unreachable("Predecessor appears twice");
+  llvm_unreachable("Predecessor appears twice");
 #else
-    return false;
+  return false;
 #endif
-  }
 }
 
 /// CountTerminators - Count the number of terminators in the given
@@ -574,7 +571,8 @@ static bool ProfitableToMerge(MachineBasicBlock *MBB1,
   // instructions that would be deleted in the merge.
   MachineFunction *MF = MBB1->getParent();
   if (EffectiveTailLen >= 2 &&
-      MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+      MF->getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize) &&
       (I1 == MBB1->begin() || I2 == MBB2->begin()))
     return true;
 
@@ -1554,8 +1552,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         Uses.insert(*AI);
     } else {
-      if (Uses.count(Reg)) {
-        Uses.erase(Reg);
+      if (Uses.erase(Reg)) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
           Uses.erase(*SubRegs); // Use sub-registers to be conservative
       }
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 2e189ad7e7d5..fa6d4e16cfe8 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_library(LLVMCodeGen
   MachineCopyPropagation.cpp
   MachineCSE.cpp
   MachineDominators.cpp
+  MachinePostDominators.cpp
   MachineFunction.cpp
   MachineFunctionAnalysis.cpp
   MachineFunctionPass.cpp
@@ -95,12 +96,14 @@ add_llvm_library(LLVMCodeGen
   SplitKit.cpp
   StackProtector.cpp
   StackSlotColoring.cpp
+  StackColoring.cpp
   StrongPHIElimination.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfoImpl.cpp
   TargetLoweringObjectFileImpl.cpp
   TargetOptionsImpl.cpp
+  TargetSchedule.cpp
   TwoAddressInstructionPass.cpp
   UnreachableBlockElim.cpp
   VirtRegMap.cpp
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 939af3f0ccc2..dee339a45863 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -9,7 +9,6 @@
 
 #define DEBUG_TYPE "calcspillweights"
 
-#include "llvm/Function.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -42,8 +41,7 @@ void CalculateSpillWeights::getAnalysisUsage(AnalysisUsage &au) const {
 bool CalculateSpillWeights::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
-               << "********** Function: "
-               << MF.getFunction()->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
 
   LiveIntervals &LIS = getAnalysis<LiveIntervals>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -166,7 +164,7 @@ void VirtRegAuxInfo::CalculateWeightAndHint(LiveInterval &li) {
       continue;
     float hweight = Hint[hint] += weight;
     if (TargetRegisterInfo::isPhysicalRegister(hint)) {
-      if (hweight > bestPhys && LIS.isAllocatable(hint))
+      if (hweight > bestPhys && mri.isAllocatable(hint))
         bestPhys = hweight, hintPhys = hint;
     } else {
       if (hweight > bestVirt)
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 0b747fd43841..22b91409240b 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
@@ -50,7 +50,7 @@ void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
   if (MinAlign > (int)Align)
     Align = MinAlign;
   MF.getFrameInfo()->ensureMaxAlignment(Align);
-  TM.getTargetLowering()->HandleByVal(this, Size);
+  TM.getTargetLowering()->HandleByVal(this, Size, Align);
   unsigned Offset = AllocateStack(Size, Align);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index fb2c2e83f1b8..a53f6f8d0f1b 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -41,6 +41,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineCSEPass(Registry);
   initializeMachineDominatorTreePass(Registry);
+  initializeMachinePostDominatorTreePass(Registry);
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
   initializeMachineModuleInfoPass(Registry);
@@ -56,6 +57,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRegisterCoalescerPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackProtectorPass(Registry);
+  initializeStackColoringPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeStrongPHIEliminationPass(Registry);
   initializeTailDuplicatePassPass(Registry);
diff --git a/lib/CodeGen/CodePlacementOpt.cpp b/lib/CodeGen/CodePlacementOpt.cpp
index 99233dfc2e3c..d8e06c33a68e 100644
--- a/lib/CodeGen/CodePlacementOpt.cpp
+++ b/lib/CodeGen/CodePlacementOpt.cpp
@@ -373,7 +373,7 @@ bool CodePlacementOpt::OptimizeIntraLoopEdges(MachineFunction &MF) {
 ///
 bool CodePlacementOpt::AlignLoops(MachineFunction &MF) {
   const Function *F = MF.getFunction();
-  if (F->hasFnAttr(Attribute::OptimizeForSize))
+  if (F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize))
     return false;
 
   unsigned Align = TLI->getPrefLoopAlignment();
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index a9de1c7490f1..377b4712beac 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -527,7 +527,7 @@ BreakAntiDependencies(const std::vector<SUnit>& SUnits,
         if (Edge->getKind() == SDep::Anti) {
           AntiDepReg = Edge->getReg();
           assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
-          if (!RegClassInfo.isAllocatable(AntiDepReg))
+          if (!MRI.isAllocatable(AntiDepReg))
             // Don't break anti-dependencies on non-allocatable registers.
             AntiDepReg = 0;
           else if (KeepRegs.test(AntiDepReg))
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index b4394e8d56e9..8964269dde5f 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -33,7 +33,6 @@ namespace {
     const MachineRegisterInfo *MRI;
     const TargetInstrInfo *TII;
     BitVector LivePhysRegs;
-    BitVector ReservedRegs;
 
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -70,7 +69,7 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
       unsigned Reg = MO.getReg();
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         // Don't delete live physreg defs, or any reserved register defs.
-        if (LivePhysRegs.test(Reg) || ReservedRegs.test(Reg))
+        if (LivePhysRegs.test(Reg) || MRI->isReserved(Reg))
           return false;
       } else {
         if (!MRI->use_nodbg_empty(Reg))
@@ -90,9 +89,6 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getTarget().getRegisterInfo();
   TII = MF.getTarget().getInstrInfo();
 
-  // Treat reserved registers as always live.
-  ReservedRegs = TRI->getReservedRegs(MF);
-
   // Loop over all instructions in all blocks, from bottom to top, so that it's
   // more likely that chains of dependent but ultimately dead instructions will
   // be cleaned up.
@@ -101,7 +97,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
     MachineBasicBlock *MBB = &*I;
 
     // Start out assuming that reserved registers are live out of this block.
-    LivePhysRegs = ReservedRegs;
+    LivePhysRegs = MRI->getReservedRegs();
 
     // Also add any explicit live-out physregs for this block.
     if (!MBB->empty() && MBB->back().isReturn())
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index f9347efdb0e9..d5d84041b69f 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -18,7 +18,6 @@
 
 #define DEBUG_TYPE "early-ifcvt"
 #include "MachineTraceMetrics.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -32,9 +31,9 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -775,11 +774,11 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
 
 bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
-               << "********** Function: "
-               << ((Value*)MF.getFunction())->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
-  SchedModel = MF.getTarget().getInstrItineraryData()->SchedModel;
+  SchedModel =
+    MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
@@ -798,6 +797,5 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
     if (tryConvertIf(I->getBlock()))
       Changed = true;
 
-  MF.verify(this, "After early if-conversion");
   return Changed;
 }
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index fee8e47b832c..ed78f1942150 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -626,9 +626,12 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
   dv->Instrs.push_back(mi);
 
-  // Finally set all defs and non-collapsed uses to dv.
-  for (unsigned i = 0, e = mi->getDesc().getNumOperands(); i != e; ++i) {
-    MachineOperand &mo = mi->getOperand(i);
+  // Finally set all defs and non-collapsed uses to dv. We must iterate through
+  // all the operators, including imp-def ones.
+  for (MachineInstr::mop_iterator ii = mi->operands_begin(),
+                                  ee = mi->operands_end();
+                                  ii != ee; ++ii) {
+    MachineOperand &mo = *ii;
     if (!mo.isReg()) continue;
     int rx = regIndex(mo.getReg());
     if (rx < 0) continue;
@@ -654,7 +657,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   bool anyregs = false;
   for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (MF->getRegInfo().isPhysRegOrOverlapUsed(*I)) {
+    if (MF->getRegInfo().isPhysRegUsed(*I)) {
       anyregs = true;
       break;
     }
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 7a17331ba1d6..ffe4b63c1b11 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -14,7 +14,6 @@
 
 #define DEBUG_TYPE "postrapseudos"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -190,8 +189,7 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
 bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Machine Function\n"
                << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
-               << "********** Function: "
-               << MF.getFunction()->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
   TRI = MF.getTarget().getRegisterInfo();
   TII = MF.getTarget().getInstrInfo();
 
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 506b5cf09457..f4755bb1635c 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -387,9 +388,16 @@ void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
   const TargetFrameLowering *TFI = TM->getFrameLowering();
   assert(TFI && "TargetRegisterInfo not available!");
 
-  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin(),
-                                      RE = FI->roots_end(); RI != RE; ++RI)
-    RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
+       RI != FI->roots_end();) {
+    // If the root references a dead object, no need to keep it.
+    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
+      RI = FI->removeStackRoot(RI);
+    } else {
+      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
+      ++RI;
+    }
+  }
 }
 
 bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 4214ba124252..31e36f0168cb 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -13,7 +13,6 @@
 
 #define DEBUG_TYPE "ifcvt"
 #include "BranchFolding.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
@@ -282,7 +281,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   }
 
   DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
-               << MF.getFunction()->getName() << "\'");
+               << MF.getName() << "\'");
 
   if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
     DEBUG(dbgs() << " skipped\n");
@@ -997,14 +996,13 @@ static void UpdatePredRedefs(MachineInstr *MI, SmallSet<unsigned,4> &Redefs,
   }
   for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
     unsigned Reg = Defs[i];
-    if (Redefs.count(Reg)) {
+    if (!Redefs.insert(Reg)) {
       if (AddImpUse)
         // Treat predicated update as read + write.
         MI->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
                                               true/*IsImp*/,false/*IsKill*/,
                                               false/*IsDead*/,true/*IsUndef*/));
     } else {
-      Redefs.insert(Reg);
       for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
         Redefs.insert(*SubRegs);
     }
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 07e37af57f3b..37828a70b56f 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -613,7 +613,7 @@ MachineInstr *InlineSpiller::traceSiblingValue(unsigned UseReg, VNInfo *UseVNI,
     propagateSiblingValue(SVI);
   } while (!WorkList.empty());
 
-  // Look up the value we were looking for.  We already did this lokup at the
+  // Look up the value we were looking for.  We already did this lookup at the
   // top of the function, but SibValues may have been invalidated.
   SVI = SibValues.find(UseVNI);
   assert(SVI != SibValues.end() && "Didn't compute requested info");
@@ -863,7 +863,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   // If the instruction also writes VirtReg.reg, it had better not require the
   // same register for uses and defs.
   SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-  MIBundleOperands::RegInfo RI =
+  MIBundleOperands::VirtRegInfo RI =
     MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
   if (RI.Tied) {
     markValueUsed(&VirtReg, ParentVNI);
@@ -1142,7 +1142,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
 
     // Analyze instruction.
     SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-    MIBundleOperands::RegInfo RI =
+    MIBundleOperands::VirtRegInfo RI =
       MIBundleOperands(MI).analyzeVirtReg(Reg, &Ops);
 
     // Find the slot index where this instruction reads and writes OldLI.
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 8d2282a679ce..6120ae56b4a7 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -21,7 +21,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 using namespace llvm;
 
 template <class ArgIt>
@@ -457,7 +457,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;   // Strip out annotate intrinsic
     
   case Intrinsic::memcpy: {
-    IntegerType *IntPtr = TD.getIntPtrType(Context);
+    Type *IntPtr = TD.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
@@ -468,7 +468,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memmove: {
-    IntegerType *IntPtr = TD.getIntPtrType(Context);
+    Type *IntPtr = TD.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
@@ -479,7 +479,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memset: {
-    IntegerType *IntPtr = TD.getIntPtrType(Context);
+    Type *IntPtr = TD.getIntPtrType(Context);
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index cac0c83bcac2..24daafaa62e1 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -172,7 +172,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
       const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
       MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI,
                                             *Context);
-      MAB = getTarget().createMCAsmBackend(getTargetTriple());
+      MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
     }
 
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
@@ -191,7 +191,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     // emission fails.
     MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                          STI, *Context);
-    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
     if (MCE == 0 || MAB == 0)
       return true;
 
@@ -266,7 +266,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                        STI, *Ctx);
-  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
   if (MCE == 0 || MAB == 0)
     return true;
 
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index d631726538ed..defc1279ec8c 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -687,8 +687,7 @@ bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
   clear();
   LS.initialize(mf);
   DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
-               << ((Value*)mf.getFunction())->getName()
-               << " **********\n");
+               << mf.getName() << " **********\n");
 
   bool Changed = collectDebugValues(mf);
   computeIntervals();
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 0a795e644cef..8585cbb30dee 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "RegisterCoalescer.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -58,8 +59,16 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
     return VNI;
   }
   if (SlotIndex::isSameInstr(Def, I->start)) {
-    assert(I->start == Def && "Cannot insert def, already live");
-    assert(I->valno->def == Def && "Inconsistent existing value def");
+    assert(I->valno->def == I->start && "Inconsistent existing value def");
+
+    // It is possible to have both normal and early-clobber defs of the same
+    // register on an instruction. It doesn't make a lot of sense, but it is
+    // possible to specify in inline assembly.
+    //
+    // Just convert everything to early-clobber.
+    Def = std::min(Def, I->start);
+    if (Def != I->start)
+      I->start = I->valno->def = Def;
     return I->valno;
   }
   assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
@@ -68,21 +77,6 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
   return VNI;
 }
 
-/// killedInRange - Return true if the interval has kills in [Start,End).
-bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
-  Ranges::const_iterator r =
-    std::lower_bound(ranges.begin(), ranges.end(), End);
-
-  // Now r points to the first interval with start >= End, or ranges.end().
-  if (r == ranges.begin())
-    return false;
-
-  --r;
-  // Now r points to the last interval with end <= End.
-  // r->end is the kill point.
-  return r->end >= Start && r->end < End;
-}
-
 // overlaps - Return true if the intersection of the two live intervals is
 // not empty.
 //
@@ -142,6 +136,48 @@ bool LiveInterval::overlapsFrom(const LiveInterval& other,
   return false;
 }
 
+bool LiveInterval::overlaps(const LiveInterval &Other,
+                            const CoalescerPair &CP,
+                            const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty interval");
+  if (Other.empty())
+    return false;
+
+  // Use binary searches to find initial positions.
+  const_iterator I = find(Other.beginIndex());
+  const_iterator IE = end();
+  if (I == IE)
+    return false;
+  const_iterator J = Other.find(I->start);
+  const_iterator JE = Other.end();
+  if (J == JE)
+    return false;
+
+  for (;;) {
+    // J has just been advanced to satisfy:
+    assert(J->end >= I->start);
+    // Check for an overlap.
+    if (J->start < I->end) {
+      // I and J are overlapping. Find the later start.
+      SlotIndex Def = std::max(I->start, J->start);
+      // Allow the overlap if Def is a coalescable copy.
+      if (Def.isBlock() ||
+          !CP.isCoalescable(Indexes.getInstructionFromIndex(Def)))
+        return true;
+    }
+    // Advance the iterator that ends first to check for more overlaps.
+    if (J->end > I->end) {
+      std::swap(I, J);
+      std::swap(IE, JE);
+    }
+    // Advance J until J->end >= I->start.
+    do
+      if (++J == JE)
+        return false;
+    while (J->end < I->start);
+  }
+}
+
 /// overlaps - Return true if the live interval overlaps a range specified
 /// by [Start, End).
 bool LiveInterval::overlaps(SlotIndex Start, SlotIndex End) const {
@@ -399,7 +435,7 @@ void LiveInterval::join(LiveInterval &Other,
 
   // If we have to apply a mapping to our base interval assignment, rewrite it
   // now.
-  if (MustMapCurValNos) {
+  if (MustMapCurValNos && !empty()) {
     // Map the first live range.
 
     iterator OutIt = begin();
@@ -673,27 +709,6 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   return V2;
 }
 
-void LiveInterval::Copy(const LiveInterval &RHS,
-                        MachineRegisterInfo *MRI,
-                        VNInfo::Allocator &VNInfoAllocator) {
-  ranges.clear();
-  valnos.clear();
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(RHS.reg);
-  MRI->setRegAllocationHint(reg, Hint.first, Hint.second);
-
-  weight = RHS.weight;
-  for (unsigned i = 0, e = RHS.getNumValNums(); i != e; ++i) {
-    const VNInfo *VNI = RHS.getValNumInfo(i);
-    createValueCopy(VNI, VNInfoAllocator);
-  }
-  for (unsigned i = 0, e = RHS.ranges.size(); i != e; ++i) {
-    const LiveRange &LR = RHS.ranges[i];
-    addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
-  }
-
-  verify();
-}
-
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
   for (const_iterator I = begin(), E = end(); I != E; ++I)
@@ -705,9 +720,11 @@ raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
   return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveRange::dump() const {
   dbgs() << *this << "\n";
 }
+#endif
 
 void LiveInterval::print(raw_ostream &OS) const {
   if (empty())
@@ -740,9 +757,11 @@ void LiveInterval::print(raw_ostream &OS) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
+#endif
 
 #ifndef NDEBUG
 void LiveInterval::verify() const {
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index d0f8ae1af305..4e75d892e523 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -34,6 +34,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "LiveRangeCalc.h"
+#include "VirtRegMap.h"
 #include <algorithm>
 #include <limits>
 #include <cmath>
@@ -109,8 +110,6 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   DomTree = &getAnalysis<MachineDominatorTree>();
   if (!LRCalc)
     LRCalc = new LiveRangeCalc();
-  AllocatableRegs = TRI->getAllocatableSet(fn);
-  ReservedRegs = TRI->getReservedRegs(fn);
 
   // Allocate space for all virtual registers.
   VirtRegIntervals.resize(MRI->getNumVirtRegs());
@@ -147,6 +146,11 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
       OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
   }
 
+  OS << "RegMasks:";
+  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
+    OS << ' ' << RegMaskSlots[i];
+  OS << '\n';
+
   printInstrs(OS);
 }
 
@@ -155,9 +159,11 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const {
   MF->print(OS, Indexes);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveIntervals::dumpInstrs() const {
   printInstrs(dbgs());
 }
+#endif
 
 static
 bool MultipleDefsBySameMI(const MachineInstr &MI, unsigned MOIdx) {
@@ -382,8 +388,7 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
 /// which a variable is live
 void LiveIntervals::computeIntervals() {
   DEBUG(dbgs() << "********** COMPUTING LIVE INTERVALS **********\n"
-               << "********** Function: "
-               << ((Value*)MF->getFunction())->getName() << '\n');
+               << "********** Function: " << MF->getName() << '\n');
 
   RegMaskBlocks.resize(MF->getNumBlockIDs());
 
@@ -440,7 +445,7 @@ void LiveIntervals::computeIntervals() {
 
     // Compute the number of register mask instructions in this block.
     std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
-    RMB.second = RegMaskSlots.size() - RMB.first;;
+    RMB.second = RegMaskSlots.size() - RMB.first;
   }
 
   // Create empty intervals for registers defined by implicit_def's (except
@@ -497,7 +502,7 @@ void LiveIntervals::computeRegMasks() {
           RegMaskBits.push_back(MO->getRegMask());
       }
     // Compute the number of register mask instructions in this block.
-    RMB.second = RegMaskSlots.size() - RMB.first;;
+    RMB.second = RegMaskSlots.size() - RMB.first;
   }
 }
 
@@ -540,11 +545,11 @@ void LiveIntervals::computeRegUnitInterval(LiveInterval *LI) {
   // Ignore uses of reserved registers. We only track defs of those.
   for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
     unsigned Root = *Roots;
-    if (!isReserved(Root) && !MRI->reg_empty(Root))
+    if (!MRI->isReserved(Root) && !MRI->reg_empty(Root))
       LRCalc->extendToUses(LI, Root);
     for (MCSuperRegIterator Supers(Root, TRI); Supers.isValid(); ++Supers) {
       unsigned Reg = *Supers;
-      if (!isReserved(Reg) && !MRI->reg_empty(Reg))
+      if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
         LRCalc->extendToUses(LI, Reg);
     }
   }
@@ -729,17 +734,100 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   return CanSeparate;
 }
 
+void LiveIntervals::extendToIndices(LiveInterval *LI,
+                                    ArrayRef<SlotIndex> Indices) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
+    LRCalc->extend(LI, Indices[i]);
+}
+
+void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
+                               SmallVectorImpl<SlotIndex> *EndPoints) {
+  LiveRangeQuery LRQ(*LI, Kill);
+  VNInfo *VNI = LRQ.valueOut();
+  if (!VNI)
+    return;
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
+  SlotIndex MBBStart, MBBEnd;
+  tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB);
+
+  // If VNI isn't live out from KillMBB, the value is trivially pruned.
+  if (LRQ.endPoint() < MBBEnd) {
+    LI->removeRange(Kill, LRQ.endPoint());
+    if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+    return;
+  }
+
+  // VNI is live out of KillMBB.
+  LI->removeRange(Kill, MBBEnd);
+  if (EndPoints) EndPoints->push_back(MBBEnd);
+
+  // Find all blocks that are reachable from KillMBB without leaving VNI's live
+  // range. It is possible that KillMBB itself is reachable, so start a DFS
+  // from each successor.
+  typedef SmallPtrSet<MachineBasicBlock*, 9> VisitedTy;
+  VisitedTy Visited;
+  for (MachineBasicBlock::succ_iterator
+       SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end();
+       SuccI != SuccE; ++SuccI) {
+    for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
+         I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited);
+         I != E;) {
+      MachineBasicBlock *MBB = *I;
+
+      // Check if VNI is live in to MBB.
+      tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
+      LiveRangeQuery LRQ(*LI, MBBStart);
+      if (LRQ.valueIn() != VNI) {
+        // This block isn't part of the VNI live range. Prune the search.
+        I.skipChildren();
+        continue;
+      }
+
+      // Prune the search if VNI is killed in MBB.
+      if (LRQ.endPoint() < MBBEnd) {
+        LI->removeRange(MBBStart, LRQ.endPoint());
+        if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+        I.skipChildren();
+        continue;
+      }
+
+      // VNI is live through MBB.
+      LI->removeRange(MBBStart, MBBEnd);
+      if (EndPoints) EndPoints->push_back(MBBEnd);
+      ++I;
+    }
+  }
+}
 
 //===----------------------------------------------------------------------===//
 // Register allocator hooks.
 //
 
-void LiveIntervals::addKillFlags() {
+void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
+  // Keep track of regunit ranges.
+  SmallVector<std::pair<LiveInterval*, LiveInterval::iterator>, 8> RU;
+
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     LiveInterval *LI = &getInterval(Reg);
+    if (LI->empty())
+      continue;
+
+    // Find the regunit intervals for the assigned register. They may overlap
+    // the virtual register live range, cancelling any kills.
+    RU.clear();
+    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
+         ++Units) {
+      LiveInterval *RUInt = &getRegUnit(*Units);
+      if (RUInt->empty())
+        continue;
+      RU.push_back(std::make_pair(RUInt, RUInt->find(LI->begin()->end)));
+    }
 
     // Every instruction that kills Reg corresponds to a live range end point.
     for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
@@ -750,7 +838,32 @@ void LiveIntervals::addKillFlags() {
       MachineInstr *MI = getInstructionFromIndex(RI->end);
       if (!MI)
         continue;
-      MI->addRegisterKilled(Reg, NULL);
+
+      // Check if any of the reguints are live beyond the end of RI. That could
+      // happen when a physreg is defined as a copy of a virtreg:
+      //
+      //   %EAX = COPY %vreg5
+      //   FOO %vreg5         <--- MI, cancel kill because %EAX is live.
+      //   BAR %EAX<kill>
+      //
+      // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
+      bool CancelKill = false;
+      for (unsigned u = 0, e = RU.size(); u != e; ++u) {
+        LiveInterval *RInt = RU[u].first;
+        LiveInterval::iterator &I = RU[u].second;
+        if (I == RInt->end())
+          continue;
+        I = RInt->advanceTo(I, RI->end);
+        if (I == RInt->end() || I->start >= RI->end)
+          continue;
+        // I is overlapping RI.
+        CancelKill = true;
+        break;
+      }
+      if (CancelKill)
+        MI->clearRegisterKills(Reg, NULL);
+      else
+        MI->addRegisterKilled(Reg, NULL);
     }
   }
 }
@@ -900,497 +1013,321 @@ private:
   LiveIntervals& LIS;
   const MachineRegisterInfo& MRI;
   const TargetRegisterInfo& TRI;
+  SlotIndex OldIdx;
   SlotIndex NewIdx;
-
-  typedef std::pair<LiveInterval*, LiveRange*> IntRangePair;
-  typedef DenseSet<IntRangePair> RangeSet;
-
-  struct RegRanges {
-    LiveRange* Use;
-    LiveRange* EC;
-    LiveRange* Dead;
-    LiveRange* Def;
-    RegRanges() : Use(0), EC(0), Dead(0), Def(0) {}
-  };
-  typedef DenseMap<unsigned, RegRanges> BundleRanges;
+  SmallPtrSet<LiveInterval*, 8> Updated;
+  bool UpdateFlags;
 
 public:
   HMEditor(LiveIntervals& LIS, const MachineRegisterInfo& MRI,
-           const TargetRegisterInfo& TRI, SlotIndex NewIdx)
-    : LIS(LIS), MRI(MRI), TRI(TRI), NewIdx(NewIdx) {}
-
-  // Update intervals for all operands of MI from OldIdx to NewIdx.
-  // This assumes that MI used to be at OldIdx, and now resides at
-  // NewIdx.
-  void moveAllRangesFrom(MachineInstr* MI, SlotIndex OldIdx) {
-    assert(NewIdx != OldIdx && "No-op move? That's a bit strange.");
-
-    // Collect the operands.
-    RangeSet Entering, Internal, Exiting;
-    bool hasRegMaskOp = false;
-    collectRanges(MI, Entering, Internal, Exiting, hasRegMaskOp, OldIdx);
-
-    // To keep the LiveRanges valid within an interval, move the ranges closest
-    // to the destination first. This prevents ranges from overlapping, to that
-    // APIs like removeRange still work.
-    if (NewIdx < OldIdx) {
-      moveAllEnteringFrom(OldIdx, Entering);
-      moveAllInternalFrom(OldIdx, Internal);
-      moveAllExitingFrom(OldIdx, Exiting);
-    }
-    else {
-      moveAllExitingFrom(OldIdx, Exiting);
-      moveAllInternalFrom(OldIdx, Internal);
-      moveAllEnteringFrom(OldIdx, Entering);
-    }
-
-    if (hasRegMaskOp)
-      updateRegMaskSlots(OldIdx);
-
-#ifndef NDEBUG
-    LIValidator validator;
-    validator = std::for_each(Entering.begin(), Entering.end(), validator);
-    validator = std::for_each(Internal.begin(), Internal.end(), validator);
-    validator = std::for_each(Exiting.begin(), Exiting.end(), validator);
-    assert(validator.rangesOk() && "moveAllOperandsFrom broke liveness.");
-#endif
-
+           const TargetRegisterInfo& TRI,
+           SlotIndex OldIdx, SlotIndex NewIdx, bool UpdateFlags)
+    : LIS(LIS), MRI(MRI), TRI(TRI), OldIdx(OldIdx), NewIdx(NewIdx),
+      UpdateFlags(UpdateFlags) {}
+
+  // FIXME: UpdateFlags is a workaround that creates live intervals for all
+  // physregs, even those that aren't needed for regalloc, in order to update
+  // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
+  // flags, and postRA passes will use a live register utility instead.
+  LiveInterval *getRegUnitLI(unsigned Unit) {
+    if (UpdateFlags)
+      return &LIS.getRegUnit(Unit);
+    return LIS.getCachedRegUnit(Unit);
   }
 
-  // Update intervals for all operands of MI to refer to BundleStart's
-  // SlotIndex.
-  void moveAllRangesInto(MachineInstr* MI, MachineInstr* BundleStart) {
-    if (MI == BundleStart)
-      return; // Bundling instr with itself - nothing to do.
-
-    SlotIndex OldIdx = LIS.getSlotIndexes()->getInstructionIndex(MI);
-    assert(LIS.getSlotIndexes()->getInstructionFromIndex(OldIdx) == MI &&
-           "SlotIndex <-> Instruction mapping broken for MI");
-
-    // Collect all ranges already in the bundle.
-    MachineBasicBlock::instr_iterator BII(BundleStart);
-    RangeSet Entering, Internal, Exiting;
-    bool hasRegMaskOp = false;
-    collectRanges(BII, Entering, Internal, Exiting, hasRegMaskOp, NewIdx);
-    assert(!hasRegMaskOp && "Can't have RegMask operand in bundle.");
-    for (++BII; &*BII == MI || BII->isInsideBundle(); ++BII) {
-      if (&*BII == MI)
+  /// Update all live ranges touched by MI, assuming a move from OldIdx to
+  /// NewIdx.
+  void updateAllRanges(MachineInstr *MI) {
+    DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": " << *MI);
+    bool hasRegMask = false;
+    for (MIOperands MO(MI); MO.isValid(); ++MO) {
+      if (MO->isRegMask())
+        hasRegMask = true;
+      if (!MO->isReg())
         continue;
-      collectRanges(BII, Entering, Internal, Exiting, hasRegMaskOp, NewIdx);
-      assert(!hasRegMaskOp && "Can't have RegMask operand in bundle.");
-    }
-
-    BundleRanges BR = createBundleRanges(Entering, Internal, Exiting);
-
-    Entering.clear();
-    Internal.clear();
-    Exiting.clear();
-    collectRanges(MI, Entering, Internal, Exiting, hasRegMaskOp, OldIdx);
-    assert(!hasRegMaskOp && "Can't have RegMask operand in bundle.");
+      // Aggressively clear all kill flags.
+      // They are reinserted by VirtRegRewriter.
+      if (MO->isUse())
+        MO->setIsKill(false);
 
-    DEBUG(dbgs() << "Entering: " << Entering.size() << "\n");
-    DEBUG(dbgs() << "Internal: " << Internal.size() << "\n");
-    DEBUG(dbgs() << "Exiting: " << Exiting.size() << "\n");
-
-    moveAllEnteringFromInto(OldIdx, Entering, BR);
-    moveAllInternalFromInto(OldIdx, Internal, BR);
-    moveAllExitingFromInto(OldIdx, Exiting, BR);
-
-
-#ifndef NDEBUG
-    LIValidator validator;
-    validator = std::for_each(Entering.begin(), Entering.end(), validator);
-    validator = std::for_each(Internal.begin(), Internal.end(), validator);
-    validator = std::for_each(Exiting.begin(), Exiting.end(), validator);
-    assert(validator.rangesOk() && "moveAllOperandsInto broke liveness.");
-#endif
-  }
-
-private:
-
-#ifndef NDEBUG
-  class LIValidator {
-  private:
-    DenseSet<const LiveInterval*> Checked, Bogus;
-  public:
-    void operator()(const IntRangePair& P) {
-      const LiveInterval* LI = P.first;
-      if (Checked.count(LI))
-        return;
-      Checked.insert(LI);
-      if (LI->empty())
-        return;
-      SlotIndex LastEnd = LI->begin()->start;
-      for (LiveInterval::const_iterator LRI = LI->begin(), LRE = LI->end();
-           LRI != LRE; ++LRI) {
-        const LiveRange& LR = *LRI;
-        if (LastEnd > LR.start || LR.start >= LR.end)
-          Bogus.insert(LI);
-        LastEnd = LR.end;
-      }
-    }
-
-    bool rangesOk() const {
-      return Bogus.empty();
-    }
-  };
-#endif
-
-  // Collect IntRangePairs for all operands of MI that may need fixing.
-  // Treat's MI's index as OldIdx (regardless of what it is in SlotIndexes'
-  // maps).
-  void collectRanges(MachineInstr* MI, RangeSet& Entering, RangeSet& Internal,
-                     RangeSet& Exiting, bool& hasRegMaskOp, SlotIndex OldIdx) {
-    hasRegMaskOp = false;
-    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                    MOE = MI->operands_end();
-         MOI != MOE; ++MOI) {
-      const MachineOperand& MO = *MOI;
-
-      if (MO.isRegMask()) {
-        hasRegMaskOp = true;
+      unsigned Reg = MO->getReg();
+      if (!Reg)
         continue;
-      }
-
-      if (!MO.isReg() || MO.getReg() == 0)
-        continue;
-
-      unsigned Reg = MO.getReg();
-
-      // TODO: Currently we're skipping uses that are reserved or have no
-      // interval, but we're not updating their kills. This should be
-      // fixed.
-      if (TargetRegisterInfo::isPhysicalRegister(Reg) && LIS.isReserved(Reg))
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        updateRange(LIS.getInterval(Reg));
         continue;
-
-      // Collect ranges for register units. These live ranges are computed on
-      // demand, so just skip any that haven't been computed yet.
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
-          if (LiveInterval *LI = LIS.getCachedRegUnit(*Units))
-            collectRanges(MO, LI, Entering, Internal, Exiting, OldIdx);
-      } else {
-        // Collect ranges for individual virtual registers.
-        collectRanges(MO, &LIS.getInterval(Reg),
-                      Entering, Internal, Exiting, OldIdx);
       }
+
+      // For physregs, only update the regunits that actually have a
+      // precomputed live range.
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        if (LiveInterval *LI = getRegUnitLI(*Units))
+          updateRange(*LI);
     }
+    if (hasRegMask)
+      updateRegMaskSlots();
   }
 
-  void collectRanges(const MachineOperand &MO, LiveInterval *LI,
-                     RangeSet &Entering, RangeSet &Internal, RangeSet &Exiting,
-                     SlotIndex OldIdx) {
-    if (MO.readsReg()) {
-      LiveRange* LR = LI->getLiveRangeContaining(OldIdx);
-      if (LR != 0)
-        Entering.insert(std::make_pair(LI, LR));
-    }
-    if (MO.isDef()) {
-      LiveRange* LR = LI->getLiveRangeContaining(OldIdx.getRegSlot());
-      assert(LR != 0 && "No live range for def?");
-      if (LR->end > OldIdx.getDeadSlot())
-        Exiting.insert(std::make_pair(LI, LR));
+private:
+  /// Update a single live range, assuming an instruction has been moved from
+  /// OldIdx to NewIdx.
+  void updateRange(LiveInterval &LI) {
+    if (!Updated.insert(&LI))
+      return;
+    DEBUG({
+      dbgs() << "     ";
+      if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+        dbgs() << PrintReg(LI.reg);
       else
-        Internal.insert(std::make_pair(LI, LR));
-    }
+        dbgs() << PrintRegUnit(LI.reg, &TRI);
+      dbgs() << ":\t" << LI << '\n';
+    });
+    if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
+      handleMoveDown(LI);
+    else
+      handleMoveUp(LI);
+    DEBUG(dbgs() << "        -->\t" << LI << '\n');
+    LI.verify();
   }
 
-  BundleRanges createBundleRanges(RangeSet& Entering,
-                                  RangeSet& Internal,
-                                  RangeSet& Exiting) {
-    BundleRanges BR;
-
-    for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
-         EI != EE; ++EI) {
-      LiveInterval* LI = EI->first;
-      LiveRange* LR = EI->second;
-      BR[LI->reg].Use = LR;
-    }
+  /// Update LI to reflect an instruction has been moved downwards from OldIdx
+  /// to NewIdx.
+  ///
+  /// 1. Live def at OldIdx:
+  ///    Move def to NewIdx, assert endpoint after NewIdx.
+  ///
+  /// 2. Live def at OldIdx, killed at NewIdx:
+  ///    Change to dead def at NewIdx.
+  ///    (Happens when bundling def+kill together).
+  ///
+  /// 3. Dead def at OldIdx:
+  ///    Move def to NewIdx, possibly across another live value.
+  ///
+  /// 4. Def at OldIdx AND at NewIdx:
+  ///    Remove live range [OldIdx;NewIdx) and value defined at OldIdx.
+  ///    (Happens when bundling multiple defs together).
+  ///
+  /// 5. Value read at OldIdx, killed before NewIdx:
+  ///    Extend kill to NewIdx.
+  ///
+  void handleMoveDown(LiveInterval &LI) {
+    // First look for a kill at OldIdx.
+    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
+    LiveInterval::iterator E = LI.end();
+    // Is LI even live at OldIdx?
+    if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
+      return;
 
-    for (RangeSet::iterator II = Internal.begin(), IE = Internal.end();
-         II != IE; ++II) {
-      LiveInterval* LI = II->first;
-      LiveRange* LR = II->second;
-      if (LR->end.isDead()) {
-        BR[LI->reg].Dead = LR;
-      } else {
-        BR[LI->reg].EC = LR;
-      }
+    // Handle a live-in value.
+    if (!SlotIndex::isSameInstr(I->start, OldIdx)) {
+      bool isKill = SlotIndex::isSameInstr(OldIdx, I->end);
+      // If the live-in value already extends to NewIdx, there is nothing to do.
+      if (!SlotIndex::isEarlierInstr(I->end, NewIdx))
+        return;
+      // Aggressively remove all kill flags from the old kill point.
+      // Kill flags shouldn't be used while live intervals exist, they will be
+      // reinserted by VirtRegRewriter.
+      if (MachineInstr *KillMI = LIS.getInstructionFromIndex(I->end))
+        for (MIBundleOperands MO(KillMI); MO.isValid(); ++MO)
+          if (MO->isReg() && MO->isUse())
+            MO->setIsKill(false);
+      // Adjust I->end to reach NewIdx. This may temporarily make LI invalid by
+      // overlapping ranges. Case 5 above.
+      I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
+      // If this was a kill, there may also be a def. Otherwise we're done.
+      if (!isKill)
+        return;
+      ++I;
     }
 
-    for (RangeSet::iterator EI = Exiting.begin(), EE = Exiting.end();
-         EI != EE; ++EI) {
-      LiveInterval* LI = EI->first;
-      LiveRange* LR = EI->second;
-      BR[LI->reg].Def = LR;
+    // Check for a def at OldIdx.
+    if (I == E || !SlotIndex::isSameInstr(OldIdx, I->start))
+      return;
+    // We have a def at OldIdx.
+    VNInfo *DefVNI = I->valno;
+    assert(DefVNI->def == I->start && "Inconsistent def");
+    DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
+    // If the defined value extends beyond NewIdx, just move the def down.
+    // This is case 1 above.
+    if (SlotIndex::isEarlierInstr(NewIdx, I->end)) {
+      I->start = DefVNI->def;
+      return;
     }
-
-    return BR;
-  }
-
-  void moveKillFlags(unsigned reg, SlotIndex OldIdx, SlotIndex newKillIdx) {
-    MachineInstr* OldKillMI = LIS.getInstructionFromIndex(OldIdx);
-    if (!OldKillMI->killsRegister(reg))
-      return; // Bail out if we don't have kill flags on the old register.
-    MachineInstr* NewKillMI = LIS.getInstructionFromIndex(newKillIdx);
-    assert(OldKillMI->killsRegister(reg) && "Old 'kill' instr isn't a kill.");
-    assert(!NewKillMI->killsRegister(reg) &&
-           "New kill instr is already a kill.");
-    OldKillMI->clearRegisterKills(reg, &TRI);
-    NewKillMI->addRegisterKilled(reg, &TRI);
-  }
-
-  void updateRegMaskSlots(SlotIndex OldIdx) {
-    SmallVectorImpl<SlotIndex>::iterator RI =
-      std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
-                       OldIdx);
-    assert(*RI == OldIdx && "No RegMask at OldIdx.");
-    *RI = NewIdx;
-    assert(*prior(RI) < *RI && *RI < *next(RI) &&
-           "RegSlots out of order. Did you move one call across another?");
-  }
-
-  // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(unsigned Reg, SlotIndex OldIdx) {
-    SlotIndex LastUse = NewIdx;
-    for (MachineRegisterInfo::use_nodbg_iterator
-           UI = MRI.use_nodbg_begin(Reg),
-           UE = MRI.use_nodbg_end();
-         UI != UE; UI.skipInstruction()) {
-      const MachineInstr* MI = &*UI;
-      SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
-      if (InstSlot > LastUse && InstSlot < OldIdx)
-        LastUse = InstSlot;
+    // The remaining possibilities are now:
+    // 2. Live def at OldIdx, killed at NewIdx: isSameInstr(I->end, NewIdx).
+    // 3. Dead def at OldIdx: I->end = OldIdx.getDeadSlot().
+    // In either case, it is possible that there is an existing def at NewIdx.
+    assert((I->end == OldIdx.getDeadSlot() ||
+            SlotIndex::isSameInstr(I->end, NewIdx)) &&
+            "Cannot move def below kill");
+    LiveInterval::iterator NewI = LI.advanceTo(I, NewIdx.getRegSlot());
+    if (NewI != E && SlotIndex::isSameInstr(NewI->start, NewIdx)) {
+      // There is an existing def at NewIdx, case 4 above. The def at OldIdx is
+      // coalesced into that value.
+      assert(NewI->valno != DefVNI && "Multiple defs of value?");
+      LI.removeValNo(DefVNI);
+      return;
     }
-    return LastUse;
+    // There was no existing def at NewIdx. Turn *I into a dead def at NewIdx.
+    // If the def at OldIdx was dead, we allow it to be moved across other LI
+    // values. The new range should be placed immediately before NewI, move any
+    // intermediate ranges up.
+    assert(NewI != I && "Inconsistent iterators");
+    std::copy(llvm::next(I), NewI, I);
+    *llvm::prior(NewI) = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
-  void moveEnteringUpFrom(SlotIndex OldIdx, IntRangePair& P) {
-    LiveInterval* LI = P.first;
-    LiveRange* LR = P.second;
-    bool LiveThrough = LR->end > OldIdx.getRegSlot();
-    if (LiveThrough)
+  /// Update LI to reflect an instruction has been moved upwards from OldIdx
+  /// to NewIdx.
+  ///
+  /// 1. Live def at OldIdx:
+  ///    Hoist def to NewIdx.
+  ///
+  /// 2. Dead def at OldIdx:
+  ///    Hoist def+end to NewIdx, possibly move across other values.
+  ///
+  /// 3. Dead def at OldIdx AND existing def at NewIdx:
+  ///    Remove value defined at OldIdx, coalescing it with existing value.
+  ///
+  /// 4. Live def at OldIdx AND existing def at NewIdx:
+  ///    Remove value defined at NewIdx, hoist OldIdx def to NewIdx.
+  ///    (Happens when bundling multiple defs together).
+  ///
+  /// 5. Value killed at OldIdx:
+  ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
+  ///    OldIdx.
+  ///
+  void handleMoveUp(LiveInterval &LI) {
+    // First look for a kill at OldIdx.
+    LiveInterval::iterator I = LI.find(OldIdx.getBaseIndex());
+    LiveInterval::iterator E = LI.end();
+    // Is LI even live at OldIdx?
+    if (I == E || SlotIndex::isEarlierInstr(OldIdx, I->start))
       return;
-    SlotIndex LastUse = findLastUseBefore(LI->reg, OldIdx);
-    if (LastUse != NewIdx)
-      moveKillFlags(LI->reg, NewIdx, LastUse);
-    LR->end = LastUse.getRegSlot();
-  }
 
-  void moveEnteringDownFrom(SlotIndex OldIdx, IntRangePair& P) {
-    LiveInterval* LI = P.first;
-    LiveRange* LR = P.second;
-    // Extend the LiveRange if NewIdx is past the end.
-    if (NewIdx > LR->end) {
-      // Move kill flags if OldIdx was not originally the end
-      // (otherwise LR->end points to an invalid slot).
-      if (LR->end.getRegSlot() != OldIdx.getRegSlot()) {
-        assert(LR->end > OldIdx && "LiveRange does not cover original slot");
-        moveKillFlags(LI->reg, LR->end, NewIdx);
+    // Handle a live-in value.
+    if (!SlotIndex::isSameInstr(I->start, OldIdx)) {
+      // If the live-in value isn't killed here, there is nothing to do.
+      if (!SlotIndex::isSameInstr(OldIdx, I->end))
+        return;
+      // Adjust I->end to end at NewIdx. If we are hoisting a kill above
+      // another use, we need to search for that use. Case 5 above.
+      I->end = NewIdx.getRegSlot(I->end.isEarlyClobber());
+      ++I;
+      // If OldIdx also defines a value, there couldn't have been another use.
+      if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
+        // No def, search for the new kill.
+        // This can never be an early clobber kill since there is no def.
+        llvm::prior(I)->end = findLastUseBefore(LI.reg).getRegSlot();
+        return;
       }
-      LR->end = NewIdx.getRegSlot();
-    }
-  }
-
-  void moveAllEnteringFrom(SlotIndex OldIdx, RangeSet& Entering) {
-    bool GoingUp = NewIdx < OldIdx;
-
-    if (GoingUp) {
-      for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
-           EI != EE; ++EI)
-        moveEnteringUpFrom(OldIdx, *EI);
-    } else {
-      for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
-           EI != EE; ++EI)
-        moveEnteringDownFrom(OldIdx, *EI);
     }
-  }
-
-  void moveInternalFrom(SlotIndex OldIdx, IntRangePair& P) {
-    LiveInterval* LI = P.first;
-    LiveRange* LR = P.second;
-    assert(OldIdx < LR->start && LR->start < OldIdx.getDeadSlot() &&
-           LR->end <= OldIdx.getDeadSlot() &&
-           "Range should be internal to OldIdx.");
-    LiveRange Tmp(*LR);
-    Tmp.start = NewIdx.getRegSlot(LR->start.isEarlyClobber());
-    Tmp.valno->def = Tmp.start;
-    Tmp.end = LR->end.isDead() ? NewIdx.getDeadSlot() : NewIdx.getRegSlot();
-    LI->removeRange(*LR);
-    LI->addRange(Tmp);
-  }
-
-  void moveAllInternalFrom(SlotIndex OldIdx, RangeSet& Internal) {
-    for (RangeSet::iterator II = Internal.begin(), IE = Internal.end();
-         II != IE; ++II)
-      moveInternalFrom(OldIdx, *II);
-  }
-
-  void moveExitingFrom(SlotIndex OldIdx, IntRangePair& P) {
-    LiveRange* LR = P.second;
-    assert(OldIdx < LR->start && LR->start < OldIdx.getDeadSlot() &&
-           "Range should start in OldIdx.");
-    assert(LR->end > OldIdx.getDeadSlot() && "Range should exit OldIdx.");
-    SlotIndex NewStart = NewIdx.getRegSlot(LR->start.isEarlyClobber());
-    LR->start = NewStart;
-    LR->valno->def = NewStart;
-  }
-
-  void moveAllExitingFrom(SlotIndex OldIdx, RangeSet& Exiting) {
-    for (RangeSet::iterator EI = Exiting.begin(), EE = Exiting.end();
-         EI != EE; ++EI)
-      moveExitingFrom(OldIdx, *EI);
-  }
 
-  void moveEnteringUpFromInto(SlotIndex OldIdx, IntRangePair& P,
-                              BundleRanges& BR) {
-    LiveInterval* LI = P.first;
-    LiveRange* LR = P.second;
-    bool LiveThrough = LR->end > OldIdx.getRegSlot();
-    if (LiveThrough) {
-      assert((LR->start < NewIdx || BR[LI->reg].Def == LR) &&
-             "Def in bundle should be def range.");
-      assert((BR[LI->reg].Use == 0 || BR[LI->reg].Use == LR) &&
-             "If bundle has use for this reg it should be LR.");
-      BR[LI->reg].Use = LR;
+    // Now deal with the def at OldIdx.
+    assert(I != E && SlotIndex::isSameInstr(I->start, OldIdx) && "No def?");
+    VNInfo *DefVNI = I->valno;
+    assert(DefVNI->def == I->start && "Inconsistent def");
+    DefVNI->def = NewIdx.getRegSlot(I->start.isEarlyClobber());
+
+    // Check for an existing def at NewIdx.
+    LiveInterval::iterator NewI = LI.find(NewIdx.getRegSlot());
+    if (SlotIndex::isSameInstr(NewI->start, NewIdx)) {
+      assert(NewI->valno != DefVNI && "Same value defined more than once?");
+      // There is an existing def at NewIdx.
+      if (I->end.isDead()) {
+        // Case 3: Remove the dead def at OldIdx.
+        LI.removeValNo(DefVNI);
+        return;
+      }
+      // Case 4: Replace def at NewIdx with live def at OldIdx.
+      I->start = DefVNI->def;
+      LI.removeValNo(NewI->valno);
       return;
     }
 
-    SlotIndex LastUse = findLastUseBefore(LI->reg, OldIdx);
-    moveKillFlags(LI->reg, OldIdx, LastUse);
-
-    if (LR->start < NewIdx) {
-      // Becoming a new entering range.
-      assert(BR[LI->reg].Dead == 0 && BR[LI->reg].Def == 0 &&
-             "Bundle shouldn't be re-defining reg mid-range.");
-      assert((BR[LI->reg].Use == 0 || BR[LI->reg].Use == LR) &&
-             "Bundle shouldn't have different use range for same reg.");
-      LR->end = LastUse.getRegSlot();
-      BR[LI->reg].Use = LR;
-    } else {
-      // Becoming a new Dead-def.
-      assert(LR->start == NewIdx.getRegSlot(LR->start.isEarlyClobber()) &&
-             "Live range starting at unexpected slot.");
-      assert(BR[LI->reg].Def == LR && "Reg should have def range.");
-      assert(BR[LI->reg].Dead == 0 &&
-               "Can't have def and dead def of same reg in a bundle.");
-      LR->end = LastUse.getDeadSlot();
-      BR[LI->reg].Dead = BR[LI->reg].Def;
-      BR[LI->reg].Def = 0;
-    }
-  }
-
-  void moveEnteringDownFromInto(SlotIndex OldIdx, IntRangePair& P,
-                                BundleRanges& BR) {
-    LiveInterval* LI = P.first;
-    LiveRange* LR = P.second;
-    if (NewIdx > LR->end) {
-      // Range extended to bundle. Add to bundle uses.
-      // Note: Currently adds kill flags to bundle start.
-      assert(BR[LI->reg].Use == 0 &&
-             "Bundle already has use range for reg.");
-      moveKillFlags(LI->reg, LR->end, NewIdx);
-      LR->end = NewIdx.getRegSlot();
-      BR[LI->reg].Use = LR;
-    } else {
-      assert(BR[LI->reg].Use != 0 &&
-             "Bundle should already have a use range for reg.");
-    }
-  }
-
-  void moveAllEnteringFromInto(SlotIndex OldIdx, RangeSet& Entering,
-                               BundleRanges& BR) {
-    bool GoingUp = NewIdx < OldIdx;
-
-    if (GoingUp) {
-      for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
-           EI != EE; ++EI)
-        moveEnteringUpFromInto(OldIdx, *EI, BR);
-    } else {
-      for (RangeSet::iterator EI = Entering.begin(), EE = Entering.end();
-           EI != EE; ++EI)
-        moveEnteringDownFromInto(OldIdx, *EI, BR);
+    // There is no existing def at NewIdx. Hoist DefVNI.
+    if (!I->end.isDead()) {
+      // Leave the end point of a live def.
+      I->start = DefVNI->def;
+      return;
     }
-  }
 
-  void moveInternalFromInto(SlotIndex OldIdx, IntRangePair& P,
-                            BundleRanges& BR) {
-    // TODO: Sane rules for moving ranges into bundles.
+    // DefVNI is a dead def. It may have been moved across other values in LI,
+    // so move I up to NewI. Slide [NewI;I) down one position.
+    std::copy_backward(NewI, I, llvm::next(I));
+    *NewI = LiveRange(DefVNI->def, NewIdx.getDeadSlot(), DefVNI);
   }
 
-  void moveAllInternalFromInto(SlotIndex OldIdx, RangeSet& Internal,
-                               BundleRanges& BR) {
-    for (RangeSet::iterator II = Internal.begin(), IE = Internal.end();
-         II != IE; ++II)
-      moveInternalFromInto(OldIdx, *II, BR);
+  void updateRegMaskSlots() {
+    SmallVectorImpl<SlotIndex>::iterator RI =
+      std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
+                       OldIdx);
+    assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+           "No RegMask at OldIdx.");
+    *RI = NewIdx.getRegSlot();
+    assert((RI == LIS.RegMaskSlots.begin() ||
+            SlotIndex::isEarlierInstr(*llvm::prior(RI), *RI)) &&
+            "Cannot move regmask instruction above another call");
+    assert((llvm::next(RI) == LIS.RegMaskSlots.end() ||
+            SlotIndex::isEarlierInstr(*RI, *llvm::next(RI))) &&
+            "Cannot move regmask instruction below another call");
   }
 
-  void moveExitingFromInto(SlotIndex OldIdx, IntRangePair& P,
-                           BundleRanges& BR) {
-    LiveInterval* LI = P.first;
-    LiveRange* LR = P.second;
-
-    assert(LR->start.isRegister() &&
-           "Don't know how to merge exiting ECs into bundles yet.");
+  // Return the last use of reg between NewIdx and OldIdx.
+  SlotIndex findLastUseBefore(unsigned Reg) {
+    SlotIndex LastUse = NewIdx;
 
-    if (LR->end > NewIdx.getDeadSlot()) {
-      // This range is becoming an exiting range on the bundle.
-      // If there was an old dead-def of this reg, delete it.
-      if (BR[LI->reg].Dead != 0) {
-        LI->removeRange(*BR[LI->reg].Dead);
-        BR[LI->reg].Dead = 0;
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      for (MachineRegisterInfo::use_nodbg_iterator
+             UI = MRI.use_nodbg_begin(Reg),
+             UE = MRI.use_nodbg_end();
+           UI != UE; UI.skipInstruction()) {
+        const MachineInstr* MI = &*UI;
+        SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
+        if (InstSlot > LastUse && InstSlot < OldIdx)
+          LastUse = InstSlot;
       }
-      assert(BR[LI->reg].Def == 0 &&
-             "Can't have two defs for the same variable exiting a bundle.");
-      LR->start = NewIdx.getRegSlot();
-      LR->valno->def = LR->start;
-      BR[LI->reg].Def = LR;
     } else {
-      // This range is becoming internal to the bundle.
-      assert(LR->end == NewIdx.getRegSlot() &&
-             "Can't bundle def whose kill is before the bundle");
-      if (BR[LI->reg].Dead || BR[LI->reg].Def) {
-        // Already have a def for this. Just delete range.
-        LI->removeRange(*LR);
-      } else {
-        // Make range dead, record.
-        LR->end = NewIdx.getDeadSlot();
-        BR[LI->reg].Dead = LR;
-        assert(BR[LI->reg].Use == LR &&
-               "Range becoming dead should currently be use.");
+      MachineInstr* MI = LIS.getSlotIndexes()->getInstructionFromIndex(NewIdx);
+      MachineBasicBlock::iterator MII(MI);
+      ++MII;
+      MachineBasicBlock* MBB = MI->getParent();
+      for (; MII != MBB->end() && LIS.getInstructionIndex(MII) < OldIdx; ++MII){
+        for (MachineInstr::mop_iterator MOI = MII->operands_begin(),
+                                        MOE = MII->operands_end();
+             MOI != MOE; ++MOI) {
+          const MachineOperand& mop = *MOI;
+          if (!mop.isReg() || mop.getReg() == 0 ||
+              TargetRegisterInfo::isVirtualRegister(mop.getReg()))
+            continue;
+
+          if (TRI.hasRegUnit(mop.getReg(), Reg))
+            LastUse = LIS.getInstructionIndex(MII);
+        }
       }
-      // In both cases the range is no longer a use on the bundle.
-      BR[LI->reg].Use = 0;
     }
+    return LastUse;
   }
-
-  void moveAllExitingFromInto(SlotIndex OldIdx, RangeSet& Exiting,
-                              BundleRanges& BR) {
-    for (RangeSet::iterator EI = Exiting.begin(), EE = Exiting.end();
-         EI != EE; ++EI)
-      moveExitingFromInto(OldIdx, *EI, BR);
-  }
-
 };
 
-void LiveIntervals::handleMove(MachineInstr* MI) {
+void LiveIntervals::handleMove(MachineInstr* MI, bool UpdateFlags) {
+  assert(!MI->isBundled() && "Can't handle bundled instructions yet.");
   SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
   Indexes->removeMachineInstrFromMaps(MI);
-  SlotIndex NewIndex = MI->isInsideBundle() ?
-                        Indexes->getInstructionIndex(MI) :
-                        Indexes->insertMachineInstrInMaps(MI);
+  SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
   assert(getMBBStartIdx(MI->getParent()) <= OldIndex &&
          OldIndex < getMBBEndIdx(MI->getParent()) &&
          "Cannot handle moves across basic block boundaries.");
-  assert(!MI->isBundled() && "Can't handle bundled instructions yet.");
 
-  HMEditor HME(*this, *MRI, *TRI, NewIndex);
-  HME.moveAllRangesFrom(MI, OldIndex);
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(MI);
 }
 
 void LiveIntervals::handleMoveIntoBundle(MachineInstr* MI,
-                                         MachineInstr* BundleStart) {
+                                         MachineInstr* BundleStart,
+                                         bool UpdateFlags) {
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
   SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
-  HMEditor HME(*this, *MRI, *TRI, NewIndex);
-  HME.moveAllRangesInto(MI, BundleStart);
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(MI);
 }
diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h
index cd4e690c3740..4d41fca85ad3 100644
--- a/lib/CodeGen/LiveIntervalUnion.h
+++ b/lib/CodeGen/LiveIntervalUnion.h
@@ -178,8 +178,8 @@ public:
     bool checkLoopInterference(MachineLoopRange*);
 
   private:
-    Query(const Query&);          // DO NOT IMPLEMENT
-    void operator=(const Query&); // DO NOT IMPLEMENT
+    Query(const Query&) LLVM_DELETED_FUNCTION;
+    void operator=(const Query&) LLVM_DELETED_FUNCTION;
   };
 
   // Array of LiveIntervalUnions.
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index d828f25932e7..c3ff4f1b6d2e 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -65,7 +65,11 @@ void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
   // Visit all operands that read Reg. This may include partial defs.
   for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
        E = MRI->reg_nodbg_end(); I != E; ++I) {
-    const MachineOperand &MO = I.getOperand();
+    MachineOperand &MO = I.getOperand();
+    // Clear all kill flags. They will be reinserted after register allocation
+    // by LiveIntervalAnalysis::addKillFlags().
+    if (MO.isUse())
+      MO.setIsKill(false);
     if (!MO.readsReg())
       continue;
     // MI is reading Reg. We may have visited MI before if it happens to be
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index b4ce9aa8c12c..f8fbc7ddf0c1 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -87,7 +87,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
 
     // We can't remat physreg uses, unless it is a constant.
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
-      if (MRI.isConstantPhysReg(MO.getReg(), VRM->getMachineFunction()))
+      if (MRI.isConstantPhysReg(MO.getReg(), *OrigMI->getParent()->getParent()))
         continue;
       return false;
     }
@@ -96,6 +96,13 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
     const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
     if (!OVNI)
       continue;
+
+    // Don't allow rematerialization immediately after the original def.
+    // It would be incorrect if OrigMI redefines the register.
+    // See PR14098.
+    if (SlotIndex::isSameInstr(OrigIdx, UseIdx))
+      return false;
+
     if (OVNI != li.getVNInfoAt(UseIdx))
       return false;
   }
@@ -249,7 +256,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         unsigned Reg = MOI->getReg();
         if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
           // Check if MI reads any unreserved physregs.
-          if (Reg && MOI->readsReg() && !LIS.isReserved(Reg))
+          if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
             ReadsPhysRegs = true;
           continue;
         }
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index cdb17768129c..7f22478d01cd 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -13,6 +13,7 @@
 
 #define DEBUG_TYPE "regalloc"
 #include "LiveRegMatrix.h"
+#include "RegisterCoalescer.h"
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -117,8 +118,9 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
                                              unsigned PhysReg) {
   if (VirtReg.empty())
     return false;
+  CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (VirtReg.overlaps(LIS->getRegUnit(*Units)))
+    if (VirtReg.overlaps(LIS->getRegUnit(*Units), CP, *LIS->getSlotIndexes()))
       return true;
   return false;
 }
diff --git a/lib/CodeGen/LiveRegMatrix.h b/lib/CodeGen/LiveRegMatrix.h
index b3e2d7f4b45b..8f22c24478f4 100644
--- a/lib/CodeGen/LiveRegMatrix.h
+++ b/lib/CodeGen/LiveRegMatrix.h
@@ -15,7 +15,7 @@
 // Register units are defined in MCRegisterInfo.h, they represent the smallest
 // unit of interference when dealing with overlapping physical registers. The
 // LiveRegMatrix is represented as a LiveIntervalUnion per register unit. When
-// a virtual register is assigned to a physicval register, the live range for
+// a virtual register is assigned to a physical register, the live range for
 // the virtual register is inserted into the LiveIntervalUnion for each regunit
 // in the physreg.
 //
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index 939e795b4a38..f0b522bd7d36 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -25,7 +25,10 @@
 using namespace llvm;
 
 char LiveStacks::ID = 0;
-INITIALIZE_PASS(LiveStacks, "livestacks",
+INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveStacks, "livestacks",
                 "Live Stack Slot Analysis", false, false)
 
 char &llvm::LiveStacksID = LiveStacks::ID;
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index 348ed3a0f932..6ea933d4304b 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -65,6 +65,7 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
 }
 
 void LiveVariables::VarInfo::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  Alive in blocks: ";
   for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
            E = AliveBlocks.end(); I != E; ++I)
@@ -77,6 +78,7 @@ void LiveVariables::VarInfo::dump() const {
       dbgs() << "\n    #" << i << ": " << *Kills[i];
     dbgs() << "\n";
   }
+#endif
 }
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
@@ -501,8 +503,6 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   MRI = &mf.getRegInfo();
   TRI = MF->getTarget().getRegisterInfo();
 
-  ReservedRegisters = TRI->getReservedRegs(mf);
-
   unsigned NumRegs = TRI->getNumRegs();
   PhysRegDef  = new MachineInstr*[NumRegs];
   PhysRegUse  = new MachineInstr*[NumRegs];
@@ -586,7 +586,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
         unsigned MOReg = UseRegs[i];
         if (TargetRegisterInfo::isVirtualRegister(MOReg))
           HandleVirtRegUse(MOReg, MBB, MI);
-        else if (!ReservedRegisters[MOReg])
+        else if (!MRI->isReserved(MOReg))
           HandlePhysRegUse(MOReg, MI);
       }
 
@@ -599,7 +599,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
         unsigned MOReg = DefRegs[i];
         if (TargetRegisterInfo::isVirtualRegister(MOReg))
           HandleVirtRegDef(MOReg, MI);
-        else if (!ReservedRegisters[MOReg])
+        else if (!MRI->isReserved(MOReg))
           HandlePhysRegDef(MOReg, MI, Defs);
       }
       UpdatePhysRegDefs(MI, Defs);
@@ -806,18 +806,44 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
                                 MachineBasicBlock *SuccBB) {
   const unsigned NumNew = BB->getNumber();
 
-  // All registers used by PHI nodes in SuccBB must be live through BB.
-  for (MachineBasicBlock::iterator BBI = SuccBB->begin(),
-         BBE = SuccBB->end(); BBI != BBE && BBI->isPHI(); ++BBI)
+  SmallSet<unsigned, 16> Defs, Kills;
+
+  MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
+  for (; BBI != BBE && BBI->isPHI(); ++BBI) {
+    // Record the def of the PHI node.
+    Defs.insert(BBI->getOperand(0).getReg());
+
+    // All registers used by PHI nodes in SuccBB must be live through BB.
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
       if (BBI->getOperand(i+1).getMBB() == BB)
         getVarInfo(BBI->getOperand(i).getReg()).AliveBlocks.set(NumNew);
+  }
+
+  // Record all vreg defs and kills of all instructions in SuccBB.
+  for (; BBI != BBE; ++BBI) {
+    for (MachineInstr::mop_iterator I = BBI->operands_begin(),
+         E = BBI->operands_end(); I != E; ++I) {
+      if (I->isReg() && TargetRegisterInfo::isVirtualRegister(I->getReg())) {
+        if (I->isDef())
+          Defs.insert(I->getReg());
+        else if (I->isKill())
+          Kills.insert(I->getReg());
+      }
+    }
+  }
 
   // Update info for all live variables
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // If the Defs is defined in the successor it can't be live in BB.
+    if (Defs.count(Reg))
+      continue;
+
+    // If the register is either killed in or live through SuccBB it's also live
+    // through BB.
     VarInfo &VI = getVarInfo(Reg);
-    if (!VI.AliveBlocks.test(NumNew) && VI.isLiveIn(*SuccBB, Reg, *MRI))
+    if (Kills.count(Reg) || VI.AliveBlocks.test(SuccBB->getNumber()))
       VI.AliveBlocks.set(NumNew);
   }
 }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index fa6b4502c4ab..18d021d521d6 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -21,7 +21,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Assembly/Writer.h"
@@ -145,7 +145,8 @@ MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
   instr_iterator I = instr_begin(), E = instr_end();
   while (I != E && I->isPHI())
     ++I;
-  assert(!I->isInsideBundle() && "First non-phi MI cannot be inside a bundle!");
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi MI cannot be inside a bundle!");
   return I;
 }
 
@@ -156,7 +157,7 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
     ++I;
   // FIXME: This needs to change if we wish to bundle labels / dbg_values
   // inside the bundle.
-  assert(!I->isInsideBundle() &&
+  assert((I == E || !I->isInsideBundle()) &&
          "First non-phi / non-label instruction is inside a bundle!");
   return I;
 }
@@ -228,9 +229,11 @@ const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const {
   return 0;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineBasicBlock::dump() const {
   print(dbgs());
 }
+#endif
 
 StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
@@ -243,7 +246,7 @@ StringRef MachineBasicBlock::getName() const {
 std::string MachineBasicBlock::getFullName() const {
   std::string Name;
   if (getParent())
-    Name = (getParent()->getFunction()->getName() + ":").str();
+    Name = (getParent()->getName() + ":").str();
   if (getBasicBlock())
     Name += getBasicBlock()->getName();
   else
@@ -942,12 +945,11 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
 
 /// getSuccWeight - Return weight of the edge from this block to MBB.
 ///
-uint32_t MachineBasicBlock::getSuccWeight(const MachineBasicBlock *succ) const {
+uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
   if (Weights.empty())
     return 0;
 
-  const_succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
-  return *getWeightIterator(I);
+  return *getWeightIterator(Succ);
 }
 
 /// getWeightIterator - Return wight iterator corresonding to the I successor
@@ -970,6 +972,80 @@ getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
   return Weights.begin() + index;
 }
 
+/// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
+/// as of just before "MI".
+/// 
+/// Search is localised to a neighborhood of
+/// Neighborhood instructions before (searching for defs or kills) and N
+/// instructions after (searching just for defs) MI.
+MachineBasicBlock::LivenessQueryResult
+MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                           unsigned Reg, MachineInstr *MI,
+                                           unsigned Neighborhood) {
+  
+  unsigned N = Neighborhood;
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Start by searching backwards from MI, looking for kills, reads or defs.
+
+  MachineBasicBlock::iterator I(MI);
+  // If this is the first insn in the block, don't search backwards.
+  if (I != MBB->begin()) {
+    do {
+      --I;
+
+      MachineOperandIteratorBase::PhysRegInfo Analysis =
+        MIOperands(I).analyzePhysReg(Reg, TRI);
+
+      if (Analysis.Kills)
+        // Register killed, so isn't live.
+        return LQR_Dead;
+
+      else if (Analysis.DefinesOverlap || Analysis.ReadsOverlap)
+        // Defined or read without a previous kill - live.
+        return (Analysis.Defines || Analysis.Reads) ? 
+          LQR_Live : LQR_OverlappingLive;
+
+    } while (I != MBB->begin() && --N > 0);
+  }
+
+  // Did we get to the start of the block?
+  if (I == MBB->begin()) {
+    // If so, the register's state is definitely defined by the live-in state.
+    for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true);
+         RAI.isValid(); ++RAI) {
+      if (MBB->isLiveIn(*RAI))
+        return (*RAI == Reg) ? LQR_Live : LQR_OverlappingLive;
+    }
+
+    return LQR_Dead;
+  }
+
+  N = Neighborhood;
+
+  // Try searching forwards from MI, looking for reads or defs.
+  I = MachineBasicBlock::iterator(MI);
+  // If this is the last insn in the block, don't search forwards.
+  if (I != MBB->end()) {
+    for (++I; I != MBB->end() && N > 0; ++I, --N) {
+      MachineOperandIteratorBase::PhysRegInfo Analysis =
+        MIOperands(I).analyzePhysReg(Reg, TRI);
+
+      if (Analysis.ReadsOverlap)
+        // Used, therefore must have been live.
+        return (Analysis.Reads) ?
+          LQR_Live : LQR_OverlappingLive;
+
+      else if (Analysis.DefinesOverlap)
+        // Defined (but not read) therefore cannot have been live.
+        return LQR_Dead;
+    }
+  }
+
+  // At this point we have no idea of the liveness of the register.
+  return LQR_Unknown;
+}
+
 void llvm::WriteAsOperand(raw_ostream &OS, const MachineBasicBlock *MBB,
                           bool t) {
   OS << "BB#" << MBB->getNumber();
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index c4dca2cd151d..cd3f19944e46 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -500,11 +500,10 @@ void MachineBlockPlacement::buildChain(
     assert(BB);
     assert(BlockToChain[BB] == &Chain);
     assert(*llvm::prior(Chain.end()) == BB);
-    MachineBasicBlock *BestSucc = 0;
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
-    BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -1014,7 +1013,8 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+  if (F.getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize))
     return;
   unsigned Align = TLI->getPrefLoopAlignment();
   if (!Align)
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 0cc1af07952d..447921147f03 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -38,7 +38,7 @@ getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
   Scale = 1;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, *I);
+    uint32_t Weight = getEdgeWeight(MBB, I);
     Sum += Weight;
   }
 
@@ -53,22 +53,30 @@ getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
   Sum = 0;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, *I);
+    uint32_t Weight = getEdgeWeight(MBB, I);
     Sum += Weight / Scale;
   }
   assert(Sum <= UINT32_MAX);
   return Sum;
 }
 
-uint32_t
-MachineBranchProbabilityInfo::getEdgeWeight(const MachineBasicBlock *Src,
-                                            const MachineBasicBlock *Dst) const {
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              MachineBasicBlock::const_succ_iterator Dst) const {
   uint32_t Weight = Src->getSuccWeight(Dst);
   if (!Weight)
     return DEFAULT_WEIGHT;
   return Weight;
 }
 
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+}
+
 bool MachineBranchProbabilityInfo::isEdgeHot(MachineBasicBlock *Src,
                                              MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
@@ -82,7 +90,7 @@ MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
   MachineBasicBlock *MaxSucc = 0;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, *I);
+    uint32_t Weight = getEdgeWeight(MBB, I);
     if (Weight > MaxWeight) {
       MaxWeight = Weight;
       MaxSucc = *I;
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 896461fd194b..dbc41defeb5a 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -63,8 +63,6 @@ namespace {
     virtual void releaseMemory() {
       ScopeMap.clear();
       Exps.clear();
-      AllocatableRegs.clear();
-      ReservedRegs.clear();
     }
 
   private:
@@ -78,8 +76,6 @@ namespace {
     ScopedHTType VNT;
     SmallVector<MachineInstr*, 64> Exps;
     unsigned CurrVN;
-    BitVector AllocatableRegs;
-    BitVector ReservedRegs;
 
     bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
     bool isPhysDefTriviallyDead(unsigned Reg,
@@ -88,7 +84,8 @@ namespace {
     bool hasLivePhysRegDefUses(const MachineInstr *MI,
                                const MachineBasicBlock *MBB,
                                SmallSet<unsigned,8> &PhysRefs,
-                               SmallVector<unsigned,2> &PhysDefs) const;
+                               SmallVector<unsigned,2> &PhysDefs,
+                               bool &PhysUseDef) const;
     bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
                           SmallSet<unsigned,8> &PhysRefs,
                           SmallVector<unsigned,2> &PhysDefs,
@@ -198,31 +195,52 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
 bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
                                        const MachineBasicBlock *MBB,
                                        SmallSet<unsigned,8> &PhysRefs,
-                                       SmallVector<unsigned,2> &PhysDefs) const{
-  MachineBasicBlock::const_iterator I = MI; I = llvm::next(I);
+                                       SmallVector<unsigned,2> &PhysDefs,
+                                       bool &PhysUseDef) const{
+  // First, add all uses to PhysRefs.
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
+    if (!MO.isReg() || MO.isDef())
       continue;
     unsigned Reg = MO.getReg();
     if (!Reg)
       continue;
     if (TargetRegisterInfo::isVirtualRegister(Reg))
       continue;
-    // If the def is dead, it's ok. But the def may not marked "dead". That's
-    // common since this pass is run before livevariables. We can scan
-    // forward a few instructions and check if it is obviously dead.
-    if (MO.isDef() &&
-        (MO.isDead() || isPhysDefTriviallyDead(Reg, I, MBB->end())))
-      continue;
     // Reading constant physregs is ok.
     if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
         PhysRefs.insert(*AI);
-    if (MO.isDef())
+  }
+
+  // Next, collect all defs into PhysDefs.  If any is already in PhysRefs
+  // (which currently contains only uses), set the PhysUseDef flag.
+  PhysUseDef = false;
+  MachineBasicBlock::const_iterator I = MI; I = llvm::next(I);
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // Check against PhysRefs even if the def is "dead".
+    if (PhysRefs.count(Reg))
+      PhysUseDef = true;
+    // If the def is dead, it's ok. But the def may not marked "dead". That's
+    // common since this pass is run before livevariables. We can scan
+    // forward a few instructions and check if it is obviously dead.
+    if (!MO.isDead() && !isPhysDefTriviallyDead(Reg, I, MBB->end()))
       PhysDefs.push_back(Reg);
   }
 
+  // Finally, add all defs to PhysRefs as well.
+  for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i)
+    for (MCRegAliasIterator AI(PhysDefs[i], TRI, true); AI.isValid(); ++AI)
+      PhysRefs.insert(*AI);
+
   return !PhysRefs.empty();
 }
 
@@ -242,7 +260,7 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
       return false;
 
     for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) {
-      if (AllocatableRegs.test(PhysDefs[i]) || ReservedRegs.test(PhysDefs[i]))
+      if (MRI->isAllocatable(PhysDefs[i]) || MRI->isReserved(PhysDefs[i]))
         // Avoid extending live range of physical registers if they are
         //allocatable or reserved.
         return false;
@@ -411,8 +429,8 @@ void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
   DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
   DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
   assert(SI != ScopeMap.end());
-  ScopeMap.erase(SI);
   delete SI->second;
+  ScopeMap.erase(SI);
 }
 
 bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
@@ -463,16 +481,22 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     bool CrossMBBPhysDef = false;
     SmallSet<unsigned, 8> PhysRefs;
     SmallVector<unsigned, 2> PhysDefs;
-    if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs, PhysDefs)) {
+    bool PhysUseDef = false;
+    if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
+                                          PhysDefs, PhysUseDef)) {
       FoundCSE = false;
 
       // ... Unless the CS is local or is in the sole predecessor block
       // and it also defines the physical register which is not clobbered
       // in between and the physical register uses were not clobbered.
-      unsigned CSVN = VNT.lookup(MI);
-      MachineInstr *CSMI = Exps[CSVN];
-      if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
-        FoundCSE = true;
+      // This can never be the case if the instruction both uses and
+      // defines the same physical register, which was detected above.
+      if (!PhysUseDef) {
+        unsigned CSVN = VNT.lookup(MI);
+        MachineInstr *CSMI = Exps[CSVN];
+        if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
+          FoundCSE = true;
+      }
     }
 
     if (!FoundCSE) {
@@ -635,7 +659,5 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<MachineDominatorTree>();
-  AllocatableRegs = TRI->getAllocatableSet(MF);
-  ReservedRegs = TRI->getReservedRegs(MF);
   return PerformCSE(DT->getRootNode());
 }
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index bac3aa2c155e..4a793281b2cd 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Pass.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -32,7 +33,7 @@ STATISTIC(NumDeletes, "Number of dead copies deleted");
 namespace {
   class MachineCopyPropagation : public MachineFunctionPass {
     const TargetRegisterInfo *TRI;
-    BitVector ReservedRegs;
+    MachineRegisterInfo *MRI;
 
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -146,8 +147,8 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       DenseMap<unsigned, MachineInstr*>::iterator CI = AvailCopyMap.find(Src);
       if (CI != AvailCopyMap.end()) {
         MachineInstr *CopyMI = CI->second;
-        if (!ReservedRegs.test(Def) &&
-            (!ReservedRegs.test(Src) || NoInterveningSideEffect(CopyMI, MI)) &&
+        if (!MRI->isReserved(Def) &&
+            (!MRI->isReserved(Src) || NoInterveningSideEffect(CopyMI, MI)) &&
             isNopCopy(CopyMI, Def, Src, TRI)) {
           // The two copies cancel out and the source of the first copy
           // hasn't been overridden, eliminate the second one. e.g.
@@ -259,7 +260,7 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
            DI = MaybeDeadCopies.begin(), DE = MaybeDeadCopies.end();
            DI != DE; ++DI) {
         unsigned Reg = (*DI)->getOperand(0).getReg();
-        if (ReservedRegs.test(Reg) || !MaskMO.clobbersPhysReg(Reg))
+        if (MRI->isReserved(Reg) || !MaskMO.clobbersPhysReg(Reg))
           continue;
         (*DI)->eraseFromParent();
         Changed = true;
@@ -296,7 +297,7 @@ bool MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
     for (SmallSetVector<MachineInstr*, 8>::iterator
            DI = MaybeDeadCopies.begin(), DE = MaybeDeadCopies.end();
          DI != DE; ++DI) {
-      if (!ReservedRegs.test((*DI)->getOperand(0).getReg())) {
+      if (!MRI->isReserved((*DI)->getOperand(0).getReg())) {
         (*DI)->eraseFromParent();
         Changed = true;
         ++NumDeletes;
@@ -311,7 +312,7 @@ bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
   bool Changed = false;
 
   TRI = MF.getTarget().getRegisterInfo();
-  ReservedRegs = TRI->getReservedRegs(MF);
+  MRI = &MF.getRegInfo();
 
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
     Changed |= CopyPropagateBlock(*I);
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index d4aede8a7efc..91d521185767 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -28,7 +28,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -59,13 +59,13 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
     RegInfo = 0;
   MFInfo = 0;
   FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering());
-  if (Fn->hasFnAttr(Attribute::StackAlignment))
-    FrameInfo->ensureMaxAlignment(Attribute::getStackAlignmentFromAttrs(
-        Fn->getAttributes().getFnAttributes()));
-  ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
+  if (Fn->getFnAttributes().hasAttribute(Attributes::StackAlignment))
+    FrameInfo->ensureMaxAlignment(Fn->getAttributes().
+                                  getFnAttributes().getStackAlignment());
+  ConstantPool = new (Allocator) MachineConstantPool(TM.getDataLayout());
   Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
-  if (!Fn->hasFnAttr(Attribute::OptimizeForSize))
+  if (!Fn->getFnAttributes().hasAttribute(Attributes::OptimizeForSize))
     Alignment = std::max(Alignment,
                          TM.getTargetLowering()->getPrefFunctionAlignment());
   FunctionNumber = FunctionNum;
@@ -284,12 +284,19 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
   return std::make_pair(Result, Result + Num);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineFunction::dump() const {
   print(dbgs());
 }
+#endif
+
+StringRef MachineFunction::getName() const {
+  assert(getFunction() && "No function!");
+  return getFunction()->getName();
+}
 
 void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
-  OS << "# Machine code for function " << Fn->getName() << ": ";
+  OS << "# Machine code for function " << getName() << ": ";
   if (RegInfo) {
     OS << (RegInfo->isSSA() ? "SSA" : "Post SSA");
     if (!RegInfo->tracksLiveness())
@@ -334,7 +341,7 @@ void MachineFunction::print(raw_ostream &OS, SlotIndexes *Indexes) const {
     BB->print(OS, Indexes);
   }
 
-  OS << "\n# End machine code for function " << Fn->getName() << ".\n\n";
+  OS << "\n# End machine code for function " << getName() << ".\n\n";
 }
 
 namespace llvm {
@@ -344,7 +351,7 @@ namespace llvm {
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
     static std::string getGraphName(const MachineFunction *F) {
-      return "CFG for '" + F->getFunction()->getName().str() + "' function";
+      return "CFG for '" + F->getName().str() + "' function";
     }
 
     std::string getNodeLabel(const MachineBasicBlock *Node,
@@ -377,7 +384,7 @@ namespace llvm {
 void MachineFunction::viewCFG() const
 {
 #ifndef NDEBUG
-  ViewGraph(this, "mf" + getFunction()->getName());
+  ViewGraph(this, "mf" + getName());
 #else
   errs() << "MachineFunction::viewCFG is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
@@ -387,7 +394,7 @@ void MachineFunction::viewCFG() const
 void MachineFunction::viewCFGOnly() const
 {
 #ifndef NDEBUG
-  ViewGraph(this, "mf" + getFunction()->getName(), true);
+  ViewGraph(this, "mf" + getName(), true);
 #else
   errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
@@ -453,7 +460,9 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   unsigned StackAlign = TFI.getStackAlignment();
   unsigned Align = MinAlign(SPOffset, StackAlign);
   Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
-                                              /*isSS*/false, false));
+                                              /*isSS*/   false,
+                                              /*NeedSP*/ false,
+                                              /*Alloca*/ 0));
   return -++NumFixedObjects;
 }
 
@@ -525,16 +534,18 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineFrameInfo::dump(const MachineFunction &MF) const {
   print(MF, dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 //  MachineJumpTableInfo implementation
 //===----------------------------------------------------------------------===//
 
 /// getEntrySize - Return the size of each entry in the jump table.
-unsigned MachineJumpTableInfo::getEntrySize(const TargetData &TD) const {
+unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const {
   // The size of a jump table entry is 4 bytes unless the entry is just the
   // address of a block, in which case it is the pointer size.
   switch (getEntryKind()) {
@@ -553,7 +564,7 @@ unsigned MachineJumpTableInfo::getEntrySize(const TargetData &TD) const {
 }
 
 /// getEntryAlignment - Return the alignment of each entry in the jump table.
-unsigned MachineJumpTableInfo::getEntryAlignment(const TargetData &TD) const {
+unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
   // The alignment of a jump table entry is the alignment of int32 unless the
   // entry is just the address of a block, in which case it is the pointer
   // alignment.
@@ -622,7 +633,9 @@ void MachineJumpTableInfo::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineJumpTableInfo::dump() const { print(dbgs()); }
+#endif
 
 
 //===----------------------------------------------------------------------===//
@@ -657,7 +670,7 @@ MachineConstantPool::~MachineConstantPool() {
 /// CanShareConstantPoolEntry - Test whether the given two constants
 /// can be allocated the same constant pool entry.
 static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
-                                      const TargetData *TD) {
+                                      const DataLayout *TD) {
   // Handle the trivial case quickly.
   if (A == B) return true;
 
@@ -681,7 +694,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
   // Try constant folding a bitcast of both instructions to an integer.  If we
   // get two identical ConstantInt's, then we are good to share them.  We use
   // the constant folding APIs to do this so that we get the benefit of
-  // TargetData.
+  // DataLayout.
   if (isa<PointerType>(A->getType()))
     A = ConstantFoldInstOperands(Instruction::PtrToInt, IntTy,
                                  const_cast<Constant*>(A), TD);
@@ -749,10 +762,12 @@ void MachineConstantPool::print(raw_ostream &OS) const {
     if (Constants[i].isMachineConstantPoolEntry())
       Constants[i].Val.MachineCPVal->print(OS);
     else
-      OS << *(Value*)Constants[i].Val.ConstVal;
+      OS << *(const Value*)Constants[i].Val.ConstVal;
     OS << ", align=" << Constants[i].getAlignment();
     OS << "\n";
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineConstantPool::dump() const { print(dbgs()); }
+#endif
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index 0102ac708d08..ed94efb93551 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -51,7 +51,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 char MachineFunctionPrinterPass::ID = 0;
 }
 
-char &MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
 INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
                 "Machine Function Printer", false, false)
 
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index b16684994696..ce8d52000b47 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -111,6 +111,7 @@ void MachineOperand::setIsDef(bool Val) {
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
 void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
   // If this operand is currently a register operand, and if this is in a
   // function, deregister the operand from the register's use/def list.
   if (isReg() && isOnRegUseList())
@@ -136,7 +137,8 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
         RegInfo = &MF->getRegInfo();
   // If this operand is already a register operand, remove it from the
   // register's use/def lists.
-  if (RegInfo && isReg())
+  bool WasReg = isReg();
+  if (RegInfo && WasReg)
     RegInfo->removeRegOperandFromUseList(this);
 
   // Change this to a register and set the reg#.
@@ -153,6 +155,9 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
   IsDebug = isDebug;
   // Ensure isOnRegUseList() returns false.
   Contents.Reg.Prev = 0;
+  // Preserve the tie when the operand was already a register.
+  if (!WasReg)
+    TiedTo = 0;
 
   // If this operand is embedded in a function, add the operand to the
   // register's use/def list.
@@ -193,7 +198,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return !strcmp(getSymbolName(), Other.getSymbolName()) &&
            getOffset() == Other.getOffset();
   case MachineOperand::MO_BlockAddress:
-    return getBlockAddress() == Other.getBlockAddress();
+    return getBlockAddress() == Other.getBlockAddress() &&
+           getOffset() == Other.getOffset();
   case MO_RegisterMask:
     return getRegMask() == Other.getRegMask();
   case MachineOperand::MO_MCSymbol:
@@ -208,8 +214,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
 hash_code llvm::hash_value(const MachineOperand &MO) {
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getReg(),
-                        MO.getSubReg(), MO.isDef());
+    // Register operands don't have target flags.
+    return hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(), MO.isDef());
   case MachineOperand::MO_Immediate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
   case MachineOperand::MO_CImmediate:
@@ -234,7 +240,7 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
                         MO.getOffset());
   case MachineOperand::MO_BlockAddress:
     return hash_combine(MO.getType(), MO.getTargetFlags(),
-                        MO.getBlockAddress());
+                        MO.getBlockAddress(), MO.getOffset());
   case MachineOperand::MO_RegisterMask:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
   case MachineOperand::MO_Metadata:
@@ -262,7 +268,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
     OS << PrintReg(getReg(), TRI, getSubReg());
 
     if (isDef() || isKill() || isDead() || isImplicit() || isUndef() ||
-        isInternalRead() || isEarlyClobber()) {
+        isInternalRead() || isEarlyClobber() || isTied()) {
       OS << '<';
       bool NeedComma = false;
       if (isDef()) {
@@ -282,27 +288,32 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
           NeedComma = true;
       }
 
-      if (isKill() || isDead() || (isUndef() && isUse()) || isInternalRead()) {
+      if (isKill()) {
         if (NeedComma) OS << ',';
-        NeedComma = false;
-        if (isKill()) {
-          OS << "kill";
-          NeedComma = true;
-        }
-        if (isDead()) {
-          OS << "dead";
-          NeedComma = true;
-        }
-        if (isUndef() && isUse()) {
-          if (NeedComma) OS << ',';
-          OS << "undef";
-          NeedComma = true;
-        }
-        if (isInternalRead()) {
-          if (NeedComma) OS << ',';
-          OS << "internal";
-          NeedComma = true;
-        }
+        OS << "kill";
+        NeedComma = true;
+      }
+      if (isDead()) {
+        if (NeedComma) OS << ',';
+        OS << "dead";
+        NeedComma = true;
+      }
+      if (isUndef() && isUse()) {
+        if (NeedComma) OS << ',';
+        OS << "undef";
+        NeedComma = true;
+      }
+      if (isInternalRead()) {
+        if (NeedComma) OS << ',';
+        OS << "internal";
+        NeedComma = true;
+      }
+      if (isTied()) {
+        if (NeedComma) OS << ',';
+        OS << "tied";
+        if (TiedTo != 15)
+          OS << unsigned(TiedTo - 1);
+        NeedComma = true;
       }
       OS << '>';
     }
@@ -352,6 +363,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
   case MachineOperand::MO_BlockAddress:
     OS << '<';
     WriteAsOperand(OS, getBlockAddress(), /*PrintType=*/false);
+    if (getOffset()) OS << "+" << getOffset();
     OS << '>';
     break;
   case MachineOperand::MO_RegisterMask:
@@ -528,20 +540,6 @@ void MachineInstr::addImplicitDefUseOperands() {
 /// MachineInstr ctor - This constructor creates a MachineInstr and adds the
 /// implicit operands. It reserves space for the number of operands specified by
 /// the MCInstrDesc.
-MachineInstr::MachineInstr(const MCInstrDesc &tid, bool NoImp)
-  : MCID(&tid), Flags(0), AsmPrinterFlags(0),
-    NumMemRefs(0), MemRefs(0), Parent(0) {
-  unsigned NumImplicitOps = 0;
-  if (!NoImp)
-    NumImplicitOps = MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
-  Operands.reserve(NumImplicitOps + MCID->getNumOperands());
-  if (!NoImp)
-    addImplicitDefUseOperands();
-  // Make sure that we get added to a machine basicblock
-  LeakDetector::addGarbageObject(this);
-}
-
-/// MachineInstr ctor - As above, but with a DebugLoc.
 MachineInstr::MachineInstr(const MCInstrDesc &tid, const DebugLoc dl,
                            bool NoImp)
   : MCID(&tid), Flags(0), AsmPrinterFlags(0),
@@ -559,21 +557,6 @@ MachineInstr::MachineInstr(const MCInstrDesc &tid, const DebugLoc dl,
 /// MachineInstr ctor - Work exactly the same as the ctor two above, except
 /// that the MachineInstr is created and added to the end of the specified
 /// basic block.
-MachineInstr::MachineInstr(MachineBasicBlock *MBB, const MCInstrDesc &tid)
-  : MCID(&tid), Flags(0), AsmPrinterFlags(0),
-    NumMemRefs(0), MemRefs(0), Parent(0) {
-  assert(MBB && "Cannot use inserting ctor with null basic block!");
-  unsigned NumImplicitOps =
-    MCID->getNumImplicitDefs() + MCID->getNumImplicitUses();
-  Operands.reserve(NumImplicitOps + MCID->getNumOperands());
-  addImplicitDefUseOperands();
-  // Make sure that we get added to a machine basicblock
-  LeakDetector::addGarbageObject(this);
-  MBB->push_back(this);  // Add instruction to end of basic block!
-}
-
-/// MachineInstr ctor - As above, but with a DebugLoc.
-///
 MachineInstr::MachineInstr(MachineBasicBlock *MBB, const DebugLoc dl,
                            const MCInstrDesc &tid)
   : MCID(&tid), Flags(0), AsmPrinterFlags(0),
@@ -673,6 +656,7 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (!isImpReg && !isInlineAsm()) {
     while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
       --OpNo;
+      assert(!Operands[OpNo].isTied() && "Cannot move tied operands");
       if (RegInfo)
         RegInfo->removeRegOperandFromUseList(&Operands[OpNo]);
     }
@@ -708,12 +692,25 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (Operands[OpNo].isReg()) {
     // Ensure isOnRegUseList() returns false, regardless of Op's status.
     Operands[OpNo].Contents.Reg.Prev = 0;
+    // Ignore existing ties. This is not a property that can be copied.
+    Operands[OpNo].TiedTo = 0;
     // Add the new operand to RegInfo.
     if (RegInfo)
       RegInfo->addRegOperandToUseList(&Operands[OpNo]);
-    // If the register operand is flagged as early, mark the operand as such.
-    if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
-      Operands[OpNo].setIsEarlyClobber(true);
+    // The MCID operand information isn't accurate until we start adding
+    // explicit operands. The implicit operands are added first, then the
+    // explicits are inserted before them.
+    if (!isImpReg) {
+      // Tie uses to defs as indicated in MCInstrDesc.
+      if (Operands[OpNo].isUse()) {
+        int DefIdx = MCID->getOperandConstraint(OpNo, MCOI::TIED_TO);
+        if (DefIdx != -1)
+          tieOperands(DefIdx, OpNo);
+      }
+      // If the register operand is flagged as early, mark the operand as such.
+      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
+        Operands[OpNo].setIsEarlyClobber(true);
+    }
   }
 
   // Re-add all the implicit ops.
@@ -730,6 +727,7 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
 ///
 void MachineInstr::RemoveOperand(unsigned OpNo) {
   assert(OpNo < Operands.size() && "Invalid operand number");
+  untieRegOperand(OpNo);
   MachineRegisterInfo *RegInfo = getRegInfo();
 
   // Special case removing the last one.
@@ -752,6 +750,13 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
     }
   }
 
+#ifndef NDEBUG
+  // Moving tied operands would break the ties.
+  for (unsigned i = OpNo + 1, e = Operands.size(); i != e; ++i)
+    if (Operands[i].isReg())
+      assert(!Operands[i].isTied() && "Cannot move tied operands");
+#endif
+
   Operands.erase(Operands.begin()+OpNo);
 
   if (RegInfo) {
@@ -935,6 +940,12 @@ bool MachineInstr::isStackAligningInlineAsm() const {
   return false;
 }
 
+InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
+  assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
+  unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+  return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
+}
+
 int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
                                        unsigned *GroupNo) const {
   assert(isInlineAsm() && "Expected an inline asm instruction");
@@ -1004,9 +1015,10 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 unsigned MachineInstr::getBundleSize() const {
   assert(isBundle() && "Expecting a bundle");
 
-  MachineBasicBlock::const_instr_iterator I = *this;
+  const MachineBasicBlock *MBB = getParent();
+  MachineBasicBlock::const_instr_iterator I = *this, E = MBB->instr_end();
   unsigned Size = 0;
-  while ((++I)->isInsideBundle()) {
+  while ((++I != E) && I->isInsideBundle()) {
     ++Size;
   }
   assert(Size > 1 && "Malformed bundle");
@@ -1114,107 +1126,99 @@ int MachineInstr::findFirstPredOperandIdx() const {
   return -1;
 }
 
-/// isRegTiedToUseOperand - Given the index of a register def operand,
-/// check if the register def is tied to a source operand, due to either
-/// two-address elimination or inline assembly constraints. Returns the
-/// first tied use operand index by reference is UseOpIdx is not null.
-bool MachineInstr::
-isRegTiedToUseOperand(unsigned DefOpIdx, unsigned *UseOpIdx) const {
-  if (isInlineAsm()) {
-    assert(DefOpIdx > InlineAsm::MIOp_FirstOperand);
-    const MachineOperand &MO = getOperand(DefOpIdx);
-    if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
-      return false;
-    // Determine the actual operand index that corresponds to this index.
-    unsigned DefNo = 0;
-    int FlagIdx = findInlineAsmFlagIdx(DefOpIdx, &DefNo);
-    if (FlagIdx < 0)
-      return false;
+// MachineOperand::TiedTo is 4 bits wide.
+const unsigned TiedMax = 15;
 
-    // Which part of the group is DefOpIdx?
-    unsigned DefPart = DefOpIdx - (FlagIdx + 1);
-
-    for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands();
-         i != e; ++i) {
-      const MachineOperand &FMO = getOperand(i);
-      if (!FMO.isImm())
-        continue;
-      if (i+1 >= e || !getOperand(i+1).isReg() || !getOperand(i+1).isUse())
-        continue;
-      unsigned Idx;
-      if (InlineAsm::isUseOperandTiedToDef(FMO.getImm(), Idx) &&
-          Idx == DefNo) {
-        if (UseOpIdx)
-          *UseOpIdx = (unsigned)i + 1 + DefPart;
-        return true;
-      }
-    }
-    return false;
+/// tieOperands - Mark operands at DefIdx and UseIdx as tied to each other.
+///
+/// Use and def operands can be tied together, indicated by a non-zero TiedTo
+/// field. TiedTo can have these values:
+///
+/// 0:              Operand is not tied to anything.
+/// 1 to TiedMax-1: Tied to getOperand(TiedTo-1).
+/// TiedMax:        Tied to an operand >= TiedMax-1.
+///
+/// The tied def must be one of the first TiedMax operands on a normal
+/// instruction. INLINEASM instructions allow more tied defs.
+///
+void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
+  MachineOperand &DefMO = getOperand(DefIdx);
+  MachineOperand &UseMO = getOperand(UseIdx);
+  assert(DefMO.isDef() && "DefIdx must be a def operand");
+  assert(UseMO.isUse() && "UseIdx must be a use operand");
+  assert(!DefMO.isTied() && "Def is already tied to another use");
+  assert(!UseMO.isTied() && "Use is already tied to another def");
+
+  if (DefIdx < TiedMax)
+    UseMO.TiedTo = DefIdx + 1;
+  else {
+    // Inline asm can use the group descriptors to find tied operands, but on
+    // normal instruction, the tied def must be within the first TiedMax
+    // operands.
+    assert(isInlineAsm() && "DefIdx out of range");
+    UseMO.TiedTo = TiedMax;
   }
 
-  assert(getOperand(DefOpIdx).isDef() && "DefOpIdx is not a def!");
-  const MCInstrDesc &MCID = getDesc();
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = getOperand(i);
-    if (MO.isReg() && MO.isUse() &&
-        MCID.getOperandConstraint(i, MCOI::TIED_TO) == (int)DefOpIdx) {
-      if (UseOpIdx)
-        *UseOpIdx = (unsigned)i;
-      return true;
-    }
-  }
-  return false;
+  // UseIdx can be out of range, we'll search for it in findTiedOperandIdx().
+  DefMO.TiedTo = std::min(UseIdx + 1, TiedMax);
 }
 
-/// isRegTiedToDefOperand - Return true if the operand of the specified index
-/// is a register use and it is tied to an def operand. It also returns the def
-/// operand index by reference.
-bool MachineInstr::
-isRegTiedToDefOperand(unsigned UseOpIdx, unsigned *DefOpIdx) const {
-  if (isInlineAsm()) {
-    const MachineOperand &MO = getOperand(UseOpIdx);
-    if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0)
-      return false;
+/// Given the index of a tied register operand, find the operand it is tied to.
+/// Defs are tied to uses and vice versa. Returns the index of the tied operand
+/// which must exist.
+unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isTied() && "Operand isn't tied");
 
-    // Find the flag operand corresponding to UseOpIdx
-    int FlagIdx = findInlineAsmFlagIdx(UseOpIdx);
-    if (FlagIdx < 0)
-      return false;
+  // Normally TiedTo is in range.
+  if (MO.TiedTo < TiedMax)
+    return MO.TiedTo - 1;
 
-    const MachineOperand &UFMO = getOperand(FlagIdx);
-    unsigned DefNo;
-    if (InlineAsm::isUseOperandTiedToDef(UFMO.getImm(), DefNo)) {
-      if (!DefOpIdx)
-        return true;
-
-      unsigned DefIdx = InlineAsm::MIOp_FirstOperand;
-      // Remember to adjust the index. First operand is asm string, second is
-      // the HasSideEffects and AlignStack bits, then there is a flag for each.
-      while (DefNo) {
-        const MachineOperand &FMO = getOperand(DefIdx);
-        assert(FMO.isImm());
-        // Skip over this def.
-        DefIdx += InlineAsm::getNumOperandRegisters(FMO.getImm()) + 1;
-        --DefNo;
-      }
-      *DefOpIdx = DefIdx + UseOpIdx - FlagIdx;
-      return true;
+  // Uses on normal instructions can be out of range.
+  if (!isInlineAsm()) {
+    // Normal tied defs must be in the 0..TiedMax-1 range.
+    if (MO.isUse())
+      return TiedMax - 1;
+    // MO is a def. Search for the tied use.
+    for (unsigned i = TiedMax - 1, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &UseMO = getOperand(i);
+      if (UseMO.isReg() && UseMO.isUse() && UseMO.TiedTo == OpIdx + 1)
+        return i;
     }
-    return false;
+    llvm_unreachable("Can't find tied use");
   }
 
-  const MCInstrDesc &MCID = getDesc();
-  if (UseOpIdx >= MCID.getNumOperands())
-    return false;
-  const MachineOperand &MO = getOperand(UseOpIdx);
-  if (!MO.isReg() || !MO.isUse())
-    return false;
-  int DefIdx = MCID.getOperandConstraint(UseOpIdx, MCOI::TIED_TO);
-  if (DefIdx == -1)
-    return false;
-  if (DefOpIdx)
-    *DefOpIdx = (unsigned)DefIdx;
-  return true;
+  // Now deal with inline asm by parsing the operand group descriptor flags.
+  // Find the beginning of each operand group.
+  SmallVector<unsigned, 8> GroupIdx;
+  unsigned OpIdxGroup = ~0u;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    assert(FlagMO.isImm() && "Invalid tied operand on inline asm");
+    unsigned CurGroup = GroupIdx.size();
+    GroupIdx.push_back(i);
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    // OpIdx belongs to this operand group.
+    if (OpIdx > i && OpIdx < i + NumOps)
+      OpIdxGroup = CurGroup;
+    unsigned TiedGroup;
+    if (!InlineAsm::isUseOperandTiedToDef(FlagMO.getImm(), TiedGroup))
+      continue;
+    // Operands in this group are tied to operands in TiedGroup which must be
+    // earlier. Find the number of operands between the two groups.
+    unsigned Delta = i - GroupIdx[TiedGroup];
+
+    // OpIdx is a use tied to TiedGroup.
+    if (OpIdxGroup == CurGroup)
+      return OpIdx - Delta;
+
+    // OpIdx is a def tied to this use group.
+    if (OpIdxGroup == TiedGroup)
+      return OpIdx + Delta;
+  }
+  llvm_unreachable("Invalid tied operand on inline asm");
 }
 
 /// clearKillInfo - Clears kill flags on all operands.
@@ -1292,7 +1296,12 @@ bool MachineInstr::isSafeToMove(const TargetInstrInfo *TII,
                                 AliasAnalysis *AA,
                                 bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
-  if (mayStore() || isCall()) {
+  //
+  // Treat volatile loads as stores. This is not strictly necessary for
+  // volatiles, but it is required for atomic loads. It is not allowed to move
+  // a load across an atomic load with Ordering > Monotonic.
+  if (mayStore() || isCall() ||
+      (mayLoad() && hasOrderedMemoryRef())) {
     SawStore = true;
     return false;
   }
@@ -1308,8 +1317,8 @@ bool MachineInstr::isSafeToMove(const TargetInstrInfo *TII,
   // load.
   if (mayLoad() && !isInvariantLoad(AA))
     // Otherwise, this is a real load.  If there is a store between the load and
-    // end of block, or if the load is volatile, we can't move it.
-    return !SawStore && !hasVolatileMemoryRef();
+    // end of block, we can't move it.
+    return !SawStore;
 
   return true;
 }
@@ -1340,11 +1349,11 @@ bool MachineInstr::isSafeToReMat(const TargetInstrInfo *TII,
   return true;
 }
 
-/// hasVolatileMemoryRef - Return true if this instruction may have a
-/// volatile memory reference, or if the information describing the
-/// memory reference is not available. Return false if it is known to
-/// have no volatile memory references.
-bool MachineInstr::hasVolatileMemoryRef() const {
+/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+/// or volatile memory reference, or if the information describing the memory
+/// reference is not available. Return false if it is known to have no ordered
+/// memory references.
+bool MachineInstr::hasOrderedMemoryRef() const {
   // An instruction known never to access memory won't have a volatile access.
   if (!mayStore() &&
       !mayLoad() &&
@@ -1357,9 +1366,9 @@ bool MachineInstr::hasVolatileMemoryRef() const {
   if (memoperands_empty())
     return true;
 
-  // Check the memory reference information for volatile references.
+  // Check the memory reference information for ordered references.
   for (mmo_iterator I = memoperands_begin(), E = memoperands_end(); I != E; ++I)
-    if ((*I)->isVolatile())
+    if (!(*I)->isUnordered())
       return true;
 
   return false;
@@ -1461,7 +1470,9 @@ void MachineInstr::copyImplicitOps(const MachineInstr *MI) {
 }
 
 void MachineInstr::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
+#endif
 }
 
 static void printDebugLoc(DebugLoc DL, const MachineFunction *MF,
@@ -1540,6 +1551,10 @@ void MachineInstr::print(raw_ostream &OS, const TargetMachine *TM) const {
       OS << " [sideeffect]";
     if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
       OS << " [alignstack]";
+    if (getInlineAsmDialect() == InlineAsm::AD_ATT)
+      OS << " [attdialect]";
+    if (getInlineAsmDialect() == InlineAsm::AD_Intel)
+      OS << " [inteldialect]";
 
     StartOp = AsmDescOp = InlineAsm::MIOp_FirstOperand;
     FirstOp = false;
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index b7de7bfb492e..1f7fbfc719b0 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -109,10 +109,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   MachineInstrBuilder MIB = BuildMI(MBB, FirstMI, FirstMI->getDebugLoc(),
                                     TII->get(TargetOpcode::BUNDLE));
 
-  SmallVector<unsigned, 8> LocalDefs;
-  SmallSet<unsigned, 8> LocalDefSet;
+  SmallVector<unsigned, 32> LocalDefs;
+  SmallSet<unsigned, 32> LocalDefSet;
   SmallSet<unsigned, 8> DeadDefSet;
-  SmallSet<unsigned, 8> KilledDefSet;
+  SmallSet<unsigned, 16> KilledDefSet;
   SmallVector<unsigned, 8> ExternUses;
   SmallSet<unsigned, 8> ExternUseSet;
   SmallSet<unsigned, 8> KilledUseSet;
@@ -181,7 +181,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     Defs.clear();
   }
 
-  SmallSet<unsigned, 8> Added;
+  SmallSet<unsigned, 32> Added;
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
     if (Added.insert(Reg)) {
@@ -248,10 +248,10 @@ bool llvm::finalizeBundles(MachineFunction &MF) {
 // MachineOperand iterator
 //===----------------------------------------------------------------------===//
 
-MachineOperandIteratorBase::RegInfo
+MachineOperandIteratorBase::VirtRegInfo
 MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
                     SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops) {
-  RegInfo RI = { false, false, false };
+  VirtRegInfo RI = { false, false, false };
   for(; isValid(); ++*this) {
     MachineOperand &MO = deref();
     if (!MO.isReg() || MO.getReg() != Reg)
@@ -276,3 +276,53 @@ MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
   }
   return RI;
 }
+
+MachineOperandIteratorBase::PhysRegInfo
+MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
+                                           const TargetRegisterInfo *TRI) {
+  bool AllDefsDead = true;
+  PhysRegInfo PRI = {false, false, false, false, false, false, false};
+
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "analyzePhysReg not given a physical register!");
+  for (; isValid(); ++*this) {
+    MachineOperand &MO = deref();
+
+    if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
+      PRI.Clobbers = true;    // Regmask clobbers Reg.
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned MOReg = MO.getReg();
+    if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
+      continue;
+
+    bool IsRegOrSuperReg = MOReg == Reg || TRI->isSubRegister(MOReg, Reg);
+    bool IsRegOrOverlapping = MOReg == Reg || TRI->regsOverlap(MOReg, Reg);
+
+    if (IsRegOrSuperReg && MO.readsReg()) {
+      // Reg or a super-reg is read, and perhaps killed also.
+      PRI.Reads = true;
+      PRI.Kills = MO.isKill();
+    } if (IsRegOrOverlapping && MO.readsReg()) {
+      PRI.ReadsOverlap = true;// Reg or an overlapping register is read.
+    }
+
+    if (!MO.isDef())
+      continue;
+
+    if (IsRegOrSuperReg) {
+      PRI.Defines = true;     // Reg or a super-register is defined.
+      if (!MO.isDead())
+        AllDefsDead = false;
+    }
+    if (IsRegOrOverlapping)
+      PRI.Clobbers = true;    // Reg or an overlapping reg is defined.
+  }
+
+  if (AllDefsDead && PRI.Defines)
+    PRI.DefinesDead = true;   // Reg or super-register was defined and was dead.
+
+  return PRI;
+}
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index efec481dab88..169443e03d77 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -334,7 +334,7 @@ bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
     DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
   else
     DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
-  DEBUG(dbgs() << MF.getFunction()->getName() << " ********\n");
+  DEBUG(dbgs() << MF.getName() << " ********\n");
 
   if (PreRegAlloc) {
     // Estimate register pressure during pre-regalloc pass.
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 9f3829e3c0f3..27afeec1d973 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -74,6 +74,8 @@ MachineBasicBlock *MachineLoop::getBottomBlock() {
   return BotMBB;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineLoop::dump() const {
   print(dbgs());
 }
+#endif
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index ea98b23c6d57..005bf783e3da 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -25,7 +25,7 @@
 using namespace llvm;
 using namespace llvm::dwarf;
 
-// Handle the Pass registration stuff necessary to use TargetData's.
+// Handle the Pass registration stuff necessary to use DataLayout's.
 INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
                 "Machine Module Information", false, false)
 char MachineModuleInfo::ID = 0;
diff --git a/lib/CodeGen/MachineModuleInfoImpls.cpp b/lib/CodeGen/MachineModuleInfoImpls.cpp
index 5ab56c09f5f6..a1c7e9f5fb22 100644
--- a/lib/CodeGen/MachineModuleInfoImpls.cpp
+++ b/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -21,8 +21,8 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 // Out of line virtual method.
-void MachineModuleInfoMachO::Anchor() {}
-void MachineModuleInfoELF::Anchor() {}
+void MachineModuleInfoMachO::anchor() {}
+void MachineModuleInfoELF::anchor() {}
 
 static int SortSymbolPair(const void *LHS, const void *RHS) {
   typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp
new file mode 100644
index 000000000000..c3f6e9249e7d
--- /dev/null
+++ b/lib/CodeGen/MachinePostDominators.cpp
@@ -0,0 +1,55 @@
+//===- MachinePostDominators.cpp -Machine Post Dominator Calculation ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// post dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+using namespace llvm;
+
+char MachinePostDominatorTree::ID = 0;
+
+//declare initializeMachinePostDominatorTreePass
+INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
+                "MachinePostDominator Tree Construction", true, true)
+
+MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
+  initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
+  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+                                                       // postdominator
+}
+
+FunctionPass *
+MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+  return new MachinePostDominatorTree();
+}
+
+bool
+MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+void
+MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void
+MachinePostDominatorTree::print(llvm::raw_ostream &OS, const Module *M) const {
+  DT->print(OS);
+}
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 5fb938f3400d..95d7a7dd6897 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -21,7 +21,7 @@ MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
   : TRI(&TRI), IsSSA(true), TracksLiveness(true) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
-  UsedPhysRegs.resize(TRI.getNumRegs());
+  UsedRegUnits.resize(TRI.getNumRegUnits());
   UsedPhysRegMask.resize(TRI.getNumRegs());
 
   // Create the physreg use/def lists.
@@ -32,7 +32,7 @@ MachineRegisterInfo::MachineRegisterInfo(const TargetRegisterInfo &TRI)
 MachineRegisterInfo::~MachineRegisterInfo() {
 #ifndef NDEBUG
   clearVirtRegs();
-  for (unsigned i = 0, e = UsedPhysRegs.size(); i != e; ++i)
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
     assert(!PhysRegUseDefLists[i] &&
            "PhysRegUseDefLists has entries after all instructions are deleted");
 #endif
@@ -306,22 +306,18 @@ void MachineRegisterInfo::dumpUses(unsigned Reg) const {
 
 void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
   ReservedRegs = TRI->getReservedRegs(MF);
+  assert(ReservedRegs.size() == TRI->getNumRegs() &&
+         "Invalid ReservedRegs vector from target");
 }
 
 bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg,
                                             const MachineFunction &MF) const {
   assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
 
-  // Check if any overlapping register is modified.
+  // Check if any overlapping register is modified, or allocatable so it may be
+  // used later.
   for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
-    if (!def_empty(*AI))
-      return false;
-
-  // Check if any overlapping register is allocatable so it may be used later.
-  if (AllocatableRegs.empty())
-    AllocatableRegs = TRI->getAllocatableSet(MF);
-  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI)
-    if (AllocatableRegs.test(*AI))
+    if (!def_empty(*AI) || isAllocatable(*AI))
       return false;
   return true;
 }
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index a1dc9481c639..a4817d09c0d3 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -18,11 +18,8 @@
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
-#include "llvm/CodeGen/RegisterPressure.h"
-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGILP.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -35,10 +32,12 @@
 
 using namespace llvm;
 
-static cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
-                                  cl::desc("Force top-down list scheduling"));
-static cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
-                                  cl::desc("Force bottom-up list scheduling"));
+namespace llvm {
+cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
+                           cl::desc("Force top-down list scheduling"));
+cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
+                            cl::desc("Force bottom-up list scheduling"));
+}
 
 #ifndef NDEBUG
 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
@@ -50,6 +49,15 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
+// Threshold to very roughly model an out-of-order processor's instruction
+// buffers. If the actual value of this threshold matters much in practice, then
+// it can be specified by the machine model. For now, it's an experimental
+// tuning knob to determine when and if it matters.
+static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
+  cl::desc("Allow expected latency to exceed the critical path by N cycles "
+           "before attempting to balance ILP"),
+  cl::init(10U));
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -221,7 +229,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     // The Scheduler may insert instructions during either schedule() or
     // exitRegion(), even for empty regions. So the local iterators 'I' and
     // 'RegionEnd' are invalid across these calls.
-    unsigned RemainingCount = MBB->size();
+    unsigned RemainingInstrs = MBB->size();
     for(MachineBasicBlock::iterator RegionEnd = MBB->end();
         RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
 
@@ -230,19 +238,19 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
           || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
         --RegionEnd;
         // Count the boundary instruction.
-        --RemainingCount;
+        --RemainingInstrs;
       }
 
       // The next region starts above the previous region. Look backward in the
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingCount) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs) {
         if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
           break;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
-      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingCount);
+      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
@@ -252,11 +260,11 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
         continue;
       }
       DEBUG(dbgs() << "********** MI Scheduling **********\n");
-      DEBUG(dbgs() << MF->getFunction()->getName()
+      DEBUG(dbgs() << MF->getName()
             << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
-            dbgs() << " Remaining: " << RemainingCount << "\n");
+            dbgs() << " Remaining: " << RemainingInstrs << "\n");
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -269,7 +277,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
       // scheduler for the top of it's scheduled region.
       RegionEnd = Scheduler->begin();
     }
-    assert(RemainingCount == 0 && "Instruction count mismatch!");
+    assert(RemainingInstrs == 0 && "Instruction count mismatch!");
     Scheduler->finishBlock();
   }
   Scheduler->finalizeSchedule();
@@ -281,157 +289,20 @@ void MachineScheduler::print(raw_ostream &O, const Module* m) const {
   // unimplemented
 }
 
-//===----------------------------------------------------------------------===//
-// MachineSchedStrategy - Interface to a machine scheduling algorithm.
-//===----------------------------------------------------------------------===//
-
-namespace {
-class ScheduleDAGMI;
-
-/// MachineSchedStrategy - Interface used by ScheduleDAGMI to drive the selected
-/// scheduling algorithm.
-///
-/// If this works well and targets wish to reuse ScheduleDAGMI, we may expose it
-/// in ScheduleDAGInstrs.h
-class MachineSchedStrategy {
-public:
-  virtual ~MachineSchedStrategy() {}
-
-  /// Initialize the strategy after building the DAG for a new region.
-  virtual void initialize(ScheduleDAGMI *DAG) = 0;
-
-  /// Pick the next node to schedule, or return NULL. Set IsTopNode to true to
-  /// schedule the node at the top of the unscheduled region. Otherwise it will
-  /// be scheduled at the bottom.
-  virtual SUnit *pickNode(bool &IsTopNode) = 0;
-
-  /// Notify MachineSchedStrategy that ScheduleDAGMI has scheduled a node.
-  virtual void schedNode(SUnit *SU, bool IsTopNode) = 0;
-
-  /// When all predecessor dependencies have been resolved, free this node for
-  /// top-down scheduling.
-  virtual void releaseTopNode(SUnit *SU) = 0;
-  /// When all successor dependencies have been resolved, free this node for
-  /// bottom-up scheduling.
-  virtual void releaseBottomNode(SUnit *SU) = 0;
-};
-} // namespace
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ReadyQueue::dump() {
+  dbgs() << Name << ": ";
+  for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+    dbgs() << Queue[i]->NodeNum << " ";
+  dbgs() << "\n";
+}
+#endif
 
 //===----------------------------------------------------------------------===//
 // ScheduleDAGMI - Base class for MachineInstr scheduling with LiveIntervals
 // preservation.
 //===----------------------------------------------------------------------===//
 
-namespace {
-/// ScheduleDAGMI is an implementation of ScheduleDAGInstrs that schedules
-/// machine instructions while updating LiveIntervals.
-class ScheduleDAGMI : public ScheduleDAGInstrs {
-  AliasAnalysis *AA;
-  RegisterClassInfo *RegClassInfo;
-  MachineSchedStrategy *SchedImpl;
-
-  MachineBasicBlock::iterator LiveRegionEnd;
-
-  /// Register pressure in this region computed by buildSchedGraph.
-  IntervalPressure RegPressure;
-  RegPressureTracker RPTracker;
-
-  /// List of pressure sets that exceed the target's pressure limit before
-  /// scheduling, listed in increasing set ID order. Each pressure set is paired
-  /// with its max pressure in the currently scheduled regions.
-  std::vector<PressureElement> RegionCriticalPSets;
-
-  /// The top of the unscheduled zone.
-  MachineBasicBlock::iterator CurrentTop;
-  IntervalPressure TopPressure;
-  RegPressureTracker TopRPTracker;
-
-  /// The bottom of the unscheduled zone.
-  MachineBasicBlock::iterator CurrentBottom;
-  IntervalPressure BotPressure;
-  RegPressureTracker BotRPTracker;
-
-#ifndef NDEBUG
-  /// The number of instructions scheduled so far. Used to cut off the
-  /// scheduler at the point determined by misched-cutoff.
-  unsigned NumInstrsScheduled;
-#endif
-public:
-  ScheduleDAGMI(MachineSchedContext *C, MachineSchedStrategy *S):
-    ScheduleDAGInstrs(*C->MF, *C->MLI, *C->MDT, /*IsPostRA=*/false, C->LIS),
-    AA(C->AA), RegClassInfo(C->RegClassInfo), SchedImpl(S),
-    RPTracker(RegPressure), CurrentTop(), TopRPTracker(TopPressure),
-    CurrentBottom(), BotRPTracker(BotPressure) {
-#ifndef NDEBUG
-    NumInstrsScheduled = 0;
-#endif
-  }
-
-  ~ScheduleDAGMI() {
-    delete SchedImpl;
-  }
-
-  MachineBasicBlock::iterator top() const { return CurrentTop; }
-  MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
-
-  /// Implement the ScheduleDAGInstrs interface for handling the next scheduling
-  /// region. This covers all instructions in a block, while schedule() may only
-  /// cover a subset.
-  void enterRegion(MachineBasicBlock *bb,
-                   MachineBasicBlock::iterator begin,
-                   MachineBasicBlock::iterator end,
-                   unsigned endcount);
-
-  /// Implement ScheduleDAGInstrs interface for scheduling a sequence of
-  /// reorderable instructions.
-  void schedule();
-
-  /// Get current register pressure for the top scheduled instructions.
-  const IntervalPressure &getTopPressure() const { return TopPressure; }
-  const RegPressureTracker &getTopRPTracker() const { return TopRPTracker; }
-
-  /// Get current register pressure for the bottom scheduled instructions.
-  const IntervalPressure &getBotPressure() const { return BotPressure; }
-  const RegPressureTracker &getBotRPTracker() const { return BotRPTracker; }
-
-  /// Get register pressure for the entire scheduling region before scheduling.
-  const IntervalPressure &getRegPressure() const { return RegPressure; }
-
-  const std::vector<PressureElement> &getRegionCriticalPSets() const {
-    return RegionCriticalPSets;
-  }
-
-  /// getIssueWidth - Return the max instructions per scheduling group.
-  unsigned getIssueWidth() const {
-    return (InstrItins && InstrItins->SchedModel)
-      ? InstrItins->SchedModel->IssueWidth : 1;
-  }
-
-  /// getNumMicroOps - Return the number of issue slots required for this MI.
-  unsigned getNumMicroOps(MachineInstr *MI) const {
-    if (!InstrItins) return 1;
-    int UOps = InstrItins->getNumMicroOps(MI->getDesc().getSchedClass());
-    return (UOps >= 0) ? UOps : TII->getNumMicroOps(InstrItins, MI);
-  }
-
-protected:
-  void initRegPressure();
-  void updateScheduledPressure(std::vector<unsigned> NewMaxPressure);
-
-  void moveInstruction(MachineInstr *MI, MachineBasicBlock::iterator InsertPos);
-  bool checkSchedLimit();
-
-  void releaseRoots();
-
-  void releaseSucc(SUnit *SU, SDep *SuccEdge);
-  void releaseSuccessors(SUnit *SU);
-  void releasePred(SUnit *SU, SDep *PredEdge);
-  void releasePredecessors(SUnit *SU);
-
-  void placeDebugValues();
-};
-} // namespace
-
 /// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
 /// NumPredsLeft reaches zero, release the successor node.
 ///
@@ -498,7 +369,7 @@ void ScheduleDAGMI::moveInstruction(MachineInstr *MI,
   BB->splice(InsertPos, BB, MI);
 
   // Update LiveIntervals
-  LIS->handleMove(MI);
+  LIS->handleMove(MI, /*UpdateFlags=*/true);
 
   // Recede RegionBegin if an instruction moves above the first.
   if (RegionBegin == InsertPos)
@@ -565,6 +436,9 @@ void ScheduleDAGMI::initRegPressure() {
   std::vector<unsigned> RegionPressure = RPTracker.getPressure().MaxSetPressure;
   for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
     unsigned Limit = TRI->getRegPressureSetLimit(i);
+    DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+          << "Limit " << Limit
+          << " Actual " << RegionPressure[i] << "\n");
     if (RegionPressure[i] > Limit)
       RegionCriticalPSets.push_back(PressureElement(i, 0));
   }
@@ -587,6 +461,74 @@ updateScheduledPressure(std::vector<unsigned> NewMaxPressure) {
   }
 }
 
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+///
+/// This is a skeletal driver, with all the functionality pushed into helpers,
+/// so that it can be easilly extended by experimental schedulers. Generally,
+/// implementing MachineSchedStrategy should be sufficient to implement a new
+/// scheduling algorithm. However, if a scheduler further subclasses
+/// ScheduleDAGMI then it will want to override this virtual method in order to
+/// update any specialized state.
+void ScheduleDAGMI::schedule() {
+  buildDAGWithRegPressure();
+
+  postprocessDAG();
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  if (ViewMISchedDAGs) viewGraph();
+
+  initQueues();
+
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    assert(!SU->isScheduled && "Node already scheduled");
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = top()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+/// Build the DAG and setup three register pressure trackers.
+void ScheduleDAGMI::buildDAGWithRegPressure() {
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker);
+  if (ViewMISchedDAGs) viewGraph();
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void ScheduleDAGMI::postprocessDAG() {
+  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
+    Mutations[i]->apply(this);
+  }
+}
+
 // Release all DAG roots for scheduling.
 void ScheduleDAGMI::releaseRoots() {
   SmallVector<SUnit*, 16> BotRoots;
@@ -607,28 +549,10 @@ void ScheduleDAGMI::releaseRoots() {
     SchedImpl->releaseBottomNode(*I);
 }
 
-/// schedule - Called back from MachineScheduler::runOnMachineFunction
-/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
-/// only includes instructions that have DAG nodes, not scheduling boundaries.
-void ScheduleDAGMI::schedule() {
-  // Initialize the register pressure tracker used by buildSchedGraph.
-  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd);
-
-  // Account for liveness generate by the region boundary.
-  if (LiveRegionEnd != RegionEnd)
-    RPTracker.recede();
-
-  // Build the DAG, and compute current register pressure.
-  buildSchedGraph(AA, &RPTracker);
-
-  // Initialize top/bottom trackers after computing region pressure.
-  initRegPressure();
-
-  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
-          SUnits[su].dumpAll(this));
-
-  if (ViewMISchedDAGs) viewGraph();
+/// Identify DAG roots and setup scheduler queues.
+void ScheduleDAGMI::initQueues() {
 
+  // Initialize the strategy before modifying the DAG.
   SchedImpl->initialize(this);
 
   // Release edges from the special Entry node or to the special Exit node.
@@ -638,61 +562,64 @@ void ScheduleDAGMI::schedule() {
   // Release all DAG roots for scheduling.
   releaseRoots();
 
+  SchedImpl->registerRoots();
+
   CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
   CurrentBottom = RegionEnd;
-  bool IsTopNode = false;
-  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
-    if (!checkSchedLimit())
-      break;
-
-    // Move the instruction to its new location in the instruction stream.
-    MachineInstr *MI = SU->getInstr();
-
-    if (IsTopNode) {
-      assert(SU->isTopReady() && "node still has unscheduled dependencies");
-      if (&*CurrentTop == MI)
-        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
-      else {
-        moveInstruction(MI, CurrentTop);
-        TopRPTracker.setPos(MI);
-      }
+}
 
-      // Update top scheduled pressure.
-      TopRPTracker.advance();
-      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
-      updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+/// Move an instruction and update register pressure.
+void ScheduleDAGMI::scheduleMI(SUnit *SU, bool IsTopNode) {
+  // Move the instruction to its new location in the instruction stream.
+  MachineInstr *MI = SU->getInstr();
 
-      // Release dependent instructions for scheduling.
-      releaseSuccessors(SU);
+  if (IsTopNode) {
+    assert(SU->isTopReady() && "node still has unscheduled dependencies");
+    if (&*CurrentTop == MI)
+      CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+    else {
+      moveInstruction(MI, CurrentTop);
+      TopRPTracker.setPos(MI);
     }
+
+    // Update top scheduled pressure.
+    TopRPTracker.advance();
+    assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+    updateScheduledPressure(TopRPTracker.getPressure().MaxSetPressure);
+  }
+  else {
+    assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+    MachineBasicBlock::iterator priorII =
+      priorNonDebug(CurrentBottom, CurrentTop);
+    if (&*priorII == MI)
+      CurrentBottom = priorII;
     else {
-      assert(SU->isBottomReady() && "node still has unscheduled dependencies");
-      MachineBasicBlock::iterator priorII =
-        priorNonDebug(CurrentBottom, CurrentTop);
-      if (&*priorII == MI)
-        CurrentBottom = priorII;
-      else {
-        if (&*CurrentTop == MI) {
-          CurrentTop = nextIfDebug(++CurrentTop, priorII);
-          TopRPTracker.setPos(CurrentTop);
-        }
-        moveInstruction(MI, CurrentBottom);
-        CurrentBottom = MI;
+      if (&*CurrentTop == MI) {
+        CurrentTop = nextIfDebug(++CurrentTop, priorII);
+        TopRPTracker.setPos(CurrentTop);
       }
-      // Update bottom scheduled pressure.
-      BotRPTracker.recede();
-      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
-      updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
-
-      // Release dependent instructions for scheduling.
-      releasePredecessors(SU);
+      moveInstruction(MI, CurrentBottom);
+      CurrentBottom = MI;
     }
-    SU->isScheduled = true;
-    SchedImpl->schedNode(SU, IsTopNode);
+    // Update bottom scheduled pressure.
+    BotRPTracker.recede();
+    assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+    updateScheduledPressure(BotRPTracker.getPressure().MaxSetPressure);
   }
-  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+}
 
-  placeDebugValues();
+/// Update scheduler queues after scheduling an instruction.
+void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
+  // Release dependent instructions for scheduling.
+  if (IsTopNode)
+    releaseSuccessors(SU);
+  else
+    releasePredecessors(SU);
+
+  SU->isScheduled = true;
+
+  // Notify the scheduling strategy after updating the DAG.
+  SchedImpl->schedNode(SU, IsTopNode);
 }
 
 /// Reinsert any remaining debug_values, just like the PostRA scheduler.
@@ -716,91 +643,146 @@ void ScheduleDAGMI::placeDebugValues() {
   FirstDbgValue = NULL;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGMI::dumpSchedule() const {
+  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
+    if (SUnit *SU = getSUnit(&(*MI)))
+      SU->dump(this);
+    else
+      dbgs() << "Missing SUnit\n";
+  }
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
 
 namespace {
-/// ReadyQueue encapsulates vector of "ready" SUnits with basic convenience
-/// methods for pushing and removing nodes. ReadyQueue's are uniquely identified
-/// by an ID. SUnit::NodeQueueId is a mask of the ReadyQueues the SUnit is in.
-class ReadyQueue {
-  unsigned ID;
-  std::string Name;
-  std::vector<SUnit*> Queue;
-
+/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
+/// the schedule.
+class ConvergingScheduler : public MachineSchedStrategy {
 public:
-  ReadyQueue(unsigned id, const Twine &name): ID(id), Name(name.str()) {}
-
-  unsigned getID() const { return ID; }
-
-  StringRef getName() const { return Name; }
-
-  // SU is in this queue if it's NodeQueueID is a superset of this ID.
-  bool isInQueue(SUnit *SU) const { return (SU->NodeQueueId & ID); }
-
-  bool empty() const { return Queue.empty(); }
-
-  unsigned size() const { return Queue.size(); }
-
-  typedef std::vector<SUnit*>::iterator iterator;
+  /// Represent the type of SchedCandidate found within a single queue.
+  /// pickNodeBidirectional depends on these listed by decreasing priority.
+  enum CandReason {
+    NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
+    BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
+    SingleMax, MultiPressure, NextDefUse, NodeOrder};
 
-  iterator begin() { return Queue.begin(); }
+#ifndef NDEBUG
+  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+#endif
 
-  iterator end() { return Queue.end(); }
+  /// Policy for scheduling the next instruction in the candidate's zone.
+  struct CandPolicy {
+    bool ReduceLatency;
+    unsigned ReduceResIdx;
+    unsigned DemandResIdx;
 
-  iterator find(SUnit *SU) {
-    return std::find(Queue.begin(), Queue.end(), SU);
-  }
+    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+  };
 
-  void push(SUnit *SU) {
-    Queue.push_back(SU);
-    SU->NodeQueueId |= ID;
-  }
+  /// Status of an instruction's critical resource consumption.
+  struct SchedResourceDelta {
+    // Count critical resources in the scheduled region required by SU.
+    unsigned CritResources;
 
-  void remove(iterator I) {
-    (*I)->NodeQueueId &= ~ID;
-    *I = Queue.back();
-    Queue.pop_back();
-  }
+    // Count critical resources from another region consumed by SU.
+    unsigned DemandedResources;
 
-  void dump() {
-    dbgs() << Name << ": ";
-    for (unsigned i = 0, e = Queue.size(); i < e; ++i)
-      dbgs() << Queue[i]->NodeNum << " ";
-    dbgs() << "\n";
-  }
-};
+    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
 
-/// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
-/// the schedule.
-class ConvergingScheduler : public MachineSchedStrategy {
+    bool operator==(const SchedResourceDelta &RHS) const {
+      return CritResources == RHS.CritResources
+        && DemandedResources == RHS.DemandedResources;
+    }
+    bool operator!=(const SchedResourceDelta &RHS) const {
+      return !operator==(RHS);
+    }
+  };
 
   /// Store the state used by ConvergingScheduler heuristics, required for the
   /// lifetime of one invocation of pickNode().
   struct SchedCandidate {
+    CandPolicy Policy;
+
     // The best SUnit candidate.
     SUnit *SU;
 
+    // The reason for this candidate.
+    CandReason Reason;
+
     // Register pressure values for the best candidate.
     RegPressureDelta RPDelta;
 
-    SchedCandidate(): SU(NULL) {}
+    // Critical resource consumption of the best candidate.
+    SchedResourceDelta ResDelta;
+
+    SchedCandidate(const CandPolicy &policy)
+    : Policy(policy), SU(NULL), Reason(NoCand) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      RPDelta = Best.RPDelta;
+      ResDelta = Best.ResDelta;
+    }
+
+    void initResourceDelta(const ScheduleDAGMI *DAG,
+                           const TargetSchedModel *SchedModel);
+  };
+
+  /// Summarize the unscheduled region.
+  struct SchedRemainder {
+    // Critical path through the DAG in expected latency.
+    unsigned CriticalPath;
+
+    // Unscheduled resources
+    SmallVector<unsigned, 16> RemainingCounts;
+    // Critical resource for the unscheduled zone.
+    unsigned CritResIdx;
+    // Number of micro-ops left to schedule.
+    unsigned RemainingMicroOps;
+    // Is the unscheduled zone resource limited.
+    bool IsResourceLimited;
+
+    unsigned MaxRemainingCount;
+
+    void reset() {
+      CriticalPath = 0;
+      RemainingCounts.clear();
+      CritResIdx = 0;
+      RemainingMicroOps = 0;
+      IsResourceLimited = false;
+      MaxRemainingCount = 0;
+    }
+
+    SchedRemainder() { reset(); }
+
+    void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
   };
-  /// Represent the type of SchedCandidate found within a single queue.
-  enum CandResult {
-    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure };
 
   /// Each Scheduling boundary is associated with ready queues. It tracks the
-  /// current cycle in whichever direction at has moved, and maintains the state
+  /// current cycle in the direction of movement, and maintains the state
   /// of "hazards" and other interlocks at the current cycle.
   struct SchedBoundary {
     ScheduleDAGMI *DAG;
+    const TargetSchedModel *SchedModel;
+    SchedRemainder *Rem;
 
     ReadyQueue Available;
     ReadyQueue Pending;
     bool CheckPending;
 
+    // For heuristics, keep a list of the nodes that immediately depend on the
+    // most recently scheduled node.
+    SmallPtrSet<const SUnit*, 8> NextSUs;
+
     ScheduleHazardRecognizer *HazardRec;
 
     unsigned CurrCycle;
@@ -809,29 +791,88 @@ class ConvergingScheduler : public MachineSchedStrategy {
     /// MinReadyCycle - Cycle of the soonest available instruction.
     unsigned MinReadyCycle;
 
+    // The expected latency of the critical path in this scheduled zone.
+    unsigned ExpectedLatency;
+
+    // Resources used in the scheduled zone beyond this boundary.
+    SmallVector<unsigned, 16> ResourceCounts;
+
+    // Cache the critical resources ID in this scheduled zone.
+    unsigned CritResIdx;
+
+    // Is the scheduled region resource limited vs. latency limited.
+    bool IsResourceLimited;
+
+    unsigned ExpectedCount;
+
+    // Policy flag: attempt to find ILP until expected latency is covered.
+    bool ShouldIncreaseILP;
+
+#ifndef NDEBUG
     // Remember the greatest min operand latency.
     unsigned MaxMinLatency;
+#endif
+
+    void reset() {
+      Available.clear();
+      Pending.clear();
+      CheckPending = false;
+      NextSUs.clear();
+      HazardRec = 0;
+      CurrCycle = 0;
+      IssueCount = 0;
+      MinReadyCycle = UINT_MAX;
+      ExpectedLatency = 0;
+      ResourceCounts.resize(1);
+      assert(!ResourceCounts[0] && "nonzero count for bad resource");
+      CritResIdx = 0;
+      IsResourceLimited = false;
+      ExpectedCount = 0;
+      ShouldIncreaseILP = false;
+#ifndef NDEBUG
+      MaxMinLatency = 0;
+#endif
+      // Reserve a zero-count for invalid CritResIdx.
+      ResourceCounts.resize(1);
+    }
 
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
-      DAG(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
-      CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0),
-      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+      DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P") {
+      reset();
+    }
 
     ~SchedBoundary() { delete HazardRec; }
 
+    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
+              SchedRemainder *rem);
+
     bool isTop() const {
       return Available.getID() == ConvergingScheduler::TopQID;
     }
 
+    unsigned getUnscheduledLatency(SUnit *SU) const {
+      if (isTop())
+        return SU->getHeight();
+      return SU->getDepth();
+    }
+
+    unsigned getCriticalCount() const {
+      return ResourceCounts[CritResIdx];
+    }
+
     bool checkHazard(SUnit *SU);
 
+    void checkILPPolicy();
+
     void releaseNode(SUnit *SU, unsigned ReadyCycle);
 
     void bumpCycle();
 
+    void countResource(unsigned PIdx, unsigned Cycles);
+
     void bumpNode(SUnit *SU);
 
     void releasePending();
@@ -841,10 +882,13 @@ class ConvergingScheduler : public MachineSchedStrategy {
     SUnit *pickOnlyChoice();
   };
 
+private:
   ScheduleDAGMI *DAG;
+  const TargetSchedModel *SchedModel;
   const TargetRegisterInfo *TRI;
 
   // State of the top and bottom scheduled instruction boundaries.
+  SchedRemainder Rem;
   SchedBoundary Top;
   SchedBoundary Bot;
 
@@ -857,7 +901,7 @@ public:
   };
 
   ConvergingScheduler():
-    DAG(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
 
   virtual void initialize(ScheduleDAGMI *dag);
 
@@ -869,28 +913,80 @@ public:
 
   virtual void releaseBottomNode(SUnit *SU);
 
+  virtual void registerRoots();
+
 protected:
-  SUnit *pickNodeBidrectional(bool &IsTopNode);
+  void balanceZones(
+    ConvergingScheduler::SchedBoundary &CriticalZone,
+    ConvergingScheduler::SchedCandidate &CriticalCand,
+    ConvergingScheduler::SchedBoundary &OppositeZone,
+    ConvergingScheduler::SchedCandidate &OppositeCand);
+
+  void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand,
+                           ConvergingScheduler::SchedCandidate &BotCand);
+
+  void tryCandidate(SchedCandidate &Cand,
+                    SchedCandidate &TryCand,
+                    SchedBoundary &Zone,
+                    const RegPressureTracker &RPTracker,
+                    RegPressureTracker &TempTracker);
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Candidate);
 
-  CandResult pickNodeFromQueue(ReadyQueue &Q,
-                               const RegPressureTracker &RPTracker,
-                               SchedCandidate &Candidate);
 #ifndef NDEBUG
-  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureElement P = PressureElement());
+  void traceCandidate(const SchedCandidate &Cand, const SchedBoundary &Zone);
 #endif
 };
 } // namespace
 
+void ConvergingScheduler::SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+  reset();
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+  for (std::vector<SUnit>::iterator
+         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
+    RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned PIdx = PI->ProcResourceIdx;
+      unsigned Factor = SchedModel->getResourceFactor(PIdx);
+      RemainingCounts[PIdx] += (Factor * PI->Cycles);
+    }
+  }
+}
+
+void ConvergingScheduler::SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+  reset();
+  DAG = dag;
+  SchedModel = smodel;
+  Rem = rem;
+  if (SchedModel->hasInstrSchedModel())
+    ResourceCounts.resize(SchedModel->getNumProcResourceKinds());
+}
+
 void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
+  SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
-  Top.DAG = dag;
-  Bot.DAG = dag;
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  Bot.init(DAG, SchedModel, &Rem);
+
+  // Initialize resource counts.
 
-  // Initialize the HazardRecognizers.
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
   const TargetMachine &TM = DAG->MF.getTarget();
-  const InstrItineraryData *Itin = TM.getInstrItineraryData();
   Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
   Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
 
@@ -905,13 +1001,12 @@ void ConvergingScheduler::releaseTopNode(SUnit *SU) {
   for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
-    unsigned Latency =
-      DAG->computeOperandLatency(I->getSUnit(), SU, *I, /*FindMin=*/true);
+    unsigned MinLatency = I->getMinLatency();
 #ifndef NDEBUG
-    Top.MaxMinLatency = std::max(Latency, Top.MaxMinLatency);
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
 #endif
-    if (SU->TopReadyCycle < PredReadyCycle + Latency)
-      SU->TopReadyCycle = PredReadyCycle + Latency;
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
   }
   Top.releaseNode(SU, SU->TopReadyCycle);
 }
@@ -925,17 +1020,27 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
   for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
-    unsigned Latency =
-      DAG->computeOperandLatency(SU, I->getSUnit(), *I, /*FindMin=*/true);
+    unsigned MinLatency = I->getMinLatency();
 #ifndef NDEBUG
-    Bot.MaxMinLatency = std::max(Latency, Bot.MaxMinLatency);
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
 #endif
-    if (SU->BotReadyCycle < SuccReadyCycle + Latency)
-      SU->BotReadyCycle = SuccReadyCycle + Latency;
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
   }
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
+void ConvergingScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (std::vector<SUnit*>::const_iterator
+         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
+    if ((*I)->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = (*I)->getDepth();
+  }
+  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+}
+
 /// Does this SU have a hazard within the current instruction group.
 ///
 /// The scheduler supports two modes of hazard recognition. The first is the
@@ -953,14 +1058,27 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
   if (HazardRec->isEnabled())
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
-  if (IssueCount + DAG->getNumMicroOps(SU->getInstr()) > DAG->getIssueWidth())
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) {
+    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
-
+  }
   return false;
 }
 
+/// If expected latency is covered, disable ILP policy.
+void ConvergingScheduler::SchedBoundary::checkILPPolicy() {
+  if (ShouldIncreaseILP
+      && (IsResourceLimited || ExpectedLatency <= CurrCycle)) {
+    ShouldIncreaseILP = false;
+    DEBUG(dbgs() << "Disable ILP: " << Available.getName() << '\n');
+  }
+}
+
 void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
                                                      unsigned ReadyCycle) {
+
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
 
@@ -970,15 +1088,31 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
     Pending.push(SU);
   else
     Available.push(SU);
+
+  // Record this node as an immediate dependent of the scheduled node.
+  NextSUs.insert(SU);
+
+  // If CriticalPath has been computed, then check if the unscheduled nodes
+  // exceed the ILP window. Before registerRoots, CriticalPath==0.
+  if (Rem->CriticalPath && (ExpectedLatency + getUnscheduledLatency(SU)
+                            > Rem->CriticalPath + ILPWindow)) {
+    ShouldIncreaseILP = true;
+    DEBUG(dbgs() << "Increase ILP: " << Available.getName() << " "
+          << ExpectedLatency << " + " << getUnscheduledLatency(SU) << '\n');
+  }
 }
 
 /// Move the boundary of scheduled code by one cycle.
 void ConvergingScheduler::SchedBoundary::bumpCycle() {
-  unsigned Width = DAG->getIssueWidth();
+  unsigned Width = SchedModel->getIssueWidth();
   IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
 
+  unsigned NextCycle = CurrCycle + 1;
   assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
-  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+  if (MinReadyCycle > NextCycle) {
+    IssueCount = 0;
+    NextCycle = MinReadyCycle;
+  }
 
   if (!HazardRec->isEnabled()) {
     // Bypass HazardRec virtual calls.
@@ -994,11 +1128,39 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
     }
   }
   CheckPending = true;
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
 
-  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+  DEBUG(dbgs() << "  *** " << Available.getName() << " cycle "
         << CurrCycle << '\n');
 }
 
+/// Add the given processor resource to this scheduled zone.
+void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx,
+                                                       unsigned Cycles) {
+  unsigned Factor = SchedModel->getResourceFactor(PIdx);
+  DEBUG(dbgs() << "  " << SchedModel->getProcResource(PIdx)->Name
+        << " +(" << Cycles << "x" << Factor
+        << ") / " << SchedModel->getLatencyFactor() << '\n');
+
+  unsigned Count = Factor * Cycles;
+  ResourceCounts[PIdx] += Count;
+  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+  Rem->RemainingCounts[PIdx] -= Count;
+
+  // Reset MaxRemainingCount for sanity.
+  Rem->MaxRemainingCount = 0;
+
+  // Check if this resource exceeds the current critical resource by a full
+  // cycle. If so, it becomes the critical resource.
+  if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx])
+      >= (int)SchedModel->getLatencyFactor()) {
+    CritResIdx = PIdx;
+    DEBUG(dbgs() << "  *** Critical resource "
+          << SchedModel->getProcResource(PIdx)->Name << " x"
+          << ResourceCounts[PIdx] << '\n');
+  }
+}
+
 /// Move the boundary of scheduled code by one SUnit.
 void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
   // Update the reservation table.
@@ -1010,11 +1172,38 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
     }
     HazardRec->EmitInstruction(SU);
   }
+  // Update resource counts and critical resource.
+  if (SchedModel->hasInstrSchedModel()) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      countResource(PI->ProcResourceIdx, PI->Cycles);
+    }
+  }
+  if (isTop()) {
+    if (SU->getDepth() > ExpectedLatency)
+      ExpectedLatency = SU->getDepth();
+  }
+  else {
+    if (SU->getHeight() > ExpectedLatency)
+      ExpectedLatency = SU->getHeight();
+  }
+
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
+
   // Check the instruction group dispatch limit.
   // TODO: Check if this SU must end a dispatch group.
-  IssueCount += DAG->getNumMicroOps(SU->getInstr());
-  if (IssueCount >= DAG->getIssueWidth()) {
-    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+
+  // checkHazard prevents scheduling multiple instructions per cycle that exceed
+  // issue width. However, we commonly reach the maximum. In this case
+  // opportunistically bump the cycle to avoid uselessly checking everything in
+  // the readyQ. Furthermore, a single instruction may produce more than one
+  // cycle's worth of micro-ops.
+  if (IssueCount >= SchedModel->getIssueWidth()) {
+    DEBUG(dbgs() << "  *** Max instrs at cycle " << CurrCycle << '\n');
     bumpCycle();
   }
 }
@@ -1045,6 +1234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
     Pending.remove(Pending.begin()+i);
     --i; --e;
   }
+  DEBUG(if (!Pending.empty()) Pending.dump());
   CheckPending = false;
 }
 
@@ -1059,12 +1249,23 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
 }
 
 /// If this queue only has one ready candidate, return it. As a side effect,
-/// advance the cycle until at least one node is ready. If multiple instructions
-/// are ready, return NULL.
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
 SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
+  if (IssueCount > 0) {
+    // Defer any ready instrs that now have a hazard.
+    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+      if (checkHazard(*I)) {
+        Pending.push(*I);
+        I = Available.remove(I);
+        continue;
+      }
+      ++I;
+    }
+  }
   for (unsigned i = 0; Available.empty(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
@@ -1076,18 +1277,262 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   return NULL;
 }
 
-#ifndef NDEBUG
-void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q,
-                                         SUnit *SU, PressureElement P) {
-  dbgs() << Label << " " << Q.getName() << " ";
-  if (P.isValid())
-    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
-  else
-    dbgs() << "     ";
-  SU->dump(DAG);
+/// Record the candidate policy for opposite zones with different critical
+/// resources.
+///
+/// If the CriticalZone is latency limited, don't force a policy for the
+/// candidates here. Instead, When releasing each candidate, releaseNode
+/// compares the region's critical path to the candidate's height or depth and
+/// the scheduled zone's expected latency then sets ShouldIncreaseILP.
+void ConvergingScheduler::balanceZones(
+  ConvergingScheduler::SchedBoundary &CriticalZone,
+  ConvergingScheduler::SchedCandidate &CriticalCand,
+  ConvergingScheduler::SchedBoundary &OppositeZone,
+  ConvergingScheduler::SchedCandidate &OppositeCand) {
+
+  if (!CriticalZone.IsResourceLimited)
+    return;
+
+  SchedRemainder *Rem = CriticalZone.Rem;
+
+  // If the critical zone is overconsuming a resource relative to the
+  // remainder, try to reduce it.
+  unsigned RemainingCritCount =
+    Rem->RemainingCounts[CriticalZone.CritResIdx];
+  if ((int)(Rem->MaxRemainingCount - RemainingCritCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce "
+          << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
+          << '\n');
+  }
+  // If the other zone is underconsuming a resource relative to the full zone,
+  // try to increase it.
+  unsigned OppositeCount =
+    OppositeZone.ResourceCounts[CriticalZone.CritResIdx];
+  if ((int)(OppositeZone.ExpectedCount - OppositeCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand "
+          << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
+          << '\n');
+  }
+}
+
+/// Determine if the scheduled zones exceed resource limits or critical path and
+/// set each candidate's ReduceHeight policy accordingly.
+void ConvergingScheduler::checkResourceLimits(
+  ConvergingScheduler::SchedCandidate &TopCand,
+  ConvergingScheduler::SchedCandidate &BotCand) {
+
+  Bot.checkILPPolicy();
+  Top.checkILPPolicy();
+  if (Bot.ShouldIncreaseILP)
+    BotCand.Policy.ReduceLatency = true;
+  if (Top.ShouldIncreaseILP)
+    TopCand.Policy.ReduceLatency = true;
+
+  // Handle resource-limited regions.
+  if (Top.IsResourceLimited && Bot.IsResourceLimited
+      && Top.CritResIdx == Bot.CritResIdx) {
+    // If the scheduled critical resource in both zones is no longer the
+    // critical remaining resource, attempt to reduce resource height both ways.
+    if (Top.CritResIdx != Rem.CritResIdx) {
+      TopCand.Policy.ReduceResIdx = Top.CritResIdx;
+      BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
+      DEBUG(dbgs() << "Reduce scheduled "
+            << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
+    }
+    return;
+  }
+  // Handle latency-limited regions.
+  if (!Top.IsResourceLimited && !Bot.IsResourceLimited) {
+    // If the total scheduled expected latency exceeds the region's critical
+    // path then reduce latency both ways.
+    //
+    // Just because a zone is not resource limited does not mean it is latency
+    // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle
+    // to exceed expected latency.
+    if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath)
+        && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
+      TopCand.Policy.ReduceLatency = true;
+      BotCand.Policy.ReduceLatency = true;
+      DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency
+            << " + " << Bot.ExpectedLatency << '\n');
+    }
+    return;
+  }
+  // The critical resource is different in each zone, so request balancing.
+
+  // Compute the cost of each zone.
+  Rem.MaxRemainingCount = std::max(
+    Rem.RemainingMicroOps * SchedModel->getMicroOpFactor(),
+    Rem.RemainingCounts[Rem.CritResIdx]);
+  Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle);
+  Top.ExpectedCount = std::max(
+    Top.getCriticalCount(),
+    Top.ExpectedCount * SchedModel->getLatencyFactor());
+  Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle);
+  Bot.ExpectedCount = std::max(
+    Bot.getCriticalCount(),
+    Bot.ExpectedCount * SchedModel->getLatencyFactor());
+
+  balanceZones(Top, TopCand, Bot, BotCand);
+  balanceZones(Bot, BotCand, Top, TopCand);
+}
+
+void ConvergingScheduler::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+                  const TargetSchedModel *SchedModel) {
+  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+    return;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+      ResDelta.CritResources += PI->Cycles;
+    if (PI->ProcResourceIdx == Policy.DemandResIdx)
+      ResDelta.DemandedResources += PI->Cycles;
+  }
+}
+
+/// Return true if this heuristic determines order.
+static bool tryLess(unsigned TryVal, unsigned CandVal,
+                    ConvergingScheduler::SchedCandidate &TryCand,
+                    ConvergingScheduler::SchedCandidate &Cand,
+                    ConvergingScheduler::CandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+static bool tryGreater(unsigned TryVal, unsigned CandVal,
+                       ConvergingScheduler::SchedCandidate &TryCand,
+                       ConvergingScheduler::SchedCandidate &Cand,
+                       ConvergingScheduler::CandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending.
+/// \param RPTracker describes reg pressure within the scheduled zone.
+/// \param TempTracker is a scratch pressure tracker to reuse in queries.
+void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+                                       SchedCandidate &TryCand,
+                                       SchedBoundary &Zone,
+                                       const RegPressureTracker &RPTracker,
+                                       RegPressureTracker &TempTracker) {
+
+  // Always initialize TryCand's RPDelta.
+  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
+                                  DAG->getRegionCriticalPSets(),
+                                  DAG->getRegPressure().MaxSetPressure);
+
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+  // Avoid exceeding the target's limit.
+  if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
+              Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
+    return;
+  if (Cand.Reason == SingleExcess)
+    Cand.Reason = MultiPressure;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease,
+              Cand.RPDelta.CriticalMax.UnitIncrease,
+              TryCand, Cand, SingleCritical))
+    return;
+  if (Cand.Reason == SingleCritical)
+    Cand.Reason = MultiPressure;
+
+  // Avoid critical resource consumption and balance the schedule.
+  TryCand.initResourceDelta(DAG, SchedModel);
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources,
+                 TryCand, Cand, ResourceDemand))
+    return;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency) {
+    if (Zone.isTop()) {
+      if (Cand.SU->getDepth() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                    TryCand, Cand, TopDepthReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                     TryCand, Cand, TopPathReduce))
+        return;
+    }
+    else {
+      if (Cand.SU->getHeight() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                    TryCand, Cand, BotHeightReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                     TryCand, Cand, BotPathReduce))
+        return;
+    }
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease,
+              Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax))
+    return;
+  if (Cand.Reason == SingleMax)
+    Cand.Reason = MultiPressure;
+
+  // Prefer immediate defs/users of the last scheduled instruction. This is a
+  // nice pressure avoidance strategy that also conserves the processor's
+  // register renaming resources and keeps the machine code readable.
+  if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
+    TryCand.Reason = NextDefUse;
+    return;
+  }
+  if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
+    if (Cand.Reason > NextDefUse)
+      Cand.Reason = NextDefUse;
+    return;
+  }
+  // Fall through to original instruction order.
+  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+    TryCand.Reason = NodeOrder;
+  }
 }
-#endif
 
 /// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
 /// more desirable than RHS from scheduling standpoint.
@@ -1098,109 +1543,144 @@ static bool compareRPDelta(const RegPressureDelta &LHS,
   // have UnitIncrease==0, so are neutral.
 
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease)
+  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
+    DEBUG(dbgs() << "RP excess top - bot: "
+          << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
     return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
-
+  }
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease)
+  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP critical top - bot: "
+          << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
+          << '\n');
     return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
-
+  }
   // Avoid increasing the max pressure of the entire region.
-  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease)
+  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP current top - bot: "
+          << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
+          << '\n');
     return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
-
+  }
   return false;
 }
 
+#ifndef NDEBUG
+const char *ConvergingScheduler::getReasonStr(
+  ConvergingScheduler::CandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case SingleExcess:   return "REG-EXCESS";
+  case SingleCritical: return "REG-CRIT  ";
+  case SingleMax:      return "REG-MAX   ";
+  case MultiPressure:  return "REG-MULTI ";
+  case ResourceReduce: return "RES-REDUCE";
+  case ResourceDemand: return "RES-DEMAND";
+  case TopDepthReduce: return "TOP-DEPTH ";
+  case TopPathReduce:  return "TOP-PATH  ";
+  case BotHeightReduce:return "BOT-HEIGHT";
+  case BotPathReduce:  return "BOT-PATH  ";
+  case NextDefUse:     return "DEF-USE   ";
+  case NodeOrder:      return "ORDER     ";
+  };
+  llvm_unreachable("Unknown reason!");
+}
+
+void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
+                                         const SchedBoundary &Zone) {
+  const char *Label = getReasonStr(Cand.Reason);
+  PressureElement P;
+  unsigned ResIdx = 0;
+  unsigned Latency = 0;
+  switch (Cand.Reason) {
+  default:
+    break;
+  case SingleExcess:
+    P = Cand.RPDelta.Excess;
+    break;
+  case SingleCritical:
+    P = Cand.RPDelta.CriticalMax;
+    break;
+  case SingleMax:
+    P = Cand.RPDelta.CurrentMax;
+    break;
+  case ResourceReduce:
+    ResIdx = Cand.Policy.ReduceResIdx;
+    break;
+  case ResourceDemand:
+    ResIdx = Cand.Policy.DemandResIdx;
+    break;
+  case TopDepthReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  case TopPathReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotHeightReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotPathReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  }
+  dbgs() << Label << " " << Zone.Available.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  if (ResIdx)
+    dbgs() << SchedModel->getProcResource(ResIdx)->Name << " ";
+  else
+    dbgs() << "        ";
+  if (Latency)
+    dbgs() << Latency << " cycles ";
+  else
+    dbgs() << "         ";
+  Cand.SU->dump(DAG);
+}
+#endif
+
 /// Pick the best candidate from the top queue.
 ///
 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
-ConvergingScheduler::CandResult ConvergingScheduler::
-pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
-                  SchedCandidate &Candidate) {
+void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                            const RegPressureTracker &RPTracker,
+                                            SchedCandidate &Cand) {
+  ReadyQueue &Q = Zone.Available;
+
   DEBUG(Q.dump());
 
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
-  // BestSU remains NULL if no top candidates beat the best existing candidate.
-  CandResult FoundCandidate = NoCand;
   for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
-    RegPressureDelta RPDelta;
-    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
-                                    DAG->getRegionCriticalPSets(),
-                                    DAG->getRegPressure().MaxSetPressure);
-
-    // Initialize the candidate if needed.
-    if (!Candidate.SU) {
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = NodeOrder;
-      continue;
-    }
-    // Avoid exceeding the target's limit.
-    if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) {
-      DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleExcess;
-      continue;
-    }
-    if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleExcess)
-      FoundCandidate = MultiPressure;
-
-    // Avoid increasing the max critical pressure in the scheduled region.
-    if (RPDelta.CriticalMax.UnitIncrease
-        < Candidate.RPDelta.CriticalMax.UnitIncrease) {
-      DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleCritical;
-      continue;
-    }
-    if (RPDelta.CriticalMax.UnitIncrease
-        > Candidate.RPDelta.CriticalMax.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleCritical)
-      FoundCandidate = MultiPressure;
-
-    // Avoid increasing the max pressure of the entire region.
-    if (RPDelta.CurrentMax.UnitIncrease
-        < Candidate.RPDelta.CurrentMax.UnitIncrease) {
-      DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleMax;
-      continue;
-    }
-    if (RPDelta.CurrentMax.UnitIncrease
-        > Candidate.RPDelta.CurrentMax.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleMax)
-      FoundCandidate = MultiPressure;
-
-    // Fall through to original instruction order.
-    // Only consider node order if Candidate was chosen from this Q.
-    if (FoundCandidate == NoCand)
-      continue;
 
-    if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
-        || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
-      DEBUG(traceCandidate("NCAND", Q, *I));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = NodeOrder;
+    SchedCandidate TryCand(Cand.Policy);
+    TryCand.SU = *I;
+    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(DAG, SchedModel);
+      Cand.setBest(TryCand);
+      DEBUG(traceCandidate(Cand, Zone));
     }
+    TryCand.SU = *I;
   }
-  return FoundCandidate;
+}
+
+static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+                      bool IsTop) {
+  DEBUG(dbgs() << "Pick " << (IsTop ? "top" : "bot")
+        << " SU(" << Cand.SU->NodeNum << ") "
+        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
 /// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
+SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
@@ -1211,11 +1691,14 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
     IsTopNode = true;
     return SU;
   }
-  SchedCandidate BotCand;
+  CandPolicy NoPolicy;
+  SchedCandidate BotCand(NoPolicy);
+  SchedCandidate TopCand(NoPolicy);
+  checkResourceLimits(TopCand, BotCand);
+
   // Prefer bottom scheduling when heuristics are silent.
-  CandResult BotResult = pickNodeFromQueue(Bot.Available,
-                                           DAG->getBotRPTracker(), BotCand);
-  assert(BotResult != NoCand && "failed to find the first candidate");
+  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+  assert(BotCand.Reason != NoCand && "failed to find the first candidate");
 
   // If either Q has a single candidate that provides the least increase in
   // Excess pressure, we can immediately schedule from that Q.
@@ -1224,37 +1707,41 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // affects picking from either Q. If scheduling in one direction must
   // increase pressure for one of the excess PSets, then schedule in that
   // direction first to provide more freedom in the other direction.
-  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+  if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) {
     IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
     return BotCand.SU;
   }
   // Check if the top Q has a better candidate.
-  SchedCandidate TopCand;
-  CandResult TopResult = pickNodeFromQueue(Top.Available,
-                                           DAG->getTopRPTracker(), TopCand);
-  assert(TopResult != NoCand && "failed to find the first candidate");
+  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+  assert(TopCand.Reason != NoCand && "failed to find the first candidate");
 
-  if (TopResult == SingleExcess || TopResult == SingleCritical) {
-    IsTopNode = true;
-    return TopCand.SU;
-  }
   // If either Q has a single candidate that minimizes pressure above the
   // original region's pressure pick it.
-  if (BotResult == SingleMax) {
+  if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) {
+    if (TopCand.Reason < BotCand.Reason) {
+      IsTopNode = true;
+      tracePick(TopCand, IsTopNode);
+      return TopCand.SU;
+    }
     IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
     return BotCand.SU;
   }
-  if (TopResult == SingleMax) {
+  // Check for a salient pressure difference and pick the best from either side.
+  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
     IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
-  // Check for a salient pressure difference and pick the best from either side.
-  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
+  // Otherwise prefer the bottom candidate, in node order if all else failed.
+  if (TopCand.Reason < BotCand.Reason) {
     IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
-  // Otherwise prefer the bottom candidate in node order.
   IsTopNode = false;
+  tracePick(BotCand, IsTopNode);
   return BotCand.SU;
 }
 
@@ -1266,33 +1753,34 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
     return NULL;
   }
   SUnit *SU;
-  if (ForceTopDown) {
-    SU = Top.pickOnlyChoice();
-    if (!SU) {
-      SchedCandidate TopCand;
-      CandResult TopResult =
-        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
-      assert(TopResult != NoCand && "failed to find the first candidate");
-      (void)TopResult;
-      SU = TopCand.SU;
+  do {
+    if (ForceTopDown) {
+      SU = Top.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        SchedCandidate TopCand(NoPolicy);
+        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
     }
-    IsTopNode = true;
-  }
-  else if (ForceBottomUp) {
-    SU = Bot.pickOnlyChoice();
-    if (!SU) {
-      SchedCandidate BotCand;
-      CandResult BotResult =
-        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
-      assert(BotResult != NoCand && "failed to find the first candidate");
-      (void)BotResult;
-      SU = BotCand.SU;
+    else if (ForceBottomUp) {
+      SU = Bot.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        SchedCandidate BotCand(NoPolicy);
+        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
     }
-    IsTopNode = false;
-  }
-  else {
-    SU = pickNodeBidrectional(IsTopNode);
-  }
+    else {
+      SU = pickNodeBidirectional(IsTopNode);
+    }
+  } while (SU->isScheduled);
+
   if (SU->isTopReady())
     Top.removeReady(SU);
   if (SU->isBottomReady())
@@ -1331,6 +1819,86 @@ ConvergingSchedRegistry("converge", "Standard converging scheduler.",
                         createConvergingSched);
 
 //===----------------------------------------------------------------------===//
+// ILP Scheduler. Currently for experimental analysis of heuristics.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Order nodes by the ILP metric.
+struct ILPOrder {
+  ScheduleDAGILP *ILP;
+  bool MaximizeILP;
+
+  ILPOrder(ScheduleDAGILP *ilp, bool MaxILP): ILP(ilp), MaximizeILP(MaxILP) {}
+
+  /// \brief Apply a less-than relation on node priority.
+  bool operator()(const SUnit *A, const SUnit *B) const {
+    // Return true if A comes after B in the Q.
+    if (MaximizeILP)
+      return ILP->getILP(A) < ILP->getILP(B);
+    else
+      return ILP->getILP(A) > ILP->getILP(B);
+  }
+};
+
+/// \brief Schedule based on the ILP metric.
+class ILPScheduler : public MachineSchedStrategy {
+  ScheduleDAGILP ILP;
+  ILPOrder Cmp;
+
+  std::vector<SUnit*> ReadyQ;
+public:
+  ILPScheduler(bool MaximizeILP)
+  : ILP(/*BottomUp=*/true), Cmp(&ILP, MaximizeILP) {}
+
+  virtual void initialize(ScheduleDAGMI *DAG) {
+    ReadyQ.clear();
+    ILP.resize(DAG->SUnits.size());
+  }
+
+  virtual void registerRoots() {
+    for (std::vector<SUnit*>::const_iterator
+           I = ReadyQ.begin(), E = ReadyQ.end(); I != E; ++I) {
+      ILP.computeILP(*I);
+    }
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  virtual SUnit *pickNode(bool &IsTopNode) {
+    if (ReadyQ.empty()) return NULL;
+    pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+    SUnit *SU = ReadyQ.back();
+    ReadyQ.pop_back();
+    IsTopNode = false;
+    DEBUG(dbgs() << "*** Scheduling " << *SU->getInstr()
+          << " ILP: " << ILP.getILP(SU) << '\n');
+    return SU;
+  }
+
+  virtual void schedNode(SUnit *, bool) {}
+
+  virtual void releaseTopNode(SUnit *) { /*only called for top roots*/ }
+
+  virtual void releaseBottomNode(SUnit *SU) {
+    ReadyQ.push_back(SU);
+    std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+};
+} // namespace
+
+static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, new ILPScheduler(true));
+}
+static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, new ILPScheduler(false));
+}
+static MachineSchedRegistry ILPMaxRegistry(
+  "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
+static MachineSchedRegistry ILPMinRegistry(
+  "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
+
+//===----------------------------------------------------------------------===//
 // Machine Instruction Shuffler for Correctness Testing
 //===----------------------------------------------------------------------===//
 
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index bc383cba455c..b117f8c3a206 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -49,7 +49,6 @@ namespace {
     MachineDominatorTree *DT;   // Machine dominator tree
     MachineLoopInfo *LI;
     AliasAnalysis *AA;
-    BitVector AllocatableSet;   // Which physregs are allocatable?
 
     // Remember which edges have been considered for breaking.
     SmallSet<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 8>
@@ -229,7 +228,6 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   DT = &getAnalysis<MachineDominatorTree>();
   LI = &getAnalysis<MachineLoopInfo>();
   AA = &getAnalysis<AliasAnalysis>();
-  AllocatableSet = TRI->getAllocatableSet(MF);
 
   bool EverMadeChange = false;
 
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 1a3aa6091825..9686b0413293 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -14,9 +14,10 @@
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -50,9 +51,11 @@ bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   TII = MF->getTarget().getInstrInfo();
   TRI = MF->getTarget().getRegisterInfo();
-  ItinData = MF->getTarget().getInstrItineraryData();
   MRI = &MF->getRegInfo();
   Loops = &getAnalysis<MachineLoopInfo>();
+  const TargetSubtargetInfo &ST =
+    MF->getTarget().getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
   BlockInfo.resize(MF->getNumBlockIDs());
   return false;
 }
@@ -674,7 +677,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
     // Ignore dependencies outside the current trace.
     const TraceBlockInfo &DefTBI = BlockInfo[DefMI->getParent()->getNumber()];
-    if (!DefTBI.hasValidDepth() || DefTBI.Head != TBI.Head)
+    if (!DefTBI.isEarlierInSameTrace(TBI))
       continue;
     unsigned Len = LIR.Height + Cycles[DefMI].Depth;
     MaxLen = std::max(MaxLen, Len);
@@ -737,16 +740,15 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
         const TraceBlockInfo&DepTBI =
           BlockInfo[Dep.DefMI->getParent()->getNumber()];
         // Ignore dependencies from outside the current trace.
-        if (!DepTBI.hasValidDepth() || DepTBI.Head != TBI.Head)
+        if (!DepTBI.isEarlierInSameTrace(TBI))
           continue;
         assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency");
         unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
         // Add latency if DefMI is a real instruction. Transients get latency 0.
         if (!Dep.DefMI->isTransient())
-          DepCycle += MTM.TII->computeOperandLatency(MTM.ItinData,
-                                                     Dep.DefMI, Dep.DefOp,
-                                                     UseMI, Dep.UseOp,
-                                                     /* FindMin = */ false);
+          DepCycle += MTM.SchedModel
+            .computeOperandLatency(Dep.DefMI, Dep.DefOp, UseMI, Dep.UseOp,
+                                   /* FindMin = */ false);
         Cycle = std::max(Cycle, DepCycle);
       }
       // Remember the instruction depth.
@@ -769,7 +771,7 @@ computeInstrDepths(const MachineBasicBlock *MBB) {
 // Height is the issue height computed from virtual register dependencies alone.
 static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
                                       SparseSet<LiveRegUnit> &RegUnits,
-                                      const InstrItineraryData *ItinData,
+                                      const TargetSchedModel &SchedModel,
                                       const TargetInstrInfo *TII,
                                       const TargetRegisterInfo *TRI) {
   SmallVector<unsigned, 8> ReadOps;
@@ -792,14 +794,10 @@ static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
       unsigned DepHeight = I->Cycle;
       if (!MI->isTransient()) {
         // We may not know the UseMI of this dependency, if it came from the
-        // live-in list.
-        if (I->MI)
-          DepHeight += TII->computeOperandLatency(ItinData,
-                                                  MI, MO.getOperandNo(),
-                                                  I->MI, I->Op);
-        else
-          // No UseMI. Just use the MI latency instead.
-          DepHeight += TII->getInstrLatency(ItinData, MI);
+        // live-in list. SchedModel can handle a NULL UseMI.
+        DepHeight += SchedModel
+          .computeOperandLatency(MI, MO.getOperandNo(), I->MI, I->Op,
+                                 /* FindMin = */ false);
       }
       Height = std::max(Height, DepHeight);
       // This regunit is dead above MI.
@@ -832,12 +830,12 @@ typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
 static bool pushDepHeight(const DataDep &Dep,
                           const MachineInstr *UseMI, unsigned UseHeight,
                           MIHeightMap &Heights,
-                          const InstrItineraryData *ItinData,
+                          const TargetSchedModel &SchedModel,
                           const TargetInstrInfo *TII) {
   // Adjust height by Dep.DefMI latency.
   if (!Dep.DefMI->isTransient())
-    UseHeight += TII->computeOperandLatency(ItinData, Dep.DefMI, Dep.DefOp,
-                                            UseMI, Dep.UseOp);
+    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
+                                                  UseMI, Dep.UseOp, false);
 
   // Update Heights[DefMI] to be the maximum height seen.
   MIHeightMap::iterator I;
@@ -852,14 +850,14 @@ static bool pushDepHeight(const DataDep &Dep,
   return false;
 }
 
-/// Assuming that DefMI was used by Trace.back(), add it to the live-in lists
-/// of all the blocks in Trace. Stop when reaching the block that contains
-/// DefMI.
+/// Assuming that the virtual register defined by DefMI:DefOp was used by
+/// Trace.back(), add it to the live-in lists of all the blocks in Trace. Stop
+/// when reaching the block that contains DefMI.
 void MachineTraceMetrics::Ensemble::
-addLiveIns(const MachineInstr *DefMI,
+addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
            ArrayRef<const MachineBasicBlock*> Trace) {
   assert(!Trace.empty() && "Trace should contain at least one block");
-  unsigned Reg = DefMI->getOperand(0).getReg();
+  unsigned Reg = DefMI->getOperand(DefOp).getReg();
   assert(TargetRegisterInfo::isVirtualRegister(Reg));
   const MachineBasicBlock *DefMBB = DefMI->getParent();
 
@@ -951,8 +949,8 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
           unsigned Height = TBI.Succ ? Cycles.lookup(PHI).Height : 0;
           DEBUG(dbgs() << "pred\t" << Height << '\t' << *PHI);
           if (pushDepHeight(Deps.front(), PHI, Height,
-                            Heights, MTM.ItinData, MTM.TII))
-            addLiveIns(Deps.front().DefMI, Stack);
+                            Heights, MTM.SchedModel, MTM.TII))
+            addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
         }
       }
     }
@@ -980,12 +978,12 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
       // There may also be regunit dependencies to include in the height.
       if (HasPhysRegs)
         Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits,
-                                      MTM.ItinData, MTM.TII, MTM.TRI);
+                                      MTM.SchedModel, MTM.TII, MTM.TRI);
 
       // Update the required height of any virtual registers read by MI.
       for (unsigned i = 0, e = Deps.size(); i != e; ++i)
-        if (pushDepHeight(Deps[i], MI, Cycle, Heights, MTM.ItinData, MTM.TII))
-          addLiveIns(Deps[i].DefMI, Stack);
+        if (pushDepHeight(Deps[i], MI, Cycle, Heights, MTM.SchedModel, MTM.TII))
+          addLiveIns(Deps[i].DefMI, Deps[i].DefOp, Stack);
 
       InstrCycles &MICycles = Cycles[MI];
       MICycles.Height = Cycle;
@@ -1054,10 +1052,8 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
   unsigned DepCycle = getInstrCycles(Dep.DefMI).Depth;
   // Add latency if DefMI is a real instruction. Transients get latency 0.
   if (!Dep.DefMI->isTransient())
-    DepCycle += TE.MTM.TII->computeOperandLatency(TE.MTM.ItinData,
-                                                  Dep.DefMI, Dep.DefOp,
-                                                  PHI, Dep.UseOp,
-                                                  /* FindMin = */ false);
+    DepCycle += TE.MTM.SchedModel
+      .computeOperandLatency(Dep.DefMI, Dep.DefOp, PHI, Dep.UseOp, false);
   return DepCycle;
 }
 
@@ -1068,9 +1064,8 @@ unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
   unsigned Instrs = TBI.InstrDepth;
   if (Bottom)
     Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
-  if (const MCSchedModel *Model = TE.MTM.ItinData->SchedModel)
-    if (Model->IssueWidth != 0)
-      return Instrs / Model->IssueWidth;
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
   // Assume issue width 1 without a schedule model.
   return Instrs;
 }
@@ -1080,9 +1075,8 @@ getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks) const {
   unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
   for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
     Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
-  if (const MCSchedModel *Model = TE.MTM.ItinData->SchedModel)
-    if (Model->IssueWidth != 0)
-      return Instrs / Model->IssueWidth;
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
   // Assume issue width 1 without a schedule model.
   return Instrs;
 }
diff --git a/lib/CodeGen/MachineTraceMetrics.h b/lib/CodeGen/MachineTraceMetrics.h
index c5b86f31dba8..460730b04059 100644
--- a/lib/CodeGen/MachineTraceMetrics.h
+++ b/lib/CodeGen/MachineTraceMetrics.h
@@ -50,6 +50,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 
 namespace llvm {
 
@@ -67,9 +68,9 @@ class MachineTraceMetrics : public MachineFunctionPass {
   const MachineFunction *MF;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  const InstrItineraryData *ItinData;
   const MachineRegisterInfo *MRI;
   const MachineLoopInfo *Loops;
+  TargetSchedModel SchedModel;
 
 public:
   class Ensemble;
@@ -164,6 +165,14 @@ public:
     /// Invalidate height resources when a block below this one has changed.
     void invalidateHeight() { InstrHeight = ~0u; HasValidInstrHeights = false; }
 
+    /// Determine if this block belongs to the same trace as TBI and comes
+    /// before it in the trace.
+    /// Also returns true when TBI == this.
+    bool isEarlierInSameTrace(const TraceBlockInfo &TBI) const {
+      return hasValidDepth() && TBI.hasValidDepth() &&
+        Head == TBI.Head && InstrDepth <= TBI.InstrDepth;
+    }
+
     // Data-dependency-related information. Per-instruction depth and height
     // are computed from data dependencies in the current trace, using
     // itinerary data.
@@ -270,7 +279,7 @@ public:
     unsigned computeCrossBlockCriticalPath(const TraceBlockInfo&);
     void computeInstrDepths(const MachineBasicBlock*);
     void computeInstrHeights(const MachineBasicBlock*);
-    void addLiveIns(const MachineInstr *DefMI,
+    void addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
                     ArrayRef<const MachineBasicBlock*> Trace);
 
   protected:
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index f745b41c16fe..69a3ae84ec99 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -23,8 +23,9 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/BasicBlock.h"
+#include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
@@ -73,11 +74,12 @@ namespace {
     typedef SmallVector<const uint32_t*, 4> RegMaskVector;
     typedef DenseSet<unsigned> RegSet;
     typedef DenseMap<unsigned, const MachineInstr*> RegMap;
+    typedef SmallPtrSet<const MachineBasicBlock*, 8> BlockSet;
 
     const MachineInstr *FirstTerminator;
+    BlockSet FunctionBlocks;
 
     BitVector regsReserved;
-    BitVector regsAllocatable;
     RegSet regsLive;
     RegVector regsDefined, regsDead, regsKilled;
     RegMaskVector regMasks;
@@ -117,6 +119,9 @@ namespace {
       // block. This set is disjoint from regsLiveOut.
       RegSet vregsRequired;
 
+      // Set versions of block's predecessor and successor lists.
+      BlockSet Preds, Succs;
+
       BBInfo() : reachable(false) {}
 
       // Add register to vregsPassed if it belongs there. Return true if
@@ -180,7 +185,7 @@ namespace {
     }
 
     bool isAllocatable(unsigned Reg) {
-      return Reg < regsAllocatable.size() && regsAllocatable.test(Reg);
+      return Reg < TRI->getNumRegs() && MRI->isAllocatable(Reg);
     }
 
     // Analysis information if available
@@ -208,6 +213,8 @@ namespace {
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
 
+    void verifyInlineAsm(const MachineInstr *MI);
+
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void markReachable(const MachineBasicBlock *MBB);
     void calcRegsPassed();
@@ -352,7 +359,7 @@ void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
     MF->print(*OS, Indexes);
   }
   *OS << "*** Bad machine code: " << msg << " ***\n"
-      << "- function:    " << MF->getFunction()->getName() << "\n";
+      << "- function:    " << MF->getName() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
@@ -360,7 +367,7 @@ void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
   report(msg, MBB->getParent());
   *OS << "- basic block: BB#" << MBB->getNumber()
       << ' ' << MBB->getName()
-      << " (" << (void*)MBB << ')';
+      << " (" << (const void*)MBB << ')';
   if (Indexes)
     *OS << " [" << Indexes->getMBBStartIdx(MBB)
         << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
@@ -419,7 +426,7 @@ void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
 
 void MachineVerifier::visitMachineFunctionBefore() {
   lastIndex = SlotIndex();
-  regsReserved = TRI->getReservedRegs(*MF);
+  regsReserved = MRI->getReservedRegs();
 
   // A sub-register of a reserved register is also reserved
   for (int Reg = regsReserved.find_first(); Reg>=0;
@@ -431,9 +438,23 @@ void MachineVerifier::visitMachineFunctionBefore() {
     }
   }
 
-  regsAllocatable = TRI->getAllocatableSet(*MF);
-
   markReachable(&MF->front());
+
+  // Build a set of the basic blocks in the function.
+  FunctionBlocks.clear();
+  for (MachineFunction::const_iterator
+       I = MF->begin(), E = MF->end(); I != E; ++I) {
+    FunctionBlocks.insert(I);
+    BBInfo &MInfo = MBBInfoMap[I];
+
+    MInfo.Preds.insert(I->pred_begin(), I->pred_end());
+    if (MInfo.Preds.size() != I->pred_size())
+      report("MBB has duplicate entries in its predecessor list.", I);
+
+    MInfo.Succs.insert(I->succ_begin(), I->succ_end());
+    if (MInfo.Succs.size() != I->succ_size())
+      report("MBB has duplicate entries in its successor list.", I);
+  }
 }
 
 // Does iterator point to a and b as the first two elements?
@@ -470,6 +491,25 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
        E = MBB->succ_end(); I != E; ++I) {
     if ((*I)->isLandingPad())
       LandingPadSuccs.insert(*I);
+    if (!FunctionBlocks.count(*I))
+      report("MBB has successor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Preds.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      *OS << "MBB is not in the predecessor list of the successor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
+  }
+
+  // Check the predecessor list.
+  for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
+       E = MBB->pred_end(); I != E; ++I) {
+    if (!FunctionBlocks.count(*I))
+      report("MBB has predecessor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Succs.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      *OS << "MBB is not in the successor list of the predecessor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
   }
 
   const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
@@ -540,7 +580,15 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       ++MBBI;
       if (MBBI == MF->end()) {
         report("MBB conditionally falls through out of function!", MBB);
-      } if (MBB->succ_size() != 2) {
+      } if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (&*MBBI != TBB)
+          report("MBB exits via conditional branch/fall-through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/fall-through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
         report("MBB exits via conditional branch/fall-through but doesn't have "
                "exactly two CFG successors!", MBB);
       } else if (!matchPair(MBB->succ_begin(), TBB, MBBI)) {
@@ -560,7 +608,15 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     } else if (TBB && FBB) {
       // Block conditionally branches somewhere, otherwise branches
       // somewhere else.
-      if (MBB->succ_size() != 2) {
+      if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (FBB != TBB)
+          report("MBB exits via conditional branch/branch through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/branch through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
         report("MBB exits via conditional branch/branch but doesn't have "
                "exactly two CFG successors!", MBB);
       } else if (!matchPair(MBB->succ_begin(), TBB, FBB)) {
@@ -639,6 +695,50 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
   }
 }
 
+// The operands on an INLINEASM instruction must follow a template.
+// Verify that the flag operands make sense.
+void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
+  // The first two operands on INLINEASM are the asm string and global flags.
+  if (MI->getNumOperands() < 2) {
+    report("Too few operands on inline asm", MI);
+    return;
+  }
+  if (!MI->getOperand(0).isSymbol())
+    report("Asm string must be an external symbol", MI);
+  if (!MI->getOperand(1).isImm())
+    report("Asm flags must be an immediate", MI);
+  // Allowed flags are Extra_HasSideEffects = 1, Extra_IsAlignStack = 2,
+  // Extra_AsmDialect = 4, Extra_MayLoad = 8, and Extra_MayStore = 16.
+  if (!isUInt<5>(MI->getOperand(1).getImm()))
+    report("Unknown asm flags", &MI->getOperand(1), 1);
+
+  assert(InlineAsm::MIOp_FirstOperand == 2 && "Asm format changed");
+
+  unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+  unsigned NumOps;
+  for (unsigned e = MI->getNumOperands(); OpNo < e; OpNo += NumOps) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // There may be implicit ops after the fixed operands.
+    if (!MO.isImm())
+      break;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(MO.getImm());
+  }
+
+  if (OpNo > MI->getNumOperands())
+    report("Missing operands in last group", MI);
+
+  // An optional MDNode follows the groups.
+  if (OpNo < MI->getNumOperands() && MI->getOperand(OpNo).isMetadata())
+    ++OpNo;
+
+  // All trailing operands must be implicit registers.
+  for (unsigned e = MI->getNumOperands(); OpNo < e; ++OpNo) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (!MO.isReg() || !MO.isImplicit())
+      report("Expected implicit register after groups", &MO, OpNo);
+  }
+}
+
 void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
@@ -647,6 +747,10 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
         << MI->getNumExplicitOperands() << " given.\n";
   }
 
+  // Check the tied operands.
+  if (MI->isInlineAsm())
+    verifyInlineAsm(MI);
+
   // Check the MachineMemOperands for basic consistency.
   for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
        E = MI->memoperands_end(); I != E; ++I) {
@@ -702,6 +806,17 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       if (MO->isImplicit())
         report("Explicit operand marked as implicit", MO, MONum);
     }
+
+    int TiedTo = MCID.getOperandConstraint(MONum, MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      if (!MO->isReg())
+        report("Tied use must be a register", MO, MONum);
+      else if (!MO->isTied())
+        report("Operand should be tied", MO, MONum);
+      else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
+        report("Tied def doesn't match MCInstrDesc", MO, MONum);
+    } else if (MO->isReg() && MO->isTied())
+      report("Explicit operand should not be tied", MO, MONum);
   } else {
     // ARM adds %reg0 operands to indicate predicates. We'll allow that.
     if (MO->isReg() && !MO->isImplicit() && !MI->isVariadic() && MO->getReg())
@@ -716,6 +831,28 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MRI->tracksLiveness() && !MI->isDebugValue())
       checkLiveness(MO, MONum);
 
+    // Verify the consistency of tied operands.
+    if (MO->isTied()) {
+      unsigned OtherIdx = MI->findTiedOperandIdx(MONum);
+      const MachineOperand &OtherMO = MI->getOperand(OtherIdx);
+      if (!OtherMO.isReg())
+        report("Must be tied to a register", MO, MONum);
+      if (!OtherMO.isTied())
+        report("Missing tie flags on tied operand", MO, MONum);
+      if (MI->findTiedOperandIdx(OtherIdx) != MONum)
+        report("Inconsistent tie links", MO, MONum);
+      if (MONum < MCID.getNumDefs()) {
+        if (OtherIdx < MCID.getNumOperands()) {
+          if (-1 == MCID.getOperandConstraint(OtherIdx, MCOI::TIED_TO))
+            report("Explicit def tied to explicit use without tie constraint",
+                   MO, MONum);
+        } else {
+          if (!OtherMO.isImplicit())
+            report("Explicit def should be tied to implicit use", MO, MONum);
+        }
+      }
+    }
+
     // Verify two-address constraints after leaving SSA form.
     unsigned DefIdx;
     if (!MRI->isSSA() && MO->isUse() &&
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index cfa3eecf02ef..4ea21d4ff7bd 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -49,8 +49,8 @@ static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
     cl::desc("Disable Stack Slot Coloring"));
 static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
     cl::desc("Disable Machine Dead Code Elimination"));
-static cl::opt<bool> EnableEarlyIfConversion("enable-early-ifcvt", cl::Hidden,
-    cl::desc("Enable Early If-conversion"));
+static cl::opt<bool> DisableEarlyIfConversion("disable-early-ifcvt", cl::Hidden,
+    cl::desc("Disable Early If-conversion"));
 static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
@@ -161,7 +161,7 @@ static AnalysisID overridePass(AnalysisID StandardID, AnalysisID TargetID) {
     return applyDisable(TargetID, DisableMachineDCE);
 
   if (StandardID == &EarlyIfConverterID)
-    return applyDisable(TargetID, !EnableEarlyIfConversion);
+    return applyDisable(TargetID, DisableEarlyIfConversion);
 
   if (StandardID == &MachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
@@ -447,8 +447,8 @@ void TargetPassConfig::addMachinePasses() {
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
     const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
     assert (TPI && IPI && "Pass ID not registered!");
-    const char *TID = (char *)(TPI->getTypeInfo());
-    const char *IID = (char *)(IPI->getTypeInfo());
+    const char *TID = (const char *)(TPI->getTypeInfo());
+    const char *IID = (const char *)(IPI->getTypeInfo());
     insertPass(TID, IID);
   }
 
@@ -456,7 +456,8 @@ void TargetPassConfig::addMachinePasses() {
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
-  addPass(&ExpandISelPseudosID);
+  if (addPass(&ExpandISelPseudosID))
+    printAndVerify("After ExpandISelPseudos");
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -528,6 +529,10 @@ void TargetPassConfig::addMachineSSAOptimization() {
   // instructions dead.
   addPass(&OptimizePHIsID);
 
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID);
+
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
   addPass(&LocalStackSlotAllocationID);
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 9099862bd312..a795ac8448f5 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -527,6 +527,11 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         SeenMoveImm = true;
       } else {
         Changed |= optimizeExtInstr(MI, MBB, LocalMIs);
+        // optimizeExtInstr might have created new instructions after MI
+        // and before the already incremented MII. Adjust MII so that the
+        // next iteration sees the new instructions.
+        MII = MI;
+        ++MII;
         if (SeenMoveImm)
           Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
       }
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 7449ff54609d..d57bc7362de9 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -240,6 +240,7 @@ void SchedulePostRATDList::exitRegion() {
   ScheduleDAGInstrs::exitRegion();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dumpSchedule - dump the scheduled Sequence.
 void SchedulePostRATDList::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
@@ -249,6 +250,7 @@ void SchedulePostRATDList::dumpSchedule() const {
       dbgs() << "**** NOOP ****\n";
   }
 }
+#endif
 
 bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
   TII = Fn.getTarget().getInstrInfo();
@@ -298,7 +300,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
       static int bbcnt = 0;
       if (bbcnt++ % DebugDiv != DebugMod)
         continue;
-      dbgs() << "*** DEBUG scheduling " << Fn.getFunction()->getName()
+      dbgs() << "*** DEBUG scheduling " << Fn.getName()
              << ":BB#" << MBB->getNumber() << " ***\n";
     }
 #endif
@@ -488,7 +490,6 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
   DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
 
   BitVector killedRegs(TRI->getNumRegs());
-  BitVector ReservedRegs = TRI->getReservedRegs(MF);
 
   StartBlockForKills(MBB);
 
@@ -529,7 +530,7 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isUse()) continue;
       unsigned Reg = MO.getReg();
-      if ((Reg == 0) || ReservedRegs.test(Reg)) continue;
+      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
 
       bool kill = false;
       if (!killedRegs.test(Reg)) {
@@ -564,7 +565,7 @@ void SchedulePostRATDList::FixupKills(MachineBasicBlock *MBB) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
       unsigned Reg = MO.getReg();
-      if ((Reg == 0) || ReservedRegs.test(Reg)) continue;
+      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
 
       LiveRegs.set(Reg);
 
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index 34d075c23286..e4e18c3bb54b 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -137,8 +137,7 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
 bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
-               << "********** Function: "
-               << ((Value*)MF.getFunction())->getName() << '\n');
+               << "********** Function: " << MF.getName() << '\n');
 
   bool Changed = false;
 
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index c791ffb28cba..77554d691c26 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -96,7 +96,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   placeCSRSpillsAndRestores(Fn);
 
   // Add the code to save and restore the callee saved registers
-  if (!F->hasFnAttr(Attribute::Naked))
+  if (!F->getFnAttributes().hasAttribute(Attributes::Naked))
     insertCSRSpillsAndRestores(Fn);
 
   // Allow the target machine to make final modifications to the function
@@ -111,7 +111,7 @@ bool PEI::runOnMachineFunction(MachineFunction &Fn) {
   // called functions.  Because of this, calculateCalleeSavedRegisters()
   // must be called before this function in order to set the AdjustsStack
   // and MaxCallFrameSize variables.
-  if (!F->hasFnAttr(Attribute::Naked))
+  if (!F->getFnAttributes().hasAttribute(Attributes::Naked))
     insertPrologEpilogCode(Fn);
 
   // Replace all MO_FrameIndex operands with physical register references
@@ -221,13 +221,13 @@ void PEI::calculateCalleeSavedRegisters(MachineFunction &Fn) {
     return;
 
   // In Naked functions we aren't going to save any registers.
-  if (Fn.getFunction()->hasFnAttr(Attribute::Naked))
+  if (Fn.getFunction()->getFnAttributes().hasAttribute(Attributes::Naked))
     return;
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
-    if (Fn.getRegInfo().isPhysRegOrOverlapUsed(Reg)) {
+    if (Fn.getRegInfo().isPhysRegUsed(Reg)) {
       // If the reg is modified, save it!
       CSI.push_back(CalleeSavedInfo(Reg));
     }
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index 3a03807ebd0e..8a49609552ad 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -20,7 +20,6 @@
 #include "VirtRegMap.h"
 #include "LiveRegMatrix.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Function.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -273,7 +272,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
 bool RABasic::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
                << "********** Function: "
-               << ((Value*)mf.getFunction())->getName() << '\n');
+               << mf.getName() << '\n');
 
   MF = &mf;
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 6b3a48eefd95..88922169b306 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -113,9 +113,11 @@ namespace {
     // PhysRegState - One of the RegState enums, or a virtreg.
     std::vector<unsigned> PhysRegState;
 
-    // UsedInInstr - BitVector of physregs that are used in the current
-    // instruction, and so cannot be allocated.
-    BitVector UsedInInstr;
+    typedef SparseSet<unsigned> UsedInInstrSet;
+
+    // UsedInInstr - Set of physregs that are used in the current instruction,
+    // and so cannot be allocated.
+    UsedInInstrSet UsedInInstr;
 
     // SkippedInstrs - Descriptors of instructions whose clobber list was
     // ignored because all registers were spilled. It is still necessary to
@@ -173,7 +175,7 @@ namespace {
                                        unsigned VirtReg, unsigned Hint);
     LiveRegMap::iterator reloadVirtReg(MachineInstr *MI, unsigned OpNum,
                                        unsigned VirtReg, unsigned Hint);
-    void spillAll(MachineInstr *MI);
+    void spillAll(MachineBasicBlock::iterator MI);
     bool setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg);
     void addRetOperands(MachineBasicBlock *MBB);
   };
@@ -312,7 +314,7 @@ void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
 }
 
 /// spillAll - Spill all dirty virtregs without killing them.
-void RAFast::spillAll(MachineInstr *MI) {
+void RAFast::spillAll(MachineBasicBlock::iterator MI) {
   if (LiveVirtRegs.empty()) return;
   isBulkSpilling = true;
   // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
@@ -340,7 +342,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
     PhysRegState[PhysReg] = regFree;
     // Fall through
   case regFree:
-    UsedInInstr.set(PhysReg);
+    UsedInInstr.insert(PhysReg);
     MO.setIsKill();
     return;
   default:
@@ -360,13 +362,13 @@ void RAFast::usePhysReg(MachineOperand &MO) {
              "Instruction is not using a subregister of a reserved register");
       // Leave the superregister in the working set.
       PhysRegState[Alias] = regFree;
-      UsedInInstr.set(Alias);
+      UsedInInstr.insert(Alias);
       MO.getParent()->addRegisterKilled(Alias, TRI, true);
       return;
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
-        UsedInInstr.set(Alias);
+        UsedInInstr.insert(Alias);
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
@@ -380,7 +382,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 
   // All aliases are disabled, bring register into working set.
   PhysRegState[PhysReg] = regFree;
-  UsedInInstr.set(PhysReg);
+  UsedInInstr.insert(PhysReg);
   MO.setIsKill();
 }
 
@@ -389,7 +391,7 @@ void RAFast::usePhysReg(MachineOperand &MO) {
 /// reserved instead of allocated.
 void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
                            RegState NewState) {
-  UsedInInstr.set(PhysReg);
+  UsedInInstr.insert(PhysReg);
   switch (unsigned VirtReg = PhysRegState[PhysReg]) {
   case regDisabled:
     break;
@@ -429,7 +431,7 @@ void RAFast::definePhysReg(MachineInstr *MI, unsigned PhysReg,
 // can be allocated directly.
 // Returns spillImpossible when PhysReg or an alias can't be spilled.
 unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
-  if (UsedInInstr.test(PhysReg)) {
+  if (UsedInInstr.count(PhysReg)) {
     DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is already used in instr.\n");
     return spillImpossible;
   }
@@ -454,7 +456,7 @@ unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
   unsigned Cost = 0;
   for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
     unsigned Alias = *AI;
-    if (UsedInInstr.test(Alias))
+    if (UsedInInstr.count(Alias))
       return spillImpossible;
     switch (unsigned VirtReg = PhysRegState[Alias]) {
     case regDisabled:
@@ -509,7 +511,7 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
 
   // Ignore invalid hints.
   if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
-               !RC->contains(Hint) || !RegClassInfo.isAllocatable(Hint)))
+               !RC->contains(Hint) || !MRI->isAllocatable(Hint)))
     Hint = 0;
 
   // Take hint when possible.
@@ -530,7 +532,7 @@ RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr *MI,
   // First try to find a completely free register.
   for (ArrayRef<unsigned>::iterator I = AO.begin(), E = AO.end(); I != E; ++I) {
     unsigned PhysReg = *I;
-    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.test(PhysReg)) {
+    if (PhysRegState[PhysReg] == regFree && !UsedInInstr.count(PhysReg)) {
       assignVirtToPhysReg(*LRI, PhysReg);
       return LRI;
     }
@@ -596,7 +598,7 @@ RAFast::defineVirtReg(MachineInstr *MI, unsigned OpNum,
   LRI->LastUse = MI;
   LRI->LastOpNum = OpNum;
   LRI->Dirty = true;
-  UsedInInstr.set(LRI->PhysReg);
+  UsedInInstr.insert(LRI->PhysReg);
   return LRI;
 }
 
@@ -646,7 +648,7 @@ RAFast::reloadVirtReg(MachineInstr *MI, unsigned OpNum,
   assert(LRI->PhysReg && "Register not assigned");
   LRI->LastUse = MI;
   LRI->LastOpNum = OpNum;
-  UsedInInstr.set(LRI->PhysReg);
+  UsedInInstr.insert(LRI->PhysReg);
   return LRI;
 }
 
@@ -708,7 +710,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     unsigned Reg = MO.getReg();
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-      UsedInInstr.set(*AI);
+      UsedInInstr.insert(*AI);
       if (ThroughRegs.count(PhysRegState[*AI]))
         definePhysReg(MI, *AI, regFree);
     }
@@ -756,7 +758,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
   }
 
   // Restore UsedInInstr to a state usable for allocating normal virtual uses.
-  UsedInInstr.reset();
+  UsedInInstr.clear();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
@@ -764,12 +766,12 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
     if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
     DEBUG(dbgs() << "\tSetting " << PrintReg(Reg, TRI)
                  << " as used in instr\n");
-    UsedInInstr.set(Reg);
+    UsedInInstr.insert(Reg);
   }
 
   // Also mark PartialDefs as used to avoid reallocation.
   for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
-    UsedInInstr.set(PartialDefs[i]);
+    UsedInInstr.insert(PartialDefs[i]);
 }
 
 /// addRetOperand - ensure that a return instruction has an operand for each
@@ -838,7 +840,7 @@ void RAFast::AllocateBasicBlock() {
   // Add live-in registers as live.
   for (MachineBasicBlock::livein_iterator I = MBB->livein_begin(),
          E = MBB->livein_end(); I != E; ++I)
-    if (RegClassInfo.isAllocatable(*I))
+    if (MRI->isAllocatable(*I))
       definePhysReg(MII, *I, regReserved);
 
   SmallVector<unsigned, 8> VirtDead;
@@ -942,7 +944,7 @@ void RAFast::AllocateBasicBlock() {
     }
 
     // Track registers used by instruction.
-    UsedInInstr.reset();
+    UsedInInstr.clear();
 
     // First scan.
     // Mark physreg uses and early clobbers as used.
@@ -954,6 +956,11 @@ void RAFast::AllocateBasicBlock() {
     bool hasPhysDefs = false;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
+      // Make sure MRI knows about registers clobbered by regmasks.
+      if (MO.isRegMask()) {
+        MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+        continue;
+      }
       if (!MO.isReg()) continue;
       unsigned Reg = MO.getReg();
       if (!Reg) continue;
@@ -970,7 +977,7 @@ void RAFast::AllocateBasicBlock() {
         }
         continue;
       }
-      if (!RegClassInfo.isAllocatable(Reg)) continue;
+      if (!MRI->isAllocatable(Reg)) continue;
       if (MO.isUse()) {
         usePhysReg(MO);
       } else if (MO.isEarlyClobber()) {
@@ -1016,11 +1023,13 @@ void RAFast::AllocateBasicBlock() {
       }
     }
 
-    MRI->addPhysRegsUsed(UsedInInstr);
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setPhysRegUsed(*I);
 
     // Track registers defined by instruction - early clobbers and tied uses at
     // this point.
-    UsedInInstr.reset();
+    UsedInInstr.clear();
     if (hasEarlyClobbers) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI->getOperand(i);
@@ -1030,7 +1039,7 @@ void RAFast::AllocateBasicBlock() {
         // Look for physreg defs and tied uses.
         if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-          UsedInInstr.set(*AI);
+          UsedInInstr.insert(*AI);
       }
     }
 
@@ -1058,7 +1067,7 @@ void RAFast::AllocateBasicBlock() {
       unsigned Reg = MO.getReg();
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        if (!RegClassInfo.isAllocatable(Reg)) continue;
+        if (!MRI->isAllocatable(Reg)) continue;
         definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
                                regFree : regReserved);
         continue;
@@ -1080,7 +1089,9 @@ void RAFast::AllocateBasicBlock() {
       killVirtReg(VirtDead[i]);
     VirtDead.clear();
 
-    MRI->addPhysRegsUsed(UsedInInstr);
+    for (UsedInInstrSet::iterator
+         I = UsedInInstr.begin(), E = UsedInInstr.end(); I != E; ++I)
+      MRI->setPhysRegUsed(*I);
 
     if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
       DEBUG(dbgs() << "-- coalescing: " << *MI);
@@ -1110,8 +1121,7 @@ void RAFast::AllocateBasicBlock() {
 ///
 bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
-               << "********** Function: "
-               << ((Value*)Fn.getFunction())->getName() << '\n');
+               << "********** Function: " << Fn.getName() << '\n');
   MF = &Fn;
   MRI = &MF->getRegInfo();
   TM = &Fn.getTarget();
@@ -1119,7 +1129,8 @@ bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
   TII = TM->getInstrInfo();
   MRI->freezeReservedRegs(Fn);
   RegClassInfo.runOnMachineFunction(Fn);
-  UsedInInstr.resize(TRI->getNumRegs());
+  UsedInInstr.clear();
+  UsedInInstr.setUniverse(TRI->getNumRegs());
 
   assert(!MRI->isSSA() && "regalloc requires leaving SSA");
 
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 6ac542860501..06f69c1e0d16 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -24,7 +24,6 @@
 #include "VirtRegMap.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Function.h"
 #include "llvm/PassAnalysisSupport.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/EdgeBundles.h"
@@ -331,9 +330,9 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
+  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -509,7 +508,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 ///
 /// @param VirtReg Live range that is about to be assigned.
 /// @param PhysReg Desired register for assignment.
-/// @prarm IsHint  True when PhysReg is VirtReg's preferred register.
+/// @param IsHint  True when PhysReg is VirtReg's preferred register.
 /// @param MaxCost Only look for cheaper candidates and update with new cost
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
@@ -1746,8 +1745,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
-               << "********** Function: "
-               << ((Value*)mf.getFunction())->getName() << '\n');
+               << "********** Function: " << mf.getName() << '\n');
 
   MF = &mf;
   if (VerifyEnabled)
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index d0db26b2089f..02ebce7a11a0 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -118,7 +118,6 @@ private:
   typedef std::vector<AllowedSet> AllowedSetMap;
   typedef std::pair<unsigned, unsigned> RegPair;
   typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
-  typedef std::vector<PBQP::Graph::NodeItr> NodeVector;
   typedef std::set<unsigned> RegSet;
 
 
@@ -192,7 +191,6 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
                                                 const MachineLoopInfo *loopInfo,
                                                 const RegSet &vregs) {
 
-  typedef std::vector<const LiveInterval*> LIVector;
   LiveIntervals *LIS = const_cast<LiveIntervals*>(lis);
   MachineRegisterInfo *mri = &mf->getRegInfo();
   const TargetRegisterInfo *tri = mf->getTarget().getRegisterInfo();
@@ -209,8 +207,6 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
     mri->setPhysRegUsed(Reg);
   }
 
-  BitVector reservedRegs = tri->getReservedRegs(*mf);
-
   // Iterate over vregs.
   for (RegSet::const_iterator vregItr = vregs.begin(), vregEnd = vregs.end();
        vregItr != vregEnd; ++vregItr) {
@@ -219,7 +215,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
     LiveInterval *vregLI = &LIS->getInterval(vreg);
 
     // Record any overlaps with regmask operands.
-    BitVector regMaskOverlaps(tri->getNumRegs());
+    BitVector regMaskOverlaps;
     LIS->checkRegMaskInterference(*vregLI, regMaskOverlaps);
 
     // Compute an initial allowed set for the current vreg.
@@ -228,7 +224,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilder::build(MachineFunction *mf,
     ArrayRef<uint16_t> rawOrder = trc->getRawAllocationOrder(*mf);
     for (unsigned i = 0; i != rawOrder.size(); ++i) {
       unsigned preg = rawOrder[i];
-      if (reservedRegs.test(preg))
+      if (mri->isReserved(preg))
         continue;
 
       // vregLI crosses a regmask operand that clobbers preg.
@@ -358,7 +354,7 @@ std::auto_ptr<PBQPRAProblem> PBQPBuilderWithCoalescing::build(
                                                    loopInfo->getLoopDepth(mbb));
 
       if (cp.isPhys()) {
-        if (!lis->isAllocatable(dst)) {
+        if (!mf->getRegInfo().isAllocatable(dst)) {
           continue;
         }
 
@@ -433,6 +429,7 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.addRequired<SlotIndexes>();
   au.addPreserved<SlotIndexes>();
   au.addRequired<LiveIntervals>();
+  au.addPreserved<LiveIntervals>();
   //au.addRequiredID(SplitCriticalEdgesID);
   if (customPassID)
     au.addRequiredID(*customPassID);
@@ -444,6 +441,7 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.addRequired<MachineLoopInfo>();
   au.addPreserved<MachineLoopInfo>();
   au.addRequired<VirtRegMap>();
+  au.addPreserved<VirtRegMap>();
   MachineFunctionPass::getAnalysisUsage(au);
 }
 
@@ -556,7 +554,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   mri->freezeReservedRegs(MF);
 
-  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getFunction()->getName() << "\n");
+  DEBUG(dbgs() << "PBQP Register Allocating for " << mf->getName() << "\n");
 
   // Allocator main loop:
   //
@@ -570,11 +568,12 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   // Find the vreg intervals in need of allocation.
   findVRegIntervalsToAlloc();
 
+#ifndef NDEBUG
   const Function* func = mf->getFunction();
   std::string fqn =
     func->getParent()->getModuleIdentifier() + "." +
     func->getName().str();
-  (void)fqn;
+#endif
 
   // If there are non-empty intervals allocate them using pbqp.
   if (!vregsToAlloc.empty()) {
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index 652bc3015a3d..805d23567307 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -15,8 +15,9 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "regalloc"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -57,10 +58,11 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   CalleeSaved = CSR;
 
   // Different reserved registers?
-  BitVector RR = TRI->getReservedRegs(*MF);
-  if (RR != Reserved)
+  const BitVector &RR = MF->getRegInfo().getReservedRegs();
+  if (Reserved.size() != RR.size() || RR != Reserved) {
     Update = true;
-  Reserved = RR;
+    Reserved = RR;
+  }
 
   // Invalidate cached information from previous function.
   if (Update)
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 990633440e0f..2538f10ede59 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -55,6 +55,8 @@ STATISTIC(numCommutes , "Number of instruction commuting performed");
 STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
 STATISTIC(NumInflated , "Number of register classes inflated");
+STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
+STATISTIC(NumLaneResolves,  "Number of dead lane conflicts resolved");
 
 static cl::opt<bool>
 EnableJoining("join-liveintervals",
@@ -123,6 +125,9 @@ namespace {
     /// can use this information below to update aliases.
     bool joinIntervals(CoalescerPair &CP);
 
+    /// Attempt joining two virtual registers. Return true on success.
+    bool joinVirtRegs(CoalescerPair &CP);
+
     /// Attempt joining with a reserved physreg.
     bool joinReservedPhysReg(CoalescerPair &CP);
 
@@ -193,12 +198,6 @@ INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
 
 char RegisterCoalescer::ID = 0;
 
-static unsigned compose(const TargetRegisterInfo &tri, unsigned a, unsigned b) {
-  if (!a) return b;
-  if (!b) return a;
-  return tri.composeSubRegIndices(a, b);
-}
-
 static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
                         unsigned &Src, unsigned &Dst,
                         unsigned &SrcSub, unsigned &DstSub) {
@@ -209,8 +208,8 @@ static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
     SrcSub = MI->getOperand(1).getSubReg();
   } else if (MI->isSubregToReg()) {
     Dst = MI->getOperand(0).getReg();
-    DstSub = compose(tri, MI->getOperand(0).getSubReg(),
-                     MI->getOperand(3).getImm());
+    DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
+                                      MI->getOperand(3).getImm());
     Src = MI->getOperand(2).getReg();
     SrcSub = MI->getOperand(2).getSubReg();
   } else
@@ -349,7 +348,8 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
     if (DstReg != Dst)
       return false;
     // Registers match, do the subregisters line up?
-    return compose(TRI, SrcIdx, SrcSub) == compose(TRI, DstIdx, DstSub);
+    return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
+           TRI.composeSubRegIndices(DstIdx, DstSub);
   }
 }
 
@@ -425,7 +425,8 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   // If AValNo is defined as a copy from IntB, we can potentially process this.
   // Get the instruction that defines this value number.
   MachineInstr *ACopyMI = LIS->getInstructionFromIndex(AValNo->def);
-  if (!CP.isCoalescable(ACopyMI))
+  // Don't allow any partial copies, even if isCoalescable() allows them.
+  if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
     return false;
 
   // Get the LiveRange in IntB that this value number starts with.
@@ -583,7 +584,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   unsigned NewReg = NewDstMO.getReg();
-  if (NewReg != IntB.reg || !NewDstMO.isKill())
+  if (NewReg != IntB.reg || !LiveRangeQuery(IntB, AValNo->def).isKill())
     return false;
 
   // Make sure there are no other definitions of IntB that would reach the
@@ -849,8 +850,17 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
   // Update LiveDebugVariables.
   LDV->renameRegister(SrcReg, DstReg, SubIdx);
 
+  SmallPtrSet<MachineInstr*, 8> Visited;
   for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(SrcReg);
        MachineInstr *UseMI = I.skipInstruction();) {
+    // Each instruction can only be rewritten once because sub-register
+    // composition is not always idempotent. When SrcReg != DstReg, rewriting
+    // the UseMI operands removes them from the SrcReg use-def chain, but when
+    // SrcReg is DstReg we could encounter UseMI twice if it has multiple
+    // operands mentioning the virtual register.
+    if (SrcReg == DstReg && !Visited.insert(UseMI))
+      continue;
+
     SmallVector<unsigned,8> Ops;
     bool Reads, Writes;
     tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
@@ -890,7 +900,7 @@ bool RegisterCoalescer::canJoinPhys(CoalescerPair &CP) {
   /// Always join simple intervals that are defined by a single copy from a
   /// reserved register. This doesn't increase register pressure, so it is
   /// always beneficial.
-  if (!RegClassInfo.isReserved(CP.getDstReg())) {
+  if (!MRI->isReserved(CP.getDstReg())) {
     DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
     return false;
   }
@@ -1065,7 +1075,7 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 /// Attempt joining with a reserved physreg.
 bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   assert(CP.isPhys() && "Must be a physreg copy");
-  assert(RegClassInfo.isReserved(CP.getDstReg()) && "Not a reserved register");
+  assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
                << '\n');
@@ -1102,347 +1112,797 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   return true;
 }
 
-/// ComputeUltimateVN - Assuming we are going to join two live intervals,
-/// compute what the resultant value numbers for each value in the input two
-/// ranges will be.  This is complicated by copies between the two which can
-/// and will commonly cause multiple value numbers to be merged into one.
-///
-/// VN is the value number that we're trying to resolve.  InstDefiningValue
-/// keeps track of the new InstDefiningValue assignment for the result
-/// LiveInterval.  ThisFromOther/OtherFromThis are sets that keep track of
-/// whether a value in this or other is a copy from the opposite set.
-/// ThisValNoAssignments/OtherValNoAssignments keep track of value #'s that have
-/// already been assigned.
-///
-/// ThisFromOther[x] - If x is defined as a copy from the other interval, this
-/// contains the value number the copy is from.
-///
-static unsigned ComputeUltimateVN(VNInfo *VNI,
-                                  SmallVector<VNInfo*, 16> &NewVNInfo,
-                                  DenseMap<VNInfo*, VNInfo*> &ThisFromOther,
-                                  DenseMap<VNInfo*, VNInfo*> &OtherFromThis,
-                                  SmallVector<int, 16> &ThisValNoAssignments,
-                                  SmallVector<int, 16> &OtherValNoAssignments) {
-  unsigned VN = VNI->id;
-
-  // If the VN has already been computed, just return it.
-  if (ThisValNoAssignments[VN] >= 0)
-    return ThisValNoAssignments[VN];
-  assert(ThisValNoAssignments[VN] != -2 && "Cyclic value numbers");
-
-  // If this val is not a copy from the other val, then it must be a new value
-  // number in the destination.
-  DenseMap<VNInfo*, VNInfo*>::iterator I = ThisFromOther.find(VNI);
-  if (I == ThisFromOther.end()) {
-    NewVNInfo.push_back(VNI);
-    return ThisValNoAssignments[VN] = NewVNInfo.size()-1;
-  }
-  VNInfo *OtherValNo = I->second;
-
-  // Otherwise, this *is* a copy from the RHS.  If the other side has already
-  // been computed, return it.
-  if (OtherValNoAssignments[OtherValNo->id] >= 0)
-    return ThisValNoAssignments[VN] = OtherValNoAssignments[OtherValNo->id];
-
-  // Mark this value number as currently being computed, then ask what the
-  // ultimate value # of the other value is.
-  ThisValNoAssignments[VN] = -2;
-  unsigned UltimateVN =
-    ComputeUltimateVN(OtherValNo, NewVNInfo, OtherFromThis, ThisFromOther,
-                      OtherValNoAssignments, ThisValNoAssignments);
-  return ThisValNoAssignments[VN] = UltimateVN;
-}
+//===----------------------------------------------------------------------===//
+//                 Interference checking and interval joining
+//===----------------------------------------------------------------------===//
+//
+// In the easiest case, the two live ranges being joined are disjoint, and
+// there is no interference to consider. It is quite common, though, to have
+// overlapping live ranges, and we need to check if the interference can be
+// resolved.
+//
+// The live range of a single SSA value forms a sub-tree of the dominator tree.
+// This means that two SSA values overlap if and only if the def of one value
+// is contained in the live range of the other value. As a special case, the
+// overlapping values can be defined at the same index.
+//
+// The interference from an overlapping def can be resolved in these cases:
+//
+// 1. Coalescable copies. The value is defined by a copy that would become an
+//    identity copy after joining SrcReg and DstReg. The copy instruction will
+//    be removed, and the value will be merged with the source value.
+//
+//    There can be several copies back and forth, causing many values to be
+//    merged into one. We compute a list of ultimate values in the joined live
+//    range as well as a mappings from the old value numbers.
+//
+// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
+//    predecessors have a live out value. It doesn't cause real interference,
+//    and can be merged into the value it overlaps. Like a coalescable copy, it
+//    can be erased after joining.
+//
+// 3. Copy of external value. The overlapping def may be a copy of a value that
+//    is already in the other register. This is like a coalescable copy, but
+//    the live range of the source register must be trimmed after erasing the
+//    copy instruction:
+//
+//      %src = COPY %ext
+//      %dst = COPY %ext  <-- Remove this COPY, trim the live range of %ext.
+//
+// 4. Clobbering undefined lanes. Vector registers are sometimes built by
+//    defining one lane at a time:
+//
+//      %dst:ssub0<def,read-undef> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    The live range of %src overlaps the %dst value defined by FOO, but
+//    merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
+//    which was undef anyway.
+//
+//    The value mapping is more complicated in this case. The final live range
+//    will have different value numbers for both FOO and BAR, but there is no
+//    simple mapping from old to new values. It may even be necessary to add
+//    new PHI values.
+//
+// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
+//    is live, but never read. This can happen because we don't compute
+//    individual live ranges per lane.
+//
+//      %dst<def> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    This kind of interference is only resolved locally. If the clobbered
+//    lane value escapes the block, the join is aborted.
 
+namespace {
+/// Track information about values in a single virtual register about to be
+/// joined. Objects of this class are always created in pairs - one for each
+/// side of the CoalescerPair.
+class JoinVals {
+  LiveInterval &LI;
+
+  // Location of this register in the final joined register.
+  // Either CP.DstIdx or CP.SrcIdx.
+  unsigned SubIdx;
+
+  // Values that will be present in the final live range.
+  SmallVectorImpl<VNInfo*> &NewVNInfo;
+
+  const CoalescerPair &CP;
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
+  const TargetRegisterInfo *TRI;
+
+  // Value number assignments. Maps value numbers in LI to entries in NewVNInfo.
+  // This is suitable for passing to LiveInterval::join().
+  SmallVector<int, 8> Assignments;
+
+  // Conflict resolution for overlapping values.
+  enum ConflictResolution {
+    // No overlap, simply keep this value.
+    CR_Keep,
+
+    // Merge this value into OtherVNI and erase the defining instruction.
+    // Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    // values.
+    CR_Erase,
+
+    // Merge this value into OtherVNI but keep the defining instruction.
+    // This is for the special case where OtherVNI is defined by the same
+    // instruction.
+    CR_Merge,
+
+    // Keep this value, and have it replace OtherVNI where possible. This
+    // complicates value mapping since OtherVNI maps to two different values
+    // before and after this def.
+    // Used when clobbering undefined or dead lanes.
+    CR_Replace,
+
+    // Unresolved conflict. Visit later when all values have been mapped.
+    CR_Unresolved,
+
+    // Unresolvable conflict. Abort the join.
+    CR_Impossible
+  };
 
-// Find out if we have something like
-// A = X
-// B = X
-// if so, we can pretend this is actually
-// A = X
-// B = A
-// which allows us to coalesce A and B.
-// VNI is the definition of B. LR is the life range of A that includes
-// the slot just before B. If we return true, we add "B = X" to DupCopies.
-// This implies that A dominates B.
-static bool RegistersDefinedFromSameValue(LiveIntervals &li,
-                                          const TargetRegisterInfo &tri,
-                                          CoalescerPair &CP,
-                                          VNInfo *VNI,
-                                          VNInfo *OtherVNI,
-                                     SmallVector<MachineInstr*, 8> &DupCopies) {
-  // FIXME: This is very conservative. For example, we don't handle
-  // physical registers.
-
-  MachineInstr *MI = li.getInstructionFromIndex(VNI->def);
-
-  if (!MI || CP.isPartial() || CP.isPhys())
-    return false;
+  // Per-value info for LI. The lane bit masks are all relative to the final
+  // joined register, so they can be compared directly between SrcReg and
+  // DstReg.
+  struct Val {
+    ConflictResolution Resolution;
 
-  unsigned A = CP.getDstReg();
-  if (!TargetRegisterInfo::isVirtualRegister(A))
-    return false;
+    // Lanes written by this def, 0 for unanalyzed values.
+    unsigned WriteLanes;
 
-  unsigned B = CP.getSrcReg();
-  if (!TargetRegisterInfo::isVirtualRegister(B))
-    return false;
+    // Lanes with defined values in this register. Other lanes are undef and
+    // safe to clobber.
+    unsigned ValidLanes;
 
-  MachineInstr *OtherMI = li.getInstructionFromIndex(OtherVNI->def);
-  if (!OtherMI)
-    return false;
+    // Value in LI being redefined by this def.
+    VNInfo *RedefVNI;
 
-  if (MI->isImplicitDef()) {
-    DupCopies.push_back(MI);
-    return true;
-  } else {
-    if (!MI->isFullCopy())
-      return false;
-    unsigned Src = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Src))
-      return false;
-    if (!OtherMI->isFullCopy())
-      return false;
-    unsigned OtherSrc = OtherMI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(OtherSrc))
-      return false;
+    // Value in the other live range that overlaps this def, if any.
+    VNInfo *OtherVNI;
 
-    if (Src != OtherSrc)
-      return false;
+    // Is this value an IMPLICIT_DEF?
+    bool IsImplicitDef;
 
-    // If the copies use two different value numbers of X, we cannot merge
-    // A and B.
-    LiveInterval &SrcInt = li.getInterval(Src);
-    // getVNInfoBefore returns NULL for undef copies. In this case, the
-    // optimization is still safe.
-    if (SrcInt.getVNInfoBefore(OtherVNI->def) !=
-        SrcInt.getVNInfoBefore(VNI->def))
-      return false;
+    // True when the live range of this value will be pruned because of an
+    // overlapping CR_Replace value in the other live range.
+    bool Pruned;
 
-    DupCopies.push_back(MI);
-    return true;
-  }
-}
+    // True once Pruned above has been computed.
+    bool PrunedComputed;
 
-/// joinIntervals - Attempt to join these two intervals.  On failure, this
-/// returns false.
-bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
-  // Handle physreg joins separately.
-  if (CP.isPhys())
-    return joinReservedPhysReg(CP);
+    Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
+            RedefVNI(0), OtherVNI(0), IsImplicitDef(false), Pruned(false),
+            PrunedComputed(false) {}
 
-  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
-  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
-               << '\n');
+    bool isAnalyzed() const { return WriteLanes != 0; }
+  };
 
-  // Compute the final value assignment, assuming that the live ranges can be
-  // coalesced.
-  SmallVector<int, 16> LHSValNoAssignments;
-  SmallVector<int, 16> RHSValNoAssignments;
-  DenseMap<VNInfo*, VNInfo*> LHSValsDefinedFromRHS;
-  DenseMap<VNInfo*, VNInfo*> RHSValsDefinedFromLHS;
-  SmallVector<VNInfo*, 16> NewVNInfo;
+  // One entry per value number in LI.
+  SmallVector<Val, 8> Vals;
+
+  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef);
+  VNInfo *stripCopies(VNInfo *VNI);
+  ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+  void computeAssignment(unsigned ValNo, JoinVals &Other);
+  bool taintExtent(unsigned, unsigned, JoinVals&,
+                   SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
+  bool usesLanes(MachineInstr *MI, unsigned, unsigned, unsigned);
+  bool isPrunedValue(unsigned ValNo, JoinVals &Other);
+
+public:
+  JoinVals(LiveInterval &li, unsigned subIdx,
+           SmallVectorImpl<VNInfo*> &newVNInfo,
+           const CoalescerPair &cp,
+           LiveIntervals *lis,
+           const TargetRegisterInfo *tri)
+    : LI(li), SubIdx(subIdx), NewVNInfo(newVNInfo), CP(cp), LIS(lis),
+      Indexes(LIS->getSlotIndexes()), TRI(tri),
+      Assignments(LI.getNumValNums(), -1), Vals(LI.getNumValNums())
+  {}
+
+  /// Analyze defs in LI and compute a value mapping in NewVNInfo.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool mapValues(JoinVals &Other);
+
+  /// Try to resolve conflicts that require all values to be mapped.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool resolveConflicts(JoinVals &Other);
+
+  /// Prune the live range of values in Other.LI where they would conflict with
+  /// CR_Replace values in LI. Collect end points for restoring the live range
+  /// after joining.
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints);
+
+  /// Erase any machine instructions that have been coalesced away.
+  /// Add erased instructions to ErasedInstrs.
+  /// Add foreign virtual registers to ShrinkRegs if their live range ended at
+  /// the erased instrs.
+  void eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+                   SmallVectorImpl<unsigned> &ShrinkRegs);
+
+  /// Get the value assignments suitable for passing to LiveInterval::join.
+  const int *getAssignments() const { return Assignments.data(); }
+};
+} // end anonymous namespace
+
+/// Compute the bitmask of lanes actually written by DefMI.
+/// Set Redef if there are any partial register definitions that depend on the
+/// previous value of the register.
+unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
+  unsigned L = 0;
+  for (ConstMIOperands MO(DefMI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || MO->getReg() != LI.reg || !MO->isDef())
+      continue;
+    L |= TRI->getSubRegIndexLaneMask(
+           TRI->composeSubRegIndices(SubIdx, MO->getSubReg()));
+    if (MO->readsReg())
+      Redef = true;
+  }
+  return L;
+}
 
-  SmallVector<MachineInstr*, 8> DupCopies;
-  SmallVector<MachineInstr*, 8> DeadCopies;
+/// Find the ultimate value that VNI was copied from.
+VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
+  while (!VNI->isPHIDef()) {
+    MachineInstr *MI = Indexes->getInstructionFromIndex(VNI->def);
+    assert(MI && "No defining instruction");
+    if (!MI->isFullCopy())
+      break;
+    unsigned Reg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      break;
+    LiveRangeQuery LRQ(LIS->getInterval(Reg), VNI->def);
+    if (!LRQ.valueIn())
+      break;
+    VNI = LRQ.valueIn();
+  }
+  return VNI;
+}
 
-  LiveInterval &LHS = LIS->getOrCreateInterval(CP.getDstReg());
-  DEBUG(dbgs() << "\t\tLHS = " << PrintReg(CP.getDstReg(), TRI) << ' ' << LHS
-               << '\n');
+/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+/// Return a conflict resolution when possible, but leave the hard cases as
+/// CR_Unresolved.
+/// Recursively calls computeAssignment() on this and Other, guaranteeing that
+/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+/// The recursion always goes upwards in the dominator tree, making loops
+/// impossible.
+JoinVals::ConflictResolution
+JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  assert(!V.isAnalyzed() && "Value has already been analyzed!");
+  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  if (VNI->isUnused()) {
+    V.WriteLanes = ~0u;
+    return CR_Keep;
+  }
 
-  // Loop over the value numbers of the LHS, seeing if any are defined from
-  // the RHS.
-  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-       i != e; ++i) {
-    VNInfo *VNI = *i;
-    if (VNI->isUnused() || VNI->isPHIDef())
-      continue;
-    MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def);
-    assert(MI && "Missing def");
-    if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy?
-      continue;
+  // Get the instruction defining this value, compute the lanes written.
+  const MachineInstr *DefMI = 0;
+  if (VNI->isPHIDef()) {
+    // Conservatively assume that all lanes in a PHI are valid.
+    V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+  } else {
+    DefMI = Indexes->getInstructionFromIndex(VNI->def);
+    bool Redef = false;
+    V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+    // If this is a read-modify-write instruction, there may be more valid
+    // lanes than the ones written by this instruction.
+    // This only covers partial redef operands. DefMI may have normal use
+    // operands reading the register. They don't contribute valid lanes.
+    //
+    // This adds ssub1 to the set of valid lanes in %src:
+    //
+    //   %src:ssub1<def> = FOO
+    //
+    // This leaves only ssub1 valid, making any other lanes undef:
+    //
+    //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+    //
+    // The <read-undef> flag on the def operand means that old lane values are
+    // not important.
+    if (Redef) {
+      V.RedefVNI = LiveRangeQuery(LI, VNI->def).valueIn();
+      assert(V.RedefVNI && "Instruction is reading nonexistent value");
+      computeAssignment(V.RedefVNI->id, Other);
+      V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+    }
 
-    // Figure out the value # from the RHS.
-    VNInfo *OtherVNI = RHS.getVNInfoBefore(VNI->def);
-    // The copy could be to an aliased physreg.
-    if (!OtherVNI)
-      continue;
+    // An IMPLICIT_DEF writes undef values.
+    if (DefMI->isImplicitDef()) {
+      V.IsImplicitDef = true;
+      V.ValidLanes &= ~V.WriteLanes;
+    }
+  }
 
-    // DstReg is known to be a register in the LHS interval.  If the src is
-    // from the RHS interval, we can use its value #.
-    if (CP.isCoalescable(MI))
-      DeadCopies.push_back(MI);
-    else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI,
-                                            DupCopies))
-      continue;
+  // Find the value in Other that overlaps VNI->def, if any.
+  LiveRangeQuery OtherLRQ(Other.LI, VNI->def);
+
+  // It is possible that both values are defined by the same instruction, or
+  // the values are PHIs defined in the same block. When that happens, the two
+  // values should be merged into one, but not into any preceding value.
+  // The first value defined or visited gets CR_Keep, the other gets CR_Merge.
+  if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
+    assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");
+
+    // One value stays, the other is merged. Keep the earlier one, or the first
+    // one we see.
+    if (OtherVNI->def < VNI->def)
+      Other.computeAssignment(OtherVNI->id, *this);
+    else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
+      // This is an early-clobber def overlapping a live-in value in the other
+      // register. Not mergeable.
+      V.OtherVNI = OtherLRQ.valueIn();
+      return CR_Impossible;
+    }
+    V.OtherVNI = OtherVNI;
+    Val &OtherV = Other.Vals[OtherVNI->id];
+    // Keep this value, check for conflicts when analyzing OtherVNI.
+    if (!OtherV.isAnalyzed())
+      return CR_Keep;
+    // Both sides have been analyzed now.
+    // Allow overlapping PHI values. Any real interference would show up in a
+    // predecessor, the PHI itself can't introduce any conflicts.
+    if (VNI->isPHIDef())
+      return CR_Merge;
+    if (V.ValidLanes & OtherV.ValidLanes)
+      // Overlapping lanes can't be resolved.
+      return CR_Impossible;
+    else
+      return CR_Merge;
+  }
 
-    LHSValsDefinedFromRHS[VNI] = OtherVNI;
+  // No simultaneous def. Is Other live at the def?
+  V.OtherVNI = OtherLRQ.valueIn();
+  if (!V.OtherVNI)
+    // No overlap, no conflict.
+    return CR_Keep;
+
+  assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");
+
+  // We have overlapping values, or possibly a kill of Other.
+  // Recursively compute assignments up the dominator tree.
+  Other.computeAssignment(V.OtherVNI->id, *this);
+  const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+  // Allow overlapping PHI values. Any real interference would show up in a
+  // predecessor, the PHI itself can't introduce any conflicts.
+  if (VNI->isPHIDef())
+    return CR_Replace;
+
+  // Check for simple erasable conflicts.
+  if (DefMI->isImplicitDef())
+    return CR_Erase;
+
+  // Include the non-conflict where DefMI is a coalescable copy that kills
+  // OtherVNI. We still want the copy erased and value numbers merged.
+  if (CP.isCoalescable(DefMI)) {
+    // Some of the lanes copied from OtherVNI may be undef, making them undef
+    // here too.
+    V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes;
+    return CR_Erase;
   }
 
-  // Loop over the value numbers of the RHS, seeing if any are defined from
-  // the LHS.
-  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
-       i != e; ++i) {
-    VNInfo *VNI = *i;
-    if (VNI->isUnused() || VNI->isPHIDef())
-      continue;
-    MachineInstr *MI = LIS->getInstructionFromIndex(VNI->def);
-    assert(MI && "Missing def");
-    if (!MI->isCopyLike() && !MI->isImplicitDef()) // Src not defined by a copy?
-      continue;
+  // This may not be a real conflict if DefMI simply kills Other and defines
+  // VNI.
+  if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
+    return CR_Keep;
+
+  // Handle the case where VNI and OtherVNI can be proven to be identical:
+  //
+  //   %other = COPY %ext
+  //   %this  = COPY %ext <-- Erase this copy
+  //
+  if (DefMI->isFullCopy() && !CP.isPartial() &&
+      stripCopies(VNI) == stripCopies(V.OtherVNI))
+    return CR_Erase;
+
+  // If the lanes written by this instruction were all undef in OtherVNI, it is
+  // still safe to join the live ranges. This can't be done with a simple value
+  // mapping, though - OtherVNI will map to multiple values:
+  //
+  //   1 %dst:ssub0 = FOO                <-- OtherVNI
+  //   2 %src = BAR                      <-- VNI
+  //   3 %dst:ssub1 = COPY %src<kill>    <-- Eliminate this copy.
+  //   4 BAZ %dst<kill>
+  //   5 QUUX %src<kill>
+  //
+  // Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
+  // handles this complex value mapping.
+  if ((V.WriteLanes & OtherV.ValidLanes) == 0)
+    return CR_Replace;
+
+  // If the other live range is killed by DefMI and the live ranges are still
+  // overlapping, it must be because we're looking at an early clobber def:
+  //
+  //   %dst<def,early-clobber> = ASM %src<kill>
+  //
+  // In this case, it is illegal to merge the two live ranges since the early
+  // clobber def would clobber %src before it was read.
+  if (OtherLRQ.isKill()) {
+    // This case where the def doesn't overlap the kill is handled above.
+    assert(VNI->def.isEarlyClobber() &&
+           "Only early clobber defs can overlap a kill");
+    return CR_Impossible;
+  }
 
-    // Figure out the value # from the LHS.
-    VNInfo *OtherVNI = LHS.getVNInfoBefore(VNI->def);
-    // The copy could be to an aliased physreg.
-    if (!OtherVNI)
-      continue;
+  // VNI is clobbering live lanes in OtherVNI, but there is still the
+  // possibility that no instructions actually read the clobbered lanes.
+  // If we're clobbering all the lanes in OtherVNI, at least one must be read.
+  // Otherwise Other.LI wouldn't be live here.
+  if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
+    return CR_Impossible;
+
+  // We need to verify that no instructions are reading the clobbered lanes. To
+  // save compile time, we'll only check that locally. Don't allow the tainted
+  // value to escape the basic block.
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
+    return CR_Impossible;
+
+  // There are still some things that could go wrong besides clobbered lanes
+  // being read, for example OtherVNI may be only partially redefined in MBB,
+  // and some clobbered lanes could escape the block. Save this analysis for
+  // resolveConflicts() when all values have been mapped. We need to know
+  // RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
+  // that now - the recursive analyzeValue() calls must go upwards in the
+  // dominator tree.
+  return CR_Unresolved;
+}
 
-    // DstReg is known to be a register in the RHS interval.  If the src is
-    // from the LHS interval, we can use its value #.
-    if (CP.isCoalescable(MI))
-      DeadCopies.push_back(MI);
-    else if (!RegistersDefinedFromSameValue(*LIS, *TRI, CP, VNI, OtherVNI,
-                                            DupCopies))
-        continue;
+/// Compute the value assignment for ValNo in LI.
+/// This may be called recursively by analyzeValue(), but never for a ValNo on
+/// the stack.
+void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.isAnalyzed()) {
+    // Recursion should always move up the dominator tree, so ValNo is not
+    // supposed to reappear before it has been assigned.
+    assert(Assignments[ValNo] != -1 && "Bad recursion?");
+    return;
+  }
+  switch ((V.Resolution = analyzeValue(ValNo, Other))) {
+  case CR_Erase:
+  case CR_Merge:
+    // Merge this ValNo into OtherVNI.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
+    assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
+    Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(LI.reg) << ':' << ValNo << '@'
+                 << LI.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.LI.reg) << ':' << V.OtherVNI->id << '@'
+                 << V.OtherVNI->def << " --> @"
+                 << NewVNInfo[Assignments[ValNo]]->def << '\n');
+    break;
+  case CR_Replace:
+  case CR_Unresolved:
+    // The other value is going to be pruned if this join is successful.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
+    Other.Vals[V.OtherVNI->id].Pruned = true;
+    // Fall through.
+  default:
+    // This value number needs to go in the final joined live range.
+    Assignments[ValNo] = NewVNInfo.size();
+    NewVNInfo.push_back(LI.getValNumInfo(ValNo));
+    break;
+  }
+}
 
-    RHSValsDefinedFromLHS[VNI] = OtherVNI;
+bool JoinVals::mapValues(JoinVals &Other) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    computeAssignment(i, Other);
+    if (Vals[i].Resolution == CR_Impossible) {
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(LI.reg) << ':' << i
+                   << '@' << LI.getValNumInfo(i)->def << '\n');
+      return false;
+    }
   }
+  return true;
+}
 
-  LHSValNoAssignments.resize(LHS.getNumValNums(), -1);
-  RHSValNoAssignments.resize(RHS.getNumValNums(), -1);
-  NewVNInfo.reserve(LHS.getNumValNums() + RHS.getNumValNums());
+/// Assuming ValNo is going to clobber some valid lanes in Other.LI, compute
+/// the extent of the tainted lanes in the block.
+///
+/// Multiple values in Other.LI can be affected since partial redefinitions can
+/// preserve previously tainted lanes.
+///
+///   1 %dst = VLOAD           <-- Define all lanes in %dst
+///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+///
+/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+/// entry to TaintedVals.
+///
+/// Returns false if the tainted lanes extend beyond the basic block.
+bool JoinVals::
+taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
+            SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
+  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
+
+  // Scan Other.LI from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LI.find(VNI->def);
+  assert(OtherI != Other.LI.end() && "No conflict?");
+  do {
+    // OtherI is pointing to a tainted value. Abort the join if the tainted
+    // lanes escape the block.
+    SlotIndex End = OtherI->end;
+    if (End >= MBBEnd) {
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.LI.reg) << ':'
+                   << OtherI->valno->id << '@' << OtherI->start << '\n');
+      return false;
+    }
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.LI.reg) << ':'
+                 << OtherI->valno->id << '@' << OtherI->start
+                 << " to " << End << '\n');
+    // A dead def is not a problem.
+    if (End.isDead())
+      break;
+    TaintExtent.push_back(std::make_pair(End, TaintedLanes));
+
+    // Check for another def in the MBB.
+    if (++OtherI == Other.LI.end() || OtherI->start >= MBBEnd)
+      break;
+
+    // Lanes written by the new def are no longer tainted.
+    const Val &OV = Other.Vals[OtherI->valno->id];
+    TaintedLanes &= ~OV.WriteLanes;
+    if (!OV.RedefVNI)
+      break;
+  } while (TaintedLanes);
+  return true;
+}
 
-  for (LiveInterval::vni_iterator i = LHS.vni_begin(), e = LHS.vni_end();
-       i != e; ++i) {
-    VNInfo *VNI = *i;
-    unsigned VN = VNI->id;
-    if (LHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+/// Return true if MI uses any of the given Lanes from Reg.
+/// This does not include partial redefinitions of Reg.
+bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+                         unsigned Lanes) {
+  if (MI->isDebugValue())
+    return false;
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || MO->isDef() || MO->getReg() != Reg)
       continue;
-    ComputeUltimateVN(VNI, NewVNInfo,
-                      LHSValsDefinedFromRHS, RHSValsDefinedFromLHS,
-                      LHSValNoAssignments, RHSValNoAssignments);
-  }
-  for (LiveInterval::vni_iterator i = RHS.vni_begin(), e = RHS.vni_end();
-       i != e; ++i) {
-    VNInfo *VNI = *i;
-    unsigned VN = VNI->id;
-    if (RHSValNoAssignments[VN] >= 0 || VNI->isUnused())
+    if (!MO->readsReg())
       continue;
-    // If this value number isn't a copy from the LHS, it's a new number.
-    if (RHSValsDefinedFromLHS.find(VNI) == RHSValsDefinedFromLHS.end()) {
-      NewVNInfo.push_back(VNI);
-      RHSValNoAssignments[VN] = NewVNInfo.size()-1;
+    if (Lanes & TRI->getSubRegIndexLaneMask(
+                  TRI->composeSubRegIndices(SubIdx, MO->getSubReg())))
+      return true;
+  }
+  return false;
+}
+
+bool JoinVals::resolveConflicts(JoinVals &Other) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
+    if (V.Resolution != CR_Unresolved)
       continue;
-    }
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(LI.reg) << ':' << i
+                 << '@' << LI.getValNumInfo(i)->def << '\n');
+    ++NumLaneConflicts;
+    assert(V.OtherVNI && "Inconsistent conflict resolution.");
+    VNInfo *VNI = LI.getValNumInfo(i);
+    const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+    // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
+    // join, those lanes will be tainted with a wrong value. Get the extent of
+    // the tainted lanes.
+    unsigned TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
+    SmallVector<std::pair<SlotIndex, unsigned>, 8> TaintExtent;
+    if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
+      // Tainted lanes would extend beyond the basic block.
+      return false;
 
-    ComputeUltimateVN(VNI, NewVNInfo,
-                      RHSValsDefinedFromLHS, LHSValsDefinedFromRHS,
-                      RHSValNoAssignments, LHSValNoAssignments);
-  }
+    assert(!TaintExtent.empty() && "There should be at least one conflict.");
 
-  // Armed with the mappings of LHS/RHS values to ultimate values, walk the
-  // interval lists to see if these intervals are coalescable.
-  LiveInterval::const_iterator I = LHS.begin();
-  LiveInterval::const_iterator IE = LHS.end();
-  LiveInterval::const_iterator J = RHS.begin();
-  LiveInterval::const_iterator JE = RHS.end();
-
-  // Collect interval end points that will no longer be kills.
-  SmallVector<MachineInstr*, 8> LHSOldKills;
-  SmallVector<MachineInstr*, 8> RHSOldKills;
-
-  // Skip ahead until the first place of potential sharing.
-  if (I != IE && J != JE) {
-    if (I->start < J->start) {
-      I = std::upper_bound(I, IE, J->start);
-      if (I != LHS.begin()) --I;
-    } else if (J->start < I->start) {
-      J = std::upper_bound(J, JE, I->start);
-      if (J != RHS.begin()) --J;
+    // Now look at the instructions from VNI->def to TaintExtent (inclusive).
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+    MachineBasicBlock::iterator MI = MBB->begin();
+    if (!VNI->isPHIDef()) {
+      MI = Indexes->getInstructionFromIndex(VNI->def);
+      // No need to check the instruction defining VNI for reads.
+      ++MI;
     }
-  }
-
-  while (I != IE && J != JE) {
-    // Determine if these two live ranges overlap.
-    // If so, check value # info to determine if they are really different.
-    if (I->end > J->start && J->end > I->start) {
-      // If the live range overlap will map to the same value number in the
-      // result liverange, we can still coalesce them.  If not, we can't.
-      if (LHSValNoAssignments[I->valno->id] !=
-          RHSValNoAssignments[J->valno->id])
+    assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
+           "Interference ends on VNI->def. Should have been handled earlier");
+    MachineInstr *LastMI =
+      Indexes->getInstructionFromIndex(TaintExtent.front().first);
+    assert(LastMI && "Range must end at a proper instruction");
+    unsigned TaintNum = 0;
+    for(;;) {
+      assert(MI != MBB->end() && "Bad LastMI");
+      if (usesLanes(MI, Other.LI.reg, Other.SubIdx, TaintedLanes)) {
+        DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
         return false;
-
-      // Extended live ranges should no longer be killed.
-      if (!I->end.isBlock() && I->end < J->end)
-        if (MachineInstr *MI = LIS->getInstructionFromIndex(I->end))
-          LHSOldKills.push_back(MI);
-      if (!J->end.isBlock() && J->end < I->end)
-        if (MachineInstr *MI = LIS->getInstructionFromIndex(J->end))
-          RHSOldKills.push_back(MI);
+      }
+      // LastMI is the last instruction to use the current value.
+      if (&*MI == LastMI) {
+        if (++TaintNum == TaintExtent.size())
+          break;
+        LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
+        assert(LastMI && "Range must end at a proper instruction");
+        TaintedLanes = TaintExtent[TaintNum].second;
+      }
+      ++MI;
     }
 
-    if (I->end < J->end)
-      ++I;
-    else
-      ++J;
-  }
-
-  // Clear kill flags where live ranges are extended.
-  while (!LHSOldKills.empty())
-    LHSOldKills.pop_back_val()->clearRegisterKills(LHS.reg, TRI);
-  while (!RHSOldKills.empty())
-    RHSOldKills.pop_back_val()->clearRegisterKills(RHS.reg, TRI);
-
-  if (LHSValNoAssignments.empty())
-    LHSValNoAssignments.push_back(-1);
-  if (RHSValNoAssignments.empty())
-    RHSValNoAssignments.push_back(-1);
-
-  // Now erase all the redundant copies.
-  for (unsigned i = 0, e = DeadCopies.size(); i != e; ++i) {
-    MachineInstr *MI = DeadCopies[i];
-    if (!ErasedInstrs.insert(MI))
-      continue;
-    DEBUG(dbgs() << "\t\terased:\t" << LIS->getInstructionIndex(MI)
-                 << '\t' << *MI);
-    LIS->RemoveMachineInstrFromMaps(MI);
-    MI->eraseFromParent();
+    // The tainted lanes are unused.
+    V.Resolution = CR_Replace;
+    ++NumLaneResolves;
   }
+  return true;
+}
 
-  SmallVector<unsigned, 8> SourceRegisters;
-  for (SmallVector<MachineInstr*, 8>::iterator I = DupCopies.begin(),
-         E = DupCopies.end(); I != E; ++I) {
-    MachineInstr *MI = *I;
-    if (!ErasedInstrs.insert(MI))
-      continue;
+// Determine if ValNo is a copy of a value number in LI or Other.LI that will
+// be pruned:
+//
+//   %dst = COPY %src
+//   %src = COPY %dst  <-- This value to be pruned.
+//   %dst = COPY %src  <-- This value is a copy of a pruned value.
+//
+bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.Pruned || V.PrunedComputed)
+    return V.Pruned;
+
+  if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
+    return V.Pruned;
+
+  // Follow copies up the dominator tree and check if any intermediate value
+  // has been pruned.
+  V.PrunedComputed = true;
+  V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
+  return V.Pruned;
+}
 
-    // If MI is a copy, then we have pretended that the assignment to B in
-    // A = X
-    // B = X
-    // was actually a copy from A. Now that we decided to coalesce A and B,
-    // transform the code into
-    // A = X
-    // In the case of the implicit_def, we just have to remove it.
-    if (!MI->isImplicitDef()) {
-      unsigned Src = MI->getOperand(1).getReg();
-      SourceRegisters.push_back(Src);
+void JoinVals::pruneValues(JoinVals &Other,
+                           SmallVectorImpl<SlotIndex> &EndPoints) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LI.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      break;
+    case CR_Replace: {
+      // This value takes precedence over the value in Other.LI.
+      LIS->pruneValue(&Other.LI, Def, &EndPoints);
+      // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
+      // instructions are only inserted to provide a live-out value for PHI
+      // predecessors, so the instruction should simply go away once its value
+      // has been replaced.
+      Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
+      bool EraseImpDef = OtherV.IsImplicitDef && OtherV.Resolution == CR_Keep;
+      if (!Def.isBlock()) {
+        // Remove <def,read-undef> flags. This def is now a partial redef.
+        // Also remove <def,dead> flags since the joined live range will
+        // continue past this instruction.
+        for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
+             MO.isValid(); ++MO)
+          if (MO->isReg() && MO->isDef() && MO->getReg() == LI.reg) {
+            MO->setIsUndef(EraseImpDef);
+            MO->setIsDead(false);
+          }
+        // This value will reach instructions below, but we need to make sure
+        // the live range also reaches the instruction at Def.
+        if (!EraseImpDef)
+          EndPoints.push_back(Def);
+      }
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def
+                   << ": " << Other.LI << '\n');
+      break;
+    }
+    case CR_Erase:
+    case CR_Merge:
+      if (isPrunedValue(i, Other)) {
+        // This value is ultimately a copy of a pruned value in LI or Other.LI.
+        // We can no longer trust the value mapping computed by
+        // computeAssignment(), the value that was originally copied could have
+        // been replaced.
+        LIS->pruneValue(&LI, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(LI.reg) << " at "
+                     << Def << ": " << LI << '\n');
+      }
+      break;
+    case CR_Unresolved:
+    case CR_Impossible:
+      llvm_unreachable("Unresolved conflicts");
     }
-    LIS->RemoveMachineInstrFromMaps(MI);
-    MI->eraseFromParent();
   }
+}
 
-  // If B = X was the last use of X in a liverange, we have to shrink it now
-  // that B = X is gone.
-  for (SmallVector<unsigned, 8>::iterator I = SourceRegisters.begin(),
-         E = SourceRegisters.end(); I != E; ++I) {
-    LIS->shrinkToUses(&LIS->getInterval(*I));
+void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+                           SmallVectorImpl<unsigned> &ShrinkRegs) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    // Get the def location before markUnused() below invalidates it.
+    SlotIndex Def = LI.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
+      // longer. The IMPLICIT_DEF instructions are only inserted by
+      // PHIElimination to guarantee that all PHI predecessors have a value.
+      if (!Vals[i].IsImplicitDef || !Vals[i].Pruned)
+        break;
+      // Remove value number i from LI. Note that this VNInfo is still present
+      // in NewVNInfo, so it will appear as an unused value number in the final
+      // joined interval.
+      LI.getValNumInfo(i)->markUnused();
+      LI.removeValNo(LI.getValNumInfo(i));
+      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LI << '\n');
+      // FALL THROUGH.
+
+    case CR_Erase: {
+      MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+      assert(MI && "No instruction to erase");
+      if (MI->isCopy()) {
+        unsigned Reg = MI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+            Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+          ShrinkRegs.push_back(Reg);
+      }
+      ErasedInstrs.insert(MI);
+      DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
+      LIS->RemoveMachineInstrFromMaps(MI);
+      MI->eraseFromParent();
+      break;
+    }
+    default:
+      break;
+    }
   }
+}
+
+bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
+  JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
+  JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
+
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << "\n\t\tLHS = " << PrintReg(CP.getDstReg()) << ' ' << LHS
+               << '\n');
+
+  // First compute NewVNInfo and the simple value mappings.
+  // Detect impossible conflicts early.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    return false;
+
+  // Some conflicts can only be resolved after all values have been mapped.
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    return false;
 
-  // If we get here, we know that we can coalesce the live ranges.  Ask the
-  // intervals to coalesce themselves now.
-  LHS.join(RHS, &LHSValNoAssignments[0], &RHSValNoAssignments[0], NewVNInfo,
+  // All clear, the live ranges can be merged.
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints);
+  RHSVals.pruneValues(LHSVals, EndPoints);
+
+  // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
+  // registers to require trimming.
+  SmallVector<unsigned, 8> ShrinkRegs;
+  LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  while (!ShrinkRegs.empty())
+    LIS->shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
+
+  // Join RHS into LHS.
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo,
            MRI);
+
+  // Kill flags are going to be wrong if the live ranges were overlapping.
+  // Eventually, we should simply clear all kill flags when computing live
+  // ranges. They are reinserted after register allocation.
+  MRI->clearKillFlags(LHS.reg);
+  MRI->clearKillFlags(RHS.reg);
+
+  if (EndPoints.empty())
+    return true;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LHS << '\n');
+  LIS->extendToIndices(&LHS, EndPoints);
   return true;
 }
 
+/// joinIntervals - Attempt to join these two intervals.  On failure, this
+/// returns false.
+bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
+  return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
+}
+
 namespace {
   // DepthMBBCompare - Comparison predicate that sort first based on the loop
   // depth of the basic block (the unsigned), and then on the MBB number.
@@ -1564,8 +2024,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   Loops = &getAnalysis<MachineLoopInfo>();
 
   DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
-               << "********** Function: "
-               << ((Value*)MF->getFunction())->getName() << '\n');
+               << "********** Function: " << MF->getName() << '\n');
 
   if (VerifyCoalescing)
     MF->verify(this, "Before register coalescing");
diff --git a/lib/CodeGen/RegisterCoalescer.h b/lib/CodeGen/RegisterCoalescer.h
index 8a6df988f1bb..47c3df14606d 100644
--- a/lib/CodeGen/RegisterCoalescer.h
+++ b/lib/CodeGen/RegisterCoalescer.h
@@ -63,6 +63,13 @@ namespace llvm {
       : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
         Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
 
+    /// Create a CoalescerPair representing a virtreg-to-physreg copy.
+    /// No need to call setRegisters().
+    CoalescerPair(unsigned VirtReg, unsigned PhysReg,
+                  const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(0) {}
+
     /// setRegisters - set registers to match the copy instruction MI. Return
     /// false if MI is not a coalescable copy instruction.
     bool setRegisters(const MachineInstr*);
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 43448c850a0b..543c426458d7 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -63,7 +63,8 @@ void RegisterPressure::decrease(const TargetRegisterClass *RC,
   decreaseSetPressure(MaxSetPressure, RC, TRI);
 }
 
-void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Live In: ";
   for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
     dbgs() << PrintReg(LiveInRegs[i], TRI) << " ";
@@ -78,6 +79,7 @@ void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
              << '\n';
   }
 }
+#endif
 
 /// Increase the current pressure as impacted by these physical registers and
 /// bump the high water mark if needed.
@@ -320,10 +322,8 @@ struct RegisterOperands {
         if (findReg(MO.getReg(), isVReg, DeadDefs, TRI) == DeadDefs.end())
           DeadDefs.push_back(MO.getReg());
       }
-      else {
-        if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end())
-          Defs.push_back(MO.getReg());
-      }
+      else if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end())
+        Defs.push_back(MO.getReg());
     }
   }
 };
@@ -335,7 +335,7 @@ static void collectOperands(const MachineInstr *MI,
                             PhysRegOperands &PhysRegOpers,
                             VirtRegOperands &VirtRegOpers,
                             const TargetRegisterInfo *TRI,
-                            const RegisterClassInfo *RCI) {
+                            const MachineRegisterInfo *MRI) {
   for(ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI) {
     const MachineOperand &MO = *OperI;
     if (!MO.isReg() || !MO.getReg())
@@ -343,7 +343,7 @@ static void collectOperands(const MachineInstr *MI,
 
     if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       VirtRegOpers.collect(MO, TRI);
-    else if (RCI->isAllocatable(MO.getReg()))
+    else if (MRI->isAllocatable(MO.getReg()))
       PhysRegOpers.collect(MO, TRI);
   }
   // Remove redundant physreg dead defs.
@@ -449,7 +449,7 @@ bool RegPressureTracker::recede() {
 
   PhysRegOperands PhysRegOpers;
   VirtRegOperands VirtRegOpers;
-  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI);
+  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, MRI);
 
   // Boost pressure for all dead defs together.
   increasePhysRegPressure(PhysRegOpers.DeadDefs);
@@ -522,7 +522,7 @@ bool RegPressureTracker::advance() {
 
   PhysRegOperands PhysRegOpers;
   VirtRegOperands VirtRegOpers;
-  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, RCI);
+  collectOperands(CurrPos, PhysRegOpers, VirtRegOpers, TRI, MRI);
 
   // Kill liveness at last uses.
   for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
@@ -664,7 +664,7 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   // Account for register pressure similar to RegPressureTracker::recede().
   PhysRegOperands PhysRegOpers;
   VirtRegOperands VirtRegOpers;
-  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI);
+  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, MRI);
 
   // Boost max pressure for all dead defs together.
   // Since CurrSetPressure and MaxSetPressure
@@ -674,9 +674,16 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
 
   // Kill liveness at live defs.
-  decreasePhysRegPressure(PhysRegOpers.Defs);
-  decreaseVirtRegPressure(VirtRegOpers.Defs);
-
+  for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Defs[i];
+    if (!findReg(Reg, false, PhysRegOpers.Uses, TRI))
+      decreasePhysRegPressure(PhysRegOpers.Defs);
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Defs[i];
+    if (!findReg(Reg, true, VirtRegOpers.Uses, TRI))
+      decreaseVirtRegPressure(VirtRegOpers.Defs);
+  }
   // Generate liveness for uses.
   for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
     unsigned Reg = PhysRegOpers.Uses[i];
@@ -750,7 +757,7 @@ void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
   // Account for register pressure similar to RegPressureTracker::recede().
   PhysRegOperands PhysRegOpers;
   VirtRegOperands VirtRegOpers;
-  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, RCI);
+  collectOperands(MI, PhysRegOpers, VirtRegOpers, TRI, MRI);
 
   // Kill liveness at last uses. Assume allocatable physregs are single-use
   // rather than checking LiveIntervals.
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index d673794e1b93..5ec6564ce398 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -92,9 +92,6 @@ void RegScavenger::enterBasicBlock(MachineBasicBlock *mbb) {
     KillRegs.resize(NumPhysRegs);
     DefRegs.resize(NumPhysRegs);
 
-    // Create reserved registers bitvector.
-    ReservedRegs = TRI->getReservedRegs(MF);
-
     // Create callee-saved registers bitvector.
     CalleeSavedRegs.resize(NumPhysRegs);
     const uint16_t *CSRegs = TRI->getCalleeSavedRegs(&MF);
@@ -225,9 +222,9 @@ void RegScavenger::getRegsUsed(BitVector &used, bool includeReserved) {
   used = RegsAvailable;
   used.flip();
   if (includeReserved)
-    used |= ReservedRegs;
+    used |= MRI->getReservedRegs();
   else
-    used.reset(ReservedRegs);
+    used.reset(MRI->getReservedRegs());
 }
 
 unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 752f8e408042..9a6507100170 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -279,6 +279,7 @@ void SUnit::ComputeHeight() {
   } while (!WorkList.empty());
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
 /// a group of nodes flagged together.
 void SUnit::dump(const ScheduleDAG *G) const {
@@ -336,6 +337,7 @@ void SUnit::dumpAll(const ScheduleDAG *G) const {
   }
   dbgs() << "\n";
 }
+#endif
 
 #ifndef NDEBUG
 /// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 9c1dba355b48..a4d4a93e6dd5 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDAGILP.h"
 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Target/TargetMachine.h"
@@ -30,6 +31,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -44,14 +46,15 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineDominatorTree &mdt,
                                      bool IsPostRAFlag,
                                      LiveIntervals *lis)
-  : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()),
-    InstrItins(mf.getTarget().getInstrItineraryData()), LIS(lis),
-    IsPostRA(IsPostRAFlag), UnitLatencies(false), CanHandleTerminators(false),
-    LoopRegs(MDT), FirstDbgValue(0) {
+  : ScheduleDAG(mf), MLI(mli), MDT(mdt), MFI(mf.getFrameInfo()), LIS(lis),
+    IsPostRA(IsPostRAFlag), CanHandleTerminators(false), FirstDbgValue(0) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
          "Virtual registers must be removed prior to PostRA scheduling");
+
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
 }
 
 /// getUnderlyingObjectFromInt - This is the function that does the work of
@@ -68,7 +71,7 @@ static const Value *getUnderlyingObjectFromInt(const Value *V) {
       // object. We don't have to worry about the case where the
       // object address is somehow being computed by the multiply,
       // because our callers only care when the result is an
-      // identifibale object.
+      // identifiable object.
       if (U->getOpcode() != Instruction::Add ||
           (!isa<ConstantInt>(U->getOperand(1)) &&
            Operator::getOpcode(U->getOperand(1)) != Instruction::Mul))
@@ -135,10 +138,6 @@ static const Value *getUnderlyingObjectForInstr(const MachineInstr *MI,
 
 void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
   BB = bb;
-  LoopRegs.Deps.clear();
-  if (MachineLoop *ML = MLI.getLoopFor(BB))
-    if (BB == ML->getLoopLatch())
-      LoopRegs.VisitLoop(ML);
 }
 
 void ScheduleDAGInstrs::finishBlock() {
@@ -174,9 +173,6 @@ void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
   EndIndex = endcount;
   MISUnitMap.clear();
 
-  // Check to see if the scheduler cares about latencies.
-  UnitLatencies = forceUnitLatencies();
-
   ScheduleDAG::clearDAG();
 }
 
@@ -209,7 +205,7 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
       if (Reg == 0) continue;
 
       if (TRI->isPhysicalRegister(Reg))
-        Uses[Reg].push_back(&ExitSU);
+        Uses[Reg].push_back(PhysRegSUOper(&ExitSU, -1));
       else {
         assert(!IsPostRA && "Virtual register encountered after regalloc.");
         addVRegUseDeps(&ExitSU, i);
@@ -225,59 +221,44 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
              E = (*SI)->livein_end(); I != E; ++I) {
         unsigned Reg = *I;
         if (!Uses.contains(Reg))
-          Uses[Reg].push_back(&ExitSU);
+          Uses[Reg].push_back(PhysRegSUOper(&ExitSU, -1));
       }
   }
 }
 
 /// MO is an operand of SU's instruction that defines a physical register. Add
 /// data dependencies from SU to any uses of the physical register.
-void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU,
-                                           const MachineOperand &MO) {
+void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
   assert(MO.isDef() && "expect physreg def");
 
   // Ask the target if address-backscheduling is desirable, and if so how much.
   const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
-  unsigned SpecialAddressLatency = ST.getSpecialAddressLatency();
-  unsigned DataLatency = SU->Latency;
 
   for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
        Alias.isValid(); ++Alias) {
     if (!Uses.contains(*Alias))
       continue;
-    std::vector<SUnit*> &UseList = Uses[*Alias];
+    std::vector<PhysRegSUOper> &UseList = Uses[*Alias];
     for (unsigned i = 0, e = UseList.size(); i != e; ++i) {
-      SUnit *UseSU = UseList[i];
+      SUnit *UseSU = UseList[i].SU;
       if (UseSU == SU)
         continue;
-      unsigned LDataLatency = DataLatency;
-      // Optionally add in a special extra latency for nodes that
-      // feed addresses.
-      // TODO: Perhaps we should get rid of
-      // SpecialAddressLatency and just move this into
-      // adjustSchedDependency for the targets that care about it.
-      if (SpecialAddressLatency != 0 && !UnitLatencies &&
-          UseSU != &ExitSU) {
-        MachineInstr *UseMI = UseSU->getInstr();
-        const MCInstrDesc &UseMCID = UseMI->getDesc();
-        int RegUseIndex = UseMI->findRegisterUseOperandIdx(*Alias);
-        assert(RegUseIndex >= 0 && "UseMI doesn't use register!");
-        if (RegUseIndex >= 0 &&
-            (UseMI->mayLoad() || UseMI->mayStore()) &&
-            (unsigned)RegUseIndex < UseMCID.getNumOperands() &&
-            UseMCID.OpInfo[RegUseIndex].isLookupPtrRegClass())
-          LDataLatency += SpecialAddressLatency;
-      }
-      // Adjust the dependence latency using operand def/use
-      // information (if any), and then allow the target to
-      // perform its own adjustments.
-      SDep dep(SU, SDep::Data, LDataLatency, *Alias);
-      if (!UnitLatencies) {
-        unsigned Latency = computeOperandLatency(SU, UseSU, dep);
-        dep.setLatency(Latency);
-
-        ST.adjustSchedDependency(SU, UseSU, dep);
-      }
+
+      SDep dep(SU, SDep::Data, *Alias);
+
+      // Adjust the dependence latency using operand def/use information,
+      // then allow the target to perform its own adjustments.
+      int UseOp = UseList[i].OpIdx;
+      MachineInstr *RegUse = UseOp < 0 ? 0 : UseSU->getInstr();
+      dep.setLatency(
+        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                         RegUse, UseOp, /*FindMin=*/false));
+      dep.setMinLatency(
+        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                         RegUse, UseOp, /*FindMin=*/true));
+
+      ST.adjustSchedDependency(SU, UseSU, dep);
       UseSU->addPred(dep);
     }
   }
@@ -301,20 +282,23 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
        Alias.isValid(); ++Alias) {
     if (!Defs.contains(*Alias))
       continue;
-    std::vector<SUnit *> &DefList = Defs[*Alias];
+    std::vector<PhysRegSUOper> &DefList = Defs[*Alias];
     for (unsigned i = 0, e = DefList.size(); i != e; ++i) {
-      SUnit *DefSU = DefList[i];
+      SUnit *DefSU = DefList[i].SU;
       if (DefSU == &ExitSU)
         continue;
       if (DefSU != SU &&
           (Kind != SDep::Output || !MO.isDead() ||
            !DefSU->getInstr()->registerDefIsDead(*Alias))) {
         if (Kind == SDep::Anti)
-          DefSU->addPred(SDep(SU, Kind, 0, /*Reg=*/*Alias));
+          DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
         else {
-          unsigned AOLat = TII->getOutputLatency(InstrItins, MI, OperIdx,
-                                                 DefSU->getInstr());
-          DefSU->addPred(SDep(SU, Kind, AOLat, /*Reg=*/*Alias));
+          SDep Dep(SU, Kind, /*Reg=*/*Alias);
+          unsigned OutLatency =
+            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
+          Dep.setMinLatency(OutLatency);
+          Dep.setLatency(OutLatency);
+          DefSU->addPred(Dep);
         }
       }
     }
@@ -324,61 +308,14 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
     // retrieve the existing SUnits list for this register's uses.
     // Push this SUnit on the use list.
-    Uses[MO.getReg()].push_back(SU);
+    Uses[MO.getReg()].push_back(PhysRegSUOper(SU, OperIdx));
   }
   else {
-    addPhysRegDataDeps(SU, MO);
+    addPhysRegDataDeps(SU, OperIdx);
 
     // Either insert a new Reg2SUnits entry with an empty SUnits list, or
     // retrieve the existing SUnits list for this register's defs.
-    std::vector<SUnit *> &DefList = Defs[MO.getReg()];
-
-    // If a def is going to wrap back around to the top of the loop,
-    // backschedule it.
-    if (!UnitLatencies && DefList.empty()) {
-      LoopDependencies::LoopDeps::iterator I = LoopRegs.Deps.find(MO.getReg());
-      if (I != LoopRegs.Deps.end()) {
-        const MachineOperand *UseMO = I->second.first;
-        unsigned Count = I->second.second;
-        const MachineInstr *UseMI = UseMO->getParent();
-        unsigned UseMOIdx = UseMO - &UseMI->getOperand(0);
-        const MCInstrDesc &UseMCID = UseMI->getDesc();
-        const TargetSubtargetInfo &ST =
-          TM.getSubtarget<TargetSubtargetInfo>();
-        unsigned SpecialAddressLatency = ST.getSpecialAddressLatency();
-        // TODO: If we knew the total depth of the region here, we could
-        // handle the case where the whole loop is inside the region but
-        // is large enough that the isScheduleHigh trick isn't needed.
-        if (UseMOIdx < UseMCID.getNumOperands()) {
-          // Currently, we only support scheduling regions consisting of
-          // single basic blocks. Check to see if the instruction is in
-          // the same region by checking to see if it has the same parent.
-          if (UseMI->getParent() != MI->getParent()) {
-            unsigned Latency = SU->Latency;
-            if (UseMCID.OpInfo[UseMOIdx].isLookupPtrRegClass())
-              Latency += SpecialAddressLatency;
-            // This is a wild guess as to the portion of the latency which
-            // will be overlapped by work done outside the current
-            // scheduling region.
-            Latency -= std::min(Latency, Count);
-            // Add the artificial edge.
-            ExitSU.addPred(SDep(SU, SDep::Order, Latency,
-                                /*Reg=*/0, /*isNormalMemory=*/false,
-                                /*isMustAlias=*/false,
-                                /*isArtificial=*/true));
-          } else if (SpecialAddressLatency > 0 &&
-                     UseMCID.OpInfo[UseMOIdx].isLookupPtrRegClass()) {
-            // The entire loop body is within the current scheduling region
-            // and the latency of this operation is assumed to be greater
-            // than the latency of the loop.
-            // TODO: Recursively mark data-edge predecessors as
-            //       isScheduleHigh too.
-            SU->isScheduleHigh = true;
-          }
-        }
-        LoopRegs.Deps.erase(I);
-      }
-    }
+    std::vector<PhysRegSUOper> &DefList = Defs[MO.getReg()];
 
     // clear this register's use list
     if (Uses.contains(MO.getReg()))
@@ -393,11 +330,11 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
     // the block. Instead, we leave only one call at the back of the
     // DefList.
     if (SU->isCall) {
-      while (!DefList.empty() && DefList.back()->isCall)
+      while (!DefList.empty() && DefList.back().SU->isCall)
         DefList.pop_back();
     }
     // Defs are pushed in the order they are visited and never reordered.
-    DefList.push_back(SU);
+    DefList.push_back(PhysRegSUOper(SU, OperIdx));
   }
 }
 
@@ -430,9 +367,12 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   else {
     SUnit *DefSU = DefI->SU;
     if (DefSU != SU && DefSU != &ExitSU) {
-      unsigned OutLatency = TII->getOutputLatency(InstrItins, MI, OperIdx,
-                                                  DefSU->getInstr());
-      DefSU->addPred(SDep(SU, SDep::Output, OutLatency, Reg));
+      SDep Dep(SU, SDep::Output, Reg);
+      unsigned OutLatency =
+        SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
+      Dep.setMinLatency(OutLatency);
+      Dep.setLatency(OutLatency);
+      DefSU->addPred(Dep);
     }
     DefI->SU = SU;
   }
@@ -462,18 +402,17 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
     if (DefSU) {
       // The reaching Def lives within this scheduling region.
       // Create a data dependence.
-      //
-      // TODO: Handle "special" address latencies cleanly.
-      SDep dep(DefSU, SDep::Data, DefSU->Latency, Reg);
-      if (!UnitLatencies) {
-        // Adjust the dependence latency using operand def/use information, then
-        // allow the target to perform its own adjustments.
-        unsigned Latency = computeOperandLatency(DefSU, SU, const_cast<SDep &>(dep));
-        dep.setLatency(Latency);
-
-        const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
-        ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
-      }
+      SDep dep(DefSU, SDep::Data, Reg);
+      // Adjust the dependence latency using operand def/use information, then
+      // allow the target to perform its own adjustments.
+      int DefOp = Def->findRegisterDefOperandIdx(Reg);
+      dep.setLatency(
+        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, false));
+      dep.setMinLatency(
+        SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, true));
+
+      const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+      ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
       SU->addPred(dep);
     }
   }
@@ -481,14 +420,14 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   // Add antidependence to the following def of the vreg it uses.
   VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
   if (DefI != VRegDefs.end() && DefI->SU != SU)
-    DefI->SU->addPred(SDep(SU, SDep::Anti, 0, Reg));
+    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
 static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
   if (MI->isCall() || MI->hasUnmodeledSideEffects() ||
-      (MI->hasVolatileMemoryRef() &&
+      (MI->hasOrderedMemoryRef() &&
        (!MI->mayLoad() || !MI->isInvariantLoad(AA))))
     return true;
   return false;
@@ -621,8 +560,7 @@ iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   // and stop descending.
   if (*Depth > 200 ||
       MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
-    SUb->addPred(SDep(SUa, SDep::Order, /*Latency=*/0, /*Reg=*/0,
-                      /*isNormalMemory=*/true));
+    SUb->addPred(SDep(SUa, SDep::MayAliasMem));
     return *Depth;
   }
   // Track current depth.
@@ -653,9 +591,9 @@ static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
     if (SU == *I)
       continue;
     if (MIsNeedChainEdge(AA, MFI, SU->getInstr(), (*I)->getInstr())) {
-      unsigned Latency = ((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0;
-      (*I)->addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0,
-                         /*isNormalMemory=*/true));
+      SDep Dep(SU, SDep::MayAliasMem);
+      Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
+      (*I)->addPred(Dep);
     }
     // Now go through all the chain successors and iterate from them.
     // Keep track of visited nodes.
@@ -678,9 +616,11 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
   if (!EnableAASchedMI ||
-      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr()))
-    SUb->addPred(SDep(SUa, SDep::Order, TrueMemOrderLatency, /*Reg=*/0,
-                      isNormalMemory));
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+    SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
+    Dep.setLatency(TrueMemOrderLatency);
+    SUb->addPred(Dep);
+  }
   else {
     // Duplicate entries should be ignored.
     RejectList.insert(SUb);
@@ -718,10 +658,7 @@ void ScheduleDAGInstrs::initSUnits() {
     SU->isCommutable = MI->isCommutable();
 
     // Assign the Latency field of SU using target-provided information.
-    if (UnitLatencies)
-      SU->Latency = 1;
-    else
-      computeLatency(SU);
+    SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
   }
 }
 
@@ -825,16 +762,19 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       // references, even those that are known to not alias.
       for (std::map<const Value *, SUnit *>::iterator I =
              NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
-        I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+        I->second->addPred(SDep(SU, SDep::Barrier));
       }
       for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
              NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
+          SDep Dep(SU, SDep::Barrier);
+          Dep.setLatency(TrueMemOrderLatency);
+          I->second[i]->addPred(Dep);
+        }
       }
       // Add SU to the barrier chain.
       if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+        BarrierChain->addPred(SDep(SU, SDep::Barrier));
       BarrierChain = SU;
       // This is a barrier event that acts as a pivotal node in the DAG,
       // so it is safe to clear list of exposed nodes.
@@ -922,7 +862,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // SU and barrier _could_ be reordered, they should not. In addition,
         // we have lost all RejectMemNodes below barrier.
         if (BarrierChain)
-          BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          BarrierChain->addPred(SDep(SU, SDep::Barrier));
       } else {
         // Treat all other stores conservatively.
         goto new_alias_chain;
@@ -931,10 +871,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       if (!ExitSU.isPred(SU))
         // Push store's up a bit to avoid them getting in between cmp
         // and branches.
-        ExitSU.addPred(SDep(SU, SDep::Order, 0,
-                            /*Reg=*/0, /*isNormalMemory=*/false,
-                            /*isMustAlias=*/false,
-                            /*isArtificial=*/true));
+        ExitSU.addPred(SDep(SU, SDep::Artificial));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {
@@ -969,7 +906,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         if (MayAlias && AliasChain)
           addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
-          BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          BarrierChain->addPred(SDep(SU, SDep::Barrier));
       }
     }
   }
@@ -982,34 +919,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
   PendingLoads.clear();
 }
 
-void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
-  // Compute the latency for the node. We only provide a default for missing
-  // itineraries. Empty itineraries still have latency properties.
-  if (!InstrItins) {
-    SU->Latency = 1;
-
-    // Simplistic target-independent heuristic: assume that loads take
-    // extra time.
-    if (SU->getInstr()->mayLoad())
-      SU->Latency += 2;
-  } else {
-    SU->Latency = TII->getInstrLatency(InstrItins, SU->getInstr());
-  }
-}
-
-unsigned ScheduleDAGInstrs::computeOperandLatency(SUnit *Def, SUnit *Use,
-                                                  const SDep& dep,
-                                                  bool FindMin) const {
-  // For a data dependency with a known register...
-  if ((dep.getKind() != SDep::Data) || (dep.getReg() == 0))
-    return 1;
-
-  return TII->computeOperandLatency(InstrItins, TRI, Def->getInstr(),
-                                    Use->getInstr(), dep.getReg(), FindMin);
-}
-
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   SU->getInstr()->dump();
+#endif
 }
 
 std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
@@ -1029,3 +942,94 @@ std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
 std::string ScheduleDAGInstrs::getDAGName() const {
   return "dag." + BB->getFullName();
 }
+
+namespace {
+/// \brief Manage the stack used by a reverse depth-first search over the DAG.
+class SchedDAGReverseDFS {
+  std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;
+public:
+  bool isComplete() const { return DFSStack.empty(); }
+
+  void follow(const SUnit *SU) {
+    DFSStack.push_back(std::make_pair(SU, SU->Preds.begin()));
+  }
+  void advance() { ++DFSStack.back().second; }
+
+  void backtrack() { DFSStack.pop_back(); }
+
+  const SUnit *getCurr() const { return DFSStack.back().first; }
+
+  SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
+
+  SUnit::const_pred_iterator getPredEnd() const {
+    return getCurr()->Preds.end();
+  }
+};
+} // anonymous
+
+void ScheduleDAGILP::resize(unsigned NumSUnits) {
+  ILPValues.resize(NumSUnits);
+}
+
+ILPValue ScheduleDAGILP::getILP(const SUnit *SU) {
+  return ILPValues[SU->NodeNum];
+}
+
+// A leaf node has an ILP of 1/1.
+static ILPValue initILP(const SUnit *SU) {
+  unsigned Cnt = SU->getInstr()->isTransient() ? 0 : 1;
+  return ILPValue(Cnt, 1 + SU->getDepth());
+}
+
+/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
+/// search from this root.
+void ScheduleDAGILP::computeILP(const SUnit *Root) {
+  if (!IsBottomUp)
+    llvm_unreachable("Top-down ILP metric is unimplemnted");
+
+  SchedDAGReverseDFS DFS;
+  // Mark a node visited by validating it.
+  ILPValues[Root->NodeNum] = initILP(Root);
+  DFS.follow(Root);
+  for (;;) {
+    // Traverse the leftmost path as far as possible.
+    while (DFS.getPred() != DFS.getPredEnd()) {
+      const SUnit *PredSU = DFS.getPred()->getSUnit();
+      DFS.advance();
+      // If the pred is already valid, skip it.
+      if (ILPValues[PredSU->NodeNum].isValid())
+        continue;
+      ILPValues[PredSU->NodeNum] = initILP(PredSU);
+      DFS.follow(PredSU);
+    }
+    // Visit the top of the stack in postorder and backtrack.
+    unsigned PredCount = ILPValues[DFS.getCurr()->NodeNum].InstrCount;
+    DFS.backtrack();
+    if (DFS.isComplete())
+      break;
+    // Add the recently finished predecessor's bottom-up descendent count.
+    ILPValues[DFS.getCurr()->NodeNum].InstrCount += PredCount;
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ILPValue::print(raw_ostream &OS) const {
+  if (!isValid())
+    OS << "BADILP";
+  OS << InstrCount << " / " << Cycles << " = "
+     << format("%g", ((double)InstrCount / Cycles));
+}
+
+void ILPValue::dump() const {
+  dbgs() << *this << '\n';
+}
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+  Val.print(OS);
+  return OS;
+}
+
+} // namespace llvm
+#endif // !NDEBUG || LLVM_ENABLE_DUMP
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 38feee95a58e..6e781b199a5f 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Constants.h"
-#include "llvm/Function.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -35,7 +34,7 @@ namespace llvm {
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
     static std::string getGraphName(const ScheduleDAG *G) {
-      return G->MF.getFunction()->getName();
+      return G->MF.getName();
     }
 
     static bool renderGraphFromBottomUp() {
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index e6753664850a..2cd84d670aaa 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -89,6 +89,7 @@ void ScoreboardHazardRecognizer::Reset() {
   ReservedScoreboard.reset();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ScoreboardHazardRecognizer::Scoreboard::dump() const {
   dbgs() << "Scoreboard:\n";
 
@@ -104,6 +105,7 @@ void ScoreboardHazardRecognizer::Scoreboard::dump() const {
     dbgs() << '\n';
   }
 }
+#endif
 
 bool ScoreboardHazardRecognizer::atIssueLimit() const {
   if (IssueWidth == 0)
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4e29879bef19..37d7731aa158 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -194,6 +194,7 @@ namespace {
     SDValue visitOR(SDNode *N);
     SDValue visitXOR(SDNode *N);
     SDValue SimplifyVBinOp(SDNode *N);
+    SDValue SimplifyVUnaryOp(SDNode *N);
     SDValue visitSHL(SDNode *N);
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
@@ -269,6 +270,8 @@ namespace {
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue TransformFPLoadStorePair(SDNode *N);
+    SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
+    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
 
     SDValue GetDemandedBits(SDValue V, const APInt &Mask);
 
@@ -300,6 +303,11 @@ namespace {
     /// looking for a better chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
+    /// Merge consecutive store operations into a wide store.
+    /// This optimization uses wide integers or vectors when possible.
+    /// \return True if some memory operations were changed.
+    bool MergeConsecutiveStores(StoreSDNode *N);
+
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
       : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
@@ -385,10 +393,6 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
                                const TargetLowering &TLI,
                                const TargetOptions *Options,
                                unsigned Depth = 0) {
-  // No compile time optimizations on this type.
-  if (Op.getValueType() == MVT::ppcf128)
-    return 0;
-
   // fneg is removable even if it has multiple uses.
   if (Op.getOpcode() == ISD::FNEG) return 2;
 
@@ -413,7 +417,7 @@ static char isNegatibleForFree(SDValue Op, bool LegalOperations,
         !TLI.isOperationLegalOrCustom(ISD::FSUB,  Op.getValueType()))
       return 0;
 
-    // fold (fsub (fadd A, B)) -> (fsub (fneg A), B)
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
     if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
                                     Options, Depth + 1))
       return V;
@@ -1643,7 +1647,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return N0.getOperand(0);
   // fold C2-(A+C1) -> (C2-C1)-A
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
-    SDValue NewC = DAG.getConstant((N0C->getAPIntValue() - N1C1->getAPIntValue()), VT);
+    SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
+                                   VT);
     return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
                        N1.getOperand(0));
   }
@@ -2345,16 +2350,19 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // we don't want to undo this promotion.
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
-  if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      && Level == AfterLegalizeTypes) {
+  if ((N0.getOpcode() == ISD::BITCAST ||
+       N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
+      Level == AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
     EVT In1Ty = In1.getValueType();
-    // If both incoming values are integers, and the original types are the same.
+    DebugLoc DL = N->getDebugLoc();
+    // If both incoming values are integers, and the original types are the
+    // same.
     if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
-      SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), In0Ty, In0, In1);
-      SDValue BC = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, Op);
+      SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
+      SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
       AddToWorkList(Op.getNode());
       return BC;
     }
@@ -2496,8 +2504,18 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         // lanes of the constant together.
         EVT VT = Vector->getValueType(0);
         unsigned BitWidth = VT.getVectorElementType().getSizeInBits();
+
+        // If the splat value has been compressed to a bitlength lower
+        // than the size of the vector lane, we need to re-expand it to
+        // the lane size.
+        if (BitWidth > SplatBitSize)
+          for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
+               SplatBitSize < BitWidth;
+               SplatBitSize = SplatBitSize * 2)
+            SplatValue |= SplatValue.shl(SplatBitSize);
+
         Constant = APInt::getAllOnesValue(BitWidth);
-        for (unsigned i = 0, n = VT.getVectorNumElements(); i < n; ++i)
+        for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
           Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
       }
     }
@@ -2984,7 +3002,7 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   SDValue ShAmt = DAG.getConstant(16, getShiftAmountTy(VT));
   if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
     return DAG.getNode(ISD::ROTL, N->getDebugLoc(), VT, BSwap, ShAmt);
-  else if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
+  if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
     return DAG.getNode(ISD::ROTR, N->getDebugLoc(), VT, BSwap, ShAmt);
   return DAG.getNode(ISD::OR, N->getDebugLoc(), VT,
                      DAG.getNode(ISD::SHL, N->getDebugLoc(), VT, BSwap, ShAmt),
@@ -3202,11 +3220,8 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
     if ((LShVal + RShVal) != OpSizeInBits)
       return 0;
 
-    SDValue Rot;
-    if (HasROTL)
-      Rot = DAG.getNode(ISD::ROTL, DL, VT, LHSShiftArg, LHSShiftAmt);
-    else
-      Rot = DAG.getNode(ISD::ROTR, DL, VT, LHSShiftArg, RHSShiftAmt);
+    SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
+                              LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
@@ -3239,12 +3254,8 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
     if (ConstantSDNode *SUBC =
           dyn_cast<ConstantSDNode>(RHSShiftAmt.getOperand(0))) {
       if (SUBC->getAPIntValue() == OpSizeInBits) {
-        if (HasROTL)
-          return DAG.getNode(ISD::ROTL, DL, VT,
-                             LHSShiftArg, LHSShiftAmt).getNode();
-        else
-          return DAG.getNode(ISD::ROTR, DL, VT,
-                             LHSShiftArg, RHSShiftAmt).getNode();
+        return DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                           HasROTL ? LHSShiftAmt : RHSShiftAmt).getNode();
       }
     }
   }
@@ -3256,25 +3267,21 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, DebugLoc DL) {
     if (ConstantSDNode *SUBC =
           dyn_cast<ConstantSDNode>(LHSShiftAmt.getOperand(0))) {
       if (SUBC->getAPIntValue() == OpSizeInBits) {
-        if (HasROTR)
-          return DAG.getNode(ISD::ROTR, DL, VT,
-                             LHSShiftArg, RHSShiftAmt).getNode();
-        else
-          return DAG.getNode(ISD::ROTL, DL, VT,
-                             LHSShiftArg, LHSShiftAmt).getNode();
+        return DAG.getNode(HasROTR ? ISD::ROTR : ISD::ROTL, DL, VT, LHSShiftArg,
+                           HasROTR ? RHSShiftAmt : LHSShiftAmt).getNode();
       }
     }
   }
 
   // Look for sign/zext/any-extended or truncate cases:
-  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
-       || LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
-       || LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
-       || LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
-      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND
-       || RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND
-       || RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND
-       || RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
+  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
+      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
     SDValue LExtOp0 = LHSShiftAmt.getOperand(0);
     SDValue RExtOp0 = RHSShiftAmt.getOperand(0);
     if (RExtOp0.getOpcode() == ISD::SUB &&
@@ -4046,7 +4053,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (VT.isInteger() &&
       (VT0 == MVT::i1 ||
        (VT0.isInteger() &&
-        TLI.getBooleanContents(false) == TargetLowering::ZeroOrOneBooleanContent)) &&
+        TLI.getBooleanContents(false) ==
+        TargetLowering::ZeroOrOneBooleanContent)) &&
       N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
     SDValue XORNode;
     if (VT == VT0)
@@ -4412,20 +4420,18 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
       // truncate/sign extend
-      else {
-        EVT MatchingElementType =
-          EVT::getIntegerVT(*DAG.getContext(),
-                            N0VT.getScalarType().getSizeInBits());
-        EVT MatchingVectorType =
-          EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
-                           N0VT.getVectorNumElements());
+      EVT MatchingElementType =
+        EVT::getIntegerVT(*DAG.getContext(),
+                          N0VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType =
+        EVT::getVectorVT(*DAG.getContext(), MatchingElementType,
+                         N0VT.getVectorNumElements());
 
-        if (SVT == MatchingVectorType) {
-          SDValue VsetCC = DAG.getSetCC(N->getDebugLoc(), MatchingVectorType,
-                                 N0.getOperand(0), N0.getOperand(1),
-                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
-          return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
-        }
+      if (SVT == MatchingVectorType) {
+        SDValue VsetCC = DAG.getSetCC(N->getDebugLoc(), MatchingVectorType,
+                               N0.getOperand(0), N0.getOperand(1),
+                               cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, N->getDebugLoc(), VT);
       }
     }
 
@@ -5235,13 +5241,12 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       // if the source is smaller than the dest, we still need an extend
       return DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT,
                          N0.getOperand(0));
-    else if (N0.getOperand(0).getValueType().bitsGT(VT))
+    if (N0.getOperand(0).getValueType().bitsGT(VT))
       // if the source is larger than the dest, than we just need the truncate
       return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, N0.getOperand(0));
-    else
-      // if the source and dest are the same type, we can drop both the extend
-      // and the truncate.
-      return N0.getOperand(0);
+    // if the source and dest are the same type, we can drop both the extend
+    // and the truncate.
+    return N0.getOperand(0);
   }
 
   // Fold extract-and-trunc into a narrow extract. For example:
@@ -5301,6 +5306,48 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     if (Reduced.getNode())
       return Reduced;
   }
+  // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
+  // where ... are all 'undef'.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
+    SmallVector<EVT, 8> VTs;
+    SDValue V;
+    unsigned Idx = 0;
+    unsigned NumDefs = 0;
+
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+      SDValue X = N0.getOperand(i);
+      if (X.getOpcode() != ISD::UNDEF) {
+        V = X;
+        Idx = i;
+        NumDefs++;
+      }
+      // Stop if more than one members are non-undef.
+      if (NumDefs > 1)
+        break;
+      VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
+                                     VT.getVectorElementType(),
+                                     X.getValueType().getVectorNumElements()));
+    }
+
+    if (NumDefs == 0)
+      return DAG.getUNDEF(VT);
+
+    if (NumDefs == 1) {
+      assert(V.getNode() && "The single defined operand is empty!");
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+        if (i != Idx) {
+          Opnds.push_back(DAG.getUNDEF(VTs[i]));
+          continue;
+        }
+        SDValue NV = DAG.getNode(ISD::TRUNCATE, V.getDebugLoc(), VTs[i], V);
+        AddToWorkList(NV.getNode());
+        Opnds.push_back(NV);
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+                         &Opnds[0], Opnds.size());
+    }
+  }
 
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
@@ -5338,7 +5385,7 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
       !LD2->isVolatile() &&
       DAG.isConsecutiveLoad(LD2, LD1, LD1VT.getSizeInBits()/8, 1)) {
     unsigned Align = LD1->getAlignment();
-    unsigned NewAlign = TLI.getTargetData()->
+    unsigned NewAlign = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
 
     if (NewAlign <= Align &&
@@ -5407,7 +5454,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       !cast<LoadSDNode>(N0)->isVolatile() &&
       (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    unsigned Align = TLI.getTargetData()->
+    unsigned Align = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
     unsigned OrigAlign = LN0->getAlignment();
 
@@ -5430,7 +5477,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(VT)) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(VT))) &&
-      N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector()) {
+      N0.getNode()->hasOneUse() && VT.isInteger() &&
+      !VT.isVector() && !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(), VT,
                                   N0.getOperand(0));
     AddToWorkList(NewConv.getNode());
@@ -5653,7 +5701,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   }
 
   // fold (fadd c1, c2) -> c1 + c2
-  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+  if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0, N1);
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
@@ -5664,12 +5712,12 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return N0;
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+    isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+    isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations));
 
@@ -5681,6 +5729,139 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                        DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
                                    N0.getOperand(1), N1));
 
+  // If allow, fold (fadd (fneg x), x) -> 0.0
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1) {
+    return DAG.getConstantFP(0.0, VT);
+  }
+
+    // If allow, fold (fadd x, (fneg x)) -> 0.0
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0) {
+    return DAG.getConstantFP(0.0, VT);
+  }
+
+  // In unsafe math mode, we can fold chains of FADD's of the same value
+  // into multiplications.  This transform is not safe in general because
+  // we are reducing the number of rounding steps.
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      TLI.isOperationLegalOrCustom(ISD::FMUL, VT) &&
+      !N0CFP && !N1CFP) {
+    if (N0.getOpcode() == ISD::FMUL) {
+      ConstantFPSDNode *CFP00 = dyn_cast<ConstantFPSDNode>(N0.getOperand(0));
+      ConstantFPSDNode *CFP01 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+
+      // (fadd (fmul c, x), x) -> (fmul c+1, x)
+      if (CFP00 && !CFP01 && N0.getOperand(1) == N1) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP00, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, NewCFP);
+      }
+
+      // (fadd (fmul x, c), x) -> (fmul c+1, x)
+      if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP01, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, NewCFP);
+      }
+
+      // (fadd (fadd x, x), x) -> (fmul 3.0, x)
+      if (!CFP00 && !CFP01 && N0.getOperand(0) == N0.getOperand(1) &&
+          N0.getOperand(0) == N1) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N1, DAG.getConstantFP(3.0, VT));
+      }
+
+      // (fadd (fmul c, x), (fadd x, x)) -> (fmul c+2, x)
+      if (CFP00 && !CFP01 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(1) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP00, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(1), NewCFP);
+      }
+
+      // (fadd (fmul x, c), (fadd x, x)) -> (fmul c+2, x)
+      if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP01, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(0), NewCFP);
+      }
+    }
+
+    if (N1.getOpcode() == ISD::FMUL) {
+      ConstantFPSDNode *CFP10 = dyn_cast<ConstantFPSDNode>(N1.getOperand(0));
+      ConstantFPSDNode *CFP11 = dyn_cast<ConstantFPSDNode>(N1.getOperand(1));
+
+      // (fadd x, (fmul c, x)) -> (fmul c+1, x)
+      if (CFP10 && !CFP11 && N1.getOperand(1) == N0) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP10, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, NewCFP);
+      }
+
+      // (fadd x, (fmul x, c)) -> (fmul c+1, x)
+      if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP11, 0),
+                                     DAG.getConstantFP(1.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, NewCFP);
+      }
+
+      // (fadd x, (fadd x, x)) -> (fmul 3.0, x)
+      if (!CFP10 && !CFP11 && N1.getOperand(0) == N1.getOperand(1) &&
+          N1.getOperand(0) == N0) {
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0, DAG.getConstantFP(3.0, VT));
+      }
+
+      // (fadd (fadd x, x), (fmul c, x)) -> (fmul c+2, x)
+      if (CFP10 && !CFP11 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(1) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP10, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(1), NewCFP);
+      }
+
+      // (fadd (fadd x, x), (fmul x, c)) -> (fmul c+2, x)
+      if (CFP11 && !CFP10 && N1.getOpcode() == ISD::FADD &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        SDValue NewCFP = DAG.getNode(ISD::FADD, N->getDebugLoc(), VT,
+                                     SDValue(CFP11, 0),
+                                     DAG.getConstantFP(2.0, VT));
+        return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                           N0.getOperand(0), NewCFP);
+      }
+    }
+
+    // (fadd (fadd x, x), (fadd x, x)) -> (fmul 4.0, x)
+    if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
+        N0.getOperand(0) == N0.getOperand(1) &&
+        N1.getOperand(0) == N1.getOperand(1) &&
+        N0.getOperand(0) == N1.getOperand(0)) {
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                         N0.getOperand(0),
+                         DAG.getConstantFP(4.0, VT));
+    }
+  }
+
   // FADD -> FMA combines:
   if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
        DAG.getTarget().Options.UnsafeFPMath) &&
@@ -5692,8 +5873,8 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
       return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
                          N0.getOperand(0), N0.getOperand(1), N1);
     }
-  
-    // fold (fadd x, (fmul y, z)) -> (fma x, y, z)
+
+    // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
     // Note: Commutes FADD operands.
     if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
       return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
@@ -5719,7 +5900,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // fold (fsub c1, c2) -> c1-c2
-  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+  if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0, N1);
   // fold (fsub A, 0) -> A
   if (DAG.getTarget().Options.UnsafeFPMath &&
@@ -5811,7 +5992,7 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   }
 
   // fold (fmul c1, c2) -> c1*c2
-  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+  if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT, N0, N1);
   // canonicalize constant to RHS
   if (N0CFP && !N1CFP)
@@ -5867,7 +6048,14 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
 
+  if (DAG.getTarget().Options.UnsafeFPMath) {
+    if (N0CFP && N0CFP->isZero())
+      return N2;
+    if (N1CFP && N1CFP->isZero())
+      return N2;
+  }
   if (N0CFP && N0CFP->isExactlyValue(1.0))
     return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N1, N2);
   if (N1CFP && N1CFP->isExactlyValue(1.0))
@@ -5877,6 +6065,58 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT, N1, N0, N2);
 
+  // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N2.getOpcode() == ISD::FMUL &&
+      N0 == N2.getOperand(0) &&
+      N2.getOperand(1).getOpcode() == ISD::ConstantFP) {
+    return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                       DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1)));
+  }
+
+
+  // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      N0.getOpcode() == ISD::FMUL && N1CFP &&
+      N0.getOperand(1).getOpcode() == ISD::ConstantFP) {
+    return DAG.getNode(ISD::FMA, dl, VT,
+                       N0.getOperand(0),
+                       DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1)),
+                       N2);
+  }
+
+  // (fma x, 1, y) -> (fadd x, y)
+  // (fma x, -1, y) -> (fadd (fneg x), y)
+  if (N1CFP) {
+    if (N1CFP->isExactlyValue(1.0))
+      return DAG.getNode(ISD::FADD, dl, VT, N0, N2);
+
+    if (N1CFP->isExactlyValue(-1.0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
+      SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0);
+      AddToWorkList(RHSNeg.getNode());
+      return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg);
+    }
+  }
+
+  // (fma x, c, x) -> (fmul x, (c+1))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP && N0 == N2) {
+    return DAG.getNode(ISD::FMUL, dl, VT,
+                       N0,
+                       DAG.getNode(ISD::FADD, dl, VT,
+                                   N1, DAG.getConstantFP(1.0, VT)));
+  }
+
+  // (fma x, c, (fneg x)) -> (fmul x, (c-1))
+  if (DAG.getTarget().Options.UnsafeFPMath && N1CFP &&
+      N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
+    return DAG.getNode(ISD::FMUL, dl, VT,
+                       N0,
+                       DAG.getNode(ISD::FADD, dl, VT,
+                                   N1, DAG.getConstantFP(-1.0, VT)));
+  }
+
+
   return SDValue();
 }
 
@@ -5895,11 +6135,11 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   }
 
   // fold (fdiv c1, c2) -> c1/c2
-  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+  if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, N->getDebugLoc(), VT, N0, N1);
 
   // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
-  if (N1CFP && VT != MVT::ppcf128 && DAG.getTarget().Options.UnsafeFPMath) {
+  if (N1CFP && DAG.getTarget().Options.UnsafeFPMath) {
     // Compute the reciprocal 1.0 / c2.
     APFloat N1APF = N1CFP->getValueAPF();
     APFloat Recip(N1APF.getSemantics(), 1); // 1.0
@@ -5942,7 +6182,7 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (frem c1, c2) -> fmod(c1,c2)
-  if (N0CFP && N1CFP && VT != MVT::ppcf128)
+  if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FREM, N->getDebugLoc(), VT, N0, N1);
 
   return SDValue();
@@ -5955,7 +6195,7 @@ SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
 
-  if (N0CFP && N1CFP && VT != MVT::ppcf128)  // Constant fold
+  if (N0CFP && N1CFP)  // Constant fold
     return DAG.getNode(ISD::FCOPYSIGN, N->getDebugLoc(), VT, N0, N1);
 
   if (N1CFP) {
@@ -6005,7 +6245,7 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   EVT OpVT = N0.getValueType();
 
   // fold (sint_to_fp c1) -> c1fp
-  if (N0C && OpVT != MVT::ppcf128 &&
+  if (N0C &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
@@ -6062,7 +6302,7 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   EVT OpVT = N0.getValueType();
 
   // fold (uint_to_fp c1) -> c1fp
-  if (N0C && OpVT != MVT::ppcf128 &&
+  if (N0C &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
        TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
@@ -6117,7 +6357,7 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (fp_to_uint c1fp) -> c1
-  if (N0CFP && VT != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FP_TO_UINT, N->getDebugLoc(), VT, N0);
 
   return SDValue();
@@ -6130,7 +6370,7 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (fp_round c1fp) -> c1fp
-  if (N0CFP && N0.getValueType() != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FP_ROUND, N->getDebugLoc(), VT, N0, N1);
 
   // fold (fp_round (fp_extend x)) -> x
@@ -6184,7 +6424,7 @@ SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
     return SDValue();
 
   // fold (fp_extend c1fp) -> c1fp
-  if (N0CFP && VT != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FP_EXTEND, N->getDebugLoc(), VT, N0);
 
   // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
@@ -6225,6 +6465,11 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVUnaryOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
                          &DAG.getTarget().Options))
     return GetNegatedExpression(N0, DAG, LegalOperations);
@@ -6246,6 +6491,17 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
     }
   }
 
+  // (fneg (fmul c, x)) -> (fmul -c, x)
+  if (N0.getOpcode() == ISD::FMUL) {
+    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CFP1) {
+      return DAG.getNode(ISD::FMUL, N->getDebugLoc(), VT,
+                         N0.getOperand(0),
+                         DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT,
+                                     N0.getOperand(1)));
+    }
+  }
+
   return SDValue();
 }
 
@@ -6255,7 +6511,7 @@ SDValue DAGCombiner::visitFCEIL(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (fceil c1) -> fceil(c1)
-  if (N0CFP && VT != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FCEIL, N->getDebugLoc(), VT, N0);
 
   return SDValue();
@@ -6267,7 +6523,7 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ftrunc c1) -> ftrunc(c1)
-  if (N0CFP && VT != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FTRUNC, N->getDebugLoc(), VT, N0);
 
   return SDValue();
@@ -6279,7 +6535,7 @@ SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ffloor c1) -> ffloor(c1)
-  if (N0CFP && VT != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FFLOOR, N->getDebugLoc(), VT, N0);
 
   return SDValue();
@@ -6290,8 +6546,13 @@ SDValue DAGCombiner::visitFABS(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   EVT VT = N->getValueType(0);
 
+  if (VT.isVector()) {
+    SDValue FoldedVOp = SimplifyVUnaryOp(N);
+    if (FoldedVOp.getNode()) return FoldedVOp;
+  }
+
   // fold (fabs c1) -> fabs(c1)
-  if (N0CFP && VT != MVT::ppcf128)
+  if (N0CFP)
     return DAG.getNode(ISD::FABS, N->getDebugLoc(), VT, N0);
   // fold (fabs (fabs x)) -> (fabs x)
   if (N0.getOpcode() == ISD::FABS)
@@ -6511,7 +6772,7 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
   } else
     return false;
 
-  TargetLowering::AddrMode AM;
+  AddrMode AM;
   if (N->getOpcode() == ISD::ADD) {
     ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
     if (Offset)
@@ -7138,7 +7399,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
 
       unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
       Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
-      if (NewAlign < TLI.getTargetData()->getABITypeAlignment(NewVTTy))
+      if (NewAlign < TLI.getDataLayout()->getABITypeAlignment(NewVTTy))
         return SDValue();
 
       SDValue NewPtr = DAG.getNode(ISD::ADD, LD->getDebugLoc(),
@@ -7200,7 +7461,7 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
     unsigned LDAlign = LD->getAlignment();
     unsigned STAlign = ST->getAlignment();
     Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlign = TLI.getTargetData()->getABITypeAlignment(IntVTTy);
+    unsigned ABIAlign = TLI.getDataLayout()->getABITypeAlignment(IntVTTy);
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
 
@@ -7225,6 +7486,433 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
   return SDValue();
 }
 
+/// Returns the base pointer and an integer offset from that object.
+static std::pair<SDValue, int64_t> GetPointerBaseAndOffset(SDValue Ptr) {
+  if (Ptr->getOpcode() == ISD::ADD && isa<ConstantSDNode>(Ptr->getOperand(1))) {
+    int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
+    SDValue Base = Ptr->getOperand(0);
+    return std::make_pair(Base, Offset);
+  }
+
+  return std::make_pair(Ptr, 0);
+}
+
+/// Holds a pointer to an LSBaseSDNode as well as information on where it
+/// is located in a sequence of memory operations connected by a chain.
+struct MemOpLink {
+  MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
+    MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+  // Ptr to the mem node.
+  LSBaseSDNode *MemNode;
+  // Offset from the base ptr.
+  int64_t OffsetFromBase;
+  // What is the sequence number of this mem node.
+  // Lowest mem operand in the DAG starts at zero.
+  unsigned SequenceNum;
+};
+
+/// Sorts store nodes in a link according to their offset from a shared
+// base ptr.
+struct ConsecutiveMemoryChainSorter {
+  bool operator()(MemOpLink LHS, MemOpLink RHS) {
+    return LHS.OffsetFromBase < RHS.OffsetFromBase;
+  }
+};
+
+bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
+  EVT MemVT = St->getMemoryVT();
+  int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
+
+  // Don't merge vectors into wider inputs.
+  if (MemVT.isVector() || !MemVT.isSimple())
+    return false;
+
+  // Perform an early exit check. Do not bother looking at stored values that
+  // are not constants or loads.
+  SDValue StoredVal = St->getValue();
+  bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
+  if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) &&
+      !IsLoadSrc)
+    return false;
+
+  // Only look at ends of store sequences.
+  SDValue Chain = SDValue(St, 1);
+  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
+    return false;
+
+  // This holds the base pointer and the offset in bytes from the base pointer.
+  std::pair<SDValue, int64_t> BasePtr =
+      GetPointerBaseAndOffset(St->getBasePtr());
+
+  // We must have a base and an offset.
+  if (!BasePtr.first.getNode())
+    return false;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.first.getOpcode() == ISD::UNDEF)
+    return false;
+
+  SmallVector<MemOpLink, 8> StoreNodes;
+  // Walk up the chain and look for nodes with offsets from the same
+  // base pointer. Stop when reaching an instruction with a different kind
+  // or instruction which has a different base pointer.
+  unsigned Seq = 0;
+  StoreSDNode *Index = St;
+  while (Index) {
+    // If the chain has more than one use, then we can't reorder the mem ops.
+    if (Index != St && !SDValue(Index, 1)->hasOneUse())
+      break;
+
+    // Find the base pointer and offset for this memory node.
+    std::pair<SDValue, int64_t> Ptr =
+      GetPointerBaseAndOffset(Index->getBasePtr());
+
+    // Check that the base pointer is the same as the original one.
+    if (Ptr.first.getNode() != BasePtr.first.getNode())
+      break;
+
+    // Check that the alignment is the same.
+    if (Index->getAlignment() != St->getAlignment())
+      break;
+
+    // The memory operands must not be volatile.
+    if (Index->isVolatile() || Index->isIndexed())
+      break;
+
+    // No truncation.
+    if (StoreSDNode *St = dyn_cast<StoreSDNode>(Index))
+      if (St->isTruncatingStore())
+        break;
+
+    // The stored memory type must be the same.
+    if (Index->getMemoryVT() != MemVT)
+      break;
+
+    // We do not allow unaligned stores because we want to prevent overriding
+    // stores.
+    if (Index->getAlignment()*8 != MemVT.getSizeInBits())
+      break;
+
+    // We found a potential memory operand to merge.
+    StoreNodes.push_back(MemOpLink(Index, Ptr.second, Seq++));
+
+    // Move up the chain to the next memory operation.
+    Index = dyn_cast<StoreSDNode>(Index->getChain().getNode());
+  }
+
+  // Check if there is anything to merge.
+  if (StoreNodes.size() < 2)
+    return false;
+
+  // Sort the memory operands according to their distance from the base pointer.
+  std::sort(StoreNodes.begin(), StoreNodes.end(),
+            ConsecutiveMemoryChainSorter());
+
+  // Scan the memory operations on the chain and find the first non-consecutive
+  // store memory address.
+  unsigned LastConsecutiveStore = 0;
+  int64_t StartAddress = StoreNodes[0].OffsetFromBase;
+  for (unsigned i=1; i<StoreNodes.size(); ++i) {
+    int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+      break;
+
+    // Mark this node as useful.
+    LastConsecutiveStore = i;
+  }
+
+  // The node with the lowest store address.
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+
+  // Store the constants into memory as one consecutive store.
+  if (!IsLoadSrc) {
+    unsigned LastLegalType = 0;
+    unsigned LastLegalVectorType = 0;
+    bool NonZero = false;
+    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = St->getValue();
+
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
+        NonZero |= !C->isNullValue();
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) {
+        NonZero |= !C->getConstantFPValue()->isNullValue();
+      } else {
+        // Non constant.
+        break;
+      }
+
+      // Find a legal type for the constant store.
+      unsigned StoreBW = (i+1) * ElementSizeBytes * 8;
+      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+      if (TLI.isTypeLegal(StoreTy))
+        LastLegalType = i+1;
+
+      // Find a legal type for the vector store.
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      if (TLI.isTypeLegal(Ty))
+        LastLegalVectorType = i + 1;
+    }
+
+    // We only use vectors if the constant is known to be zero.
+    if (NonZero)
+      LastLegalVectorType = 0;
+
+    // Check if we found a legal integer type to store.
+    if (LastLegalType == 0 && LastLegalVectorType == 0)
+      return false;
+
+    bool UseVector = LastLegalVectorType > LastLegalType;
+    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
+
+    // Make sure we have something to merge.
+    if (NumElem < 2)
+      return false;
+
+    unsigned EarliestNodeUsed = 0;
+    for (unsigned i=0; i < NumElem; ++i) {
+      // Find a chain for the new wide-store operand. Notice that some
+      // of the store nodes that we found may not be selected for inclusion
+      // in the wide store. The chain we use needs to be the chain of the
+      // earliest store node which is *used* and replaced by the wide store.
+      if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
+        EarliestNodeUsed = i;
+    }
+
+    // The earliest Node in the DAG.
+    LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
+    DebugLoc DL = StoreNodes[0].MemNode->getDebugLoc();
+
+    SDValue StoredVal;
+    if (UseVector) {
+      // Find a legal type for the vector store.
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+      assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+      StoredVal = DAG.getConstant(0, Ty);
+    } else {
+      unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+      APInt StoreInt(StoreBW, 0);
+
+      // Construct a single integer constant which is made of the smaller
+      // constant inputs.
+      bool IsLE = TLI.isLittleEndian();
+      for (unsigned i = 0; i < NumElem ; ++i) {
+        unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
+        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+        SDValue Val = St->getValue();
+        StoreInt<<=ElementSizeBytes*8;
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
+          StoreInt|=C->getAPIntValue().zext(StoreBW);
+        } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
+          StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
+        } else {
+          assert(false && "Invalid constant element type");
+        }
+      }
+
+      // Create the new Load and Store operations.
+      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+      StoredVal = DAG.getConstant(StoreInt, StoreTy);
+    }
+
+    SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
+                                    FirstInChain->getBasePtr(),
+                                    FirstInChain->getPointerInfo(),
+                                    false, false,
+                                    FirstInChain->getAlignment());
+
+    // Replace the first store with the new store
+    CombineTo(EarliestOp, NewStore);
+    // Erase all other stores.
+    for (unsigned i = 0; i < NumElem ; ++i) {
+      if (StoreNodes[i].MemNode == EarliestOp)
+        continue;
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      // ReplaceAllUsesWith will replace all uses that existed when it was
+      // called, but graph optimizations may cause new ones to appear. For
+      // example, the case in pr14333 looks like
+      //
+      //  St's chain -> St -> another store -> X
+      //
+      // And the only difference from St to the other store is the chain.
+      // When we change it's chain to be St's chain they become identical,
+      // get CSEed and the net result is that X is now a use of St.
+      // Since we know that St is redundant, just iterate.
+      while (!St->use_empty())
+        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
+      removeFromWorkList(St);
+      DAG.DeleteNode(St);
+    }
+
+    return true;
+  }
+
+  // Below we handle the case of multiple consecutive stores that
+  // come from multiple consecutive loads. We merge them into a single
+  // wide load and a single wide store.
+
+  // Look for load nodes which are used by the stored values.
+  SmallVector<MemOpLink, 8> LoadNodes;
+
+  // Find acceptable loads. Loads need to have the same chain (token factor),
+  // must not be zext, volatile, indexed, and they must be consecutive.
+  SDValue LdBasePtr;
+  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+    StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
+    if (!Ld) break;
+
+    // Loads must only have one use.
+    if (!Ld->hasNUsesOfValue(1, 0))
+      break;
+
+    // Check that the alignment is the same as the stores.
+    if (Ld->getAlignment() != St->getAlignment())
+      break;
+
+    // The memory operands must not be volatile.
+    if (Ld->isVolatile() || Ld->isIndexed())
+      break;
+
+    // We do not accept ext loads.
+    if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
+      break;
+
+    // The stored memory type must be the same.
+    if (Ld->getMemoryVT() != MemVT)
+      break;
+
+    std::pair<SDValue, int64_t> LdPtr =
+    GetPointerBaseAndOffset(Ld->getBasePtr());
+
+    // If this is not the first ptr that we check.
+    if (LdBasePtr.getNode()) {
+      // The base ptr must be the same.
+      if (LdPtr.first != LdBasePtr)
+        break;
+    } else {
+      // Check that all other base pointers are the same as this one.
+      LdBasePtr = LdPtr.first;
+    }
+
+    // We found a potential memory operand to merge.
+    LoadNodes.push_back(MemOpLink(Ld, LdPtr.second, 0));
+  }
+
+  if (LoadNodes.size() < 2)
+    return false;
+
+  // Scan the memory operations on the chain and find the first non-consecutive
+  // load memory address. These variables hold the index in the store node
+  // array.
+  unsigned LastConsecutiveLoad = 0;
+  // This variable refers to the size and not index in the array.
+  unsigned LastLegalVectorType = 0;
+  unsigned LastLegalIntegerType = 0;
+  StartAddress = LoadNodes[0].OffsetFromBase;
+  SDValue FirstChain = LoadNodes[0].MemNode->getChain();
+  for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+    // All loads much share the same chain.
+    if (LoadNodes[i].MemNode->getChain() != FirstChain)
+      break;
+    
+    int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+      break;
+    LastConsecutiveLoad = i;
+
+    // Find a legal type for the vector store.
+    EVT StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+    if (TLI.isTypeLegal(StoreTy))
+      LastLegalVectorType = i + 1;
+
+    // Find a legal type for the integer store.
+    unsigned StoreBW = (i+1) * ElementSizeBytes * 8;
+    StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+    if (TLI.isTypeLegal(StoreTy))
+      LastLegalIntegerType = i + 1;
+  }
+
+  // Only use vector types if the vector type is larger than the integer type.
+  // If they are the same, use integers.
+  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType;
+  unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType);
+
+  // We add +1 here because the LastXXX variables refer to location while
+  // the NumElem refers to array/index size.
+  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
+  NumElem = std::min(LastLegalType, NumElem);
+
+  if (NumElem < 2)
+    return false;
+
+  // The earliest Node in the DAG.
+  unsigned EarliestNodeUsed = 0;
+  LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
+  for (unsigned i=1; i<NumElem; ++i) {
+    // Find a chain for the new wide-store operand. Notice that some
+    // of the store nodes that we found may not be selected for inclusion
+    // in the wide store. The chain we use needs to be the chain of the
+    // earliest store node which is *used* and replaced by the wide store.
+    if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
+      EarliestNodeUsed = i;
+  }
+
+  // Find if it is better to use vectors or integers to load and store
+  // to memory.
+  EVT JointMemOpVT;
+  if (UseVectorTy) {
+    JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+  } else {
+    unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+    JointMemOpVT = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+  }
+
+  DebugLoc LoadDL = LoadNodes[0].MemNode->getDebugLoc();
+  DebugLoc StoreDL = StoreNodes[0].MemNode->getDebugLoc();
+
+  LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+  SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL,
+                                FirstLoad->getChain(),
+                                FirstLoad->getBasePtr(),
+                                FirstLoad->getPointerInfo(),
+                                false, false, false,
+                                FirstLoad->getAlignment());
+
+  SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad,
+                                  FirstInChain->getBasePtr(),
+                                  FirstInChain->getPointerInfo(), false, false,
+                                  FirstInChain->getAlignment());
+
+  // Replace one of the loads with the new load.
+  LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[0].MemNode);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                SDValue(NewLoad.getNode(), 1));
+
+  // Remove the rest of the load chains.
+  for (unsigned i = 1; i < NumElem ; ++i) {
+    // Replace all chain users of the old load nodes with the chain of the new
+    // load node.
+    LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Ld->getChain());
+  }
+
+  // Replace the first store with the new store.
+  CombineTo(EarliestOp, NewStore);
+  // Erase all other stores.
+  for (unsigned i = 0; i < NumElem ; ++i) {
+    // Remove all Store nodes.
+    if (StoreNodes[i].MemNode == EarliestOp)
+      continue;
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
+    removeFromWorkList(St);
+    DAG.DeleteNode(St);
+  }
+
+  return true;
+}
+
 SDValue DAGCombiner::visitSTORE(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
   SDValue Chain = ST->getChain();
@@ -7237,7 +7925,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       ST->isUnindexed()) {
     unsigned OrigAlign = ST->getAlignment();
     EVT SVT = Value.getOperand(0).getValueType();
-    unsigned Align = TLI.getTargetData()->
+    unsigned Align = TLI.getDataLayout()->
       getABITypeAlignment(SVT.getTypeForEVT(*DAG.getContext()));
     if (Align <= OrigAlign &&
         ((!LegalOperations && !ST->isVolatile()) ||
@@ -7426,6 +8114,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
                              ST->getAlignment());
   }
 
+  // Only perform this optimization before the types are legal, because we
+  // don't want to perform this optimization on every DAGCombine invocation.
+  if (!LegalTypes && MergeConsecutiveStores(ST))
+    return SDValue(N, 0);
+
   return ReduceLoadOpStoreWidth(N);
 }
 
@@ -7504,9 +8197,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
   // We only perform this optimization before the op legalization phase because
-  // we may introduce new vector instructions which are not backed by TD patterns.
-  // For example on AVX, extracting elements from a wide vector without using
-  // extract_subvector.
+  // we may introduce new vector instructions which are not backed by TD
+  // patterns. For example on AVX, extracting elements from a wide vector
+  // without using extract_subvector.
   if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE
       && ConstEltNo && !LegalOperations) {
     int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
@@ -7625,7 +8318,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       // Check the resultant load doesn't need a higher alignment than the
       // original load.
       unsigned NewAlign =
-        TLI.getTargetData()
+        TLI.getDataLayout()
             ->getABITypeAlignment(LVT.getTypeForEVT(*DAG.getContext()));
 
       if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, LVT))
@@ -7690,15 +8383,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+// Simplify (build_vec (ext )) to (bitcast (build_vec ))
+SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
+  // We perform this optimization post type-legalization because
+  // the type-legalizer often scalarizes integer-promoted vectors.
+  // Performing this optimization before may create bit-casts which
+  // will be type-legalized to complex code sequences.
+  // We perform this optimization only before the operation legalizer because we
+  // may introduce illegal operations.
+  if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
+    return SDValue();
+
   unsigned NumInScalars = N->getNumOperands();
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
-  // A vector built entirely of undefs is undef.
-  if (ISD::allOperandsUndef(N))
-    return DAG.getUNDEF(VT);
-
   // Check to see if this is a BUILD_VECTOR of a bunch of values
   // which come from any_extend or zero_extend nodes. If so, we can create
   // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
@@ -7741,64 +8440,141 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // In order to have valid types, all of the inputs must be extended from the
   // same source type and all of the inputs must be any or zero extend.
   // Scalar sizes must be a power of two.
-  EVT OutScalarTy = N->getValueType(0).getScalarType();
+  EVT OutScalarTy = VT.getScalarType();
   bool ValidTypes = SourceType != MVT::Other &&
                  isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
                  isPowerOf2_32(SourceType.getSizeInBits());
 
-  // We perform this optimization post type-legalization because
-  // the type-legalizer often scalarizes integer-promoted vectors.
-  // Performing this optimization before may create bit-casts which
-  // will be type-legalized to complex code sequences.
-  // We perform this optimization only before the operation legalizer because we
-  // may introduce illegal operations.
   // Create a new simpler BUILD_VECTOR sequence which other optimizations can
   // turn into a single shuffle instruction.
-  if ((Level == AfterLegalizeVectorOps || Level == AfterLegalizeTypes) &&
-      ValidTypes) {
-    bool isLE = TLI.isLittleEndian();
-    unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
-    assert(ElemRatio > 1 && "Invalid element size ratio");
-    SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
-                                 DAG.getConstant(0, SourceType);
-
-    unsigned NewBVElems = ElemRatio * N->getValueType(0).getVectorNumElements();
-    SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
-
-    // Populate the new build_vector
-    for (unsigned i=0; i < N->getNumOperands(); ++i) {
-      SDValue Cast = N->getOperand(i);
-      assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
-              Cast.getOpcode() == ISD::ZERO_EXTEND ||
-              Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode");
-      SDValue In;
-      if (Cast.getOpcode() == ISD::UNDEF)
-        In = DAG.getUNDEF(SourceType);
-      else
-        In = Cast->getOperand(0);
-      unsigned Index = isLE ? (i * ElemRatio) :
-                              (i * ElemRatio + (ElemRatio - 1));
+  if (!ValidTypes)
+    return SDValue();
+
+  bool isLE = TLI.isLittleEndian();
+  unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
+  assert(ElemRatio > 1 && "Invalid element size ratio");
+  SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
+                               DAG.getConstant(0, SourceType);
+
+  unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
+
+  // Populate the new build_vector
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Cast = N->getOperand(i);
+    assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
+            Cast.getOpcode() == ISD::ZERO_EXTEND ||
+            Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode");
+    SDValue In;
+    if (Cast.getOpcode() == ISD::UNDEF)
+      In = DAG.getUNDEF(SourceType);
+    else
+      In = Cast->getOperand(0);
+    unsigned Index = isLE ? (i * ElemRatio) :
+                            (i * ElemRatio + (ElemRatio - 1));
+
+    assert(Index < Ops.size() && "Invalid index");
+    Ops[Index] = In;
+  }
+
+  // The type of the new BUILD_VECTOR node.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
+  assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
+         "Invalid vector size");
+  // Check if the new vector type is legal.
+  if (!isTypeLegal(VecVT)) return SDValue();
+
+  // Make the new BUILD_VECTOR.
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], Ops.size());
+
+  // The new BUILD_VECTOR node has the potential to be further optimized.
+  AddToWorkList(BV.getNode());
+  // Bitcast to the desired type.
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT SrcVT = MVT::Other;
+  unsigned Opcode = ISD::DELETED_NODE;
+  unsigned NumDefs = 0;
 
-      assert(Index < Ops.size() && "Invalid index");
-      Ops[Index] = In;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    unsigned Opc = In.getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    // If all scalar values are floats and converted from integers.
+    if (Opcode == ISD::DELETED_NODE &&
+        (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
+      Opcode = Opc;
+      // If not supported by target, bail out.
+      if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal &&
+          TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
+        return SDValue();
     }
+    if (Opc != Opcode)
+      return SDValue();
 
-    // The type of the new BUILD_VECTOR node.
-    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
-    assert(VecVT.getSizeInBits() == N->getValueType(0).getSizeInBits() &&
-           "Invalid vector size");
-    // Check if the new vector type is legal.
-    if (!isTypeLegal(VecVT)) return SDValue();
+    EVT InVT = In.getOperand(0).getValueType();
 
-    // Make the new BUILD_VECTOR.
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
-                                 VecVT, &Ops[0], Ops.size());
+    // If all scalar values are typed differently, bail out. It's chosen to
+    // simplify BUILD_VECTOR of integer types.
+    if (SrcVT == MVT::Other)
+      SrcVT = InVT;
+    if (SrcVT != InVT)
+      return SDValue();
+    NumDefs++;
+  }
+
+  // If the vector has just one element defined, it's not worth to fold it into
+  // a vectorized one.
+  if (NumDefs < 2)
+    return SDValue();
 
-    // The new BUILD_VECTOR node has the potential to be further optimized.
-    AddToWorkList(BV.getNode());
-    // Bitcast to the desired type.
-    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), BV);
+  assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
+         && "Should only handle conversion from integer to float.");
+  assert(SrcVT != MVT::Other && "Cannot determine source type!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
+  SmallVector<SDValue, 8> Opnds;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+
+    if (In.getOpcode() == ISD::UNDEF)
+      Opnds.push_back(DAG.getUNDEF(SrcVT));
+    else
+      Opnds.push_back(In.getOperand(0));
   }
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
+                           &Opnds[0], Opnds.size());
+  AddToWorkList(BV.getNode());
+
+  return DAG.getNode(Opcode, dl, VT, BV);
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  SDValue V = reduceBuildVecExtToExtBuildVec(N);
+  if (V.getNode())
+    return V;
+
+  V = reduceBuildVecConvertToConvertBuildVec(N);
+  if (V.getNode())
+    return V;
 
   // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
   // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
@@ -7876,15 +8652,22 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
       if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
         return SDValue();
 
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      if (VecIn1.getValueType().getVectorElementType() !=
+          VT.getVectorElementType())
+        return SDValue();
+
       // Widen the input vector by adding undef values.
-      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                            VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
     }
 
     // If VecIn2 is unused then change it to undef.
     VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
 
-    // Check that we were able to transform all incoming values to the same type.
+    // Check that we were able to transform all incoming values to the same
+    // type.
     if (VecIn2.getValueType() != VecIn1.getValueType() ||
         VecIn1.getValueType() != VT)
           return SDValue();
@@ -7897,7 +8680,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     SDValue Ops[2];
     Ops[0] = VecIn1;
     Ops[1] = VecIn2;
-    return DAG.getVectorShuffle(VT, N->getDebugLoc(), Ops[0], Ops[1], &Mask[0]);
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], &Mask[0]);
   }
 
   return SDValue();
@@ -7933,8 +8716,8 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
       return SDValue();
 
     // Only handle cases where both indexes are constants with the same type.
-    ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
-    ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
+    ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
 
     if (InsIdx && ExtIdx &&
         InsIdx->getValueType(0).getSizeInBits() <= 64 &&
@@ -7951,6 +8734,21 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     }
   }
 
+  if (V->getOpcode() == ISD::CONCAT_VECTORS) {
+    // Combine:
+    //    (extract_subvec (concat V1, V2, ...), i)
+    // Into:
+    //    Vi if possible
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same type.
+    if (V->getOperand(0).getValueType() != NVT)
+      return SDValue();
+    unsigned Idx = dyn_cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    unsigned NumElems = NVT.getVectorNumElements();
+    assert((Idx % NumElems) == 0 &&
+           "IDX in concat is not a multiple of the result vector length.");
+    return V->getOperand(Idx / NumElems);
+  }
+
   return SDValue();
 }
 
@@ -8266,6 +9064,44 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   return SDValue();
 }
 
+/// SimplifyVUnaryOp - Visit a binary vector operation, like FABS/FNEG.
+SDValue DAGCombiner::SimplifyVUnaryOp(SDNode *N) {
+  // After legalize, the target may be depending on adds and other
+  // binary ops to provide legal ways to construct constants or other
+  // things. Simplifying them may result in a loss of legality.
+  if (LegalOperations) return SDValue();
+
+  assert(N->getValueType(0).isVector() &&
+         "SimplifyVUnaryOp only works on vectors!");
+
+  SDValue N0 = N->getOperand(0);
+
+  if (N0.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  // Operand is a BUILD_VECTOR node, see if we can constant fold it.
+  SmallVector<SDValue, 8> Ops;
+  for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+    SDValue Op = N0.getOperand(i);
+    if (Op.getOpcode() != ISD::UNDEF &&
+        Op.getOpcode() != ISD::ConstantFP)
+      break;
+    EVT EltVT = Op.getValueType();
+    SDValue FoldOp = DAG.getNode(N->getOpcode(), N0.getDebugLoc(), EltVT, Op);
+    if (FoldOp.getOpcode() != ISD::UNDEF &&
+        FoldOp.getOpcode() != ISD::ConstantFP)
+      break;
+    Ops.push_back(FoldOp);
+    AddToWorkList(FoldOp.getNode());
+  }
+
+  if (Ops.size() != N0.getNumOperands())
+    return SDValue();
+
+  return DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
+                     N0.getValueType(), &Ops[0], Ops.size());
+}
+
 SDValue DAGCombiner::SimplifySelect(DebugLoc DL, SDValue N0,
                                     SDValue N1, SDValue N2){
   assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
@@ -8349,6 +9185,10 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
       if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) ||
           (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
         return false;
+      // The loads must not depend on one another.
+      if (LLD->isPredecessorOf(RLD) ||
+          RLD->isPredecessorOf(LLD))
+        return false;
       Addr = DAG.getNode(ISD::SELECT, TheSelect->getDebugLoc(),
                          LLD->getBasePtr().getValueType(),
                          TheSelect->getOperand(0), LLD->getBasePtr(),
@@ -8468,7 +9308,7 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
           const_cast<ConstantFP*>(TV->getConstantFPValue())
         };
         Type *FPTy = Elts[0]->getType();
-        const TargetData &TD = *TLI.getTargetData();
+        const DataLayout &TD = *TLI.getDataLayout();
 
         // Create a ConstantArray of the two constants.
         Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
@@ -8583,34 +9423,38 @@ SDValue DAGCombiner::SimplifySelectCC(DebugLoc DL, SDValue N0, SDValue N1,
       return SDValue();
 
     // Get a SetCC of the condition
-    // FIXME: Should probably make sure that setcc is legal if we ever have a
-    // target where it isn't.
-    SDValue Temp, SCC;
-    // cast from setcc result type to select result type
-    if (LegalTypes) {
-      SCC  = DAG.getSetCC(DL, TLI.getSetCCResultType(N0.getValueType()),
-                          N0, N1, CC);
-      if (N2.getValueType().bitsLT(SCC.getValueType()))
-        Temp = DAG.getZeroExtendInReg(SCC, N2.getDebugLoc(), N2.getValueType());
-      else
+    // NOTE: Don't create a SETCC if it's not legal on this target.
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC,
+          LegalTypes ? TLI.getSetCCResultType(N0.getValueType()) : MVT::i1)) {
+      SDValue Temp, SCC;
+      // cast from setcc result type to select result type
+      if (LegalTypes) {
+        SCC  = DAG.getSetCC(DL, TLI.getSetCCResultType(N0.getValueType()),
+                            N0, N1, CC);
+        if (N2.getValueType().bitsLT(SCC.getValueType()))
+          Temp = DAG.getZeroExtendInReg(SCC, N2.getDebugLoc(),
+                                        N2.getValueType());
+        else
+          Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
+                             N2.getValueType(), SCC);
+      } else {
+        SCC  = DAG.getSetCC(N0.getDebugLoc(), MVT::i1, N0, N1, CC);
         Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
                            N2.getValueType(), SCC);
-    } else {
-      SCC  = DAG.getSetCC(N0.getDebugLoc(), MVT::i1, N0, N1, CC);
-      Temp = DAG.getNode(ISD::ZERO_EXTEND, N2.getDebugLoc(),
-                         N2.getValueType(), SCC);
-    }
+      }
 
-    AddToWorkList(SCC.getNode());
-    AddToWorkList(Temp.getNode());
+      AddToWorkList(SCC.getNode());
+      AddToWorkList(Temp.getNode());
 
-    if (N2C->getAPIntValue() == 1)
-      return Temp;
+      if (N2C->getAPIntValue() == 1)
+        return Temp;
 
-    // shl setcc result by log2 n2c
-    return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
-                       DAG.getConstant(N2C->getAPIntValue().logBase2(),
-                                       getShiftAmountTy(Temp.getValueType())));
+      // shl setcc result by log2 n2c
+      return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
+                         DAG.getConstant(N2C->getAPIntValue().logBase2(),
+                                         getShiftAmountTy(Temp.getValueType())));
+    }
   }
 
   // Check to see if this is the equivalent of setcc
@@ -8729,7 +9573,7 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
 // to alias with anything but itself.  Provides base object and offset as
 // results.
 static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
-                           const GlobalValue *&GV, void *&CV) {
+                           const GlobalValue *&GV, const void *&CV) {
   // Assume it is a primitive operation.
   Base = Ptr; Offset = 0; GV = 0; CV = 0;
 
@@ -8754,8 +9598,8 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
   // for ConstantSDNodes since the same constant pool entry may be represented
   // by multiple nodes with different offsets.
   if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) {
-    CV = C->isMachineConstantPoolEntry() ? (void *)C->getMachineCPVal()
-                                         : (void *)C->getConstVal();
+    CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal()
+                                         : (const void *)C->getConstVal();
     Offset += C->getOffset();
     return false;
   }
@@ -8780,7 +9624,7 @@ bool DAGCombiner::isAlias(SDValue Ptr1, int64_t Size1,
   SDValue Base1, Base2;
   int64_t Offset1, Offset2;
   const GlobalValue *GV1, *GV2;
-  void *CV1, *CV2;
+  const void *CV1, *CV2;
   bool isFrameIndex1 = FindBaseOffset(Ptr1, Base1, Offset1, GV1, CV1);
   bool isFrameIndex2 = FindBaseOffset(Ptr2, Base2, Offset2, GV2, CV2);
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 683fac6744f8..4854cf7b261f 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -53,7 +53,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -1059,7 +1059,7 @@ FastISel::FastISel(FunctionLoweringInfo &funcInfo,
     MFI(*FuncInfo.MF->getFrameInfo()),
     MCP(*FuncInfo.MF->getConstantPool()),
     TM(FuncInfo.MF->getTarget()),
-    TD(*TM.getTargetData()),
+    TD(*TM.getDataLayout()),
     TII(*TM.getInstrInfo()),
     TLI(*TM.getTargetLowering()),
     TRI(*TM.getRegisterInfo()),
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 3e18ea7ac95b..a4182906cbf4 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -29,7 +29,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
@@ -80,9 +80,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(I))
       if (const ConstantInt *CUI = dyn_cast<ConstantInt>(AI->getArraySize())) {
         Type *Ty = AI->getAllocatedType();
-        uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
         unsigned Align =
-          std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
+          std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
                    AI->getAlignment());
 
         TySize *= CUI->getZExtValue();   // Get total allocated size.
@@ -97,7 +97,7 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf) {
             cast<ArrayType>(Ty)->getElementType()->isIntegerTy(8)));
         StaticAllocaMap[AI] =
           MF->getFrameInfo()->CreateStackObject(TySize, Align, false,
-                                                MayNeedSP);
+                                                MayNeedSP, AI);
       }
 
   for (; BB != EB; ++BB)
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 4488d2790bbb..a8381b25ba12 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -55,7 +55,8 @@ unsigned InstrEmitter::CountResults(SDNode *Node) {
 ///
 /// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
 /// the chain and glue. These operands may be implicit on the machine instr.
-static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) {
+static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
+                              unsigned &NumImpUses) {
   unsigned N = Node->getNumOperands();
   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     --N;
@@ -63,7 +64,8 @@ static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) {
     --N; // Ignore chain if it exists.
 
   // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
-  for (unsigned I = N; I; --I) {
+  NumImpUses = N - NumExpUses;
+  for (unsigned I = N; I > NumExpUses; --I) {
     if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
       continue;
     if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
@@ -312,8 +314,6 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
     const TargetRegisterClass *DstRC = 0;
     if (IIOpNum < II->getNumOperands())
       DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
-    assert((DstRC || (MI->isVariadic() && IIOpNum >= MCID.getNumOperands())) &&
-           "Don't have operand info for this instruction!");
     if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
@@ -390,10 +390,10 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
     Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
-      Align = TM->getTargetData()->getPrefTypeAlignment(Type);
+      Align = TM->getDataLayout()->getPrefTypeAlignment(Type);
       if (Align == 0) {
         // Alignment of vector types.  FIXME!
-        Align = TM->getTargetData()->getTypeAllocSize(Type);
+        Align = TM->getDataLayout()->getTypeAllocSize(Type);
       }
     }
 
@@ -410,6 +410,7 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                                             ES->getTargetFlags()));
   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateBA(BA->getBlockAddress(),
+                                            BA->getOffset(),
                                             BA->getTargetFlags()));
   } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateTargetIndex(TI->getIndex(),
@@ -720,7 +721,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
   unsigned NumImpUses = 0;
-  unsigned NodeOperands = countOperands(Node, NumImpUses);
+  unsigned NodeOperands =
+    countOperands(Node, II.getNumOperands() - II.getNumDefs(), NumImpUses);
   bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
@@ -870,6 +872,17 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     break;
   }
 
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END: {
+    unsigned TarOp = (Node->getOpcode() == ISD::LIFETIME_START) ?
+    TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END;
+
+    FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Node->getOperand(1));
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+    .addFrameIndex(FI->getIndex());
+    break;
+  }
+
   case ISD::INLINEASM: {
     unsigned NumOps = Node->getNumOperands();
     if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
@@ -884,25 +897,30 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
     MI->addOperand(MachineOperand::CreateES(AsmStr));
 
-    // Add the HasSideEffect and isAlignStack bits.
+    // Add the HasSideEffect, isAlignStack, AsmDialect, MayLoad and MayStore
+    // bits.
     int64_t ExtraInfo =
       cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))->
                           getZExtValue();
     MI->addOperand(MachineOperand::CreateImm(ExtraInfo));
 
+    // Remember to operand index of the group flags.
+    SmallVector<unsigned, 8> GroupIdx;
+
     // Add all of the operand registers to the instruction.
     for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
       unsigned Flags =
         cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
-      unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      const unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
 
+      GroupIdx.push_back(MI->getNumOperands());
       MI->addOperand(MachineOperand::CreateImm(Flags));
       ++i;  // Skip the ID value.
 
       switch (InlineAsm::getKind(Flags)) {
       default: llvm_unreachable("Bad flags!");
         case InlineAsm::Kind_RegDef:
-        for (; NumVals; --NumVals, ++i) {
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
           // FIXME: Add dead flags for physical and virtual registers defined.
           // For now, mark physical register defs as implicit to help fast
@@ -913,7 +931,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
         break;
       case InlineAsm::Kind_RegDefEarlyClobber:
       case InlineAsm::Kind_Clobber:
-        for (; NumVals; --NumVals, ++i) {
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
           MI->addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/ true,
                          /*isImp=*/ TargetRegisterInfo::isPhysicalRegister(Reg),
@@ -928,9 +946,20 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
       case InlineAsm::Kind_Mem:  // Addressing mode.
         // The addressing mode has been selected, just add all of the
         // operands to the machine instruction.
-        for (; NumVals; --NumVals, ++i)
+        for (unsigned j = 0; j != NumVals; ++j, ++i)
           AddOperand(MI, Node->getOperand(i), 0, 0, VRBaseMap,
                      /*IsDebug=*/false, IsClone, IsCloned);
+
+        // Manually set isTied bits.
+        if (InlineAsm::getKind(Flags) == InlineAsm::Kind_RegUse) {
+          unsigned DefGroup = 0;
+          if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
+            unsigned DefIdx = GroupIdx[DefGroup] + 1;
+            unsigned UseIdx = GroupIdx.back() + 1;
+            for (unsigned j = 0; j != NumVals; ++j)
+              MI->tieOperands(DefIdx + j, UseIdx + j);
+          }
+        }
         break;
       }
     }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 908ebb948647..abf40b77a18f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -718,7 +718,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node),
                                    DAG, TLI, this);
@@ -824,7 +824,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           // expand it.
           if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
             if (ST->getAlignment() < ABIAlignment)
               ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
@@ -869,25 +869,24 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal:
-             // If this is an unaligned load and the target doesn't support it,
-             // expand it.
-             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-               Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-               unsigned ABIAlignment =
-                 TLI.getTargetData()->getABITypeAlignment(Ty);
-               if (LD->getAlignment() < ABIAlignment){
-                 ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                     DAG, TLI, RVal, RChain);
-               }
-             }
-             break;
+      // If this is an unaligned load and the target doesn't support it,
+      // expand it.
+      if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+        Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+        unsigned ABIAlignment =
+          TLI.getDataLayout()->getABITypeAlignment(Ty);
+        if (LD->getAlignment() < ABIAlignment){
+          ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
+        }
+      }
+      break;
     case TargetLowering::Custom: {
-             SDValue Res = TLI.LowerOperation(RVal, DAG);
-             if (Res.getNode()) {
-               RVal = Res;
-               RChain = Res.getValue(1);
-             }
-             break;
+      SDValue Res = TLI.LowerOperation(RVal, DAG);
+      if (Res.getNode()) {
+        RVal = Res;
+        RChain = Res.getValue(1);
+      }
+      break;
     }
     case TargetLowering::Promote: {
       // Only promote a load of vector type to another.
@@ -1060,7 +1059,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                  Type *Ty =
                    LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
                  unsigned ABIAlignment =
-                   TLI.getTargetData()->getABITypeAlignment(Ty);
+                   TLI.getDataLayout()->getABITypeAlignment(Ty);
                  if (LD->getAlignment() < ABIAlignment){
                    ExpandUnalignedLoad(cast<LoadSDNode>(Node),
                                        DAG, TLI, Value, Chain);
@@ -1241,6 +1240,19 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Custom;
     break;
+  case ISD::DEBUGTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::DEBUGTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, Node->getDebugLoc(), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
+
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
       Action = TargetLowering::Legal;
@@ -1588,26 +1600,71 @@ void SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT,
     break;
   case TargetLowering::Expand: {
     ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
+    ISD::CondCode InvCC = ISD::SETCC_INVALID;
     unsigned Opc = 0;
     switch (CCCode) {
     default: llvm_unreachable("Don't know how to expand this condition!");
-    case ISD::SETOEQ: CC1 = ISD::SETEQ; CC2 = ISD::SETO;  Opc = ISD::AND; break;
-    case ISD::SETOGT: CC1 = ISD::SETGT; CC2 = ISD::SETO;  Opc = ISD::AND; break;
-    case ISD::SETOGE: CC1 = ISD::SETGE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
-    case ISD::SETOLT: CC1 = ISD::SETLT; CC2 = ISD::SETO;  Opc = ISD::AND; break;
-    case ISD::SETOLE: CC1 = ISD::SETLE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
-    case ISD::SETONE: CC1 = ISD::SETNE; CC2 = ISD::SETO;  Opc = ISD::AND; break;
-    case ISD::SETUEQ: CC1 = ISD::SETEQ; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
-    case ISD::SETUGT: CC1 = ISD::SETGT; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
-    case ISD::SETUGE: CC1 = ISD::SETGE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
-    case ISD::SETULT: CC1 = ISD::SETLT; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
-    case ISD::SETULE: CC1 = ISD::SETLE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
-    case ISD::SETUNE: CC1 = ISD::SETNE; CC2 = ISD::SETUO; Opc = ISD::OR;  break;
-    // FIXME: Implement more expansions.
-    }
-
-    SDValue SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
-    SDValue SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
+    case ISD::SETO: 
+        assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
+            == TargetLowering::Legal
+            && "If SETO is expanded, SETOEQ must be legal!");
+        CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
+    case ISD::SETUO:  
+        assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
+            == TargetLowering::Legal
+            && "If SETUO is expanded, SETUNE must be legal!");
+        CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;  break;
+    case ISD::SETOEQ:
+    case ISD::SETOGT:
+    case ISD::SETOGE:
+    case ISD::SETOLT:
+    case ISD::SETOLE:
+    case ISD::SETONE: 
+    case ISD::SETUEQ: 
+    case ISD::SETUNE: 
+    case ISD::SETUGT: 
+    case ISD::SETUGE: 
+    case ISD::SETULT: 
+    case ISD::SETULE:
+        // If we are floating point, assign and break, otherwise fall through.
+        if (!OpVT.isInteger()) {
+          // We can use the 4th bit to tell if we are the unordered
+          // or ordered version of the opcode.
+          CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
+          Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
+          CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10);
+          break;
+        }
+        // Fallthrough if we are unsigned integer.
+    case ISD::SETLE:
+    case ISD::SETGT:
+    case ISD::SETGE:
+    case ISD::SETLT:
+    case ISD::SETNE:
+    case ISD::SETEQ:
+      InvCC = ISD::getSetCCSwappedOperands(CCCode);
+      if (TLI.getCondCodeAction(InvCC, OpVT) == TargetLowering::Expand) {
+        // We only support using the inverted operation and not a
+        // different manner of supporting expanding these cases.
+        llvm_unreachable("Don't know how to expand this condition!");
+      }
+      LHS = DAG.getSetCC(dl, VT, RHS, LHS, InvCC);
+      RHS = SDValue();
+      CC = SDValue();
+      return;
+    }
+    
+    SDValue SetCC1, SetCC2;
+    if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
+      // If we aren't the ordered or unorder operation,
+      // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
+      SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
+    } else {
+      // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1);
+      SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2);
+    }
     LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
     RHS = SDValue();
     CC  = SDValue();
@@ -1626,7 +1683,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
                                                DebugLoc dl) {
   // Create the stack frame object.
   unsigned SrcAlign =
-    TLI.getTargetData()->getPrefTypeAlignment(SrcOp.getValueType().
+    TLI.getDataLayout()->getPrefTypeAlignment(SrcOp.getValueType().
                                               getTypeForEVT(*DAG.getContext()));
   SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
 
@@ -1638,7 +1695,7 @@ SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp,
   unsigned SlotSize = SlotVT.getSizeInBits();
   unsigned DestSize = DestVT.getSizeInBits();
   Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
-  unsigned DestAlign = TLI.getTargetData()->getPrefTypeAlignment(DestType);
+  unsigned DestAlign = TLI.getDataLayout()->getPrefTypeAlignment(DestType);
 
   // Emit a store to the stack slot.  Use a truncstore if the input value is
   // later than DestVT.
@@ -2042,7 +2099,7 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned,
                                                    SDValue Op0,
                                                    EVT DestVT,
                                                    DebugLoc dl) {
-  if (Op0.getValueType() == MVT::i32) {
+  if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
     // simple 32-bit [signed|unsigned] integer to float/double expansion
 
     // Get the stack frame index of a 8 byte buffer.
@@ -2787,7 +2844,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     // Increment the pointer, VAList, to the next vaarg
     Tmp3 = DAG.getNode(ISD::ADD, dl, TLI.getPointerTy(), VAList,
-                       DAG.getConstant(TLI.getTargetData()->
+                       DAG.getConstant(TLI.getDataLayout()->
                           getTypeAllocSize(VT.getTypeForEVT(*DAG.getContext())),
                                        TLI.getPointerTy()));
     // Store the incremented VAList to the legalized pointer
@@ -3109,6 +3166,8 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Tmp3 = Node->getOperand(1);
     if (TLI.isOperationLegalOrCustom(DivRemOpc, VT) ||
         (isDivRemLibcallAvailable(Node, isSigned, TLI) &&
+         // If div is legal, it's better to do the normal expansion
+         !TLI.isOperationLegalOrCustom(DivOpc, Node->getValueType(0)) &&
          useDivRem(Node, isSigned, false))) {
       Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
     } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
@@ -3366,7 +3425,7 @@ void SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     EVT PTy = TLI.getPointerTy();
 
-    const TargetData &TD = *TLI.getTargetData();
+    const DataLayout &TD = *TLI.getDataLayout();
     unsigned EntrySize =
       DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index e3938968b205..92dc5a9831b6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1245,32 +1245,30 @@ bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
   DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
   SDValue Res = SDValue();
 
-  if (TLI.getOperationAction(N->getOpcode(), N->getOperand(OpNo).getValueType())
-      == TargetLowering::Custom)
-    Res = TLI.LowerOperation(SDValue(N, 0), DAG);
-
-  if (Res.getNode() == 0) {
-    switch (N->getOpcode()) {
-    default:
-  #ifndef NDEBUG
-      dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
-      N->dump(&DAG); dbgs() << "\n";
-  #endif
-      llvm_unreachable("Do not know how to expand this operator's operand!");
-
-    case ISD::BITCAST:         Res = ExpandOp_BITCAST(N); break;
-    case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
-    case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
-
-    case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
-    case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
-    case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
-    case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
-    case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
-    case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
-    case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
-                                                    OpNo); break;
-    }
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:         Res = ExpandOp_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+
+  case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
+  case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
+  case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
+  case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
+                                                  OpNo); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e8e968aaef31..a370faeb2399 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -644,8 +644,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
   EVT SmallVT = LHS.getValueType();
 
   // To determine if the result overflowed in a larger type, we extend the
-  // input to the larger type, do the multiply, then check the high bits of
-  // the result to see if the overflow happened.
+  // input to the larger type, do the multiply (checking if it overflows),
+  // then also check the high bits of the result to see if overflow happened
+  // there.
   if (N->getOpcode() == ISD::SMULO) {
     LHS = SExtPromotedInteger(LHS);
     RHS = SExtPromotedInteger(RHS);
@@ -653,24 +654,31 @@ SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
     LHS = ZExtPromotedInteger(LHS);
     RHS = ZExtPromotedInteger(RHS);
   }
-  SDValue Mul = DAG.getNode(ISD::MUL, DL, LHS.getValueType(), LHS, RHS);
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1));
+  SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS);
 
-  // Overflow occurred iff the high part of the result does not
-  // zero/sign-extend the low part.
+  // Overflow occurred if it occurred in the larger type, or if the high part
+  // of the result does not zero/sign-extend the low part.  Check this second
+  // possibility first.
   SDValue Overflow;
   if (N->getOpcode() == ISD::UMULO) {
-    // Unsigned overflow occurred iff the high part is non-zero.
+    // Unsigned overflow occurred if the high part is non-zero.
     SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
                              DAG.getIntPtrConstant(SmallVT.getSizeInBits()));
     Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
                             DAG.getConstant(0, Hi.getValueType()), ISD::SETNE);
   } else {
-    // Signed overflow occurred iff the high part does not sign extend the low.
+    // Signed overflow occurred if the high part does not sign extend the low.
     SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(),
                                Mul, DAG.getValueType(SmallVT));
     Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE);
   }
 
+  // The only other way for overflow to occur is if the multiplication in the
+  // larger type itself overflowed.
+  Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow,
+                         SDValue(Mul.getNode(), 1));
+
   // Use the calculated overflow everywhere.
   ReplaceValueWith(SDValue(N, 1), Overflow);
   return Mul;
@@ -2253,32 +2261,35 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
-  EVT PtrVT = TLI.getPointerTy();
-  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
   DebugLoc dl = N->getDebugLoc();
 
   // A divide for UMULO should be faster than a function call.
   if (N->getOpcode() == ISD::UMULO) {
     SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
-    DebugLoc DL = N->getDebugLoc();
 
-    SDValue MUL = DAG.getNode(ISD::MUL, DL, LHS.getValueType(), LHS, RHS);
+    SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
     SplitInteger(MUL, Lo, Hi);
 
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
     SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
-                                  RHS, DAG.getConstant(0, VT), ISD::SETNE);
+                                  RHS, DAG.getConstant(0, VT), ISD::SETEQ);
     SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
                                   DAG.getConstant(1, VT), RHS);
-    SDValue DIV = DAG.getNode(ISD::UDIV, DL, LHS.getValueType(), MUL, NotZero);
-    SDValue Overflow;
-    Overflow = DAG.getSetCC(DL, N->getValueType(1), DIV, LHS, ISD::SETNE);
+    SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
+    SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
+                                    ISD::SETNE);
+    Overflow = DAG.getNode(ISD::SELECT, dl, N->getValueType(1), isZero,
+                           DAG.getConstant(0, N->getValueType(1)),
+                           Overflow);
     ReplaceValueWith(SDValue(N, 1), Overflow);
     return;
   }
 
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  EVT PtrVT = TLI.getPointerTy();
+  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
+  
   // Replace this with a libcall that will check overflow.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i32)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 39337fff5079..644e36e35e21 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -15,7 +15,7 @@
 
 #include "LegalizeTypes.h"
 #include "llvm/CallingConv.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 94fc9761ecbd..20b7ce6b15ba 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -625,6 +625,7 @@ private:
   SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
   SDValue WidenVecRes_VSETCC(SDNode* N);
 
+  SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_Convert(SDNode *N);
   SDValue WidenVecRes_POWI(SDNode *N);
@@ -633,7 +634,7 @@ private:
   SDValue WidenVecRes_InregOp(SDNode *N);
 
   // Widen Vector Operand.
-  bool WidenVectorOperand(SDNode *N, unsigned ResNo);
+  bool WidenVectorOperand(SDNode *N, unsigned OpNo);
   SDValue WidenVecOp_BITCAST(SDNode *N);
   SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 06f6bd63b671..6bcb3b25e98e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -20,7 +20,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -94,14 +94,48 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   if (InVT.isVector() && OutVT.isInteger()) {
     // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
     // is legal but the result is not.
-    EVT NVT = EVT::getVectorVT(*DAG.getContext(), NOutVT, 2);
+    unsigned NumElems = 2;
+    EVT ElemVT = NOutVT;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+
+    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
+    while (!isTypeLegal(NVT)) {
+      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
+      // If the element size is smaller than byte, bail.
+      if (NewSizeInBits < 8)
+        break;
+      NumElems *= 2;
+      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
+      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+    }
 
     if (isTypeLegal(NVT)) {
       SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);
-      Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
-                       DAG.getIntPtrConstant(0));
-      Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
-                       DAG.getIntPtrConstant(1));
+
+      SmallVector<SDValue, 8> Vals;
+      for (unsigned i = 0; i < NumElems; ++i)
+        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
+                                   CastInOp, DAG.getIntPtrConstant(i)));
+
+      // Build Lo, Hi pair by pairing extracted elements if needed.
+      unsigned Slot = 0;
+      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
+        // Each iteration will BUILD_PAIR two nodes and append the result until
+        // there are only two nodes left, i.e. Lo and Hi.
+        SDValue LHS = Vals[Slot];
+        SDValue RHS = Vals[Slot + 1];
+
+        if (TLI.isBigEndian())
+          std::swap(LHS, RHS);
+
+        Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
+                                   EVT::getIntegerVT(
+                                     *DAG.getContext(),
+                                     LHS.getValueType().getSizeInBits() << 1),
+                                   LHS, RHS));
+      }
+      Lo = Vals[Slot++];
+      Hi = Vals[Slot++];
 
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
@@ -116,7 +150,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   // Create the stack frame object.  Make sure it is aligned for both
   // the source and expanded destination types.
   unsigned Alignment =
-    TLI.getTargetData()->getPrefTypeAlignment(NOutVT.
+    TLI.getDataLayout()->getPrefTypeAlignment(NOutVT.
                                               getTypeForEVT(*DAG.getContext()));
   SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 704f99bcf0e1..22f8d51ab2a9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -64,6 +64,7 @@ class VectorLegalizer {
   // Implement vselect in terms of XOR, AND, OR when blend is not supported
   // by the target.
   SDValue ExpandVSELECT(SDValue Op);
+  SDValue ExpandSELECT(SDValue Op);
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
@@ -220,6 +221,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FFLOOR:
+  case ISD::FMA:
   case ISD::SIGN_EXTEND_INREG:
     QueryType = Node->getValueType(0);
     break;
@@ -260,6 +262,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case TargetLowering::Expand:
     if (Node->getOpcode() == ISD::VSELECT)
       Result = ExpandVSELECT(Op);
+    else if (Node->getOpcode() == ISD::SELECT)
+      Result = ExpandSELECT(Op);
     else if (Node->getOpcode() == ISD::UINT_TO_FP)
       Result = ExpandUINT_TO_FLOAT(Op);
     else if (Node->getOpcode() == ISD::FNEG)
@@ -435,6 +439,66 @@ SDValue VectorLegalizer::ExpandStore(SDValue Op) {
   return TF;
 }
 
+SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
+  // Lower a select instruction where the condition is a scalar and the
+  // operands are vectors. Lower this select to VSELECT and implement it
+  // using XOR AND OR. The selector bit is broadcasted. 
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  assert(VT.isVector() && !Mask.getValueType().isVector()
+         && Op1.getValueType() == Op2.getValueType() && "Invalid type");
+
+  unsigned NumElem = VT.getVectorNumElements();
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // Also, we need to be able to construct a splat vector using BUILD_VECTOR.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::BUILD_VECTOR,  VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Generate a mask operand.
+  EVT MaskTy = TLI.getSetCCResultType(VT);
+  assert(MaskTy.isVector() && "Invalid CC type");
+  assert(MaskTy.getSizeInBits() == Op1.getValueType().getSizeInBits()
+         && "Invalid mask size");
+
+  // What is the size of each element in the vector mask.
+  EVT BitTy = MaskTy.getScalarType();
+
+  Mask = DAG.getNode(ISD::SELECT, DL, BitTy, Mask,
+          DAG.getConstant(APInt::getAllOnesValue(BitTy.getSizeInBits()), BitTy),
+          DAG.getConstant(0, BitTy));
+
+  // Broadcast the mask so that the entire vector is all-one or all zero.
+  SmallVector<SDValue, 8> Ops(NumElem, Mask);
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, &Ops[0], Ops.size());
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+            APInt::getAllOnesValue(BitTy.getSizeInBits()), MaskTy);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, MaskTy, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, MaskTy, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, MaskTy, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, MaskTy, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
+}
+
 SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // Implement VSELECT in terms of XOR, AND, OR
   // on platforms which do not support blend natively.
@@ -449,12 +513,17 @@ SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
   // AND,OR,XOR, we will have to scalarize the op.
   // Notice that the operation may be 'promoted' which means that it is
   // 'bitcasted' to another type which is handled.
+  // This operation also isn't safe with AND, OR, XOR when the boolean
+  // type is 0/1 as we need an all ones vector constant to mask with.
+  // FIXME: Sign extend 1 to all ones if thats legal on the target.
   if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
       TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
-      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand)
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getBooleanContents(true) !=
+      TargetLowering::ZeroOrNegativeOneBooleanContent)
     return DAG.UnrollVectorOp(Op.getNode());
 
-  assert(VT.getSizeInBits() == Op.getOperand(1).getValueType().getSizeInBits()
+  assert(VT.getSizeInBits() == Op1.getValueType().getSizeInBits()
          && "Invalid mask size");
   // Bitcast the operands to be the same type as the mask.
   // This is needed when we select between FP types because
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 470920296575..d51a6eb192ee 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -21,7 +21,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -749,7 +749,7 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
   Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
   unsigned Alignment =
-    TLI.getTargetData()->getPrefTypeAlignment(VecType);
+    TLI.getDataLayout()->getPrefTypeAlignment(VecType);
   Store = DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT,
                             false, false, 0);
 
@@ -1366,6 +1366,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FTRUNC:
     Res = WidenVecRes_Unary(N);
     break;
+  case ISD::FMA:
+    Res = WidenVecRes_Ternary(N);
+    break;
   }
 
   // If Res is null, the sub-method took care of registering the result.
@@ -1373,6 +1376,16 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     SetWidenedVector(SDValue(N, ResNo), Res);
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
+  // Ternary op widening.
+  DebugLoc dl = N->getDebugLoc();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = GetWidenedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   // Binary op widening.
   unsigned Opcode = N->getOpcode();
@@ -2069,16 +2082,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
 //===----------------------------------------------------------------------===//
 // Widen Vector Operand
 //===----------------------------------------------------------------------===//
-bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned ResNo) {
-  DEBUG(dbgs() << "Widen node operand " << ResNo << ": ";
+bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Widen node operand " << OpNo << ": ";
         N->dump(&DAG);
         dbgs() << "\n");
   SDValue Res = SDValue();
 
+  // See if the target wants to custom widen this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
   switch (N->getOpcode()) {
   default:
 #ifndef NDEBUG
-    dbgs() << "WidenVectorOperand op #" << ResNo << ": ";
+    dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
diff --git a/lib/CodeGen/SelectionDAG/SDNodeOrdering.h b/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
index f88b26d5c422..d2269f8accf1 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
@@ -28,8 +28,8 @@ class SDNode;
 class SDNodeOrdering {
   DenseMap<const SDNode*, unsigned> OrderMap;
 
-  void operator=(const SDNodeOrdering&);   // Do not implement.
-  SDNodeOrdering(const SDNodeOrdering&);   // Do not implement.
+  void operator=(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
+  SDNodeOrdering(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
 public:
   SDNodeOrdering() {}
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index b7ce48a48929..2ecdd8941551 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -13,11 +13,12 @@
 
 #define DEBUG_TYPE "pre-RA-sched"
 #include "ScheduleDAGSDNodes.h"
+#include "InstrEmitter.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/SmallSet.h"
@@ -34,6 +35,10 @@ STATISTIC(NumPRCopies,   "Number of physical copies");
 static RegisterScheduler
   fastDAGScheduler("fast", "Fast suboptimal list scheduling",
                    createFastDAGScheduler);
+static RegisterScheduler
+  linearizeDAGScheduler("linearize", "Linearize DAG, no scheduling",
+                        createDAGLinearizer);
+
 
 namespace {
   /// FastPriorityQueue - A degenerate priority queue that considers
@@ -331,7 +336,9 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
       }
     }
     if (isNewLoad) {
-      AddPred(NewSU, SDep(LoadSU, SDep::Order, LoadSU->Latency));
+      SDep D(LoadSU, SDep::Barrier);
+      D.setLatency(LoadSU->Latency);
+      AddPred(NewSU, D);
     }
 
     ++NumUnfolds;
@@ -407,9 +414,12 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
   for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
     RemovePred(DelDeps[i].first, DelDeps[i].second);
   }
-
-  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
-  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
 
   Copies.push_back(CopyFromSU);
   Copies.push_back(CopyToSU);
@@ -586,18 +596,14 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
           InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
           DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
                        << " to SU #" << Copies.front()->NodeNum << "\n");
-          AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
-                              /*Reg=*/0, /*isNormalMemory=*/false,
-                              /*isMustAlias=*/false, /*isArtificial=*/true));
+          AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
           NewDef = Copies.back();
         }
 
         DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
                      << " to SU #" << TrySU->NodeNum << "\n");
         LiveRegDefs[Reg] = NewDef;
-        AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
-                             /*Reg=*/0, /*isNormalMemory=*/false,
-                             /*isMustAlias=*/false, /*isArtificial=*/true));
+        AddPred(NewDef, SDep(TrySU, SDep::Artificial));
         TrySU->isAvailable = false;
         CurSU = NewDef;
       }
@@ -629,6 +635,155 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
 #endif
 }
 
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ScheduleDAGLinearize - No scheduling scheduler, it simply linearize the
+// DAG in topological order.
+// IMPORTANT: this may not work for targets with phyreg dependency.
+//
+class ScheduleDAGLinearize : public ScheduleDAGSDNodes {
+public:
+  ScheduleDAGLinearize(MachineFunction &mf) : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule();
+
+  MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
+private:
+  std::vector<SDNode*> Sequence;
+  DenseMap<SDNode*, SDNode*> GluedMap;  // Cache glue to its user
+
+  void ScheduleNode(SDNode *N);
+};
+} // end anonymous namespace
+
+void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
+  if (N->getNodeId() != 0)
+    llvm_unreachable(0);
+
+  if (!N->isMachineOpcode() &&
+      (N->getOpcode() == ISD::EntryToken || isPassiveNode(N)))
+    // These nodes do not need to be translated into MIs.
+    return;
+
+  DEBUG(dbgs() << "\n*** Scheduling: ");
+  DEBUG(N->dump(DAG));
+  Sequence.push_back(N);
+
+  unsigned NumOps = N->getNumOperands();
+  if (unsigned NumLeft = NumOps) {
+    SDNode *GluedOpN = 0;
+    do {
+      const SDValue &Op = N->getOperand(NumLeft-1);
+      SDNode *OpN = Op.getNode();
+
+      if (NumLeft == NumOps && Op.getValueType() == MVT::Glue) {
+        // Schedule glue operand right above N.
+        GluedOpN = OpN;
+        assert(OpN->getNodeId() != 0 && "Glue operand not ready?");
+        OpN->setNodeId(0);
+        ScheduleNode(OpN);
+        continue;
+      }
+
+      if (OpN == GluedOpN)
+        // Glue operand is already scheduled.
+        continue;
+
+      DenseMap<SDNode*, SDNode*>::iterator DI = GluedMap.find(OpN);
+      if (DI != GluedMap.end() && DI->second != N)
+        // Users of glues are counted against the glued users.
+        OpN = DI->second;
+
+      unsigned Degree = OpN->getNodeId();
+      assert(Degree > 0 && "Predecessor over-released!");
+      OpN->setNodeId(--Degree);
+      if (Degree == 0)
+        ScheduleNode(OpN);
+    } while (--NumLeft);
+  }
+}
+
+/// findGluedUser - Find the representative use of a glue value by walking
+/// the use chain.
+static SDNode *findGluedUser(SDNode *N) {
+  while (SDNode *Glued = N->getGluedUser())
+    N = Glued;
+  return N;
+}
+
+void ScheduleDAGLinearize::Schedule() {
+  DEBUG(dbgs() << "********** DAG Linearization **********\n");
+
+  SmallVector<SDNode*, 8> Glues;
+  unsigned DAGSize = 0;
+  for (SelectionDAG::allnodes_iterator I = DAG->allnodes_begin(),
+         E = DAG->allnodes_end(); I != E; ++I) {
+    SDNode *N = I;
+
+    // Use node id to record degree.
+    unsigned Degree = N->use_size();
+    N->setNodeId(Degree);
+    unsigned NumVals = N->getNumValues();
+    if (NumVals && N->getValueType(NumVals-1) == MVT::Glue &&
+        N->hasAnyUseOfValue(NumVals-1)) {
+      SDNode *User = findGluedUser(N);
+      if (User) {
+        Glues.push_back(N);
+        GluedMap.insert(std::make_pair(N, User));
+      }
+    }
+
+    if (N->isMachineOpcode() ||
+        (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N)))
+      ++DAGSize;
+  }
+
+  for (unsigned i = 0, e = Glues.size(); i != e; ++i) {
+    SDNode *Glue = Glues[i];
+    SDNode *GUser = GluedMap[Glue];
+    unsigned Degree = Glue->getNodeId();
+    unsigned UDegree = GUser->getNodeId();
+
+    // Glue user must be scheduled together with the glue operand. So other
+    // users of the glue operand must be treated as its users.
+    SDNode *ImmGUser = Glue->getGluedUser();
+    for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end();
+         ui != ue; ++ui)
+      if (*ui == ImmGUser)
+        --Degree;
+    GUser->setNodeId(UDegree + Degree);
+    Glue->setNodeId(1);
+  }
+
+  Sequence.reserve(DAGSize);
+  ScheduleNode(DAG->getRoot().getNode());
+}
+
+MachineBasicBlock*
+ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+
+  DEBUG({
+      dbgs() << "\n*** Final schedule ***\n";
+    });
+
+  // FIXME: Handle dbg_values.
+  unsigned NumNodes = Sequence.size();
+  for (unsigned i = 0; i != NumNodes; ++i) {
+    SDNode *N = Sequence[NumNodes-i-1];
+    DEBUG(N->dump(DAG));
+    Emitter.EmitNode(N, false, false, VRBaseMap);
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
 //===----------------------------------------------------------------------===//
 //                         Public Constructor Functions
 //===----------------------------------------------------------------------===//
@@ -637,3 +792,8 @@ llvm::ScheduleDAGSDNodes *
 llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
   return new ScheduleDAGFast(*IS->MF);
 }
+
+llvm::ScheduleDAGSDNodes *
+llvm::createDAGLinearizer(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGLinearize(*IS->MF);
+}
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index bf0a43785b70..c55456902c87 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -656,6 +656,8 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) {
     break;
   case ISD::MERGE_VALUES:
   case ISD::TokenFactor:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
   case ISD::CopyToReg:
   case ISD::CopyFromReg:
   case ISD::EH_LABEL:
@@ -1056,7 +1058,9 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
 
     // Add a data dependency to reflect that NewSU reads the value defined
     // by LoadSU.
-    AddPred(NewSU, SDep(LoadSU, SDep::Data, LoadSU->Latency));
+    SDep D(LoadSU, SDep::Data, 0);
+    D.setLatency(LoadSU->Latency);
+    AddPred(NewSU, D);
 
     if (isNewLoad)
       AvailableQueue->addNode(LoadSU);
@@ -1138,17 +1142,18 @@ void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
       // Avoid scheduling the def-side copy before other successors. Otherwise
       // we could introduce another physreg interference on the copy and
       // continue inserting copies indefinitely.
-      SDep D(CopyFromSU, SDep::Order, /*Latency=*/0,
-             /*Reg=*/0, /*isNormalMemory=*/false,
-             /*isMustAlias=*/false, /*isArtificial=*/true);
-      AddPred(SuccSU, D);
+      AddPred(SuccSU, SDep(CopyFromSU, SDep::Artificial));
     }
   }
   for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
     RemovePred(DelDeps[i].first, DelDeps[i].second);
 
-  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
-  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
 
   AvailableQueue->updateNode(SU);
   AvailableQueue->addNode(CopyFromSU);
@@ -1357,9 +1362,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
         if (!BtSU->isPending)
           AvailableQueue->remove(BtSU);
       }
-      AddPred(TrySU, SDep(BtSU, SDep::Order, /*Latency=*/1,
-                          /*Reg=*/0, /*isNormalMemory=*/false,
-                          /*isMustAlias=*/false, /*isArtificial=*/true));
+      AddPred(TrySU, SDep(BtSU, SDep::Artificial));
 
       // If one or more successors has been unscheduled, then the current
       // node is no longer avaialable. Schedule a successor that's now
@@ -1411,20 +1414,14 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
       InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
       DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
             << " to SU #" << Copies.front()->NodeNum << "\n");
-      AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
-                          /*Reg=*/0, /*isNormalMemory=*/false,
-                          /*isMustAlias=*/false,
-                          /*isArtificial=*/true));
+      AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
       NewDef = Copies.back();
     }
 
     DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
           << " to SU #" << TrySU->NodeNum << "\n");
     LiveRegDefs[Reg] = NewDef;
-    AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
-                         /*Reg=*/0, /*isNormalMemory=*/false,
-                         /*isMustAlias=*/false,
-                         /*isArtificial=*/true));
+    AddPred(NewDef, SDep(TrySU, SDep::Artificial));
     TrySU->isAvailable = false;
     CurSU = NewDef;
   }
@@ -1756,6 +1753,7 @@ public:
     return V;
   }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump(ScheduleDAG *DAG) const {
     // Emulate pop() without clobbering NodeQueueIds.
     std::vector<SUnit*> DumpQueue = Queue;
@@ -1766,6 +1764,7 @@ public:
       SU->dump(DAG);
     }
   }
+#endif
 };
 
 typedef RegReductionPriorityQueue<bu_ls_rr_sort>
@@ -1893,6 +1892,7 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
 //===----------------------------------------------------------------------===//
 
 void RegReductionPQBase::dumpRegPressure() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
          E = TRI->regclass_end(); I != E; ++I) {
     const TargetRegisterClass *RC = *I;
@@ -1902,6 +1902,7 @@ void RegReductionPQBase::dumpRegPressure() const {
     DEBUG(dbgs() << RC->getName() << ": " << RP << " / " << RegLimit[Id]
           << '\n');
   }
+#endif
 }
 
 bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
@@ -2930,10 +2931,7 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
             !scheduleDAG->IsReachable(SuccSU, SU)) {
           DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
                        << SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
-          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Order, /*Latency=*/0,
-                                        /*Reg=*/0, /*isNormalMemory=*/false,
-                                        /*isMustAlias=*/false,
-                                        /*isArtificial=*/true));
+          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Artificial));
         }
       }
     }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 748668cdf674..a197fcbfa593 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -485,14 +485,15 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         if(isChain && OpN->getOpcode() == ISD::TokenFactor)
           OpLatency = 0;
 
-        const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
-                               OpLatency, PhysReg);
+        SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
+          : SDep(OpSU, SDep::Data, PhysReg);
+        Dep.setLatency(OpLatency);
         if (!isChain && !UnitLatencies) {
-          computeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
-          ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
+          computeOperandLatency(OpN, N, i, Dep);
+          ST.adjustSchedDependency(OpSU, SU, Dep);
         }
 
-        if (!SU->addPred(dep) && !dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
+        if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
           // Multiple register uses are combined in the same SUnit. For example,
           // we could have a set of glued nodes with all their defs consumed by
           // another set of glued nodes. Register pressure tracking sees this as
@@ -643,6 +644,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   if (!SU->getNode()) {
     dbgs() << "PHYS REG COPY\n";
     return;
@@ -659,8 +661,10 @@ void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
     dbgs() << "\n";
     GluedNodes.pop_back();
   }
+#endif
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ScheduleDAGSDNodes::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
     if (SUnit *SU = Sequence[i])
@@ -669,6 +673,7 @@ void ScheduleDAGSDNodes::dumpSchedule() const {
       dbgs() << "**** NOOP ****\n";
   }
 }
+#endif
 
 #ifndef NDEBUG
 /// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
@@ -827,8 +832,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     }
 
     SmallVector<SDNode *, 4> GluedNodes;
-    for (SDNode *N = SU->getNode()->getGluedNode(); N;
-         N = N->getGluedNode())
+    for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
       GluedNodes.push_back(N);
     while (!GluedNodes.empty()) {
       SDNode *N = GluedNodes.back();
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 84e41fc4a1ba..907356fd212c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -114,7 +114,8 @@ namespace llvm {
     /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock
     /// according to the order specified in Sequence.
     ///
-    MachineBasicBlock *EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+    virtual MachineBasicBlock*
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos);
 
     virtual void dumpNode(const SUnit *SU) const;
 
@@ -158,6 +159,12 @@ namespace llvm {
       void InitNodeNumDefs();
     };
 
+  protected:
+    /// ForceUnitLatencies - Return true if all scheduling edges should be given
+    /// a latency value of one.  The default is to return false; schedulers may
+    /// override this as needed.
+    virtual bool forceUnitLatencies() const { return false; }
+
   private:
     /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
     /// combined SUnits.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index c8512914c1e2..30f03ac737b9 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index f4fe8927f696..f000ce38d367 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -29,7 +29,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 #include "llvm/Target/TargetOptions.h"
@@ -91,11 +91,6 @@ bool ConstantFPSDNode::isValueValidForType(EVT VT,
                                            const APFloat& Val) {
   assert(VT.isFloatingPoint() && "Can only convert between FP types");
 
-  // PPC long double cannot be converted to any other type.
-  if (VT == MVT::ppcf128 ||
-      &Val.getSemantics() == &APFloat::PPCDoubleDouble)
-    return false;
-
   // convert modifies in place, so make a copy.
   APFloat Val2 = APFloat(Val);
   bool losesInfo;
@@ -136,13 +131,11 @@ bool ISD::isBuildVectorAllOnes(const SDNode *N) {
   // constants are.
   SDValue NotZero = N->getOperand(i);
   unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
-  if (isa<ConstantSDNode>(NotZero)) {
-    if (cast<ConstantSDNode>(NotZero)->getAPIntValue().countTrailingOnes() <
-        EltSize)
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
+    if (CN->getAPIntValue().countTrailingOnes() < EltSize)
       return false;
-  } else if (isa<ConstantFPSDNode>(NotZero)) {
-    if (cast<ConstantFPSDNode>(NotZero)->getValueAPF()
-              .bitcastToAPInt().countTrailingOnes() < EltSize)
+  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
+    if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
       return false;
   } else
     return false;
@@ -179,11 +172,11 @@ bool ISD::isBuildVectorAllZeros(const SDNode *N) {
   // Do not accept build_vectors that aren't all constants or which have non-0
   // elements.
   SDValue Zero = N->getOperand(i);
-  if (isa<ConstantSDNode>(Zero)) {
-    if (!cast<ConstantSDNode>(Zero)->isNullValue())
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Zero)) {
+    if (!CN->isNullValue())
       return false;
-  } else if (isa<ConstantFPSDNode>(Zero)) {
-    if (!cast<ConstantFPSDNode>(Zero)->getValueAPF().isPosZero())
+  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Zero)) {
+    if (!CFPN->getValueAPF().isPosZero())
       return false;
   } else
     return false;
@@ -494,8 +487,10 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   }
   case ISD::TargetBlockAddress:
   case ISD::BlockAddress: {
-    ID.AddPointer(cast<BlockAddressSDNode>(N)->getBlockAddress());
-    ID.AddInteger(cast<BlockAddressSDNode>(N)->getTargetFlags());
+    const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
+    ID.AddPointer(BA->getBlockAddress());
+    ID.AddInteger(BA->getOffset());
+    ID.AddInteger(BA->getTargetFlags());
     break;
   }
   } // end switch (N->getOpcode())
@@ -883,7 +878,7 @@ unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
-  return TLI.getTargetData()->getABITypeAlignment(Ty);
+  return TLI.getDataLayout()->getABITypeAlignment(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
@@ -1097,10 +1092,9 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
          "Cannot set target flags on target-independent globals");
 
   // Truncate (with sign-extension) the offset value to the pointer size.
-  EVT PTy = TLI.getPointerTy();
-  unsigned BitWidth = PTy.getSizeInBits();
+  unsigned BitWidth = TLI.getPointerTy().getSizeInBits();
   if (BitWidth < 64)
-    Offset = (Offset << (64 - BitWidth) >> (64 - BitWidth));
+    Offset = SignExtend64(Offset, BitWidth);
 
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
   if (!GVar) {
@@ -1174,7 +1168,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = TLI.getTargetData()->getPrefTypeAlignment(C->getType());
+    Alignment = TLI.getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
@@ -1201,7 +1195,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
-    Alignment = TLI.getTargetData()->getPrefTypeAlignment(C->getType());
+    Alignment = TLI.getDataLayout()->getPrefTypeAlignment(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
@@ -1471,6 +1465,7 @@ SDValue SelectionDAG::getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label) {
 
 
 SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                      int64_t Offset,
                                       bool isTarget,
                                       unsigned char TargetFlags) {
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
@@ -1478,12 +1473,14 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
   ID.AddPointer(BA);
+  ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, TargetFlags);
+  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset,
+                                                     TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -1542,7 +1539,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
   unsigned ByteSize = VT.getStoreSize();
   Type *Ty = VT.getTypeForEVT(*getContext());
   unsigned StackAlign =
-  std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty), minAlign);
+  std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty), minAlign);
 
   int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
   return getFrameIndex(FrameIdx, TLI.getPointerTy());
@@ -1555,7 +1552,7 @@ SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
                             VT2.getStoreSizeInBits())/8;
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
-  const TargetData *TD = TLI.getTargetData();
+  const DataLayout *TD = TLI.getDataLayout();
   unsigned Align = std::max(TD->getPrefTypeAlignment(Ty1),
                             TD->getPrefTypeAlignment(Ty2));
 
@@ -1610,10 +1607,6 @@ SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1,
   }
   if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
     if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2.getNode())) {
-      // No compile time operations on this type yet.
-      if (N1C->getValueType(0) == MVT::ppcf128)
-        return SDValue();
-
       APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
       switch (Cond) {
       default: break;
@@ -2445,8 +2438,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
       return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), VT);
     case ISD::UINT_TO_FP:
     case ISD::SINT_TO_FP: {
-      // No compile time operations on ppcf128.
-      if (VT == MVT::ppcf128) break;
       APFloat apf(APInt::getNullValue(VT.getSizeInBits()));
       (void)apf.convertFromAPInt(Val,
                                  Opcode==ISD::SINT_TO_FP,
@@ -2455,9 +2446,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     }
     case ISD::BITCAST:
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
-        return getConstantFP(Val.bitsToFloat(), VT);
+        return getConstantFP(APFloat(Val), VT);
       else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
-        return getConstantFP(Val.bitsToDouble(), VT);
+        return getConstantFP(APFloat(Val), VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), VT);
@@ -2475,61 +2466,59 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
   // Constant fold unary operations with a floating point constant operand.
   if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand.getNode())) {
     APFloat V = C->getValueAPF();    // make copy
-    if (VT != MVT::ppcf128 && Operand.getValueType() != MVT::ppcf128) {
-      switch (Opcode) {
-      case ISD::FNEG:
-        V.changeSign();
+    switch (Opcode) {
+    case ISD::FNEG:
+      V.changeSign();
+      return getConstantFP(V, VT);
+    case ISD::FABS:
+      V.clearSign();
+      return getConstantFP(V, VT);
+    case ISD::FCEIL: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
         return getConstantFP(V, VT);
-      case ISD::FABS:
-        V.clearSign();
+      break;
+    }
+    case ISD::FTRUNC: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
         return getConstantFP(V, VT);
-      case ISD::FCEIL: {
-        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
-        if (fs == APFloat::opOK || fs == APFloat::opInexact)
-          return getConstantFP(V, VT);
-        break;
-      }
-      case ISD::FTRUNC: {
-        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
-        if (fs == APFloat::opOK || fs == APFloat::opInexact)
-          return getConstantFP(V, VT);
-        break;
-      }
-      case ISD::FFLOOR: {
-        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
-        if (fs == APFloat::opOK || fs == APFloat::opInexact)
-          return getConstantFP(V, VT);
-        break;
-      }
-      case ISD::FP_EXTEND: {
-        bool ignored;
-        // This can return overflow, underflow, or inexact; we don't care.
-        // FIXME need to be more flexible about rounding mode.
-        (void)V.convert(*EVTToAPFloatSemantics(VT),
-                        APFloat::rmNearestTiesToEven, &ignored);
+      break;
+    }
+    case ISD::FFLOOR: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
         return getConstantFP(V, VT);
-      }
-      case ISD::FP_TO_SINT:
-      case ISD::FP_TO_UINT: {
-        integerPart x[2];
-        bool ignored;
-        assert(integerPartWidth >= 64);
-        // FIXME need to be more flexible about rounding mode.
-        APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
-                              Opcode==ISD::FP_TO_SINT,
-                              APFloat::rmTowardZero, &ignored);
-        if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
-          break;
-        APInt api(VT.getSizeInBits(), x);
-        return getConstant(api, VT);
-      }
-      case ISD::BITCAST:
-        if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
-          return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
-        else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
-          return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
+      break;
+    }
+    case ISD::FP_EXTEND: {
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(*EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, VT);
+    }
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT: {
+      integerPart x[2];
+      bool ignored;
+      assert(integerPartWidth >= 64);
+      // FIXME need to be more flexible about rounding mode.
+      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
+                            Opcode==ISD::FP_TO_SINT,
+                            APFloat::rmTowardZero, &ignored);
+      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
         break;
-      }
+      APInt api(VT.getSizeInBits(), x);
+      return getConstant(api, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+        return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
+      else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+        return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
+      break;
     }
   }
 
@@ -2817,6 +2806,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N2))
           if (CFP->getValueAPF().isZero())
             return N1;
+      } else if (Opcode == ISD::FMUL) {
+        ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1);
+        SDValue V = N2;
+
+        // If the first operand isn't the constant, try the second
+        if (!CFP) {
+          CFP = dyn_cast<ConstantFPSDNode>(N2);
+          V = N1;
+        }
+
+        if (CFP) {
+          // 0*x --> 0
+          if (CFP->isZero())
+            return SDValue(CFP,0);
+          // 1*x --> x
+          if (CFP->isExactlyValue(1.0))
+            return V;
+        }
       }
     }
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
@@ -2935,17 +2942,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     // expanding large vector constants.
     if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
       SDValue Elt = N1.getOperand(N2C->getZExtValue());
-      EVT VEltTy = N1.getValueType().getVectorElementType();
-      if (Elt.getValueType() != VEltTy) {
+
+      if (VT != Elt.getValueType())
         // If the vector element type is not legal, the BUILD_VECTOR operands
-        // are promoted and implicitly truncated.  Make that explicit here.
-        Elt = getNode(ISD::TRUNCATE, DL, VEltTy, Elt);
-      }
-      if (VT != VEltTy) {
-        // If the vector element type is not legal, the EXTRACT_VECTOR_ELT
-        // result is implicitly extended.
-        Elt = getNode(ISD::ANY_EXTEND, DL, VT, Elt);
-      }
+        // are promoted and implicitly truncated, and the result implicitly
+        // extended. Make that explicit here.
+        Elt = getAnyExtOrTrunc(Elt, DL, VT);
+
       return Elt;
     }
 
@@ -3036,7 +3039,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
       // Cannonicalize constant to RHS if commutative
       std::swap(N1CFP, N2CFP);
       std::swap(N1, N2);
-    } else if (N2CFP && VT != MVT::ppcf128) {
+    } else if (N2CFP) {
       APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
       APFloat::opStatus s;
       switch (Opcode) {
@@ -3435,7 +3438,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
-    if (DstAlign >= TLI.getTargetData()->getPointerPrefAlignment() ||
+    if (DstAlign >= TLI.getDataLayout()->getPointerPrefAlignment() ||
         TLI.allowsUnalignedMemoryAccesses(VT)) {
       VT = TLI.getPointerTy();
     } else {
@@ -3503,7 +3506,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  bool OptSize =
+    MF.getFunction()->getFnAttributes().
+      hasAttribute(Attributes::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -3523,7 +3528,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -3596,7 +3601,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -3612,7 +3618,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -3674,7 +3680,8 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -3687,7 +3694,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned) TLI.getTargetData()->getABITypeAlignment(Ty);
+    unsigned NewAlign = (unsigned) TLI.getDataLayout()->getABITypeAlignment(Ty);
     if (NewAlign > Align) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
@@ -3781,7 +3788,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI.getTargetData()->getIntPtrType(*getContext());
+  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
@@ -3836,7 +3843,7 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI.getTargetData()->getIntPtrType(*getContext());
+  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
@@ -3885,7 +3892,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
     return Result;
 
   // Emit a library call.
-  Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*getContext());
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
@@ -3923,17 +3930,21 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
                                 SDValue Swp, MachinePointerInfo PtrInfo,
                                 unsigned Alignment,
                                 AtomicOrdering Ordering,
-                                SynchronizationScope SynchScope) {                                
+                                SynchronizationScope SynchScope) {
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  unsigned Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
 
+  // All atomics are load and store, except for ATMOIC_LOAD and ATOMIC_STORE.
   // For now, atomics are considered to be volatile always.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  Flags |= MachineMemOperand::MOVolatile;
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
@@ -3983,17 +3994,17 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  // A monotonic store does not load; a release store "loads" in the sense
-  // that other stores cannot be sunk past it.
+  // An atomic store does not load. An atomic load does not store.
   // (An atomicrmw obviously both loads and stores.)
-  unsigned Flags = MachineMemOperand::MOStore;
-  if (Opcode != ISD::ATOMIC_STORE || Ordering > Monotonic)
-    Flags |= MachineMemOperand::MOLoad;
-
-  // For now, atomics are considered to be volatile always.
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  Flags |= MachineMemOperand::MOVolatile;
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
@@ -4056,16 +4067,17 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
     Alignment = getEVTAlignment(MemVT);
 
   MachineFunction &MF = getMachineFunction();
-  // A monotonic load does not store; an acquire load "stores" in the sense
-  // that other loads cannot be hoisted past it.
-  unsigned Flags = MachineMemOperand::MOLoad;
-  if (Ordering > Monotonic)
-    Flags |= MachineMemOperand::MOStore;
-
-  // For now, atomics are considered to be volatile always.
+  // An atomic store does not load. An atomic load does not store.
+  // (An atomicrmw obviously both loads and stores.)
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
   // FIXME: Volatile isn't really correct; we should keep track of atomic
   // orderings in the memoperand.
-  Flags |= MachineMemOperand::MOVolatile;
+  unsigned Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
 
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
@@ -4157,6 +4169,8 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
+          Opcode == ISD::LIFETIME_START ||
+          Opcode == ISD::LIFETIME_END ||
           (Opcode <= INT_MAX &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
@@ -4226,7 +4240,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                       bool isVolatile, bool isNonTemporal, bool isInvariant,
                       unsigned Alignment, const MDNode *TBAAInfo,
                       const MDNode *Ranges) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(VT);
@@ -4284,7 +4298,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   AddNodeIDNode(ID, ISD::LOAD, VTs, Ops, 3);
   ID.AddInteger(MemVT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
-                                     MMO->isNonTemporal(), 
+                                     MMO->isNonTemporal(),
                                      MMO->isInvariant()));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
@@ -4303,7 +4317,7 @@ SDValue SelectionDAG::getLoad(EVT VT, DebugLoc dl,
                               SDValue Chain, SDValue Ptr,
                               MachinePointerInfo PtrInfo,
                               bool isVolatile, bool isNonTemporal,
-                              bool isInvariant, unsigned Alignment, 
+                              bool isInvariant, unsigned Alignment,
                               const MDNode *TBAAInfo,
                               const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -4332,7 +4346,7 @@ SelectionDAG::getIndexedLoad(SDValue OrigLoad, DebugLoc dl, SDValue Base,
          "Load is already a indexed load!");
   return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
                  LD->getChain(), Base, Offset, LD->getPointerInfo(),
-                 LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(), 
+                 LD->getMemoryVT(), LD->isVolatile(), LD->isNonTemporal(),
                  false, LD->getAlignment());
 }
 
@@ -4340,7 +4354,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
                                bool isVolatile, bool isNonTemporal,
                                unsigned Alignment, const MDNode *TBAAInfo) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(Val.getValueType());
@@ -4365,7 +4379,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
 
 SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
                                SDValue Ptr, MachineMemOperand *MMO) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   EVT VT = Val.getValueType();
   SDVTList VTs = getVTList(MVT::Other);
@@ -4394,7 +4408,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
                                     EVT SVT,bool isVolatile, bool isNonTemporal,
                                     unsigned Alignment,
                                     const MDNode *TBAAInfo) {
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (Alignment == 0)  // Ensure that codegen never sees alignment 0
     Alignment = getEVTAlignment(SVT);
@@ -4421,7 +4435,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
                                     MachineMemOperand *MMO) {
   EVT VT = Val.getValueType();
 
-  assert(Chain.getValueType() == MVT::Other && 
+  assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
   if (VT == SVT)
     return getStore(Chain, dl, Val, Ptr, MMO);
@@ -6074,7 +6088,7 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
     unsigned PtrWidth = TLI.getPointerTy().getSizeInBits();
     APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
     llvm::ComputeMaskedBits(const_cast<GlobalValue*>(GV), KnownZero, KnownOne,
-                            TLI.getTargetData());
+                            TLI.getDataLayout());
     unsigned AlignBits = KnownZero.countTrailingOnes();
     unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
     if (Align)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ba5bd79722ce..3fbf7c2fe66b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/CallingConv.h"
 #include "llvm/DebugInfo.h"
@@ -43,7 +44,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -88,7 +89,7 @@ static const unsigned MaxParallelChains = 64;
 
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
                                       const SDValue *Parts, unsigned NumParts,
-                                      EVT PartVT, EVT ValueVT);
+                                      EVT PartVT, EVT ValueVT, const Value *V);
 
 /// getCopyFromParts - Create a value that contains the specified legal parts
 /// combined into the value they represent.  If the parts combine to a type
@@ -98,9 +99,11 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
 static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
                                 const SDValue *Parts,
                                 unsigned NumParts, EVT PartVT, EVT ValueVT,
+                                const Value *V,
                                 ISD::NodeType AssertOp = ISD::DELETED_NODE) {
   if (ValueVT.isVector())
-    return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT);
+    return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
+                                  PartVT, ValueVT, V);
 
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -124,9 +127,9 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
 
       if (RoundParts > 2) {
         Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
-                              PartVT, HalfVT);
+                              PartVT, HalfVT, V);
         Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
-                              RoundParts / 2, PartVT, HalfVT);
+                              RoundParts / 2, PartVT, HalfVT, V);
       } else {
         Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
         Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
@@ -142,7 +145,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
         unsigned OddParts = NumParts - RoundParts;
         EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
         Hi = getCopyFromParts(DAG, DL,
-                              Parts + RoundParts, OddParts, PartVT, OddVT);
+                              Parts + RoundParts, OddParts, PartVT, OddVT, V);
 
         // Combine the round and odd parts.
         Lo = Val;
@@ -171,7 +174,7 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
       assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
              !PartVT.isVector() && "Unexpected split");
       EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
-      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT);
+      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V);
     }
   }
 
@@ -209,14 +212,14 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, DebugLoc DL,
   llvm_unreachable("Unknown mismatch!");
 }
 
-/// getCopyFromParts - Create a value that contains the specified legal parts
-/// combined into the value they represent.  If the parts combine to a type
-/// larger then ValueVT then AssertOp can be used to specify whether the extra
-/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
-/// (ISD::AssertSext).
+/// getCopyFromPartsVector - Create a value that contains the specified legal
+/// parts combined into the value they represent.  If the parts combine to a
+/// type larger then ValueVT then AssertOp can be used to specify whether the
+/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
+/// ValueVT (ISD::AssertSext).
 static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
                                       const SDValue *Parts, unsigned NumParts,
-                                      EVT PartVT, EVT ValueVT) {
+                                      EVT PartVT, EVT ValueVT, const Value *V) {
   assert(ValueVT.isVector() && "Not a vector value");
   assert(NumParts > 0 && "No parts to assemble!");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -242,7 +245,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
       // as appropriate.
       for (unsigned i = 0; i != NumParts; ++i)
         Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
-                                  PartVT, IntermediateVT);
+                                  PartVT, IntermediateVT, V);
     } else if (NumParts > 0) {
       // If the intermediate type was expanded, build the intermediate
       // operands from the parts.
@@ -251,7 +254,7 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
       unsigned Factor = NumParts / NumIntermediates;
       for (unsigned i = 0; i != NumIntermediates; ++i)
         Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
-                                  PartVT, IntermediateVT);
+                                  PartVT, IntermediateVT, V);
     }
 
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
@@ -299,8 +302,19 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
     return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
   // Handle cases such as i8 -> <1 x i1>
-  assert(ValueVT.getVectorNumElements() == 1 &&
-         "Only trivial scalar-to-vector conversions should get here!");
+  if (ValueVT.getVectorNumElements() != 1) {
+    LLVMContext &Ctx = *DAG.getContext();
+    Twine ErrMsg("non-trivial scalar-to-vector conversion");
+    if (const Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      if (const CallInst *CI = dyn_cast<CallInst>(I))
+        if (isa<InlineAsm>(CI->getCalledValue()))
+          ErrMsg = ErrMsg + ", possible invalid constraint for vector type";
+      Ctx.emitError(I, ErrMsg);
+    } else {
+      Ctx.emitError(ErrMsg);
+    }
+    report_fatal_error("Cannot handle scalar-to-vector conversion!");
+  }
 
   if (ValueVT.getVectorNumElements() == 1 &&
       ValueVT.getVectorElementType() != PartVT) {
@@ -312,25 +326,22 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, DebugLoc DL,
   return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
 }
 
-
-
-
 static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc dl,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
-                                 EVT PartVT);
+                                 EVT PartVT, const Value *V);
 
 /// getCopyToParts - Create a series of nodes that contain the specified value
 /// split into legal parts.  If the parts contain more bits than Val, then, for
 /// integers, ExtendKind can be used to specify how to generate the extra bits.
 static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
                            SDValue Val, SDValue *Parts, unsigned NumParts,
-                           EVT PartVT,
+                           EVT PartVT, const Value *V,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
   EVT ValueVT = Val.getValueType();
 
   // Handle the vector case separately.
   if (ValueVT.isVector())
-    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT);
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V);
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   unsigned PartBits = PartVT.getSizeInBits();
@@ -382,7 +393,19 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
          "Failed to tile the value with PartVT!");
 
   if (NumParts == 1) {
-    assert(PartVT == ValueVT && "Type conversion failed!");
+    if (PartVT != ValueVT) {
+      LLVMContext &Ctx = *DAG.getContext();
+      Twine ErrMsg("scalar-to-vector conversion failed");
+      if (const Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+        if (const CallInst *CI = dyn_cast<CallInst>(I))
+          if (isa<InlineAsm>(CI->getCalledValue()))
+            ErrMsg = ErrMsg + ", possible invalid constraint for vector type";
+        Ctx.emitError(I, ErrMsg);
+      } else {
+        Ctx.emitError(ErrMsg);
+      }
+    }
+
     Parts[0] = Val;
     return;
   }
@@ -397,7 +420,7 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
     unsigned OddParts = NumParts - RoundParts;
     SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
                                  DAG.getIntPtrConstant(RoundBits));
-    getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT);
+    getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V);
 
     if (TLI.isBigEndian())
       // The odd parts were reversed by getCopyToParts - unreverse them.
@@ -443,7 +466,7 @@ static void getCopyToParts(SelectionDAG &DAG, DebugLoc DL,
 /// value split into legal parts.
 static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
                                  SDValue Val, SDValue *Parts, unsigned NumParts,
-                                 EVT PartVT) {
+                                 EVT PartVT, const Value *V) {
   EVT ValueVT = Val.getValueType();
   assert(ValueVT.isVector() && "Not a vector");
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -529,7 +552,7 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
     // If the register was not expanded, promote or copy the value,
     // as appropriate.
     for (unsigned i = 0; i != NumParts; ++i)
-      getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT);
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V);
   } else if (NumParts > 0) {
     // If the intermediate type was expanded, split each the value into
     // legal parts.
@@ -537,13 +560,10 @@ static void getCopyToPartsVector(SelectionDAG &DAG, DebugLoc DL,
            "Must expand into a divisible number of parts!");
     unsigned Factor = NumParts / NumIntermediates;
     for (unsigned i = 0; i != NumIntermediates; ++i)
-      getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT);
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT, V);
   }
 }
 
-
-
-
 namespace {
   /// RegsForValue - This struct represents the registers (physical or virtual)
   /// that a particular set of values is assigned, and the type information
@@ -621,14 +641,15 @@ namespace {
     /// If the Flag pointer is NULL, no flag is used.
     SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
                             DebugLoc dl,
-                            SDValue &Chain, SDValue *Flag) const;
+                            SDValue &Chain, SDValue *Flag,
+                            const Value *V = 0) const;
 
     /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
     /// specified value into the registers specified by this object.  This uses
     /// Chain/Flag as the input and updates them for the output Chain/Flag.
     /// If the Flag pointer is NULL, no flag is used.
     void getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
-                       SDValue &Chain, SDValue *Flag) const;
+                       SDValue &Chain, SDValue *Flag, const Value *V) const;
 
     /// AddInlineAsmOperands - Add this value to the specified inlineasm node
     /// operand list.  This adds the code marker, matching input operand index
@@ -647,7 +668,8 @@ namespace {
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
                                       DebugLoc dl,
-                                      SDValue &Chain, SDValue *Flag) const {
+                                      SDValue &Chain, SDValue *Flag,
+                                      const Value *V) const {
   // A Value with type {} or [0 x %t] needs no registers.
   if (ValueVTs.empty())
     return SDValue();
@@ -721,7 +743,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
     }
 
     Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
-                                     NumRegs, RegisterVT, ValueVT);
+                                     NumRegs, RegisterVT, ValueVT, V);
     Part += NumRegs;
     Parts.clear();
   }
@@ -736,7 +758,8 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 /// Chain/Flag as the input and updates them for the output Chain/Flag.
 /// If the Flag pointer is NULL, no flag is used.
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
-                                 SDValue &Chain, SDValue *Flag) const {
+                                 SDValue &Chain, SDValue *Flag,
+                                 const Value *V) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // Get the list of the values's legal parts.
@@ -748,7 +771,7 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG, DebugLoc dl,
     EVT RegisterVT = RegVTs[Value];
 
     getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
-                   &Parts[Part], NumParts, RegisterVT);
+                   &Parts[Part], NumParts, RegisterVT, V);
     Part += NumParts;
   }
 
@@ -824,7 +847,8 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   AA = &aa;
   GFI = gfi;
   LibInfo = li;
-  TD = DAG.getTarget().getTargetData();
+  TD = DAG.getTarget().getDataLayout();
+  Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
 
@@ -992,7 +1016,7 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
     unsigned InReg = It->second;
     RegsForValue RFV(*DAG.getContext(), TLI, InReg, V->getType());
     SDValue Chain = DAG.getEntryNode();
-    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
+    N = RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL, V);
     resolveDanglingDebugInfo(V, N);
     return N;
   }
@@ -1147,7 +1171,7 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
     RegsForValue RFV(*DAG.getContext(), TLI, InReg, Inst->getType());
     SDValue Chain = DAG.getEntryNode();
-    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL);
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(), Chain, NULL, V);
   }
 
   llvm_unreachable("Can't get register for value!");
@@ -1203,9 +1227,9 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
         const Function *F = I.getParent()->getParent();
-        if (F->paramHasAttr(0, Attribute::SExt))
+        if (F->getRetAttributes().hasAttribute(Attributes::SExt))
           ExtendKind = ISD::SIGN_EXTEND;
-        else if (F->paramHasAttr(0, Attribute::ZExt))
+        else if (F->getRetAttributes().hasAttribute(Attributes::ZExt))
           ExtendKind = ISD::ZERO_EXTEND;
 
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
@@ -1216,11 +1240,11 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurDebugLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
-                       &Parts[0], NumParts, PartVT, ExtendKind);
+                       &Parts[0], NumParts, PartVT, &I, ExtendKind);
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-        if (F->paramHasAttr(0, Attribute::InReg))
+        if (F->getRetAttributes().hasAttribute(Attributes::InReg))
           Flags.setInReg();
 
         // Propagate extension type if any
@@ -1231,7 +1255,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
 
         for (unsigned i = 0; i < NumParts; ++i) {
           Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
-                                        /*isfixed=*/true));
+                                        /*isfixed=*/true, 0, 0));
           OutVals.push_back(Parts[i]);
         }
       }
@@ -1601,7 +1625,10 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
 
   // Update successor info
   addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight);
-  addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
+  // TrueBB and FalseBB are always different unless the incoming IR is
+  // degenerate. This only happens when running llc on weird IR.
+  if (CB.TrueBB != CB.FalseBB)
+    addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1762,6 +1789,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 /// visitBitTestCase - this function produces one "bit test"
 void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                                            MachineBasicBlock* NextMBB,
+                                           uint32_t BranchWeightToNext,
                                            unsigned Reg,
                                            BitTestCase &B,
                                            MachineBasicBlock *SwitchBB) {
@@ -1799,8 +1827,10 @@ void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
                        ISD::SETNE);
   }
 
-  addSuccessorWithWeight(SwitchBB, B.TargetBB);
-  addSuccessorWithWeight(SwitchBB, NextMBB);
+  // The branch weight from SwitchBB to B.TargetBB is B.ExtraWeight.
+  addSuccessorWithWeight(SwitchBB, B.TargetBB, B.ExtraWeight);
+  // The branch weight from SwitchBB to NextMBB is BranchWeightToNext.
+  addSuccessorWithWeight(SwitchBB, NextMBB, BranchWeightToNext);
 
   SDValue BrAnd = DAG.getNode(ISD::BRCOND, getCurDebugLoc(),
                               MVT::Other, getControlRoot(),
@@ -1923,6 +1953,7 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
   if (++BBI != FuncInfo.MF->end())
     NextBlock = BBI;
 
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // If any two of the cases has the same destination, and if one value
   // is the same as the other, but has one bit unset that the other has set,
   // use bit manipulation to do two compares at once.  For example:
@@ -1956,8 +1987,12 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
                                     ISD::SETEQ);
 
         // Update successor info.
-        addSuccessorWithWeight(SwitchBB, Small.BB);
-        addSuccessorWithWeight(SwitchBB, Default);
+        // Both Small and Big will jump to Small.BB, so we sum up the weights.
+        addSuccessorWithWeight(SwitchBB, Small.BB,
+                               Small.ExtraWeight + Big.ExtraWeight);
+        addSuccessorWithWeight(SwitchBB, Default,
+          // The default destination is the first successor in IR.
+          BPI ? BPI->getEdgeWeight(SwitchBB->getBasicBlock(), (unsigned)0) : 0);
 
         // Insert the true branch.
         SDValue BrCond = DAG.getNode(ISD::BRCOND, DL, MVT::Other,
@@ -1975,14 +2010,13 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
   }
 
   // Order cases by weight so the most likely case will be checked first.
-  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  uint32_t UnhandledWeights = 0;
   if (BPI) {
     for (CaseItr I = CR.Range.first, IE = CR.Range.second; I != IE; ++I) {
-      uint32_t IWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
-                                            I->BB->getBasicBlock());
+      uint32_t IWeight = I->ExtraWeight;
+      UnhandledWeights += IWeight;
       for (CaseItr J = CR.Range.first; J < I; ++J) {
-        uint32_t JWeight = BPI->getEdgeWeight(SwitchBB->getBasicBlock(),
-                                              J->BB->getBasicBlock());
+        uint32_t JWeight = J->ExtraWeight;
         if (IWeight > JWeight)
           std::swap(*I, *J);
       }
@@ -2031,10 +2065,12 @@ bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
       LHS = I->Low; MHS = SV; RHS = I->High;
     }
 
-    uint32_t ExtraWeight = I->ExtraWeight;
+    // The false weight should be sum of all un-handled cases.
+    UnhandledWeights -= I->ExtraWeight;
     CaseBlock CB(CC, LHS, RHS, MHS, /* truebb */ I->BB, /* falsebb */ FallThrough,
                  /* me */ CurBlock,
-                 /* trueweight */ ExtraWeight / 2, /* falseweight */ ExtraWeight / 2);
+                 /* trueweight */ I->ExtraWeight,
+                 /* falseweight */ UnhandledWeights);
 
     // If emitting the first comparison, just call visitSwitchCase to emit the
     // code into the current block.  Otherwise, push the CaseBlock onto the
@@ -2079,7 +2115,7 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
   for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I)
     TSize += I->size();
 
-  if (!areJTsAllowed(TLI) || TSize.ult(4))
+  if (!areJTsAllowed(TLI) || TSize.ult(TLI.getMinimumJumpTableEntries()))
     return false;
 
   APInt Range = ComputeRange(First, Last);
@@ -2134,13 +2170,28 @@ bool SelectionDAGBuilder::handleJTSwitchCase(CaseRec &CR,
     }
   }
 
+  // Calculate weight for each unique destination in CR.
+  DenseMap<MachineBasicBlock*, uint32_t> DestWeights;
+  if (FuncInfo.BPI)
+    for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
+      DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
+          DestWeights.find(I->BB);
+      if (Itr != DestWeights.end()) 
+        Itr->second += I->ExtraWeight;
+      else
+        DestWeights[I->BB] = I->ExtraWeight;
+    }
+
   // Update successor info. Add one edge to each unique successor.
   BitVector SuccsHandled(CR.CaseBB->getParent()->getNumBlockIDs());
   for (std::vector<MachineBasicBlock*>::iterator I = DestBBs.begin(),
          E = DestBBs.end(); I != E; ++I) {
     if (!SuccsHandled[(*I)->getNumber()]) {
       SuccsHandled[(*I)->getNumber()] = true;
-      addSuccessorWithWeight(JumpTableBB, *I);
+      DenseMap<MachineBasicBlock*, uint32_t>::iterator Itr =
+          DestWeights.find(*I);
+      addSuccessorWithWeight(JumpTableBB, *I,
+                             Itr != DestWeights.end() ? Itr->second : 0);
     }
   }
 
@@ -2371,7 +2422,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 
     if (i == count) {
       assert((count < 3) && "Too much destinations to test!");
-      CasesBits.push_back(CaseBits(0, Dest, 0));
+      CasesBits.push_back(CaseBits(0, Dest, 0, 0/*Weight*/));
       count++;
     }
 
@@ -2380,6 +2431,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
 
     uint64_t lo = (lowValue - lowBound).getZExtValue();
     uint64_t hi = (highValue - lowBound).getZExtValue();
+    CasesBits[i].ExtraWeight += I->ExtraWeight;
 
     for (uint64_t j = lo; j <= hi; j++) {
       CasesBits[i].Mask |=  1ULL << j;
@@ -2407,7 +2459,7 @@ bool SelectionDAGBuilder::handleBitTestsSwitchCase(CaseRec& CR,
     CurMF->insert(BBI, CaseBB);
     BTC.push_back(BitTestCase(CasesBits[i].Mask,
                               CaseBB,
-                              CasesBits[i].BB));
+                              CasesBits[i].BB, CasesBits[i].ExtraWeight));
 
     // Put SV in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(SV);
@@ -2435,30 +2487,25 @@ size_t SelectionDAGBuilder::Clusterify(CaseVector& Cases,
   
   Clusterifier TheClusterifier;
 
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   // Start with "simple" cases
   for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
        i != e; ++i) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
-    TheClusterifier.add(i.getCaseValueEx(), SMBB);
+    TheClusterifier.add(i.getCaseValueEx(), SMBB, 
+        BPI ? BPI->getEdgeWeight(SI.getParent(), i.getSuccessorIndex()) : 0);
   }
   
   TheClusterifier.optimize();
   
-  BranchProbabilityInfo *BPI = FuncInfo.BPI;
   size_t numCmps = 0;
   for (Clusterifier::RangeIterator i = TheClusterifier.begin(),
        e = TheClusterifier.end(); i != e; ++i, ++numCmps) {
     Clusterifier::Cluster &C = *i;
-    unsigned W = 0;
-    if (BPI) {
-      W = BPI->getEdgeWeight(SI.getParent(), C.second->getBasicBlock());
-      if (!W)
-        W = 16;
-      W *= C.first.Weight;
-      BPI->setEdgeWeight(SI.getParent(), C.second->getBasicBlock(), W);  
-    }
+    // Update edge weight for the cluster.
+    unsigned W = C.first.Weight;
 
     // FIXME: Currently work with ConstantInt based numbers.
     // Changing it to APInt based is a pretty heavy for this commit.
@@ -2540,9 +2587,10 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
     if (handleSmallSwitchRange(CR, WorkList, SV, Default, SwitchMBB))
       continue;
 
-    // If the switch has more than 5 blocks, and at least 40% dense, and the
+    // If the switch has more than N blocks, and is at least 40% dense, and the
     // target supports indirect branches, then emit a jump table rather than
     // lowering the switch to a binary tree of conditional branches.
+    // N defaults to 4 and is controlled via TLS.getMinimumJumpTableEntries().
     if (handleJTSwitchCase(CR, WorkList, SV, Default, SwitchMBB))
       continue;
 
@@ -2556,14 +2604,14 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
   MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges with unique successors.
-  SmallVector<BasicBlock*, 32> succs;
-  succs.reserve(I.getNumSuccessors());
-  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i)
-    succs.push_back(I.getSuccessor(i));
-  array_pod_sort(succs.begin(), succs.end());
-  succs.erase(std::unique(succs.begin(), succs.end()), succs.end());
-  for (unsigned i = 0, e = succs.size(); i != e; ++i) {
-    MachineBasicBlock *Succ = FuncInfo.MBBMap[succs[i]];
+  SmallSet<BasicBlock*, 32> Done;
+  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
+    BasicBlock *BB = I.getSuccessor(i);
+    bool Inserted = Done.insert(BB);
+    if (!Inserted)
+        continue;
+
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
     addSuccessorWithWeight(IndirectBrMBB, Succ);
   }
 
@@ -3160,9 +3208,9 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
     return;   // getValue will auto-populate this.
 
   Type *Ty = I.getAllocatedType();
-  uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
+  uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
   unsigned Align =
-    std::max((unsigned)TLI.getTargetData()->getPrefTypeAlignment(Ty),
+    std::max((unsigned)TLI.getDataLayout()->getPrefTypeAlignment(Ty),
              I.getAlignment());
 
   SDValue AllocSize = getValue(I.getArraySize());
@@ -3460,7 +3508,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
 
   SDValue InChain = getRoot();
 
-  EVT VT = EVT::getEVT(I.getType());
+  EVT VT = TLI.getValueType(I.getType());
 
   if (I.getAlignment() * 8 < VT.getSizeInBits())
     report_fatal_error("Cannot generate unaligned atomic load");
@@ -3490,7 +3538,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
 
   SDValue InChain = getRoot();
 
-  EVT VT = EVT::getEVT(I.getValueOperand()->getType());
+  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
 
   if (I.getAlignment() * 8 < VT.getSizeInBits())
     report_fatal_error("Cannot generate unaligned atomic store");
@@ -4352,7 +4400,7 @@ static SDValue ExpandPowI(DebugLoc DL, SDValue LHS, SDValue RHS,
       return DAG.getConstantFP(1.0, LHS.getValueType());
 
     const Function *F = DAG.getMachineFunction().getFunction();
-    if (!F->hasFnAttr(Attribute::OptimizeForSize) ||
+    if (!F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize) ||
         // If optimizing for size, don't insert too many multiplies.  This
         // inserts up to 5 multiplies.
         CountPopulation_32(Val)+Log2_32(Val) < 7) {
@@ -4850,7 +4898,21 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, DestVT,
                       getValue(I.getArgOperand(0)),
                       getValue(I.getArgOperand(1)),
-                      DAG.getConstant(Idx, MVT::i32));
+                      DAG.getIntPtrConstant(Idx));
+    setValue(&I, Res);
+    return 0;
+  }
+  case Intrinsic::x86_avx_vextractf128_pd_256:
+  case Intrinsic::x86_avx_vextractf128_ps_256:
+  case Intrinsic::x86_avx_vextractf128_si_256:
+  case Intrinsic::x86_avx2_vextracti128: {
+    DebugLoc dl = getCurDebugLoc();
+    EVT DestVT = TLI.getValueType(I.getType());
+    uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
+                   DestVT.getVectorNumElements();
+    Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT,
+                      getValue(I.getArgOperand(0)),
+                      DAG.getIntPtrConstant(Idx));
     setValue(&I, Res);
     return 0;
   }
@@ -5113,10 +5175,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     return 0;
   }
 
+  case Intrinsic::debugtrap:
   case Intrinsic::trap: {
     StringRef TrapFuncName = TM.Options.getTrapFunctionName();
     if (TrapFuncName.empty()) {
-      DAG.setRoot(DAG.getNode(ISD::TRAP, dl,MVT::Other, getRoot()));
+      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ? 
+        ISD::TRAP : ISD::DEBUGTRAP;
+      DAG.setRoot(DAG.getNode(Op, dl,MVT::Other, getRoot()));
       return 0;
     }
     TargetLowering::ArgListTy Args;
@@ -5131,10 +5196,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     DAG.setRoot(Result.second);
     return 0;
   }
-  case Intrinsic::debugtrap: {
-    DAG.setRoot(DAG.getNode(ISD::DEBUGTRAP, dl,MVT::Other, getRoot()));
-    return 0;
-  }
+
   case Intrinsic::uadd_with_overflow:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::usub_with_overflow:
@@ -5177,14 +5239,40 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                                         rw==1)); /* write */
     return 0;
   }
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
+    // Stack coloring is not enabled in O0, discard region information.
+    if (TM.getOptLevel() == CodeGenOpt::None)
+      return 0;
+
+    SmallVector<Value *, 4> Allocas;
+    GetUnderlyingObjects(I.getArgOperand(1), Allocas, TD);
+
+    for (SmallVector<Value*, 4>::iterator Object = Allocas.begin(),
+         E = Allocas.end(); Object != E; ++Object) {
+      AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
 
+      // Could not find an Alloca.
+      if (!LifetimeObject)
+        continue;
+
+      int FI = FuncInfo.StaticAllocaMap[LifetimeObject];
+
+      SDValue Ops[2];
+      Ops[0] = getRoot();
+      Ops[1] = DAG.getFrameIndex(FI, TLI.getPointerTy(), true);
+      unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
+
+      Res = DAG.getNode(Opcode, dl, MVT::Other, Ops, 2);
+      DAG.setRoot(Res);
+    }
+  }
   case Intrinsic::invariant_start:
-  case Intrinsic::lifetime_start:
     // Discard region information.
     setValue(&I, DAG.getUNDEF(TLI.getPointerTy()));
     return 0;
   case Intrinsic::invariant_end:
-  case Intrinsic::lifetime_end:
     // Discard region information.
     return 0;
   case Intrinsic::donothing:
@@ -5220,9 +5308,9 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   int DemoteStackIdx = -100;
 
   if (!CanLowerReturn) {
-    uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(
+    uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(
                       FTy->getReturnType());
-    unsigned Align  = TLI.getTargetData()->getPrefTypeAlignment(
+    unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(
                       FTy->getReturnType());
     MachineFunction &MF = DAG.getMachineFunction();
     DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
@@ -5254,12 +5342,12 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
     unsigned attrInd = i - CS.arg_begin() + 1;
-    Entry.isSExt  = CS.paramHasAttr(attrInd, Attribute::SExt);
-    Entry.isZExt  = CS.paramHasAttr(attrInd, Attribute::ZExt);
-    Entry.isInReg = CS.paramHasAttr(attrInd, Attribute::InReg);
-    Entry.isSRet  = CS.paramHasAttr(attrInd, Attribute::StructRet);
-    Entry.isNest  = CS.paramHasAttr(attrInd, Attribute::Nest);
-    Entry.isByVal = CS.paramHasAttr(attrInd, Attribute::ByVal);
+    Entry.isSExt  = CS.paramHasAttr(attrInd, Attributes::SExt);
+    Entry.isZExt  = CS.paramHasAttr(attrInd, Attributes::ZExt);
+    Entry.isInReg = CS.paramHasAttr(attrInd, Attributes::InReg);
+    Entry.isSRet  = CS.paramHasAttr(attrInd, Attributes::StructRet);
+    Entry.isNest  = CS.paramHasAttr(attrInd, Attributes::Nest);
+    Entry.isByVal = CS.paramHasAttr(attrInd, Attributes::ByVal);
     Entry.Alignment = CS.getParamAlignment(attrInd);
     Args.push_back(Entry);
   }
@@ -5687,7 +5775,7 @@ public:
   /// MVT::Other.
   EVT getCallOperandValEVT(LLVMContext &Context,
                            const TargetLowering &TLI,
-                           const TargetData *TD) const {
+                           const DataLayout *TD) const {
     if (CallOperandVal == 0) return MVT::Other;
 
     if (isa<BasicBlock>(CallOperandVal))
@@ -5991,8 +6079,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         // Otherwise, create a stack slot and emit a store to it before the
         // asm.
         Type *Ty = OpVal->getType();
-        uint64_t TySize = TLI.getTargetData()->getTypeAllocSize(Ty);
-        unsigned Align  = TLI.getTargetData()->getPrefTypeAlignment(Ty);
+        uint64_t TySize = TLI.getDataLayout()->getTypeAllocSize(Ty);
+        unsigned Align  = TLI.getDataLayout()->getPrefTypeAlignment(Ty);
         MachineFunction &MF = DAG.getMachineFunction();
         int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
         SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getPointerTy());
@@ -6040,12 +6128,36 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
   AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
 
-  // Remember the HasSideEffect and AlignStack bits as operand 3.
+  // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
+  // bits as operand 3.
   unsigned ExtraInfo = 0;
   if (IA->hasSideEffects())
     ExtraInfo |= InlineAsm::Extra_HasSideEffects;
   if (IA->isAlignStack())
     ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+  // Set the asm dialect.
+  ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+
+  // Determine if this InlineAsm MayLoad or MayStore based on the constraints.
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
+    // Ideally, we would only check against memory constraints.  However, the
+    // meaning of an other constraint can be target-specific and we can't easily
+    // reason about it.  Therefore, be conservative and set MayLoad/MayStore
+    // for other constriants as well.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+        OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.Type == InlineAsm::isInput)
+        ExtraInfo |= InlineAsm::Extra_MayLoad;
+      else if (OpInfo.Type == InlineAsm::isOutput)
+        ExtraInfo |= InlineAsm::Extra_MayStore;
+    }
+  }
+
   AsmNodeOperands.push_back(DAG.getTargetConstant(ExtraInfo,
                                                   TLI.getPointerTy()));
 
@@ -6155,7 +6267,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
           // Use the produced MatchedRegs object to
           MatchedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
-                                    Chain, &Flag);
+                                    Chain, &Flag, CS.getInstruction());
           MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
                                            true, OpInfo.getMatchedOperand(),
                                            DAG, AsmNodeOperands);
@@ -6237,7 +6349,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       }
 
       OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, getCurDebugLoc(),
-                                        Chain, &Flag);
+                                        Chain, &Flag, CS.getInstruction());
 
       OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
                                                DAG, AsmNodeOperands);
@@ -6268,7 +6380,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // and set it as the value of the call.
   if (!RetValRegs.Regs.empty()) {
     SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
-                                             Chain, &Flag);
+                                             Chain, &Flag, CS.getInstruction());
 
     // FIXME: Why don't we do this for inline asms with MRVs?
     if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
@@ -6308,7 +6420,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
     const Value *Ptr = IndirectStoresToEmit[i].second;
     SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurDebugLoc(),
-                                             Chain, &Flag);
+                                             Chain, &Flag, IA);
     StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
   }
 
@@ -6338,7 +6450,7 @@ void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
 }
 
 void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
-  const TargetData &TD = *TLI.getTargetData();
+  const DataLayout &TD = *TLI.getDataLayout();
   SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
                            getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
@@ -6384,7 +6496,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
                            Args[i].Node.getResNo() + Value);
       ISD::ArgFlagsTy Flags;
       unsigned OriginalAlignment =
-        getTargetData()->getABITypeAlignment(ArgTy);
+        getDataLayout()->getABITypeAlignment(ArgTy);
 
       if (Args[i].isZExt)
         Flags.setZExt();
@@ -6398,7 +6510,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setByVal();
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
-        Flags.setByValSize(getTargetData()->getTypeAllocSize(ElementTy));
+        Flags.setByValSize(getDataLayout()->getTypeAllocSize(ElementTy));
         // For ByVal, alignment should come from FE.  BE will guess if this
         // info is not there but there are cases it cannot get right.
         unsigned FrameAlign;
@@ -6423,12 +6535,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         ExtendKind = ISD::ZERO_EXTEND;
 
       getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts,
-                     PartVT, ExtendKind);
+                     PartVT, CLI.CS ? CLI.CS->getInstruction() : 0, ExtendKind);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
         ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(),
-                               i < CLI.NumFixedArgs);
+                               i < CLI.NumFixedArgs,
+                               i, j*Parts[j].getValueType().getStoreSize());
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0)
@@ -6504,7 +6617,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
 
     ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
-                                            NumRegs, RegisterVT, VT,
+                                            NumRegs, RegisterVT, VT, NULL,
                                             AssertOp));
     CurReg += NumRegs;
   }
@@ -6543,7 +6656,7 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
 
   RegsForValue RFV(V->getContext(), TLI, Reg, V->getType());
   SDValue Chain = DAG.getEntryNode();
-  RFV.getCopyToRegs(Op, DAG, getCurDebugLoc(), Chain, 0);
+  RFV.getCopyToRegs(Op, DAG, getCurDebugLoc(), Chain, 0, V);
   PendingExports.push_back(Chain);
 }
 
@@ -6573,7 +6686,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
   const Function &F = *LLVMBB->getParent();
   SelectionDAG &DAG = SDB->DAG;
   DebugLoc dl = SDB->getCurDebugLoc();
-  const TargetData *TD = TLI.getTargetData();
+  const DataLayout *TD = TLI.getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
   // Check whether the function can return without sret-demotion.
@@ -6591,7 +6704,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     EVT RegisterVT = TLI.getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, true);
+    ISD::InputArg RetArg(Flags, RegisterVT, true, 0, 0);
     Ins.push_back(RetArg);
   }
 
@@ -6610,15 +6723,15 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
       unsigned OriginalAlignment =
         TD->getABITypeAlignment(ArgTy);
 
-      if (F.paramHasAttr(Idx, Attribute::ZExt))
+      if (F.getParamAttributes(Idx).hasAttribute(Attributes::ZExt))
         Flags.setZExt();
-      if (F.paramHasAttr(Idx, Attribute::SExt))
+      if (F.getParamAttributes(Idx).hasAttribute(Attributes::SExt))
         Flags.setSExt();
-      if (F.paramHasAttr(Idx, Attribute::InReg))
+      if (F.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
         Flags.setInReg();
-      if (F.paramHasAttr(Idx, Attribute::StructRet))
+      if (F.getParamAttributes(Idx).hasAttribute(Attributes::StructRet))
         Flags.setSRet();
-      if (F.paramHasAttr(Idx, Attribute::ByVal)) {
+      if (F.getParamAttributes(Idx).hasAttribute(Attributes::ByVal)) {
         Flags.setByVal();
         PointerType *Ty = cast<PointerType>(I->getType());
         Type *ElementTy = Ty->getElementType();
@@ -6632,14 +6745,15 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
           FrameAlign = TLI.getByValTypeAlignment(ElementTy);
         Flags.setByValAlign(FrameAlign);
       }
-      if (F.paramHasAttr(Idx, Attribute::Nest))
+      if (F.getParamAttributes(Idx).hasAttribute(Attributes::Nest))
         Flags.setNest();
       Flags.setOrigAlign(OriginalAlignment);
 
       EVT RegisterVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumRegs = TLI.getNumRegisters(*CurDAG->getContext(), VT);
       for (unsigned i = 0; i != NumRegs; ++i) {
-        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed);
+        ISD::InputArg MyFlags(Flags, RegisterVT, isArgValueUsed,
+                              Idx-1, i*RegisterVT.getStoreSize());
         if (NumRegs > 1 && i == 0)
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
@@ -6685,7 +6799,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     EVT RegVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
     ISD::NodeType AssertOp = ISD::DELETED_NODE;
     SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
-                                        RegVT, VT, AssertOp);
+                                        RegVT, VT, NULL, AssertOp);
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
@@ -6719,14 +6833,14 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
 
       if (!I->use_empty()) {
         ISD::NodeType AssertOp = ISD::DELETED_NODE;
-        if (F.paramHasAttr(Idx, Attribute::SExt))
+        if (F.getParamAttributes(Idx).hasAttribute(Attributes::SExt))
           AssertOp = ISD::AssertSext;
-        else if (F.paramHasAttr(Idx, Attribute::ZExt))
+        else if (F.getParamAttributes(Idx).hasAttribute(Attributes::ZExt))
           AssertOp = ISD::AssertZext;
 
         ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
                                              NumParts, PartVT, VT,
-                                             AssertOp));
+                                             NULL, AssertOp));
       }
 
       i += NumParts;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 40900023140e..9e46d9664f96 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -66,7 +66,7 @@ class ShuffleVectorInst;
 class SIToFPInst;
 class StoreInst;
 class SwitchInst;
-class TargetData;
+class DataLayout;
 class TargetLibraryInfo;
 class TargetLowering;
 class TruncInst;
@@ -150,9 +150,11 @@ private:
     uint64_t Mask;
     MachineBasicBlock* BB;
     unsigned Bits;
+    uint32_t ExtraWeight;
 
-    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits):
-      Mask(mask), BB(bb), Bits(bits) { }
+    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits,
+             uint32_t Weight):
+      Mask(mask), BB(bb), Bits(bits), ExtraWeight(Weight) { }
   };
 
   typedef std::vector<Case>           CaseVector;
@@ -247,11 +249,13 @@ private:
   typedef std::pair<JumpTableHeader, JumpTable> JumpTableBlock;
 
   struct BitTestCase {
-    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr):
-      Mask(M), ThisBB(T), TargetBB(Tr) { }
+    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr,
+                uint32_t Weight):
+      Mask(M), ThisBB(T), TargetBB(Tr), ExtraWeight(Weight) { }
     uint64_t Mask;
     MachineBasicBlock *ThisBB;
     MachineBasicBlock *TargetBB;
+    uint32_t ExtraWeight;
   };
 
   typedef SmallVector<BitTestCase, 3> BitTestInfo;
@@ -281,7 +285,7 @@ public:
   const TargetMachine &TM;
   const TargetLowering &TLI;
   SelectionDAG &DAG;
-  const TargetData *TD;
+  const DataLayout *TD;
   AliasAnalysis *AA;
   const TargetLibraryInfo *LibInfo;
 
@@ -325,7 +329,7 @@ public:
                       CodeGenOpt::Level ol)
     : SDNodeOrder(0), TM(dag.getTarget()), TLI(dag.getTargetLoweringInfo()),
       DAG(dag), FuncInfo(funcinfo), OptLevel(ol),
-      HasTailCall(false), Context(dag.getContext()) {
+      HasTailCall(false) {
   }
 
   void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
@@ -452,6 +456,7 @@ public:
   void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
   void visitBitTestCase(BitTestBlock &BB,
                         MachineBasicBlock* NextMBB,
+                        uint32_t BranchWeightToNext,
                         unsigned Reg,
                         BitTestCase &B,
                         MachineBasicBlock *SwitchBB);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 13cd011c2b8c..6f3ce7a44bc4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -267,6 +267,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STACKRESTORE:               return "stackrestore";
   case ISD::TRAP:                       return "trap";
   case ISD::DEBUGTRAP:                  return "debugtrap";
+  case ISD::LIFETIME_START:             return "lifetime.start";
+  case ISD::LIFETIME_END:               return "lifetime.end";
 
   // Bit manipulation
   case ISD::BSWAP:                      return "bswap";
@@ -331,7 +333,7 @@ void SDNode::dump(const SelectionDAG *G) const {
 }
 
 void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
-  OS << (void*)this << ": ";
+  OS << (const void*)this << ": ";
 
   for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
     if (i) OS << ",";
@@ -473,11 +475,16 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     OS << "<" << *M->getMemOperand() << ">";
   } else if (const BlockAddressSDNode *BA =
                dyn_cast<BlockAddressSDNode>(this)) {
+    int64_t offset = BA->getOffset();
     OS << "<";
     WriteAsOperand(OS, BA->getBlockAddress()->getFunction(), false);
     OS << ", ";
     WriteAsOperand(OS, BA->getBlockAddress()->getBasicBlock(), false);
     OS << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
     if (unsigned int TF = BA->getTargetFlags())
       OS << " [TF=" << TF << ']';
   }
@@ -559,7 +566,7 @@ static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
       child->printr(OS, G);
       once.insert(child);
     } else {         // Just the address. FIXME: also print the child's opcode.
-      OS << (void*)child;
+      OS << (const void*)child;
       if (unsigned RN = N->getOperand(i).getResNo())
         OS << ":" << RN;
     }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 4e5e3bae62ca..c314fa5b5118 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -474,6 +474,11 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     MRI.replaceRegWith(From, To);
   }
 
+  // Freeze the set of reserved registers now that MachineFrameInfo has been
+  // set up. All the information required by getReservedRegs() should be
+  // available now.
+  MRI.freezeReservedRegs(*MF);
+
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
@@ -554,7 +559,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 #endif
   {
     BlockNumber = FuncInfo->MBB->getNumber();
-    BlockName = MF->getFunction()->getName().str() + ":" +
+    BlockName = MF->getName().str() + ":" +
                 FuncInfo->MBB->getBasicBlock()->getName().str();
   }
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
@@ -1209,7 +1214,12 @@ SelectionDAGISel::FinishBasicBlock() {
       CodeGenAndEmitDAG();
     }
 
+    uint32_t UnhandledWeight = 0;
+    for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j)
+      UnhandledWeight += SDB->BitTestCases[i].Cases[j].ExtraWeight;
+
     for (unsigned j = 0, ej = SDB->BitTestCases[i].Cases.size(); j != ej; ++j) {
+      UnhandledWeight -= SDB->BitTestCases[i].Cases[j].ExtraWeight;
       // Set the current basic block to the mbb we wish to insert the code into
       FuncInfo->MBB = SDB->BitTestCases[i].Cases[j].ThisBB;
       FuncInfo->InsertPt = FuncInfo->MBB->end();
@@ -1217,12 +1227,14 @@ SelectionDAGISel::FinishBasicBlock() {
       if (j+1 != ej)
         SDB->visitBitTestCase(SDB->BitTestCases[i],
                               SDB->BitTestCases[i].Cases[j+1].ThisBB,
+                              UnhandledWeight,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
                               FuncInfo->MBB);
       else
         SDB->visitBitTestCase(SDB->BitTestCases[i],
                               SDB->BitTestCases[i].Default,
+                              UnhandledWeight,
                               SDB->BitTestCases[i].Reg,
                               SDB->BitTestCases[i].Cases[j],
                               FuncInfo->MBB);
@@ -1794,10 +1806,13 @@ WalkChainUsers(const SDNode *ChainedNode,
         User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
       continue;
 
-    if (User->getOpcode() == ISD::CopyToReg ||
-        User->getOpcode() == ISD::CopyFromReg ||
-        User->getOpcode() == ISD::INLINEASM ||
-        User->getOpcode() == ISD::EH_LABEL) {
+    unsigned UserOpcode = User->getOpcode();
+    if (UserOpcode == ISD::CopyToReg ||
+        UserOpcode == ISD::CopyFromReg ||
+        UserOpcode == ISD::INLINEASM ||
+        UserOpcode == ISD::EH_LABEL ||
+        UserOpcode == ISD::LIFETIME_START ||
+        UserOpcode == ISD::LIFETIME_END) {
       // If their node ID got reset to -1 then they've already been selected.
       // Treat them like a MachineOpcode.
       if (User->getNodeId() == -1)
@@ -1994,7 +2009,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
   return Res;
 }
 
-/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+/// CheckSame - Implements OP_CheckSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
@@ -2213,6 +2228,8 @@ SelectCodeCommon(SDNode *NodeToMatch, const unsigned char *MatcherTable,
   case ISD::CopyFromReg:
   case ISD::CopyToReg:
   case ISD::EH_LABEL:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
     NodeToMatch->setNodeId(-1); // Mark selected.
     return 0;
   case ISD::AssertSext:
@@ -2981,7 +2998,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
       N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_VOID) {
     N->printrFull(Msg, CurDAG);
-    Msg << "\nIn function: " << MF->getFunction()->getName();
+    Msg << "\nIn function: " << MF->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
     unsigned iid =
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index 173ffac329c4..39216356522f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -14,7 +14,6 @@
 #include "ScheduleDAGSDNodes.h"
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
-#include "llvm/Function.h"
 #include "llvm/Assembly/Writer.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -50,7 +49,7 @@ namespace llvm {
 
     template<typename EdgeIter>
     static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
-      return itostr(I - SDNodeIterator::begin((SDNode *) Node));
+      return itostr(I - SDNodeIterator::begin((const SDNode *) Node));
     }
 
     /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
@@ -73,7 +72,7 @@ namespace llvm {
     }
 
     static std::string getGraphName(const SelectionDAG *G) {
-      return G->getMachineFunction().getFunction()->getName();
+      return G->getMachineFunction().getName();
     }
 
     static bool renderGraphFromBottomUp() {
@@ -146,7 +145,7 @@ std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
 void SelectionDAG::viewGraph(const std::string &Title) {
 // This code is only for debugging!
 #ifndef NDEBUG
-  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getName(),
+  ViewGraph(this, "dag." + getMachineFunction().getName(),
             false, Title);
 #else
   errs() << "SelectionDAG::viewGraph is only available in debug builds on "
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6820175c1bed..49f55e2fc608 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -515,7 +515,7 @@ static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
 /// NOTE: The constructor takes ownership of TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm,
                                const TargetLoweringObjectFile *tlof)
-  : TM(tm), TD(TM.getTargetData()), TLOF(*tlof) {
+  : TM(tm), TD(TM.getDataLayout()), TLOF(*tlof) {
   // All operations default to being supported.
   memset(OpActions, 0, sizeof(OpActions));
   memset(LoadExtActions, 0, sizeof(LoadExtActions));
@@ -583,8 +583,13 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   // Default ISD::TRAP to expand (which turns it into abort).
   setOperationAction(ISD::TRAP, MVT::Other, Expand);
 
+  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
+  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
+  //
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+
   IsLittleEndian = TD->isLittleEndian();
-  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize());
+  PointerTy = MVT::getIntegerVT(8*TD->getPointerSize(0));
   memset(RegClassForVT, 0,MVT::LAST_VALUETYPE*sizeof(TargetRegisterClass*));
   memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
   maxStoresPerMemset = maxStoresPerMemcpy = maxStoresPerMemmove = 8;
@@ -613,6 +618,7 @@ TargetLowering::TargetLowering(const TargetMachine &tm,
   ShouldFoldAtomicFences = false;
   InsertFencesForAtomic = false;
   SupportJumpTables = true;
+  MinimumJumpTableEntries = 4;
 
   InitLibcallNames(LibcallRoutineNames);
   InitCmpLibcallCCs(CmpLibcallCCs);
@@ -624,7 +630,7 @@ TargetLowering::~TargetLowering() {
 }
 
 MVT TargetLowering::getShiftAmountTy(EVT LHSTy) const {
-  return MVT::getIntegerVT(8*TD->getPointerSize());
+  return MVT::getIntegerVT(8*TD->getPointerSize(0));
 }
 
 /// canOpTrap - Returns true if the operation can trap for the value type.
@@ -772,7 +778,7 @@ void TargetLowering::computeRegisterProperties() {
       LegalIntReg = IntReg;
     } else {
       RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
-        (MVT::SimpleValueType)LegalIntReg;
+        (const MVT::SimpleValueType)LegalIntReg;
       ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
     }
   }
@@ -898,10 +904,9 @@ const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
   return NULL;
 }
 
-
 EVT TargetLowering::getSetCCResultType(EVT VT) const {
   assert(!VT.isVector() && "No default SetCC type for vectors!");
-  return PointerTy.SimpleTy;
+  return getPointerTy(0).SimpleTy;
 }
 
 MVT::SimpleValueType TargetLowering::getCmpLibcallReturnType() const {
@@ -997,9 +1002,9 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr & Attribute::SExt)
+    if (attr.hasAttribute(Attributes::SExt))
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr & Attribute::ZExt)
+    else if (attr.hasAttribute(Attributes::ZExt))
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1017,18 +1022,17 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr & Attribute::InReg)
+    if (attr.hasAttribute(Attributes::InReg))
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr & Attribute::SExt)
+    if (attr.hasAttribute(Attributes::SExt))
       Flags.setSExt();
-    else if (attr & Attribute::ZExt)
+    else if (attr.hasAttribute(Attributes::ZExt))
       Flags.setZExt();
 
-    for (unsigned i = 0; i < NumParts; ++i) {
-      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
-    }
+    for (unsigned i = 0; i < NumParts; ++i)
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true, 0, 0));
   }
 }
 
@@ -1062,7 +1066,7 @@ SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
 
   if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
       (JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
-    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy());
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(0));
 
   return Table;
 }
@@ -2441,7 +2445,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   if (N0 == N1) {
     // The sext(setcc()) => setcc() optimization relies on the appropriate
     // constant being emitted.
-    uint64_t EqVal;
+    uint64_t EqVal = 0;
     switch (getBooleanContents(N0.getValueType().isVector())) {
     case UndefinedBooleanContent:
     case ZeroOrOneBooleanContent:
@@ -2954,8 +2958,9 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
               EVT::getEVT(IntegerType::get(OpTy->getContext(), BitSize), true);
           break;
         }
-      } else if (dyn_cast<PointerType>(OpTy)) {
-        OpInfo.ConstraintVT = MVT::getIntegerVT(8*TD->getPointerSize());
+      } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
+        OpInfo.ConstraintVT = MVT::getIntegerVT(
+            8*TD->getPointerSize(PT->getAddressSpace()));
       } else {
         OpInfo.ConstraintVT = EVT::getEVT(OpTy, true);
       }
diff --git a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
index a081e3cd493f..f769b44efbb3 100644
--- a/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetSelectionDAGInfo.cpp
@@ -16,7 +16,7 @@
 using namespace llvm;
 
 TargetSelectionDAGInfo::TargetSelectionDAGInfo(const TargetMachine &TM)
-  : TD(TM.getTargetData()) {
+  : TD(TM.getDataLayout()) {
 }
 
 TargetSelectionDAGInfo::~TargetSelectionDAGInfo() {
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
index 21ae2f5e56eb..4fbe1b360577 100644
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ b/lib/CodeGen/ShrinkWrapping.cpp
@@ -159,7 +159,7 @@ void PEI::initShrinkWrappingInfo() {
   // via --shrink-wrap-func=<funcname>.
 #ifndef NDEBUG
   if (ShrinkWrapFunc != "") {
-    std::string MFName = MF->getFunction()->getName().str();
+    std::string MFName = MF->getName().str();
     ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
   }
 #endif
@@ -187,7 +187,7 @@ void PEI::placeCSRSpillsAndRestores(MachineFunction &Fn) {
 
   DEBUG(if (ShrinkWrapThisFunction) {
       dbgs() << "Place CSR spills/restores for "
-             << MF->getFunction()->getName() << "\n";
+             << MF->getName() << "\n";
     });
 
   if (calculateSets(Fn))
@@ -364,7 +364,7 @@ bool PEI::calculateSets(MachineFunction &Fn) {
   // If no CSRs used, we are done.
   if (CSI.empty()) {
     DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+            dbgs() << "DISABLED: " << Fn.getName()
                    << ": uses no callee-saved registers\n");
     return false;
   }
@@ -384,7 +384,7 @@ bool PEI::calculateSets(MachineFunction &Fn) {
   // implementation to functions with <= 500 MBBs.
   if (Fn.size() > 500) {
     DEBUG(if (ShrinkWrapThisFunction)
-            dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+            dbgs() << "DISABLED: " << Fn.getName()
                    << ": too large (" << Fn.size() << " MBBs)\n");
     ShrinkWrapThisFunction = false;
   }
@@ -466,7 +466,7 @@ bool PEI::calculateSets(MachineFunction &Fn) {
   }
 
   if (allCSRUsesInEntryBlock) {
-    DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+    DEBUG(dbgs() << "DISABLED: " << Fn.getName()
                  << ": all CSRs used in EntryBlock\n");
     ShrinkWrapThisFunction = false;
   } else {
@@ -478,7 +478,7 @@ bool PEI::calculateSets(MachineFunction &Fn) {
         allCSRsUsedInEntryFanout = false;
     }
     if (allCSRsUsedInEntryFanout) {
-      DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+      DEBUG(dbgs() << "DISABLED: " << Fn.getName()
                    << ": all CSRs used in imm successors of EntryBlock\n");
       ShrinkWrapThisFunction = false;
     }
@@ -505,7 +505,7 @@ bool PEI::calculateSets(MachineFunction &Fn) {
       if (dominatesExitNodes) {
         CSRUsedInChokePoints |= CSRUsed[MBB];
         if (CSRUsedInChokePoints == UsedCSRegs) {
-          DEBUG(dbgs() << "DISABLED: " << Fn.getFunction()->getName()
+          DEBUG(dbgs() << "DISABLED: " << Fn.getName()
                        << ": all CSRs used in choke point(s) at "
                        << getBasicBlockName(MBB) << "\n");
           ShrinkWrapThisFunction = false;
@@ -521,7 +521,7 @@ bool PEI::calculateSets(MachineFunction &Fn) {
     return false;
 
   DEBUG({
-      dbgs() << "ENABLED: " << Fn.getFunction()->getName();
+      dbgs() << "ENABLED: " << Fn.getName();
       if (HasFastExitPath)
         dbgs() << " (fast exit path)";
       dbgs() << "\n";
@@ -861,7 +861,7 @@ void PEI::placeSpillsAndRestores(MachineFunction &Fn) {
   DEBUG(if (ShrinkWrapDebugging >= BasicInfo) {
       dbgs() << "-----------------------------------------------------------\n";
       dbgs() << "total iterations = " << iterations << " ( "
-           << Fn.getFunction()->getName()
+           << Fn.getName()
            << " " << numSRReducedThisFunc
            << " " << Fn.size()
            << " )\n";
@@ -984,7 +984,7 @@ void PEI::verifySpillRestorePlacement() {
       if (isReturnBlock(SBB) || SBB->succ_size() == 0) {
         if (restored != spilled) {
           CSRegSet notRestored = (spilled - restored);
-          DEBUG(dbgs() << MF->getFunction()->getName() << ": "
+          DEBUG(dbgs() << MF->getName() << ": "
                        << stringifyCSRegSet(notRestored)
                        << " spilled at " << getBasicBlockName(MBB)
                        << " are never restored on path to return "
@@ -1032,7 +1032,7 @@ void PEI::verifySpillRestorePlacement() {
     }
     if (spilled != restored) {
       CSRegSet notSpilled = (restored - spilled);
-      DEBUG(dbgs() << MF->getFunction()->getName() << ": "
+      DEBUG(dbgs() << MF->getName() << ": "
                    << stringifyCSRegSet(notSpilled)
                    << " restored at " << getBasicBlockName(MBB)
                    << " are never spilled\n");
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 980bd7414ccb..4b566fcba931 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -30,7 +30,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -191,58 +191,43 @@ setupFunctionContext(Function &F, ArrayRef<LandingPadInst*> LPads) {
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
   unsigned Align =
-    TLI->getTargetData()->getPrefTypeAlignment(FunctionContextTy);
+    TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
   FuncCtx =
     new AllocaInst(FunctionContextTy, 0, Align, "fn_context", EntryBB->begin());
 
   // Fill in the function context structure.
-  Type *Int32Ty = Type::getInt32Ty(F.getContext());
-  Value *Zero = ConstantInt::get(Int32Ty, 0);
-  Value *One = ConstantInt::get(Int32Ty, 1);
-  Value *Two = ConstantInt::get(Int32Ty, 2);
-  Value *Three = ConstantInt::get(Int32Ty, 3);
-  Value *Four = ConstantInt::get(Int32Ty, 4);
-
-  Value *Idxs[2] = { Zero, 0 };
-
   for (unsigned I = 0, E = LPads.size(); I != E; ++I) {
     LandingPadInst *LPI = LPads[I];
     IRBuilder<> Builder(LPI->getParent()->getFirstInsertionPt());
 
     // Reference the __data field.
-    Idxs[1] = Two;
-    Value *FCData = Builder.CreateGEP(FuncCtx, Idxs, "__data");
+    Value *FCData = Builder.CreateConstGEP2_32(FuncCtx, 0, 2, "__data");
 
     // The exception values come back in context->__data[0].
-    Idxs[1] = Zero;
-    Value *ExceptionAddr = Builder.CreateGEP(FCData, Idxs, "exception_gep");
+    Value *ExceptionAddr = Builder.CreateConstGEP2_32(FCData, 0, 0,
+                                                      "exception_gep");
     Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
-    ExnVal = Builder.CreateIntToPtr(ExnVal, Type::getInt8PtrTy(F.getContext()));
+    ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
 
-    Idxs[1] = One;
-    Value *SelectorAddr = Builder.CreateGEP(FCData, Idxs, "exn_selector_gep");
+    Value *SelectorAddr = Builder.CreateConstGEP2_32(FCData, 0, 1,
+                                                     "exn_selector_gep");
     Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
 
     substituteLPadValues(LPI, ExnVal, SelVal);
   }
 
   // Personality function
-  Idxs[1] = Three;
+  IRBuilder<> Builder(EntryBB->getTerminator());
   if (!PersonalityFn)
     PersonalityFn = LPads[0]->getPersonalityFn();
-  Value *PersonalityFieldPtr =
-    GetElementPtrInst::Create(FuncCtx, Idxs, "pers_fn_gep",
-                              EntryBB->getTerminator());
-  new StoreInst(PersonalityFn, PersonalityFieldPtr, true,
-                EntryBB->getTerminator());
+  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 3,
+                                                          "pers_fn_gep");
+  Builder.CreateStore(PersonalityFn, PersonalityFieldPtr, /*isVolatile=*/true);
 
   // LSDA address
-  Value *LSDA = CallInst::Create(LSDAAddrFn, "lsda_addr",
-                                 EntryBB->getTerminator());
-  Idxs[1] = Four;
-  Value *LSDAFieldPtr = GetElementPtrInst::Create(FuncCtx, Idxs, "lsda_gep",
-                                                  EntryBB->getTerminator());
-  new StoreInst(LSDA, LSDAFieldPtr, true, EntryBB->getTerminator());
+  Value *LSDA = Builder.CreateCall(LSDAAddrFn, "lsda_addr");
+  Value *LSDAFieldPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 4, "lsda_gep");
+  Builder.CreateStore(LSDA, LSDAFieldPtr, /*isVolatile=*/true);
 
   return FuncCtx;
 }
@@ -417,48 +402,31 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   Value *FuncCtx =
     setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
   BasicBlock *EntryBB = F.begin();
-  Type *Int32Ty = Type::getInt32Ty(F.getContext());
-
-  Value *Idxs[2] = {
-    ConstantInt::get(Int32Ty, 0), 0
-  };
+  IRBuilder<> Builder(EntryBB->getTerminator());
 
   // Get a reference to the jump buffer.
-  Idxs[1] = ConstantInt::get(Int32Ty, 5);
-  Value *JBufPtr = GetElementPtrInst::Create(FuncCtx, Idxs, "jbuf_gep",
-                                             EntryBB->getTerminator());
+  Value *JBufPtr = Builder.CreateConstGEP2_32(FuncCtx, 0, 5, "jbuf_gep");
 
   // Save the frame pointer.
-  Idxs[1] = ConstantInt::get(Int32Ty, 0);
-  Value *FramePtr = GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_fp_gep",
-                                              EntryBB->getTerminator());
+  Value *FramePtr = Builder.CreateConstGEP2_32(JBufPtr, 0, 0, "jbuf_fp_gep");
 
-  Value *Val = CallInst::Create(FrameAddrFn,
-                                ConstantInt::get(Int32Ty, 0),
-                                "fp",
-                                EntryBB->getTerminator());
-  new StoreInst(Val, FramePtr, true, EntryBB->getTerminator());
+  Value *Val = Builder.CreateCall(FrameAddrFn, Builder.getInt32(0), "fp");
+  Builder.CreateStore(Val, FramePtr, /*isVolatile=*/true);
 
   // Save the stack pointer.
-  Idxs[1] = ConstantInt::get(Int32Ty, 2);
-  Value *StackPtr = GetElementPtrInst::Create(JBufPtr, Idxs, "jbuf_sp_gep",
-                                              EntryBB->getTerminator());
+  Value *StackPtr = Builder.CreateConstGEP2_32(JBufPtr, 0, 2, "jbuf_sp_gep");
 
-  Val = CallInst::Create(StackAddrFn, "sp", EntryBB->getTerminator());
-  new StoreInst(Val, StackPtr, true, EntryBB->getTerminator());
+  Val = Builder.CreateCall(StackAddrFn, "sp");
+  Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
 
   // Call the setjmp instrinsic. It fills in the rest of the jmpbuf.
-  Value *SetjmpArg = CastInst::Create(Instruction::BitCast, JBufPtr,
-                                      Type::getInt8PtrTy(F.getContext()), "",
-                                      EntryBB->getTerminator());
-  CallInst::Create(BuiltinSetjmpFn, SetjmpArg, "", EntryBB->getTerminator());
+  Value *SetjmpArg = Builder.CreateBitCast(JBufPtr, Builder.getInt8PtrTy());
+  Builder.CreateCall(BuiltinSetjmpFn, SetjmpArg);
 
   // Store a pointer to the function context so that the back-end will know
   // where to look for it.
-  Value *FuncCtxArg = CastInst::Create(Instruction::BitCast, FuncCtx,
-                                       Type::getInt8PtrTy(F.getContext()), "",
-                                       EntryBB->getTerminator());
-  CallInst::Create(FuncCtxFn, FuncCtxArg, "", EntryBB->getTerminator());
+  Value *FuncCtxArg = Builder.CreateBitCast(FuncCtx, Builder.getInt8PtrTy());
+  Builder.CreateCall(FuncCtxFn, FuncCtxArg);
 
   // At this point, we are all set up, update the invoke instructions to mark
   // their call_site values.
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index c8c3fb37ad79..95faafab45a9 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -143,6 +143,7 @@ void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
 }
 
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SlotIndexes::dump() const {
   for (IndexList::const_iterator itr = indexList.begin();
        itr != indexList.end(); ++itr) {
@@ -159,6 +160,7 @@ void SlotIndexes::dump() const {
     dbgs() << "BB#" << i << "\t[" << MBBRanges[i].first << ';'
            << MBBRanges[i].second << ")\n";
 }
+#endif
 
 // Print a SlotIndex to a raw_ostream.
 void SlotIndex::print(raw_ostream &os) const {
@@ -168,9 +170,11 @@ void SlotIndex::print(raw_ostream &os) const {
     os << "invalid";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // Dump a SlotIndex to stderr.
 void SlotIndex::dump() const {
   print(dbgs());
   dbgs() << "\n";
 }
+#endif
 
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 4a2b7ec1cf24..dca15ee7580f 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -356,6 +356,7 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Edit->anyRematerializable(0);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SplitEditor::dump() const {
   if (RegAssign.empty()) {
     dbgs() << " empty\n";
@@ -366,6 +367,7 @@ void SplitEditor::dump() const {
     dbgs() << " [" << I.start() << ';' << I.stop() << "):" << I.value();
   dbgs() << '\n';
 }
+#endif
 
 VNInfo *SplitEditor::defValue(unsigned RegIdx,
                               const VNInfo *ParentVNI,
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
new file mode 100644
index 000000000000..1cbee843a125
--- /dev/null
+++ b/lib/CodeGen/StackColoring.cpp
@@ -0,0 +1,783 @@
+//===-- StackColoring.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the stack-coloring optimization that looks for
+// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END),
+// which represent the possible lifetime of stack slots. It attempts to
+// merge disjoint stack slots and reduce the used stack space.
+// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
+//
+// TODO: In the future we plan to improve stack coloring in the following ways:
+// 1. Allow merging multiple small slots into a single larger slot at different
+//    offsets.
+// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
+//    spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "stackcoloring"
+#include "MachineTraceMetrics.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/Instructions.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+DisableColoring("no-stack-coloring",
+        cl::init(false), cl::Hidden,
+        cl::desc("Disable stack coloring"));
+
+/// The user may write code that uses allocas outside of the declared lifetime
+/// zone. This can happen when the user returns a reference to a local
+/// data-structure. We can detect these cases and decide not to optimize the
+/// code. If this flag is enabled, we try to save the user.
+static cl::opt<bool>
+ProtectFromEscapedAllocas("protect-from-escaped-allocas",
+        cl::init(false), cl::Hidden,
+        cl::desc("Do not optimize lifetime zones that are broken"));
+
+STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
+STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
+STATISTIC(StackSlotMerged, "Number of stack slot merged.");
+STATISTIC(EscapedAllocas,
+          "Number of allocas that escaped the lifetime region");
+
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// StackColoring - A machine pass for merging disjoint stack allocations,
+/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
+class StackColoring : public MachineFunctionPass {
+  MachineFrameInfo *MFI;
+  MachineFunction *MF;
+
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+  /// Maps active slots (per bit) for each basic block.
+  DenseMap<MachineBasicBlock*, BlockLifetimeInfo> BlockLiveness;
+
+  /// Maps serial numbers to basic blocks.
+  DenseMap<MachineBasicBlock*, int> BasicBlocks;
+  /// Maps basic blocks to a serial number.
+  SmallVector<MachineBasicBlock*, 8> BasicBlockNumbering;
+
+  /// Maps liveness intervals for each slot.
+  SmallVector<LiveInterval*, 16> Intervals;
+  /// VNInfo is used for the construction of LiveIntervals.
+  VNInfo::Allocator VNInfoAllocator;
+  /// SlotIndex analysis object.
+  SlotIndexes *Indexes;
+
+  /// The list of lifetime markers found. These markers are to be removed
+  /// once the coloring is done.
+  SmallVector<MachineInstr*, 8> Markers;
+
+  /// SlotSizeSorter - A Sort utility for arranging stack slots according
+  /// to their size.
+  struct SlotSizeSorter {
+    MachineFrameInfo *MFI;
+    SlotSizeSorter(MachineFrameInfo *mfi) : MFI(mfi) { }
+    bool operator()(int LHS, int RHS) {
+      // We use -1 to denote a uninteresting slot. Place these slots at the end.
+      if (LHS == -1) return false;
+      if (RHS == -1) return true;
+      // Sort according to size.
+      return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
+  }
+};
+
+public:
+  static char ID;
+  StackColoring() : MachineFunctionPass(ID) {
+    initializeStackColoringPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  /// Debug.
+  void dump();
+
+  /// Removes all of the lifetime marker instructions from the function.
+  /// \returns true if any markers were removed.
+  bool removeAllMarkers();
+
+  /// Scan the machine function and find all of the lifetime markers.
+  /// Record the findings in the BEGIN and END vectors.
+  /// \returns the number of markers found.
+  unsigned collectMarkers(unsigned NumSlot);
+
+  /// Perform the dataflow calculation and calculate the lifetime for each of
+  /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and
+  /// LifetimeLIVE_OUT maps that represent which stack slots are live coming
+  /// in and out blocks.
+  void calculateLocalLiveness();
+
+  /// Construct the LiveIntervals for the slots.
+  void calculateLiveIntervals(unsigned NumSlots);
+
+  /// Go over the machine function and change instructions which use stack
+  /// slots to use the joint slots.
+  void remapInstructions(DenseMap<int, int> &SlotRemap);
+
+  /// The input program may contain intructions which are not inside lifetime
+  /// markers. This can happen due to a bug in the compiler or due to a bug in
+  /// user code (for example, returning a reference to a local variable).
+  /// This procedure checks all of the instructions in the function and
+  /// invalidates lifetime ranges which do not contain all of the instructions
+  /// which access that frame slot.
+  void removeInvalidSlotRanges();
+
+  /// Map entries which point to other entries to their destination.
+  ///   A->B->C becomes A->C.
+   void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+};
+} // end anonymous namespace
+
+char StackColoring::ID = 0;
+char &llvm::StackColoringID = StackColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+
+void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void StackColoring::dump() {
+  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
+       FI != FE; ++FI) {
+    unsigned Num = BasicBlocks[*FI];
+    DEBUG(dbgs()<<"Inspecting block #"<<Num<<" ["<<FI->getName()<<"]\n");
+    Num = 0;
+    DEBUG(dbgs()<<"BEGIN  : {");
+    for (unsigned i=0; i < BlockLiveness[*FI].Begin.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].Begin.test(i)<<" ");
+    DEBUG(dbgs()<<"}\n");
+
+    DEBUG(dbgs()<<"END    : {");
+    for (unsigned i=0; i < BlockLiveness[*FI].End.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].End.test(i)<<" ");
+
+    DEBUG(dbgs()<<"}\n");
+
+    DEBUG(dbgs()<<"LIVE_IN: {");
+    for (unsigned i=0; i < BlockLiveness[*FI].LiveIn.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].LiveIn.test(i)<<" ");
+
+    DEBUG(dbgs()<<"}\n");
+    DEBUG(dbgs()<<"LIVEOUT: {");
+    for (unsigned i=0; i < BlockLiveness[*FI].LiveOut.size(); ++i)
+      DEBUG(dbgs()<<BlockLiveness[*FI].LiveOut.test(i)<<" ");
+    DEBUG(dbgs()<<"}\n");
+  }
+}
+
+unsigned StackColoring::collectMarkers(unsigned NumSlot) {
+  unsigned MarkersFound = 0;
+  // Scan the function to find all lifetime markers.
+  // NOTE: We use the a reverse-post-order iteration to ensure that we obtain a
+  // deterministic numbering, and because we'll need a post-order iteration
+  // later for solving the liveness dataflow problem.
+  for (df_iterator<MachineFunction*> FI = df_begin(MF), FE = df_end(MF);
+       FI != FE; ++FI) {
+
+    // Assign a serial number to this basic block.
+    BasicBlocks[*FI] = BasicBlockNumbering.size();
+    BasicBlockNumbering.push_back(*FI);
+
+    BlockLiveness[*FI].Begin.resize(NumSlot);
+    BlockLiveness[*FI].End.resize(NumSlot);
+
+    for (MachineBasicBlock::iterator BI = (*FI)->begin(), BE = (*FI)->end();
+         BI != BE; ++BI) {
+
+      if (BI->getOpcode() != TargetOpcode::LIFETIME_START &&
+          BI->getOpcode() != TargetOpcode::LIFETIME_END)
+        continue;
+
+      Markers.push_back(BI);
+
+      bool IsStart = BI->getOpcode() == TargetOpcode::LIFETIME_START;
+      MachineOperand &MI = BI->getOperand(0);
+      unsigned Slot = MI.getIndex();
+
+      MarkersFound++;
+
+      const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+      if (Allocation) {
+        DEBUG(dbgs()<<"Found a lifetime marker for slot #"<<Slot<<
+              " with allocation: "<< Allocation->getName()<<"\n");
+      }
+
+      if (IsStart) {
+        BlockLiveness[*FI].Begin.set(Slot);
+      } else {
+        if (BlockLiveness[*FI].Begin.test(Slot)) {
+          // Allocas that start and end within a single block are handled
+          // specially when computing the LiveIntervals to avoid pessimizing
+          // the liveness propagation.
+          BlockLiveness[*FI].Begin.reset(Slot);
+        } else {
+          BlockLiveness[*FI].End.set(Slot);
+        }
+      }
+    }
+  }
+
+  // Update statistics.
+  NumMarkerSeen += MarkersFound;
+  return MarkersFound;
+}
+
+void StackColoring::calculateLocalLiveness() {
+  // Perform a standard reverse dataflow computation to solve for
+  // global liveness.  The BEGIN set here is equivalent to KILL in the standard
+  // formulation, and END is equivalent to GEN.  The result of this computation
+  // is a map from blocks to bitvectors where the bitvectors represent which
+  // allocas are live in/out of that block.
+  SmallPtrSet<MachineBasicBlock*, 8> BBSet(BasicBlockNumbering.begin(),
+                                           BasicBlockNumbering.end());
+  unsigned NumSSMIters = 0;
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    ++NumSSMIters;
+
+    SmallPtrSet<MachineBasicBlock*, 8> NextBBSet;
+
+    for (SmallVector<MachineBasicBlock*, 8>::iterator
+         PI = BasicBlockNumbering.begin(), PE = BasicBlockNumbering.end();
+         PI != PE; ++PI) {
+
+      MachineBasicBlock *BB = *PI;
+      if (!BBSet.count(BB)) continue;
+
+      BitVector LocalLiveIn;
+      BitVector LocalLiveOut;
+
+      // Forward propagation from begins to ends.
+      for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+           PE = BB->pred_end(); PI != PE; ++PI)
+        LocalLiveIn |= BlockLiveness[*PI].LiveOut;
+      LocalLiveIn |= BlockLiveness[BB].End;
+      LocalLiveIn.reset(BlockLiveness[BB].Begin);
+
+      // Reverse propagation from ends to begins.
+      for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI)
+        LocalLiveOut |= BlockLiveness[*SI].LiveIn;
+      LocalLiveOut |= BlockLiveness[BB].Begin;
+      LocalLiveOut.reset(BlockLiveness[BB].End);
+
+      LocalLiveIn |= LocalLiveOut;
+      LocalLiveOut |= LocalLiveIn;
+
+      // After adopting the live bits, we need to turn-off the bits which
+      // are de-activated in this block.
+      LocalLiveOut.reset(BlockLiveness[BB].End);
+      LocalLiveIn.reset(BlockLiveness[BB].Begin);
+
+      // If we have both BEGIN and END markers in the same basic block then
+      // we know that the BEGIN marker comes after the END, because we already
+      // handle the case where the BEGIN comes before the END when collecting
+      // the markers (and building the BEGIN/END vectore).
+      // Want to enable the LIVE_IN and LIVE_OUT of slots that have both
+      // BEGIN and END because it means that the value lives before and after
+      // this basic block.
+      BitVector LocalEndBegin = BlockLiveness[BB].End;
+      LocalEndBegin &= BlockLiveness[BB].Begin;
+      LocalLiveIn |= LocalEndBegin;
+      LocalLiveOut |= LocalEndBegin;
+
+      if (LocalLiveIn.test(BlockLiveness[BB].LiveIn)) {
+        changed = true;
+        BlockLiveness[BB].LiveIn |= LocalLiveIn;
+
+        for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+             PE = BB->pred_end(); PI != PE; ++PI)
+          NextBBSet.insert(*PI);
+      }
+
+      if (LocalLiveOut.test(BlockLiveness[BB].LiveOut)) {
+        changed = true;
+        BlockLiveness[BB].LiveOut |= LocalLiveOut;
+
+        for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+             SE = BB->succ_end(); SI != SE; ++SI)
+          NextBBSet.insert(*SI);
+      }
+    }
+
+    BBSet = NextBBSet;
+  }// while changed.
+}
+
+void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
+  SmallVector<SlotIndex, 16> Starts;
+  SmallVector<SlotIndex, 16> Finishes;
+
+  // For each block, find which slots are active within this block
+  // and update the live intervals.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBe = MF->end();
+       MBB != MBBe; ++MBB) {
+    Starts.clear();
+    Starts.resize(NumSlots);
+    Finishes.clear();
+    Finishes.resize(NumSlots);
+
+    // Create the interval for the basic blocks with lifetime markers in them.
+    for (SmallVector<MachineInstr*, 8>::iterator it = Markers.begin(),
+         e = Markers.end(); it != e; ++it) {
+      MachineInstr *MI = *it;
+      if (MI->getParent() != MBB)
+        continue;
+
+      assert((MI->getOpcode() == TargetOpcode::LIFETIME_START ||
+              MI->getOpcode() == TargetOpcode::LIFETIME_END) &&
+             "Invalid Lifetime marker");
+
+      bool IsStart = MI->getOpcode() == TargetOpcode::LIFETIME_START;
+      MachineOperand &Mo = MI->getOperand(0);
+      int Slot = Mo.getIndex();
+      assert(Slot >= 0 && "Invalid slot");
+
+      SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
+
+      if (IsStart) {
+        if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+          Starts[Slot] = ThisIndex;
+      } else {
+        if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
+          Finishes[Slot] = ThisIndex;
+      }
+    }
+
+    // Create the interval of the blocks that we previously found to be 'alive'.
+    BitVector Alive = BlockLiveness[MBB].LiveIn;
+    Alive |= BlockLiveness[MBB].LiveOut;
+
+    if (Alive.any()) {
+      for (int pos = Alive.find_first(); pos != -1;
+           pos = Alive.find_next(pos)) {
+        if (!Starts[pos].isValid())
+          Starts[pos] = Indexes->getMBBStartIdx(MBB);
+        if (!Finishes[pos].isValid())
+          Finishes[pos] = Indexes->getMBBEndIdx(MBB);
+      }
+    }
+
+    for (unsigned i = 0; i < NumSlots; ++i) {
+      assert(Starts[i].isValid() == Finishes[i].isValid() && "Unmatched range");
+      if (!Starts[i].isValid())
+        continue;
+
+      assert(Starts[i] && Finishes[i] && "Invalid interval");
+      VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
+      SlotIndex S = Starts[i];
+      SlotIndex F = Finishes[i];
+      if (S < F) {
+        // We have a single consecutive region.
+        Intervals[i]->addRange(LiveRange(S, F, ValNum));
+      } else {
+        // We have two non consecutive regions. This happens when
+        // LIFETIME_START appears after the LIFETIME_END marker.
+        SlotIndex NewStart = Indexes->getMBBStartIdx(MBB);
+        SlotIndex NewFin = Indexes->getMBBEndIdx(MBB);
+        Intervals[i]->addRange(LiveRange(NewStart, F, ValNum));
+        Intervals[i]->addRange(LiveRange(S, NewFin, ValNum));
+      }
+    }
+  }
+}
+
+bool StackColoring::removeAllMarkers() {
+  unsigned Count = 0;
+  for (unsigned i = 0; i < Markers.size(); ++i) {
+    Markers[i]->eraseFromParent();
+    Count++;
+  }
+  Markers.clear();
+
+  DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n");
+  return Count;
+}
+
+void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
+  unsigned FixedInstr = 0;
+  unsigned FixedMemOp = 0;
+  unsigned FixedDbg = 0;
+  MachineModuleInfo *MMI = &MF->getMMI();
+
+  // Remap debug information that refers to stack slots.
+  MachineModuleInfo::VariableDbgInfoMapTy &VMap = MMI->getVariableDbgInfo();
+  for (MachineModuleInfo::VariableDbgInfoMapTy::iterator VI = VMap.begin(),
+       VE = VMap.end(); VI != VE; ++VI) {
+    const MDNode *Var = VI->first;
+    if (!Var) continue;
+    std::pair<unsigned, DebugLoc> &VP = VI->second;
+    if (SlotRemap.count(VP.first)) {
+      DEBUG(dbgs()<<"Remapping debug info for ["<<Var->getName()<<"].\n");
+      VP.first = SlotRemap[VP.first];
+      FixedDbg++;
+    }
+  }
+
+  // Keep a list of *allocas* which need to be remapped.
+  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+  for (DenseMap<int, int>::iterator it = SlotRemap.begin(),
+       e = SlotRemap.end(); it != e; ++it) {
+    const AllocaInst *From = MFI->getObjectAllocation(it->first);
+    const AllocaInst *To = MFI->getObjectAllocation(it->second);
+    assert(To && From && "Invalid allocation object");
+    Allocas[From] = To;
+  }
+
+  // Remap all instructions to the new stack slots.
+  MachineFunction::iterator BB, BBE;
+  MachineBasicBlock::iterator I, IE;
+  for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
+    for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+
+      // Skip lifetime markers. We'll remove them soon.
+      if (I->getOpcode() == TargetOpcode::LIFETIME_START ||
+          I->getOpcode() == TargetOpcode::LIFETIME_END)
+        continue;
+
+      // Update the MachineMemOperand to use the new alloca.
+      for (MachineInstr::mmo_iterator MM = I->memoperands_begin(),
+           E = I->memoperands_end(); MM != E; ++MM) {
+        MachineMemOperand *MMO = *MM;
+
+        const Value *V = MMO->getValue();
+
+        if (!V)
+          continue;
+
+        // Climb up and find the original alloca.
+        V = GetUnderlyingObject(V);
+        // If we did not find one, or if the one that we found is not in our
+        // map, then move on.
+        if (!V || !isa<AllocaInst>(V)) {
+          // Clear mem operand since we don't know for sure that it doesn't
+          // alias a merged alloca.
+          MMO->setValue(0);
+          continue;
+        }
+        const AllocaInst *AI= cast<AllocaInst>(V);
+        if (!Allocas.count(AI))
+          continue;
+
+        MMO->setValue(Allocas[AI]);
+        FixedMemOp++;
+      }
+
+      // Update all of the machine instruction operands.
+      for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
+        MachineOperand &MO = I->getOperand(i);
+
+        if (!MO.isFI())
+          continue;
+        int FromSlot = MO.getIndex();
+
+        // Don't touch arguments.
+        if (FromSlot<0)
+          continue;
+
+        // Only look at mapped slots.
+        if (!SlotRemap.count(FromSlot))
+          continue;
+
+        // In a debug build, check that the instruction that we are modifying is
+        // inside the expected live range. If the instruction is not inside
+        // the calculated range then it means that the alloca usage moved
+        // outside of the lifetime markers, or that the user has a bug.
+        // NOTE: Alloca address calculations which happen outside the lifetime
+        // zone are are okay, despite the fact that we don't have a good way
+        // for validating all of the usages of the calculation.
+#ifndef NDEBUG
+        bool TouchesMemory = I->mayLoad() || I->mayStore();
+        // If we *don't* protect the user from escaped allocas, don't bother
+        // validating the instructions.
+        if (!I->isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
+          SlotIndex Index = Indexes->getInstructionIndex(I);
+          LiveInterval *Interval = Intervals[FromSlot];
+          assert(Interval->find(Index) != Interval->end() &&
+               "Found instruction usage outside of live range.");
+        }
+#endif
+
+        // Fix the machine instructions.
+        int ToSlot = SlotRemap[FromSlot];
+        MO.setIndex(ToSlot);
+        FixedInstr++;
+      }
+    }
+
+  DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
+}
+
+void StackColoring::removeInvalidSlotRanges() {
+  MachineFunction::iterator BB, BBE;
+  MachineBasicBlock::iterator I, IE;
+  for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
+    for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+
+      if (I->getOpcode() == TargetOpcode::LIFETIME_START ||
+          I->getOpcode() == TargetOpcode::LIFETIME_END || I->isDebugValue())
+        continue;
+
+      // Some intervals are suspicious! In some cases we find address
+      // calculations outside of the lifetime zone, but not actual memory
+      // read or write. Memory accesses outside of the lifetime zone are a clear
+      // violation, but address calculations are okay. This can happen when
+      // GEPs are hoisted outside of the lifetime zone.
+      // So, in here we only check instructions which can read or write memory.
+      if (!I->mayLoad() && !I->mayStore())
+        continue;
+
+      // Check all of the machine operands.
+      for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
+        MachineOperand &MO = I->getOperand(i);
+
+        if (!MO.isFI())
+          continue;
+
+        int Slot = MO.getIndex();
+
+        if (Slot<0)
+          continue;
+
+        if (Intervals[Slot]->empty())
+          continue;
+
+        // Check that the used slot is inside the calculated lifetime range.
+        // If it is not, warn about it and invalidate the range.
+        LiveInterval *Interval = Intervals[Slot];
+        SlotIndex Index = Indexes->getInstructionIndex(I);
+        if (Interval->find(Index) == Interval->end()) {
+          Intervals[Slot]->clear();
+          DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
+          EscapedAllocas++;
+        }
+      }
+    }
+}
+
+void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
+                                   unsigned NumSlots) {
+  // Expunge slot remap map.
+  for (unsigned i=0; i < NumSlots; ++i) {
+    // If we are remapping i
+    if (SlotRemap.count(i)) {
+      int Target = SlotRemap[i];
+      // As long as our target is mapped to something else, follow it.
+      while (SlotRemap.count(Target)) {
+        Target = SlotRemap[Target];
+        SlotRemap[i] = Target;
+      }
+    }
+  }
+}
+
+bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
+  DEBUG(dbgs() << "********** Stack Coloring **********\n"
+               << "********** Function: "
+               << ((const Value*)Func.getFunction())->getName() << '\n');
+  MF = &Func;
+  MFI = MF->getFrameInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  BlockLiveness.clear();
+  BasicBlocks.clear();
+  BasicBlockNumbering.clear();
+  Markers.clear();
+  Intervals.clear();
+  VNInfoAllocator.Reset();
+
+  unsigned NumSlots = MFI->getObjectIndexEnd();
+
+  // If there are no stack slots then there are no markers to remove.
+  if (!NumSlots)
+    return false;
+
+  SmallVector<int, 8> SortedSlots;
+
+  SortedSlots.reserve(NumSlots);
+  Intervals.reserve(NumSlots);
+
+  unsigned NumMarkers = collectMarkers(NumSlots);
+
+  unsigned TotalSize = 0;
+  DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
+  DEBUG(dbgs()<<"Slot structure:\n");
+
+  for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
+    DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
+    TotalSize += MFI->getObjectSize(i);
+  }
+
+  DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");
+
+  // Don't continue because there are not enough lifetime markers, or the
+  // stack is too small, or we are told not to optimize the slots.
+  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring) {
+    DEBUG(dbgs()<<"Will not try to merge slots.\n");
+    return removeAllMarkers();
+  }
+
+  for (unsigned i=0; i < NumSlots; ++i) {
+    LiveInterval *LI = new LiveInterval(i, 0);
+    Intervals.push_back(LI);
+    LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
+    SortedSlots.push_back(i);
+  }
+
+  // Calculate the liveness of each block.
+  calculateLocalLiveness();
+
+  // Propagate the liveness information.
+  calculateLiveIntervals(NumSlots);
+
+  // Search for allocas which are used outside of the declared lifetime
+  // markers.
+  if (ProtectFromEscapedAllocas)
+    removeInvalidSlotRanges();
+
+  // Maps old slots to new slots.
+  DenseMap<int, int> SlotRemap;
+  unsigned RemovedSlots = 0;
+  unsigned ReducedSize = 0;
+
+  // Do not bother looking at empty intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    if (Intervals[SortedSlots[I]]->empty())
+      SortedSlots[I] = -1;
+  }
+
+  // This is a simple greedy algorithm for merging allocas. First, sort the
+  // slots, placing the largest slots first. Next, perform an n^2 scan and look
+  // for disjoint slots. When you find disjoint slots, merge the samller one
+  // into the bigger one and update the live interval. Remove the small alloca
+  // and continue.
+
+  // Sort the slots according to their size. Place unused slots at the end.
+  std::sort(SortedSlots.begin(), SortedSlots.end(), SlotSizeSorter(MFI));
+
+  bool Chanded = true;
+  while (Chanded) {
+    Chanded = false;
+    for (unsigned I = 0; I < NumSlots; ++I) {
+      if (SortedSlots[I] == -1)
+        continue;
+
+      for (unsigned J=I+1; J < NumSlots; ++J) {
+        if (SortedSlots[J] == -1)
+          continue;
+
+        int FirstSlot = SortedSlots[I];
+        int SecondSlot = SortedSlots[J];
+        LiveInterval *First = Intervals[FirstSlot];
+        LiveInterval *Second = Intervals[SecondSlot];
+        assert (!First->empty() && !Second->empty() && "Found an empty range");
+
+        // Merge disjoint slots.
+        if (!First->overlaps(*Second)) {
+          Chanded = true;
+          First->MergeRangesInAsValue(*Second, First->getValNumInfo(0));
+          SlotRemap[SecondSlot] = FirstSlot;
+          SortedSlots[J] = -1;
+          DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
+                SecondSlot<<" together.\n");
+          unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
+                                           MFI->getObjectAlignment(SecondSlot));
+
+          assert(MFI->getObjectSize(FirstSlot) >=
+                 MFI->getObjectSize(SecondSlot) &&
+                 "Merging a small object into a larger one");
+
+          RemovedSlots+=1;
+          ReducedSize += MFI->getObjectSize(SecondSlot);
+          MFI->setObjectAlignment(FirstSlot, MaxAlignment);
+          MFI->RemoveStackObject(SecondSlot);
+        }
+      }
+    }
+  }// While changed.
+
+  // Record statistics.
+  StackSpaceSaved += ReducedSize;
+  StackSlotMerged += RemovedSlots;
+  DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<<
+        ReducedSize<<" bytes\n");
+
+  // Scan the entire function and update all machine operands that use frame
+  // indices to use the remapped frame index.
+  expungeSlotMap(SlotRemap, NumSlots);
+  remapInstructions(SlotRemap);
+
+  // Release the intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    delete Intervals[I];
+  }
+
+  return removeAllMarkers();
+}
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index f1eab1f8e70e..31e9ec0ac0b9 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -26,18 +26,12 @@
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/Triple.h"
 using namespace llvm;
 
-// SSPBufferSize - The lower bound for a buffer to be considered for stack
-// smashing protection.
-static cl::opt<unsigned>
-SSPBufferSize("stack-protector-buffer-size", cl::init(8),
-              cl::desc("Lower bound for a buffer to be considered for "
-                       "stack protection"));
-
 namespace {
   class StackProtector : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -61,6 +55,11 @@ namespace {
     /// check fails.
     BasicBlock *CreateFailBB();
 
+    /// ContainsProtectableArray - Check whether the type either is an array or
+    /// contains an array of sufficient size so that we need stack protectors
+    /// for it.
+    bool ContainsProtectableArray(Type *Ty, bool InStruct = false) const;
+
     /// RequiresStackProtector - Check whether or not this function needs a
     /// stack protector based upon the stack protector level.
     bool RequiresStackProtector() const;
@@ -100,21 +99,50 @@ bool StackProtector::runOnFunction(Function &Fn) {
   return InsertStackProtectors();
 }
 
+/// ContainsProtectableArray - Check whether the type either is an array or
+/// contains a char array of sufficient size so that we need stack protectors
+/// for it.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool InStruct) const {
+  if (!Ty) return false;
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    const TargetMachine &TM = TLI->getTargetMachine();
+    if (!AT->getElementType()->isIntegerTy(8)) {
+      Triple Trip(TM.getTargetTriple());
+
+      // If we're on a non-Darwin platform or we're inside of a structure, don't
+      // add stack protectors unless the array is a character array.
+      if (InStruct || !Trip.isOSDarwin())
+          return false;
+    }
+
+    // If an array has more than SSPBufferSize bytes of allocated space, then we
+    // emit stack protectors.
+    if (TM.Options.SSPBufferSize <= TLI->getDataLayout()->getTypeAllocSize(AT))
+      return true;
+  }
+
+  const StructType *ST = dyn_cast<StructType>(Ty);
+  if (!ST) return false;
+
+  for (StructType::element_iterator I = ST->element_begin(),
+         E = ST->element_end(); I != E; ++I)
+    if (ContainsProtectableArray(*I, true))
+      return true;
+
+  return false;
+}
+
 /// RequiresStackProtector - Check whether or not this function needs a stack
 /// protector based upon the stack protector level. The heuristic we use is to
 /// add a guard variable to functions that call alloca, and functions with
 /// buffers larger than SSPBufferSize bytes.
 bool StackProtector::RequiresStackProtector() const {
-  if (F->hasFnAttr(Attribute::StackProtectReq))
+  if (F->getFnAttributes().hasAttribute(Attributes::StackProtectReq))
     return true;
 
-  if (!F->hasFnAttr(Attribute::StackProtect))
+  if (!F->getFnAttributes().hasAttribute(Attributes::StackProtect))
     return false;
 
-  const TargetData *TD = TLI->getTargetData();
-  const TargetMachine &TM = TLI->getTargetMachine();
-  Triple Trip(TM.getTargetTriple());
-
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
 
@@ -126,17 +154,8 @@ bool StackProtector::RequiresStackProtector() const {
           // protectors.
           return true;
 
-        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
-          // If we're on a non-Darwin platform, don't add stack protectors
-          // unless the array is a character array.
-          if (!Trip.isOSDarwin() && !AT->getElementType()->isIntegerTy(8))
-            continue;
-
-          // If an array has more than SSPBufferSize bytes of allocated space,
-          // then we emit stack protectors.
-          if (SSPBufferSize <= TD->getTypeAllocSize(AT))
-            return true;
-        }
+        if (ContainsProtectableArray(AI->getAllocatedType()))
+          return true;
       }
   }
 
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 20da36e8fb41..d349abc35774 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -11,8 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stackcoloring"
-#include "llvm/Function.h"
+#define DEBUG_TYPE "stackslotcoloring"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -391,8 +390,7 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
 bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   DEBUG({
       dbgs() << "********** Stack Slot Coloring **********\n"
-             << "********** Function: "
-             << MF.getFunction()->getName() << '\n';
+             << "********** Function: " << MF.getName() << '\n';
     });
 
   MFI = MF.getFrameInfo();
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index 5b0619504647..39fd600d4abf 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -404,9 +404,9 @@ bool StrongPHIElimination::runOnMachineFunction(MachineFunction &MF) {
 }
 
 void StrongPHIElimination::addReg(unsigned Reg) {
-  if (RegNodeMap.count(Reg))
-    return;
-  RegNodeMap[Reg] = new (Allocator) Node(Reg);
+  Node *&N = RegNodeMap[Reg];
+  if (!N)
+    N = new (Allocator) Node(Reg);
 }
 
 StrongPHIElimination::Node*
@@ -714,8 +714,9 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
         assert(getRegColor(CopyReg) == CopyReg);
       }
 
-      if (!InsertedSrcCopyMap.count(std::make_pair(PredBB, PHIColor)))
-        InsertedSrcCopyMap[std::make_pair(PredBB, PHIColor)] = CopyInstr;
+      // Insert into map if not already there.
+      InsertedSrcCopyMap.insert(std::make_pair(std::make_pair(PredBB, PHIColor),
+                                               CopyInstr));
     }
 
     SrcMO.setReg(CopyReg);
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index a813fa65ac58..1497d1ba6287 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -552,7 +552,8 @@ TailDuplicatePass::shouldTailDuplicate(const MachineFunction &MF,
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
   if (TailDuplicateSize.getNumOccurrences() == 0 &&
-      MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+      MF.getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize))
     MaxDuplicateCount = 1;
   else
     MaxDuplicateCount = TailDuplicateSize;
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index ddee6b240160..4439192fe2f4 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -99,17 +99,8 @@ MachineInstr *TargetInstrInfoImpl::commuteInstruction(MachineInstr *MI,
 
   if (NewMI) {
     // Create a new instruction.
-    bool Reg0IsDead = HasDef ? MI->getOperand(0).isDead() : false;
     MachineFunction &MF = *MI->getParent()->getParent();
-    if (HasDef)
-      return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-        .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead), SubReg0)
-        .addReg(Reg2, getKillRegState(Reg2IsKill), SubReg2)
-        .addReg(Reg1, getKillRegState(Reg1IsKill), SubReg1);
-    else
-      return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
-        .addReg(Reg2, getKillRegState(Reg2IsKill), SubReg2)
-        .addReg(Reg1, getKillRegState(Reg1IsKill), SubReg1);
+    MI = MF.CloneMachineInstr(MI);
   }
 
   if (HasDef) {
@@ -572,6 +563,8 @@ TargetInstrInfoImpl::getNumMicroOps(const InstrItineraryData *ItinData,
 /// Return the default expected latency for a def based on it's opcode.
 unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
                                             const MachineInstr *DefMI) const {
+  if (DefMI->isTransient())
+    return 0;
   if (DefMI->mayLoad())
     return SchedModel->LoadLatency;
   if (isHighLatencyDef(DefMI->getOpcode()))
@@ -615,13 +608,13 @@ getOperandLatency(const InstrItineraryData *ItinData,
 
 /// If we can determine the operand latency from the def only, without itinerary
 /// lookup, do so. Otherwise return -1.
-static int computeDefOperandLatency(
-  const TargetInstrInfo *TII, const InstrItineraryData *ItinData,
-  const MachineInstr *DefMI, bool FindMin) {
+int TargetInstrInfo::computeDefOperandLatency(
+  const InstrItineraryData *ItinData,
+  const MachineInstr *DefMI, bool FindMin) const {
 
   // Let the target hook getInstrLatency handle missing itineraries.
   if (!ItinData)
-    return TII->getInstrLatency(ItinData, DefMI);
+    return getInstrLatency(ItinData, DefMI);
 
   // Return a latency based on the itinerary properties and defining instruction
   // if possible. Some common subtargets don't require per-operand latency,
@@ -630,7 +623,7 @@ static int computeDefOperandLatency(
     // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
     // it exists before defaulting to MinLatency.
     if (ItinData->SchedModel->MinLatency >= 0)
-      return TII->getInstrLatency(ItinData, DefMI);
+      return getInstrLatency(ItinData, DefMI);
 
     // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
     // For empty itineraries, short-cirtuit the check and default to one cycle.
@@ -638,29 +631,42 @@ static int computeDefOperandLatency(
       return 1;
   }
   else if(ItinData->isEmpty())
-    return TII->defaultDefLatency(ItinData->SchedModel, DefMI);
+    return defaultDefLatency(ItinData->SchedModel, DefMI);
 
   // ...operand lookup required
   return -1;
 }
 
 /// computeOperandLatency - Compute and return the latency of the given data
-/// dependent def and use when the operand indices are already known.
+/// dependent def and use when the operand indices are already known. UseMI may
+/// be NULL for an unknown use.
+///
+/// FindMin may be set to get the minimum vs. expected latency. Minimum
+/// latency is used for scheduling groups, while expected latency is for
+/// instruction cost and critical path.
 ///
-/// FindMin may be set to get the minimum vs. expected latency.
+/// Depending on the subtarget's itinerary properties, this may or may not need
+/// to call getOperandLatency(). For most subtargets, we don't need DefIdx or
+/// UseIdx to compute min latency.
 unsigned TargetInstrInfo::
 computeOperandLatency(const InstrItineraryData *ItinData,
                       const MachineInstr *DefMI, unsigned DefIdx,
                       const MachineInstr *UseMI, unsigned UseIdx,
                       bool FindMin) const {
 
-  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
+  int DefLatency = computeDefOperandLatency(ItinData, DefMI, FindMin);
   if (DefLatency >= 0)
     return DefLatency;
 
   assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
 
-  int OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  int OperLatency = 0;
+  if (UseMI)
+    OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
+  else {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
+  }
   if (OperLatency >= 0)
     return OperLatency;
 
@@ -673,77 +679,3 @@ computeOperandLatency(const InstrItineraryData *ItinData,
                             defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
-
-/// computeOperandLatency - Compute and return the latency of the given data
-/// dependent def and use. DefMI must be a valid def. UseMI may be NULL for an
-/// unknown use. Depending on the subtarget's itinerary properties, this may or
-/// may not need to call getOperandLatency().
-///
-/// FindMin may be set to get the minimum vs. expected latency. Minimum
-/// latency is used for scheduling groups, while expected latency is for
-/// instruction cost and critical path.
-///
-/// For most subtargets, we don't need DefIdx or UseIdx to compute min latency.
-/// DefMI must be a valid definition, but UseMI may be NULL for an unknown use.
-unsigned TargetInstrInfo::
-computeOperandLatency(const InstrItineraryData *ItinData,
-                      const TargetRegisterInfo *TRI,
-                      const MachineInstr *DefMI, const MachineInstr *UseMI,
-                      unsigned Reg, bool FindMin) const {
-
-  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
-  if (DefLatency >= 0)
-    return DefLatency;
-
-  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
-
-  // Find the definition of the register in the defining instruction.
-  int DefIdx = DefMI->findRegisterDefOperandIdx(Reg);
-  if (DefIdx != -1) {
-    const MachineOperand &MO = DefMI->getOperand(DefIdx);
-    if (MO.isReg() && MO.isImplicit() &&
-        DefIdx >= (int)DefMI->getDesc().getNumOperands()) {
-      // This is an implicit def, getOperandLatency() won't return the correct
-      // latency. e.g.
-      //   %D6<def>, %D7<def> = VLD1q16 %R2<kill>, 0, ..., %Q3<imp-def>
-      //   %Q1<def> = VMULv8i16 %Q1<kill>, %Q3<kill>, ...
-      // What we want is to compute latency between def of %D6/%D7 and use of
-      // %Q3 instead.
-      unsigned Op2 = DefMI->findRegisterDefOperandIdx(Reg, false, true, TRI);
-      if (DefMI->getOperand(Op2).isReg())
-        DefIdx = Op2;
-    }
-    // For all uses of the register, calculate the maxmimum latency
-    int OperLatency = -1;
-
-    // UseMI is null, then it must be a scheduling barrier.
-    if (!UseMI) {
-      unsigned DefClass = DefMI->getDesc().getSchedClass();
-      OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
-    }
-    else {
-      for (unsigned i = 0, e = UseMI->getNumOperands(); i != e; ++i) {
-        const MachineOperand &MO = UseMI->getOperand(i);
-        if (!MO.isReg() || !MO.isUse())
-          continue;
-        unsigned MOReg = MO.getReg();
-        if (MOReg != Reg)
-          continue;
-
-        int UseCycle = getOperandLatency(ItinData, DefMI, DefIdx, UseMI, i);
-        OperLatency = std::max(OperLatency, UseCycle);
-      }
-    }
-    // If we found an operand latency, we're done.
-    if (OperLatency >= 0)
-      return OperLatency;
-  }
-  // No operand latency was found.
-  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
-
-  // Expected latency is the max of the stage latency and itinerary props.
-  if (!FindMin)
-    InstrLatency = std::max(InstrLatency,
-                            defaultDefLatency(ItinData->SchedModel, DefMI));
-  return InstrLatency;
-}
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 2a2fa9e54325..8f5d770f6651 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -27,7 +27,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Dwarf.h"
@@ -77,9 +77,9 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
                                                     Flags,
                                                     SectionKind::getDataRel(),
                                                     0, Label->getName());
-  unsigned Size = TM.getTargetData()->getPointerSize();
+  unsigned Size = TM.getDataLayout()->getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(TM.getTargetData()->getPointerABIAlignment());
+  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::Create(Size, getContext());
   Streamer.EmitELFSize(Label, E);
@@ -247,7 +247,7 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
     unsigned Align =
-      TM.getTargetData()->getPreferredAlignment(cast<GlobalVariable>(GV));
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
 
     const char *SizeSpec = ".rodata.str1.";
     if (Kind.isMergeable2ByteCString())
@@ -522,14 +522,14 @@ SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      TM.getTargetData()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
-      TM.getTargetData()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
     return UStringSection;
 
   if (Kind.isMergeableConst()) {
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
new file mode 100644
index 000000000000..ca3b0e0b1173
--- /dev/null
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -0,0 +1,306 @@
+//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a wrapper around MCSchedModel that allows the interface
+// to benefit from information currently only available in TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSchedModel("schedmodel", cl::Hidden, cl::init(true),
+  cl::desc("Use TargetSchedModel for latency lookup"));
+
+static cl::opt<bool> EnableSchedItins("scheditins", cl::Hidden, cl::init(true),
+  cl::desc("Use InstrItineraryData for latency lookup"));
+
+bool TargetSchedModel::hasInstrSchedModel() const {
+  return EnableSchedModel && SchedModel.hasInstrSchedModel();
+}
+
+bool TargetSchedModel::hasInstrItineraries() const {
+  return EnableSchedItins && !InstrItins.isEmpty();
+}
+
+static unsigned gcd(unsigned Dividend, unsigned Divisor) {
+  // Dividend and Divisor will be naturally swapped as needed.
+  while(Divisor) {
+    unsigned Rem = Dividend % Divisor;
+    Dividend = Divisor;
+    Divisor = Rem;
+  };
+  return Dividend;
+}
+static unsigned lcm(unsigned A, unsigned B) {
+  unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
+  assert((LCM >= A && LCM >= B) && "LCM overflow");
+  return LCM;
+}
+
+void TargetSchedModel::init(const MCSchedModel &sm,
+                            const TargetSubtargetInfo *sti,
+                            const TargetInstrInfo *tii) {
+  SchedModel = sm;
+  STI = sti;
+  TII = tii;
+  STI->initInstrItins(InstrItins);
+
+  unsigned NumRes = SchedModel.getNumProcResourceKinds();
+  ResourceFactors.resize(NumRes);
+  ResourceLCM = SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    if (NumUnits > 0)
+      ResourceLCM = lcm(ResourceLCM, NumUnits);
+  }
+  MicroOpFactor = ResourceLCM / SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    ResourceFactors[Idx] = NumUnits ? (ResourceLCM / NumUnits) : 0;
+  }
+}
+
+unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
+                                          const MCSchedClassDesc *SC) const {
+  if (hasInstrItineraries()) {
+    int UOps = InstrItins.getNumMicroOps(MI->getDesc().getSchedClass());
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, MI);
+  }
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->NumMicroOps;
+  }
+  return MI->isTransient() ? 0 : 1;
+}
+
+// The machine model may explicitly specify an invalid latency, which
+// effectively means infinite latency. Since users of the TargetSchedule API
+// don't know how to handle this, we convert it to a very large latency that is
+// easy to distinguish when debugging the DAG but won't induce overflow.
+static unsigned convertLatency(int Cycles) {
+  return Cycles >= 0 ? Cycles : 1000;
+}
+
+/// If we can determine the operand latency from the def only, without machine
+/// model or itinerary lookup, do so. Otherwise return -1.
+int TargetSchedModel::getDefLatency(const MachineInstr *DefMI,
+                                    bool FindMin) const {
+
+  // Return a latency based on the itinerary properties and defining instruction
+  // if possible. Some common subtargets don't require per-operand latency,
+  // especially for minimum latencies.
+  if (FindMin) {
+    // If MinLatency is invalid, then use the itinerary for MinLatency. If no
+    // itinerary exists either, then use single cycle latency.
+    if (SchedModel.MinLatency < 0 && !hasInstrItineraries()) {
+      return 1;
+    }
+    return SchedModel.MinLatency;
+  }
+  else if (!hasInstrSchedModel() && !hasInstrItineraries()) {
+    return TII->defaultDefLatency(&SchedModel, DefMI);
+  }
+  // ...operand lookup required
+  return -1;
+}
+
+/// Return the MCSchedClassDesc for this instruction. Some SchedClasses require
+/// evaluation of predicates that depend on instruction operands or flags.
+const MCSchedClassDesc *TargetSchedModel::
+resolveSchedClass(const MachineInstr *MI) const {
+
+  // Get the definition's scheduling class descriptor from this machine model.
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+
+#ifndef NDEBUG
+  unsigned NIter = 0;
+#endif
+  while (SCDesc->isVariant()) {
+    assert(++NIter < 6 && "Variants are nested deeper than the magic number");
+
+    SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
+    SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  }
+  return SCDesc;
+}
+
+/// Find the def index of this operand. This index maps to the machine model and
+/// is independent of use operands. Def operands may be reordered with uses or
+/// merged with uses without affecting the def index (e.g. before/after
+/// regalloc). However, an instruction's def operands must never be reordered
+/// with respect to each other.
+static unsigned findDefIdx(const MachineInstr *MI, unsigned DefOperIdx) {
+  unsigned DefIdx = 0;
+  for (unsigned i = 0; i != DefOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef())
+      ++DefIdx;
+  }
+  return DefIdx;
+}
+
+/// Find the use index of this operand. This is independent of the instruction's
+/// def operands.
+///
+/// Note that uses are not determined by the operand's isUse property, which
+/// is simply the inverse of isDef. Here we consider any readsReg operand to be
+/// a "use". The machine model allows an operand to be both a Def and Use.
+static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
+  unsigned UseIdx = 0;
+  for (unsigned i = 0; i != UseOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.readsReg())
+      ++UseIdx;
+  }
+  return UseIdx;
+}
+
+// Top-level API for clients that know the operand indices.
+unsigned TargetSchedModel::computeOperandLatency(
+  const MachineInstr *DefMI, unsigned DefOperIdx,
+  const MachineInstr *UseMI, unsigned UseOperIdx,
+  bool FindMin) const {
+
+  int DefLatency = getDefLatency(DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  if (hasInstrItineraries()) {
+    int OperLatency = 0;
+    if (UseMI) {
+      OperLatency =
+        TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx, UseMI, UseOperIdx);
+    }
+    else {
+      unsigned DefClass = DefMI->getDesc().getSchedClass();
+      OperLatency = InstrItins.getOperandCycle(DefClass, DefOperIdx);
+    }
+    if (OperLatency >= 0)
+      return OperLatency;
+
+    // No operand latency was found.
+    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, DefMI);
+
+    // Expected latency is the max of the stage latency and itinerary props.
+    // Rather than directly querying InstrItins stage latency, we call a TII
+    // hook to allow subtargets to specialize latency. This hook is only
+    // applicable to the InstrItins model. InstrSchedModel should model all
+    // special cases without TII hooks.
+    if (!FindMin)
+      InstrLatency = std::max(InstrLatency,
+                              TII->defaultDefLatency(&SchedModel, DefMI));
+    return InstrLatency;
+  }
+  assert(!FindMin && hasInstrSchedModel() &&
+         "Expected a SchedModel for this cpu");
+  const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+  unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
+  if (DefIdx < SCDesc->NumWriteLatencyEntries) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+      STI->getWriteLatencyEntry(SCDesc, DefIdx);
+    unsigned WriteID = WLEntry->WriteResourceID;
+    unsigned Latency = convertLatency(WLEntry->Cycles);
+    if (!UseMI)
+      return Latency;
+
+    // Lookup the use's latency adjustment in SubtargetInfo.
+    const MCSchedClassDesc *UseDesc = resolveSchedClass(UseMI);
+    if (UseDesc->NumReadAdvanceEntries == 0)
+      return Latency;
+    unsigned UseIdx = findUseIdx(UseMI, UseOperIdx);
+    return Latency - STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+  }
+  // If DefIdx does not exist in the model (e.g. implicit defs), then return
+  // unit latency (defaultDefLatency may be too conservative).
+#ifndef NDEBUG
+  if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
+      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()) {
+    std::string Err;
+    raw_string_ostream ss(Err);
+    ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
+       << *DefMI;
+    report_fatal_error(ss.str());
+  }
+#endif
+  return DefMI->isTransient() ? 0 : 1;
+}
+
+unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI) const {
+  // For the itinerary model, fall back to the old subtarget hook.
+  // Allow subtargets to compute Bundle latencies outside the machine model.
+  if (hasInstrItineraries() || MI->isBundle())
+    return TII->getInstrLatency(&InstrItins, MI);
+
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(MI);
+    if (SCDesc->isValid()) {
+      unsigned Latency = 0;
+      for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries;
+           DefIdx != DefEnd; ++DefIdx) {
+        // Lookup the definition's write latency in SubtargetInfo.
+        const MCWriteLatencyEntry *WLEntry =
+          STI->getWriteLatencyEntry(SCDesc, DefIdx);
+        Latency = std::max(Latency, convertLatency(WLEntry->Cycles));
+      }
+      return Latency;
+    }
+  }
+  return TII->defaultDefLatency(&SchedModel, MI);
+}
+
+unsigned TargetSchedModel::
+computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
+                     const MachineInstr *DepMI) const {
+  // MinLatency == -1 is for in-order processors that always have unit
+  // MinLatency. MinLatency > 0 is for in-order processors with varying min
+  // latencies, but since this is not a RAW dep, we always use unit latency.
+  if (SchedModel.MinLatency != 0)
+    return 1;
+
+  // MinLatency == 0 indicates an out-of-order processor that can dispatch
+  // WAW dependencies in the same cycle.
+
+  // Treat predication as a data dependency for out-of-order cpus. In-order
+  // cpus do not need to treat predicated writes specially.
+  //
+  // TODO: The following hack exists because predication passes do not
+  // correctly append imp-use operands, and readsReg() strangely returns false
+  // for predicated defs.
+  unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
+  const MachineFunction &MF = *DefMI->getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(DepMI))
+    return computeInstrLatency(DefMI);
+
+  // If we have a per operand scheduling model, check if this def is writing
+  // an unbuffered resource. If so, it treated like an in-order cpu.
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+    if (SCDesc->isValid()) {
+      for (const MCWriteProcResEntry *PRI = STI->getWriteProcResBegin(SCDesc),
+             *PRE = STI->getWriteProcResEnd(SCDesc); PRI != PRE; ++PRI) {
+        if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->IsBuffered)
+          return 1;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index aa601af21b0c..a9058bc7f6d9 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -60,116 +60,108 @@ STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
 STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
 
 namespace {
-  class TwoAddressInstructionPass : public MachineFunctionPass {
-    MachineFunction *MF;
-    const TargetInstrInfo *TII;
-    const TargetRegisterInfo *TRI;
-    const InstrItineraryData *InstrItins;
-    MachineRegisterInfo *MRI;
-    LiveVariables *LV;
-    SlotIndexes *Indexes;
-    LiveIntervals *LIS;
-    AliasAnalysis *AA;
-    CodeGenOpt::Level OptLevel;
-
-    // DistanceMap - Keep track the distance of a MI from the start of the
-    // current basic block.
-    DenseMap<MachineInstr*, unsigned> DistanceMap;
-
-    // SrcRegMap - A map from virtual registers to physical registers which
-    // are likely targets to be coalesced to due to copies from physical
-    // registers to virtual registers. e.g. v1024 = move r0.
-    DenseMap<unsigned, unsigned> SrcRegMap;
-
-    // DstRegMap - A map from virtual registers to physical registers which
-    // are likely targets to be coalesced to due to copies to physical
-    // registers from virtual registers. e.g. r1 = move v1024.
-    DenseMap<unsigned, unsigned> DstRegMap;
-
-    /// RegSequences - Keep track the list of REG_SEQUENCE instructions seen
-    /// during the initial walk of the machine function.
-    SmallVector<MachineInstr*, 16> RegSequences;
-
-    bool Sink3AddrInstruction(MachineBasicBlock *MBB, MachineInstr *MI,
-                              unsigned Reg,
-                              MachineBasicBlock::iterator OldPos);
-
-    bool NoUseAfterLastDef(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist,
-                           unsigned &LastDef);
-
-    bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
-                               MachineInstr *MI, MachineBasicBlock *MBB,
-                               unsigned Dist);
+class TwoAddressInstructionPass : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const InstrItineraryData *InstrItins;
+  MachineRegisterInfo *MRI;
+  LiveVariables *LV;
+  SlotIndexes *Indexes;
+  LiveIntervals *LIS;
+  AliasAnalysis *AA;
+  CodeGenOpt::Level OptLevel;
+
+  // The current basic block being processed.
+  MachineBasicBlock *MBB;
+
+  // DistanceMap - Keep track the distance of a MI from the start of the
+  // current basic block.
+  DenseMap<MachineInstr*, unsigned> DistanceMap;
+
+  // Set of already processed instructions in the current block.
+  SmallPtrSet<MachineInstr*, 8> Processed;
 
-    bool CommuteInstruction(MachineBasicBlock::iterator &mi,
-                            MachineFunction::iterator &mbbi,
-                            unsigned RegB, unsigned RegC, unsigned Dist);
+  // SrcRegMap - A map from virtual registers to physical registers which are
+  // likely targets to be coalesced to due to copies from physical registers to
+  // virtual registers. e.g. v1024 = move r0.
+  DenseMap<unsigned, unsigned> SrcRegMap;
 
-    bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
+  // DstRegMap - A map from virtual registers to physical registers which are
+  // likely targets to be coalesced to due to copies to physical registers from
+  // virtual registers. e.g. r1 = move v1024.
+  DenseMap<unsigned, unsigned> DstRegMap;
 
-    bool ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
-                            MachineBasicBlock::iterator &nmi,
-                            MachineFunction::iterator &mbbi,
-                            unsigned RegA, unsigned RegB, unsigned Dist);
+  /// RegSequences - Keep track the list of REG_SEQUENCE instructions seen
+  /// during the initial walk of the machine function.
+  SmallVector<MachineInstr*, 16> RegSequences;
 
-    bool isDefTooClose(unsigned Reg, unsigned Dist,
-                       MachineInstr *MI, MachineBasicBlock *MBB);
+  bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
+                            MachineBasicBlock::iterator OldPos);
 
-    bool RescheduleMIBelowKill(MachineBasicBlock *MBB,
-                               MachineBasicBlock::iterator &mi,
-                               MachineBasicBlock::iterator &nmi,
-                               unsigned Reg);
-    bool RescheduleKillAboveMI(MachineBasicBlock *MBB,
-                               MachineBasicBlock::iterator &mi,
-                               MachineBasicBlock::iterator &nmi,
-                               unsigned Reg);
+  bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
 
-    bool TryInstructionTransform(MachineBasicBlock::iterator &mi,
-                                 MachineBasicBlock::iterator &nmi,
-                                 MachineFunction::iterator &mbbi,
-                                 unsigned SrcIdx, unsigned DstIdx,
-                                 unsigned Dist,
-                                 SmallPtrSet<MachineInstr*, 8> &Processed);
+  bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+                             MachineInstr *MI, unsigned Dist);
 
-    void ScanUses(unsigned DstReg, MachineBasicBlock *MBB,
-                  SmallPtrSet<MachineInstr*, 8> &Processed);
+  bool commuteInstruction(MachineBasicBlock::iterator &mi,
+                          unsigned RegB, unsigned RegC, unsigned Dist);
 
-    void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB,
-                     SmallPtrSet<MachineInstr*, 8> &Processed);
+  bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
 
-    typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
-    typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
-    bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
-    void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+  bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                          MachineBasicBlock::iterator &nmi,
+                          unsigned RegA, unsigned RegB, unsigned Dist);
 
-    void CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs, unsigned DstReg);
+  bool isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI);
 
-    /// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
-    /// of the de-ssa process. This replaces sources of REG_SEQUENCE as
-    /// sub-register references of the register defined by REG_SEQUENCE.
-    bool EliminateRegSequences();
+  bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi,
+                             unsigned Reg);
+  bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi,
+                             unsigned Reg);
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    TwoAddressInstructionPass() : MachineFunctionPass(ID) {
-      initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
-    }
+  bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                               MachineBasicBlock::iterator &nmi,
+                               unsigned SrcIdx, unsigned DstIdx,
+                               unsigned Dist);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<AliasAnalysis>();
-      AU.addPreserved<LiveVariables>();
-      AU.addPreserved<SlotIndexes>();
-      AU.addPreserved<LiveIntervals>();
-      AU.addPreservedID(MachineLoopInfoID);
-      AU.addPreservedID(MachineDominatorsID);
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
+  void scanUses(unsigned DstReg);
 
-    /// runOnMachineFunction - Pass entry point.
-    bool runOnMachineFunction(MachineFunction&);
-  };
-}
+  void processCopy(MachineInstr *MI);
+
+  typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
+  typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
+  bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
+  void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+
+  /// eliminateRegSequences - Eliminate REG_SEQUENCE instructions as part of
+  /// the de-ssa process. This replaces sources of REG_SEQUENCE as sub-register
+  /// references of the register defined by REG_SEQUENCE.
+  bool eliminateRegSequences();
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  TwoAddressInstructionPass() : MachineFunctionPass(ID) {
+    initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<AliasAnalysis>();
+    AU.addPreserved<LiveVariables>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// runOnMachineFunction - Pass entry point.
+  bool runOnMachineFunction(MachineFunction&);
+};
+} // end anonymous namespace
 
 char TwoAddressInstructionPass::ID = 0;
 INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction",
@@ -180,13 +172,13 @@ INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
 
 char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
 
-/// Sink3AddrInstruction - A two-address instruction has been converted to a
+/// sink3AddrInstruction - A two-address instruction has been converted to a
 /// three-address instruction to avoid clobbering a register. Try to sink it
 /// past the instruction that would kill the above mentioned register to reduce
 /// register pressure.
-bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
-                                           MachineInstr *MI, unsigned SavedReg,
-                                           MachineBasicBlock::iterator OldPos) {
+bool TwoAddressInstructionPass::
+sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
+                     MachineBasicBlock::iterator OldPos) {
   // FIXME: Shouldn't we be trying to do this before we three-addressify the
   // instruction?  After this transformation is done, we no longer need
   // the instruction to be in three-address form.
@@ -299,13 +291,12 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   return true;
 }
 
-/// NoUseAfterLastDef - Return true if there are no intervening uses between the
+/// noUseAfterLastDef - Return true if there are no intervening uses between the
 /// last instruction in the MBB that defines the specified register and the
 /// two-address instruction which is being processed. It also returns the last
 /// def location by reference
-bool TwoAddressInstructionPass::NoUseAfterLastDef(unsigned Reg,
-                                           MachineBasicBlock *MBB, unsigned Dist,
-                                           unsigned &LastDef) {
+bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
+                                                  unsigned &LastDef) {
   LastDef = 0;
   unsigned LastUse = Dist;
   for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
@@ -465,10 +456,9 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 /// isProfitableToCommute - Return true if it's potentially profitable to commute
 /// the two-address instruction that's being processed.
 bool
-TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB,
-                                       unsigned regC,
-                                       MachineInstr *MI, MachineBasicBlock *MBB,
-                                       unsigned Dist) {
+TwoAddressInstructionPass::
+isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+                      MachineInstr *MI, unsigned Dist) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
@@ -516,13 +506,13 @@ TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB,
   // If there is a use of regC between its last def (could be livein) and this
   // instruction, then bail.
   unsigned LastDefC = 0;
-  if (!NoUseAfterLastDef(regC, MBB, Dist, LastDefC))
+  if (!noUseAfterLastDef(regC, Dist, LastDefC))
     return false;
 
   // If there is a use of regB between its last def (could be livein) and this
   // instruction, then go ahead and make this transformation.
   unsigned LastDefB = 0;
-  if (!NoUseAfterLastDef(regB, MBB, Dist, LastDefB))
+  if (!noUseAfterLastDef(regB, Dist, LastDefB))
     return true;
 
   // Since there are no intervening uses for both registers, then commute
@@ -530,13 +520,12 @@ TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB,
   return LastDefB && LastDefC && LastDefC > LastDefB;
 }
 
-/// CommuteInstruction - Commute a two-address instruction and update the basic
+/// commuteInstruction - Commute a two-address instruction and update the basic
 /// block, distance map, and live variables if needed. Return true if it is
 /// successful.
-bool
-TwoAddressInstructionPass::CommuteInstruction(MachineBasicBlock::iterator &mi,
-                               MachineFunction::iterator &mbbi,
-                               unsigned RegB, unsigned RegC, unsigned Dist) {
+bool TwoAddressInstructionPass::
+commuteInstruction(MachineBasicBlock::iterator &mi,
+                   unsigned RegB, unsigned RegC, unsigned Dist) {
   MachineInstr *MI = mi;
   DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
   MachineInstr *NewMI = TII->commuteInstruction(MI);
@@ -555,8 +544,8 @@ TwoAddressInstructionPass::CommuteInstruction(MachineBasicBlock::iterator &mi,
     if (Indexes)
       Indexes->replaceMachineInstrInMaps(MI, NewMI);
 
-    mbbi->insert(mi, NewMI);           // Insert the new inst
-    mbbi->erase(mi);                   // Nuke the old inst.
+    MBB->insert(mi, NewMI);           // Insert the new inst
+    MBB->erase(mi);                   // Nuke the old inst.
     mi = NewMI;
     DistanceMap.insert(std::make_pair(NewMI, Dist));
   }
@@ -588,51 +577,51 @@ TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
   return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
 }
 
-/// ConvertInstTo3Addr - Convert the specified two-address instruction into a
+/// convertInstTo3Addr - Convert the specified two-address instruction into a
 /// three address one. Return true if this transformation was successful.
 bool
-TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
+TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
                                               MachineBasicBlock::iterator &nmi,
-                                              MachineFunction::iterator &mbbi,
                                               unsigned RegA, unsigned RegB,
                                               unsigned Dist) {
-  MachineInstr *NewMI = TII->convertToThreeAddress(mbbi, mi, LV);
-  if (NewMI) {
-    DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
-    DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
-    bool Sunk = false;
+  // FIXME: Why does convertToThreeAddress() need an iterator reference?
+  MachineFunction::iterator MFI = MBB;
+  MachineInstr *NewMI = TII->convertToThreeAddress(MFI, mi, LV);
+  assert(MBB == MFI && "convertToThreeAddress changed iterator reference");
+  if (!NewMI)
+    return false;
 
-    if (Indexes)
-      Indexes->replaceMachineInstrInMaps(mi, NewMI);
+  DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+  DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
+  bool Sunk = false;
 
-    if (NewMI->findRegisterUseOperand(RegB, false, TRI))
-      // FIXME: Temporary workaround. If the new instruction doesn't
-      // uses RegB, convertToThreeAddress must have created more
-      // then one instruction.
-      Sunk = Sink3AddrInstruction(mbbi, NewMI, RegB, mi);
+  if (Indexes)
+    Indexes->replaceMachineInstrInMaps(mi, NewMI);
 
-    mbbi->erase(mi); // Nuke the old inst.
+  if (NewMI->findRegisterUseOperand(RegB, false, TRI))
+    // FIXME: Temporary workaround. If the new instruction doesn't
+    // uses RegB, convertToThreeAddress must have created more
+    // then one instruction.
+    Sunk = sink3AddrInstruction(NewMI, RegB, mi);
 
-    if (!Sunk) {
-      DistanceMap.insert(std::make_pair(NewMI, Dist));
-      mi = NewMI;
-      nmi = llvm::next(mi);
-    }
+  MBB->erase(mi); // Nuke the old inst.
 
-    // Update source and destination register maps.
-    SrcRegMap.erase(RegA);
-    DstRegMap.erase(RegB);
-    return true;
+  if (!Sunk) {
+    DistanceMap.insert(std::make_pair(NewMI, Dist));
+    mi = NewMI;
+    nmi = llvm::next(mi);
   }
 
-  return false;
+  // Update source and destination register maps.
+  SrcRegMap.erase(RegA);
+  DstRegMap.erase(RegB);
+  return true;
 }
 
-/// ScanUses - Scan forward recursively for only uses, update maps if the use
+/// scanUses - Scan forward recursively for only uses, update maps if the use
 /// is a copy or a two-address instruction.
 void
-TwoAddressInstructionPass::ScanUses(unsigned DstReg, MachineBasicBlock *MBB,
-                                    SmallPtrSet<MachineInstr*, 8> &Processed) {
+TwoAddressInstructionPass::scanUses(unsigned DstReg) {
   SmallVector<unsigned, 4> VirtRegPairs;
   bool IsDstPhys;
   bool IsCopy = false;
@@ -676,7 +665,7 @@ TwoAddressInstructionPass::ScanUses(unsigned DstReg, MachineBasicBlock *MBB,
   }
 }
 
-/// ProcessCopy - If the specified instruction is not yet processed, process it
+/// processCopy - If the specified instruction is not yet processed, process it
 /// if it's a copy. For a copy instruction, we find the physical registers the
 /// source and destination registers might be mapped to. These are kept in
 /// point-to maps used to determine future optimizations. e.g.
@@ -688,9 +677,7 @@ TwoAddressInstructionPass::ScanUses(unsigned DstReg, MachineBasicBlock *MBB,
 /// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is
 /// potentially joined with r1 on the output side. It's worthwhile to commute
 /// 'add' to eliminate a copy.
-void TwoAddressInstructionPass::ProcessCopy(MachineInstr *MI,
-                                     MachineBasicBlock *MBB,
-                                     SmallPtrSet<MachineInstr*, 8> &Processed) {
+void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
   if (Processed.count(MI))
     return;
 
@@ -707,21 +694,20 @@ void TwoAddressInstructionPass::ProcessCopy(MachineInstr *MI,
       assert(SrcRegMap[DstReg] == SrcReg &&
              "Can't map to two src physical registers!");
 
-    ScanUses(DstReg, MBB, Processed);
+    scanUses(DstReg);
   }
 
   Processed.insert(MI);
   return;
 }
 
-/// RescheduleMIBelowKill - If there is one more local instruction that reads
+/// rescheduleMIBelowKill - If there is one more local instruction that reads
 /// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
 /// instruction in order to eliminate the need for the copy.
-bool
-TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
-                                     MachineBasicBlock::iterator &mi,
-                                     MachineBasicBlock::iterator &nmi,
-                                     unsigned Reg) {
+bool TwoAddressInstructionPass::
+rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                      MachineBasicBlock::iterator &nmi,
+                      unsigned Reg) {
   // Bail immediately if we don't have LV available. We use it to find kills
   // efficiently.
   if (!LV)
@@ -853,8 +839,7 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
 /// isDefTooClose - Return true if the re-scheduling will put the given
 /// instruction too close to the defs of its register dependencies.
 bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
-                                              MachineInstr *MI,
-                                              MachineBasicBlock *MBB) {
+                                              MachineInstr *MI) {
   for (MachineRegisterInfo::def_iterator DI = MRI->def_begin(Reg),
          DE = MRI->def_end(); DI != DE; ++DI) {
     MachineInstr *DefMI = &*DI;
@@ -873,15 +858,14 @@ bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
   return false;
 }
 
-/// RescheduleKillAboveMI - If there is one more local instruction that reads
+/// rescheduleKillAboveMI - If there is one more local instruction that reads
 /// 'Reg' and it kills 'Reg, consider moving the kill instruction above the
 /// current two-address instruction in order to eliminate the need for the
 /// copy.
-bool
-TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
-                                     MachineBasicBlock::iterator &mi,
-                                     MachineBasicBlock::iterator &nmi,
-                                     unsigned Reg) {
+bool TwoAddressInstructionPass::
+rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                      MachineBasicBlock::iterator &nmi,
+                      unsigned Reg) {
   // Bail immediately if we don't have LV available. We use it to find kills
   // efficiently.
   if (!LV)
@@ -918,7 +902,7 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
     if (MO.isUse()) {
       if (!MOReg)
         continue;
-      if (isDefTooClose(MOReg, DI->second, MI, MBB))
+      if (isDefTooClose(MOReg, DI->second, MI))
         return false;
       if (MOReg == Reg && !MO.isKill())
         return false;
@@ -1006,18 +990,16 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
   return true;
 }
 
-/// TryInstructionTransform - For the case where an instruction has a single
+/// tryInstructionTransform - For the case where an instruction has a single
 /// pair of tied register operands, attempt some transformations that may
 /// either eliminate the tied operands or improve the opportunities for
 /// coalescing away the register copy.  Returns true if no copy needs to be
 /// inserted to untie mi's operands (either because they were untied, or
 /// because mi was rescheduled, and will be visited again later).
 bool TwoAddressInstructionPass::
-TryInstructionTransform(MachineBasicBlock::iterator &mi,
+tryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineBasicBlock::iterator &nmi,
-                        MachineFunction::iterator &mbbi,
-                        unsigned SrcIdx, unsigned DstIdx, unsigned Dist,
-                        SmallPtrSet<MachineInstr*, 8> &Processed) {
+                        unsigned SrcIdx, unsigned DstIdx, unsigned Dist) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
@@ -1030,7 +1012,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
   bool regBKilled = isKilled(MI, regB, MRI, TII);
 
   if (TargetRegisterInfo::isVirtualRegister(regA))
-    ScanUses(regA, &*mbbi, Processed);
+    scanUses(regA);
 
   // Check if it is profitable to commute the operands.
   unsigned SrcOp1, SrcOp2;
@@ -1051,7 +1033,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
         // If C dies but B does not, swap the B and C operands.
         // This makes the live ranges of A and C joinable.
         TryCommute = true;
-      else if (isProfitableToCommute(regA, regB, regC, &MI, mbbi, Dist)) {
+      else if (isProfitableToCommute(regA, regB, regC, &MI, Dist)) {
         TryCommute = true;
         AggressiveCommute = true;
       }
@@ -1059,7 +1041,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
   }
 
   // If it's profitable to commute, try to do so.
-  if (TryCommute && CommuteInstruction(mi, mbbi, regB, regC, Dist)) {
+  if (TryCommute && commuteInstruction(mi, regB, regC, Dist)) {
     ++NumCommuted;
     if (AggressiveCommute)
       ++NumAggrCommuted;
@@ -1068,7 +1050,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
 
   // If there is one more use of regB later in the same MBB, consider
   // re-schedule this MI below it.
-  if (RescheduleMIBelowKill(mbbi, mi, nmi, regB)) {
+  if (rescheduleMIBelowKill(mi, nmi, regB)) {
     ++NumReSchedDowns;
     return true;
   }
@@ -1078,7 +1060,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
     // three-address instruction.  Check if it is profitable.
     if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
       // Try to convert it.
-      if (ConvertInstTo3Addr(mi, nmi, mbbi, regA, regB, Dist)) {
+      if (convertInstTo3Addr(mi, nmi, regA, regB, Dist)) {
         ++NumConvertedTo3Addr;
         return true; // Done with this instruction.
       }
@@ -1087,7 +1069,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
 
   // If there is one more use of regB later in the same MBB, consider
   // re-schedule it before this MI if it's legal.
-  if (RescheduleKillAboveMI(mbbi, mi, nmi, regB)) {
+  if (rescheduleKillAboveMI(mi, nmi, regB)) {
     ++NumReSchedUps;
     return true;
   }
@@ -1131,8 +1113,8 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
 
         // Tentatively insert the instructions into the block so that they
         // look "normal" to the transformation logic.
-        mbbi->insert(mi, NewMIs[0]);
-        mbbi->insert(mi, NewMIs[1]);
+        MBB->insert(mi, NewMIs[0]);
+        MBB->insert(mi, NewMIs[1]);
 
         DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
                      << "2addr:    NEW INST: " << *NewMIs[1]);
@@ -1142,8 +1124,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
         unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
         MachineBasicBlock::iterator NewMI = NewMIs[1];
         bool TransformSuccess =
-          TryInstructionTransform(NewMI, mi, mbbi,
-                                  NewSrcIdx, NewDstIdx, Dist, Processed);
+          tryInstructionTransform(NewMI, mi, NewSrcIdx, NewDstIdx, Dist);
         if (TransformSuccess ||
             NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
           // Success, or at least we made an improvement. Keep the unfolded
@@ -1202,8 +1183,7 @@ bool TwoAddressInstructionPass::
 collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
   const MCInstrDesc &MCID = MI->getDesc();
   bool AnyOps = false;
-  unsigned NumOps = MI->isInlineAsm() ?
-    MI->getNumOperands() : MCID.getNumOperands();
+  unsigned NumOps = MI->getNumOperands();
 
   for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
     unsigned DstIdx = 0;
@@ -1373,22 +1353,21 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 
   DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
   DEBUG(dbgs() << "********** Function: "
-        << MF->getFunction()->getName() << '\n');
+        << MF->getName() << '\n');
 
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();
 
   TiedOperandMap TiedOperands;
-
-  SmallPtrSet<MachineInstr*, 8> Processed;
-  for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end();
-       mbbi != mbbe; ++mbbi) {
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    MBB = MBBI;
     unsigned Dist = 0;
     DistanceMap.clear();
     SrcRegMap.clear();
     DstRegMap.clear();
     Processed.clear();
-    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+    for (MachineBasicBlock::iterator mi = MBB->begin(), me = MBB->end();
          mi != me; ) {
       MachineBasicBlock::iterator nmi = llvm::next(mi);
       if (mi->isDebugValue()) {
@@ -1402,7 +1381,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 
       DistanceMap.insert(std::make_pair(mi, ++Dist));
 
-      ProcessCopy(&*mi, &*mbbi, Processed);
+      processCopy(&*mi);
 
       // First scan through all the tied register uses in this instruction
       // and record a list of pairs of tied operands for each register.
@@ -1427,8 +1406,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
           unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
           unsigned DstReg = mi->getOperand(DstIdx).getReg();
           if (SrcReg != DstReg &&
-              TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
-                                      Processed)) {
+              tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist)) {
             // The tied operands have been eliminated or shifted further down the
             // block to ease elimination. Continue processing with 'nmi'.
             TiedOperands.clear();
@@ -1468,7 +1446,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 
   // Eliminate REG_SEQUENCE instructions. Their whole purpose was to preseve
   // SSA form. It's now safe to de-SSA.
-  MadeChange |= EliminateRegSequences();
+  MadeChange |= eliminateRegSequences();
 
   return MadeChange;
 }
@@ -1515,127 +1493,6 @@ static MachineInstr *findFirstDef(unsigned Reg, MachineRegisterInfo *MRI) {
   return First;
 }
 
-/// CoalesceExtSubRegs - If a number of sources of the REG_SEQUENCE are
-/// EXTRACT_SUBREG from the same register and to the same virtual register
-/// with different sub-register indices, attempt to combine the
-/// EXTRACT_SUBREGs and pre-coalesce them. e.g.
-/// %reg1026<def> = VLDMQ %reg1025<kill>, 260, pred:14, pred:%reg0
-/// %reg1029:6<def> = EXTRACT_SUBREG %reg1026, 6
-/// %reg1029:5<def> = EXTRACT_SUBREG %reg1026<kill>, 5
-/// Since D subregs 5, 6 can combine to a Q register, we can coalesce
-/// reg1026 to reg1029.
-void
-TwoAddressInstructionPass::CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs,
-                                              unsigned DstReg) {
-  SmallSet<unsigned, 4> Seen;
-  for (unsigned i = 0, e = Srcs.size(); i != e; ++i) {
-    unsigned SrcReg = Srcs[i];
-    if (!Seen.insert(SrcReg))
-      continue;
-
-    // Check that the instructions are all in the same basic block.
-    MachineInstr *SrcDefMI = MRI->getUniqueVRegDef(SrcReg);
-    MachineInstr *DstDefMI = MRI->getUniqueVRegDef(DstReg);
-    if (!SrcDefMI || !DstDefMI ||
-        SrcDefMI->getParent() != DstDefMI->getParent())
-      continue;
-
-    // If there are no other uses than copies which feed into
-    // the reg_sequence, then we might be able to coalesce them.
-    bool CanCoalesce = true;
-    SmallVector<unsigned, 4> SrcSubIndices, DstSubIndices;
-    for (MachineRegisterInfo::use_nodbg_iterator
-           UI = MRI->use_nodbg_begin(SrcReg),
-           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-      MachineInstr *UseMI = &*UI;
-      if (!UseMI->isCopy() || UseMI->getOperand(0).getReg() != DstReg) {
-        CanCoalesce = false;
-        break;
-      }
-      SrcSubIndices.push_back(UseMI->getOperand(1).getSubReg());
-      DstSubIndices.push_back(UseMI->getOperand(0).getSubReg());
-    }
-
-    if (!CanCoalesce || SrcSubIndices.size() < 2)
-      continue;
-
-    // Check that the source subregisters can be combined.
-    std::sort(SrcSubIndices.begin(), SrcSubIndices.end());
-    unsigned NewSrcSubIdx = 0;
-    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(SrcReg), SrcSubIndices,
-                                      NewSrcSubIdx))
-      continue;
-
-    // Check that the destination subregisters can also be combined.
-    std::sort(DstSubIndices.begin(), DstSubIndices.end());
-    unsigned NewDstSubIdx = 0;
-    if (!TRI->canCombineSubRegIndices(MRI->getRegClass(DstReg), DstSubIndices,
-                                      NewDstSubIdx))
-      continue;
-
-    // If neither source nor destination can be combined to the full register,
-    // just give up.  This could be improved if it ever matters.
-    if (NewSrcSubIdx != 0 && NewDstSubIdx != 0)
-      continue;
-
-    // Now that we know that all the uses are extract_subregs and that those
-    // subregs can somehow be combined, scan all the extract_subregs again to
-    // make sure the subregs are in the right order and can be composed.
-    MachineInstr *SomeMI = 0;
-    CanCoalesce = true;
-    for (MachineRegisterInfo::use_nodbg_iterator
-           UI = MRI->use_nodbg_begin(SrcReg),
-           UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-      MachineInstr *UseMI = &*UI;
-      assert(UseMI->isCopy());
-      unsigned DstSubIdx = UseMI->getOperand(0).getSubReg();
-      unsigned SrcSubIdx = UseMI->getOperand(1).getSubReg();
-      assert(DstSubIdx != 0 && "missing subreg from RegSequence elimination");
-      if ((NewDstSubIdx == 0 &&
-           TRI->composeSubRegIndices(NewSrcSubIdx, DstSubIdx) != SrcSubIdx) ||
-          (NewSrcSubIdx == 0 &&
-           TRI->composeSubRegIndices(NewDstSubIdx, SrcSubIdx) != DstSubIdx)) {
-        CanCoalesce = false;
-        break;
-      }
-      // Keep track of one of the uses.  Preferably the first one which has a
-      // <def,undef> flag.
-      if (!SomeMI || UseMI->getOperand(0).isUndef())
-        SomeMI = UseMI;
-    }
-    if (!CanCoalesce)
-      continue;
-
-    // Insert a copy to replace the original.
-    MachineInstr *CopyMI = BuildMI(*SomeMI->getParent(), SomeMI,
-                                   SomeMI->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY))
-      .addReg(DstReg, RegState::Define |
-                      getUndefRegState(SomeMI->getOperand(0).isUndef()),
-              NewDstSubIdx)
-      .addReg(SrcReg, 0, NewSrcSubIdx);
-
-    // Remove all the old extract instructions.
-    for (MachineRegisterInfo::use_nodbg_iterator
-           UI = MRI->use_nodbg_begin(SrcReg),
-           UE = MRI->use_nodbg_end(); UI != UE; ) {
-      MachineInstr *UseMI = &*UI;
-      ++UI;
-      if (UseMI == CopyMI)
-        continue;
-      assert(UseMI->isCopy());
-      // Move any kills to the new copy or extract instruction.
-      if (UseMI->getOperand(1).isKill()) {
-        CopyMI->getOperand(1).setIsKill();
-        if (LV)
-          // Update live variables
-          LV->replaceKillInstruction(SrcReg, UseMI, &*CopyMI);
-      }
-      UseMI->eraseFromParent();
-    }
-  }
-}
-
 static bool HasOtherRegSequenceUses(unsigned Reg, MachineInstr *RegSeq,
                                     MachineRegisterInfo *MRI) {
   for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(Reg),
@@ -1647,7 +1504,7 @@ static bool HasOtherRegSequenceUses(unsigned Reg, MachineInstr *RegSeq,
   return false;
 }
 
-/// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
+/// eliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
 /// of the de-ssa process. This replaces sources of REG_SEQUENCE as
 /// sub-register references of the register defined by REG_SEQUENCE. e.g.
 ///
@@ -1655,7 +1512,7 @@ static bool HasOtherRegSequenceUses(unsigned Reg, MachineInstr *RegSeq,
 /// %reg1031<def> = REG_SEQUENCE %reg1029<kill>, 5, %reg1030<kill>, 6
 /// =>
 /// %reg1031:5<def>, %reg1031:6<def> = VLD1q16 %reg1024<kill>, ...
-bool TwoAddressInstructionPass::EliminateRegSequences() {
+bool TwoAddressInstructionPass::eliminateRegSequences() {
   if (RegSequences.empty())
     return false;
 
@@ -1759,10 +1616,6 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
         if (MO.isReg() && MO.isDef() && MO.getReg() == DstReg)
           MO.setIsUndef();
       }
-      // Make sure there is a full non-subreg imp-def operand on the
-      // instruction.  This shouldn't be necessary, but it seems that at least
-      // RAFast requires it.
-      Def->addRegisterDefined(DstReg, TRI);
       DEBUG(dbgs() << "First def: " << *Def);
     }
 
@@ -1775,12 +1628,6 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
       DEBUG(dbgs() << "Eliminated: " << *MI);
       MI->eraseFromParent();
     }
-
-    // Try coalescing some EXTRACT_SUBREG instructions. This can create
-    // INSERT_SUBREG instructions that must have <undef> flags added by
-    // LiveIntervalAnalysis, so only run it when LiveVariables is available.
-    if (LV)
-      CoalesceExtSubRegs(RealSrcs, DstReg);
   }
 
   RegSequences.clear();
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 93840f0544ac..bb93bdc0bc25 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -19,8 +19,8 @@
 #define DEBUG_TYPE "regalloc"
 #include "VirtRegMap.h"
 #include "LiveDebugVariables.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -127,9 +127,11 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
   OS << '\n';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VirtRegMap::dump() const {
   print(dbgs());
 }
+#endif
 
 //===----------------------------------------------------------------------===//
 //                              VirtRegRewriter
@@ -170,6 +172,7 @@ INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
                     "Virtual Register Rewriter", false, false)
@@ -182,6 +185,8 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<SlotIndexes>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
   AU.addRequired<VirtRegMap>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -197,11 +202,11 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
   VRM = &getAnalysis<VirtRegMap>();
   DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
                << "********** Function: "
-               << MF->getFunction()->getName() << '\n');
+               << MF->getName() << '\n');
   DEBUG(VRM->dump());
 
   // Add kill flags while we still have virtual registers.
-  LIS->addKillFlags();
+  LIS->addKillFlags(VRM);
 
   // Live-in lists on basic blocks are required for physregs.
   addMBBLiveIns();
@@ -252,9 +257,6 @@ void VirtRegRewriter::rewrite() {
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
-#ifndef NDEBUG
-  BitVector Reserved = TRI->getReservedRegs(*MF);
-#endif
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
@@ -278,7 +280,7 @@ void VirtRegRewriter::rewrite() {
         unsigned PhysReg = VRM->getPhys(VirtReg);
         assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
                "Instruction uses unmapped VirtReg");
-        assert(!Reserved.test(PhysReg) && "Reserved register assignment");
+        assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
 
         // Preserve semantics of sub-register operands.
         if (MO.getSubReg()) {
diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h
index c3209854a480..7974dda66a5f 100644
--- a/lib/CodeGen/VirtRegMap.h
+++ b/lib/CodeGen/VirtRegMap.h
@@ -63,8 +63,8 @@ namespace llvm {
     /// createSpillSlot - Allocate a spill slot for RC from MFI.
     unsigned createSpillSlot(const TargetRegisterClass *RC);
 
-    VirtRegMap(const VirtRegMap&);     // DO NOT IMPLEMENT
-    void operator=(const VirtRegMap&); // DO NOT IMPLEMENT
+    VirtRegMap(const VirtRegMap&) LLVM_DELETED_FUNCTION;
+    void operator=(const VirtRegMap&) LLVM_DELETED_FUNCTION;
 
   public:
     static char ID;
diff --git a/lib/DebugInfo/CMakeLists.txt b/lib/DebugInfo/CMakeLists.txt
index 441f1e86dcd8..1e9e509fd2a1 100644
--- a/lib/DebugInfo/CMakeLists.txt
+++ b/lib/DebugInfo/CMakeLists.txt
@@ -8,5 +8,6 @@ add_llvm_library(LLVMDebugInfo
   DWARFDebugAranges.cpp
   DWARFDebugInfoEntry.cpp
   DWARFDebugLine.cpp
+  DWARFDebugRangeList.cpp
   DWARFFormValue.cpp
   )
diff --git a/lib/DebugInfo/DIContext.cpp b/lib/DebugInfo/DIContext.cpp
index e2fd55fd6ef8..691a92c392c2 100644
--- a/lib/DebugInfo/DIContext.cpp
+++ b/lib/DebugInfo/DIContext.cpp
@@ -18,7 +18,10 @@ DIContext *DIContext::getDWARFContext(bool isLittleEndian,
                                       StringRef abbrevSection,
                                       StringRef aRangeSection,
                                       StringRef lineSection,
-                                      StringRef stringSection) {
+                                      StringRef stringSection,
+                                      StringRef rangeSection,
+                                      const RelocAddrMap &Map) {
   return new DWARFContextInMemory(isLittleEndian, infoSection, abbrevSection,
-                                  aRangeSection, lineSection, stringSection);
+                                  aRangeSection, lineSection, stringSection,
+                                  rangeSection, Map);
 }
diff --git a/lib/DebugInfo/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARFCompileUnit.cpp
index b27d57bef1c5..bdd65b77e4b6 100644
--- a/lib/DebugInfo/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARFCompileUnit.cpp
@@ -63,7 +63,7 @@ DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
     Version = debug_info_data.getU16(&offset);
     bool abbrevsOK = debug_info_data.getU32(&offset) == abbrevs->getOffset();
     Abbrevs = abbrevs;
-    AddrSize = debug_info_data.getU8 (&offset);
+    AddrSize = debug_info_data.getU8(&offset);
 
     bool versionOK = DWARFContext::isSupportedVersion(Version);
     bool addrSizeOK = AddrSize == 4 || AddrSize == 8;
@@ -75,6 +75,15 @@ DWARFCompileUnit::extract(uint32_t offset, DataExtractor debug_info_data,
   return 0;
 }
 
+bool DWARFCompileUnit::extractRangeList(uint32_t RangeListOffset,
+                                        DWARFDebugRangeList &RangeList) const {
+  // Require that compile unit is extracted.
+  assert(DieArray.size() > 0);
+  DataExtractor RangesData(Context.getRangeSection(),
+                           Context.isLittleEndian(), AddrSize);
+  return RangeList.extract(RangesData, &RangeListOffset);
+}
+
 void DWARFCompileUnit::clear() {
   Offset = 0;
   Length = 0;
@@ -94,7 +103,9 @@ void DWARFCompileUnit::dump(raw_ostream &OS) {
      << " (next CU at " << format("0x%08x", getNextCompileUnitOffset())
      << ")\n";
 
-  getCompileUnitDIE(false)->dump(OS, this, -1U);
+  const DWARFDebugInfoEntryMinimal *CU = getCompileUnitDIE(false);
+  assert(CU && "Null Compile Unit?");
+  CU->dump(OS, this, -1U);
 }
 
 const char *DWARFCompileUnit::getCompilationDir() {
@@ -174,11 +185,11 @@ size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
       addDIE(die);
       return 1;
     }
-    else if (depth == 0 && initial_die_array_size == 1) {
+    else if (depth == 0 && initial_die_array_size == 1)
       // Don't append the CU die as we already did that
-    } else {
-      addDIE (die);
-    }
+      ;
+    else
+      addDIE(die);
 
     const DWARFAbbreviationDeclaration *abbrDecl =
       die.getAbbreviationDeclarationPtr();
@@ -199,9 +210,9 @@ size_t DWARFCompileUnit::extractDIEsIfNeeded(bool cu_die_only) {
   // Give a little bit of info if we encounter corrupt DWARF (our offset
   // should always terminate at or before the start of the next compilation
   // unit header).
-  if (offset > next_cu_offset) {
-    fprintf (stderr, "warning: DWARF compile unit extends beyond its bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
-  }
+  if (offset > next_cu_offset)
+    fprintf(stderr, "warning: DWARF compile unit extends beyond its"
+                    "bounds cu 0x%8.8x at 0x%8.8x'\n", getOffset(), offset);
 
   setDIERelations();
   return DieArray.size();
@@ -244,12 +255,21 @@ DWARFCompileUnit::buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
     clearDIEs(true);
 }
 
-const DWARFDebugInfoEntryMinimal*
-DWARFCompileUnit::getFunctionDIEForAddress(int64_t address) {
+DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFCompileUnit::getInlinedChainForAddress(uint64_t Address) {
+  // First, find a subprogram that contains the given address (the root
+  // of inlined chain).
   extractDIEsIfNeeded(false);
+  const DWARFDebugInfoEntryMinimal *SubprogramDIE = 0;
   for (size_t i = 0, n = DieArray.size(); i != n; i++) {
-    if (DieArray[i].addressRangeContainsAddress(this, address))
-      return &DieArray[i];
+    if (DieArray[i].isSubprogramDIE() &&
+        DieArray[i].addressRangeContainsAddress(this, Address)) {
+      SubprogramDIE = &DieArray[i];
+      break;
+    }
   }
-  return 0;
+  // Get inlined chain rooted at this subprogram DIE.
+  if (!SubprogramDIE)
+    return DWARFDebugInfoEntryMinimal::InlinedChain();
+  return SubprogramDIE->getInlinedChainForAddress(this, Address);
 }
diff --git a/lib/DebugInfo/DWARFCompileUnit.h b/lib/DebugInfo/DWARFCompileUnit.h
index b34a5965af85..03e28620d4b3 100644
--- a/lib/DebugInfo/DWARFCompileUnit.h
+++ b/lib/DebugInfo/DWARFCompileUnit.h
@@ -12,6 +12,7 @@
 
 #include "DWARFDebugAbbrev.h"
 #include "DWARFDebugInfoEntry.h"
+#include "DWARFDebugRangeList.h"
 #include <vector>
 
 namespace llvm {
@@ -45,6 +46,11 @@ public:
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
   /// hasn't already been done. Returns the number of DIEs parsed at this call.
   size_t extractDIEsIfNeeded(bool cu_die_only);
+  /// extractRangeList - extracts the range list referenced by this compile
+  /// unit from .debug_ranges section. Returns true on success.
+  /// Requires that compile unit is already extracted.
+  bool extractRangeList(uint32_t RangeListOffset,
+                        DWARFDebugRangeList &RangeList) const;
   void clear();
   void dump(raw_ostream &OS);
   uint32_t getOffset() const { return Offset; }
@@ -106,11 +112,11 @@ public:
 
   void buildAddressRangeTable(DWARFDebugAranges *debug_aranges,
                               bool clear_dies_if_already_not_parsed);
-  /// getFunctionDIEForAddress - Returns pointer to parsed subprogram DIE,
-  /// address ranges of which contain the provided address,
-  /// or NULL if there is no such subprogram. The pointer
-  /// is valid until DWARFCompileUnit::clear() or clearDIEs() is called.
-  const DWARFDebugInfoEntryMinimal *getFunctionDIEForAddress(int64_t address);
+
+  /// getInlinedChainForAddress - fetches inlined chain for a given address.
+  /// Returns empty chain if there is no subprogram containing address.
+  DWARFDebugInfoEntryMinimal::InlinedChain getInlinedChainForAddress(
+      uint64_t Address);
 };
 
 }
diff --git a/lib/DebugInfo/DWARFContext.cpp b/lib/DebugInfo/DWARFContext.cpp
index 797662b083f1..afd614cc356e 100644
--- a/lib/DebugInfo/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARFContext.cpp
@@ -17,6 +17,8 @@
 using namespace llvm;
 using namespace dwarf;
 
+typedef DWARFDebugLine::LineTable DWARFLineTable;
+
 void DWARFContext::dump(raw_ostream &OS) {
   OS << ".debug_abbrev contents:\n";
   getDebugAbbrev()->dump(OS);
@@ -32,15 +34,17 @@ void DWARFContext::dump(raw_ostream &OS) {
   while (set.extract(arangesData, &offset))
     set.dump(OS);
 
+  uint8_t savedAddressByteSize = 0;
   OS << "\n.debug_lines contents:\n";
   for (unsigned i = 0, e = getNumCompileUnits(); i != e; ++i) {
     DWARFCompileUnit *cu = getCompileUnitAtIndex(i);
+    savedAddressByteSize = cu->getAddressByteSize();
     unsigned stmtOffset =
       cu->getCompileUnitDIE()->getAttributeValueAsUnsigned(cu, DW_AT_stmt_list,
                                                            -1U);
     if (stmtOffset != -1U) {
       DataExtractor lineData(getLineSection(), isLittleEndian(),
-                             cu->getAddressByteSize());
+                             savedAddressByteSize);
       DWARFDebugLine::DumpingState state(OS);
       DWARFDebugLine::parseStatementTable(lineData, &stmtOffset, state);
     }
@@ -54,6 +58,18 @@ void DWARFContext::dump(raw_ostream &OS) {
     OS << format("0x%8.8x: \"%s\"\n", lastOffset, s);
     lastOffset = offset;
   }
+
+  OS << "\n.debug_ranges contents:\n";
+  // In fact, different compile units may have different address byte
+  // sizes, but for simplicity we just use the address byte size of the last
+  // compile unit (there is no easy and fast way to associate address range
+  // list and the compile unit it describes).
+  DataExtractor rangesData(getRangeSection(), isLittleEndian(),
+                           savedAddressByteSize);
+  offset = 0;
+  DWARFDebugRangeList rangeList;
+  while (rangeList.extract(rangesData, &offset))
+    rangeList.dump(OS);
 }
 
 const DWARFDebugAbbrev *DWARFContext::getDebugAbbrev() {
@@ -80,7 +96,7 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   return Aranges.get();
 }
 
-const DWARFDebugLine::LineTable *
+const DWARFLineTable *
 DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
   if (!Line)
     Line.reset(new DWARFDebugLine());
@@ -92,7 +108,7 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
     return 0; // No line table for this compile unit.
 
   // See if the line table is cached.
-  if (const DWARFDebugLine::LineTable *lt = Line->getLineTable(stmtOffset))
+  if (const DWARFLineTable *lt = Line->getLineTable(stmtOffset))
     return lt;
 
   // We have to parse it first.
@@ -103,11 +119,11 @@ DWARFContext::getLineTableForCompileUnit(DWARFCompileUnit *cu) {
 
 void DWARFContext::parseCompileUnits() {
   uint32_t offset = 0;
-  const DataExtractor &debug_info_data = DataExtractor(getInfoSection(),
-                                                       isLittleEndian(), 0);
-  while (debug_info_data.isValidOffset(offset)) {
+  const DataExtractor &DIData = DataExtractor(getInfoSection(),
+                                              isLittleEndian(), 0);
+  while (DIData.isValidOffset(offset)) {
     CUs.push_back(DWARFCompileUnit(*this));
-    if (!CUs.back().extract(debug_info_data, &offset)) {
+    if (!CUs.back().extract(DIData, &offset)) {
       CUs.pop_back();
       break;
     }
@@ -131,75 +147,155 @@ namespace {
   };
 }
 
-DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t offset) {
+DWARFCompileUnit *DWARFContext::getCompileUnitForOffset(uint32_t Offset) {
   if (CUs.empty())
     parseCompileUnits();
 
-  DWARFCompileUnit *i = std::lower_bound(CUs.begin(), CUs.end(), offset,
-                                         OffsetComparator());
-  if (i != CUs.end())
-    return &*i;
+  DWARFCompileUnit *CU = std::lower_bound(CUs.begin(), CUs.end(), Offset,
+                                          OffsetComparator());
+  if (CU != CUs.end())
+    return &*CU;
   return 0;
 }
 
-DILineInfo DWARFContext::getLineInfoForAddress(uint64_t address,
-    DILineInfoSpecifier specifier) {
+DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   // First, get the offset of the compile unit.
-  uint32_t cuOffset = getDebugAranges()->findAddress(address);
+  uint32_t CUOffset = getDebugAranges()->findAddress(Address);
   // Retrieve the compile unit.
-  DWARFCompileUnit *cu = getCompileUnitForOffset(cuOffset);
-  if (!cu)
+  return getCompileUnitForOffset(CUOffset);
+}
+
+static bool getFileNameForCompileUnit(DWARFCompileUnit *CU,
+                                      const DWARFLineTable *LineTable,
+                                      uint64_t FileIndex,
+                                      bool NeedsAbsoluteFilePath,
+                                      std::string &FileName) {
+  if (CU == 0 ||
+      LineTable == 0 ||
+      !LineTable->getFileNameByIndex(FileIndex, NeedsAbsoluteFilePath,
+                                     FileName))
+    return false;
+  if (NeedsAbsoluteFilePath && sys::path::is_relative(FileName)) {
+    // We may still need to append compilation directory of compile unit.
+    SmallString<16> AbsolutePath;
+    if (const char *CompilationDir = CU->getCompilationDir()) {
+      sys::path::append(AbsolutePath, CompilationDir);
+    }
+    sys::path::append(AbsolutePath, FileName);
+    FileName = AbsolutePath.str();
+  }
+  return true;
+}
+
+static bool getFileLineInfoForCompileUnit(DWARFCompileUnit *CU,
+                                          const DWARFLineTable *LineTable,
+                                          uint64_t Address,
+                                          bool NeedsAbsoluteFilePath,
+                                          std::string &FileName,
+                                          uint32_t &Line, uint32_t &Column) {
+  if (CU == 0 || LineTable == 0)
+    return false;
+  // Get the index of row we're looking for in the line table.
+  uint32_t RowIndex = LineTable->lookupAddress(Address);
+  if (RowIndex == -1U)
+    return false;
+  // Take file number and line/column from the row.
+  const DWARFDebugLine::Row &Row = LineTable->Rows[RowIndex];
+  if (!getFileNameForCompileUnit(CU, LineTable, Row.File,
+                                 NeedsAbsoluteFilePath, FileName))
+    return false;
+  Line = Row.Line;
+  Column = Row.Column;
+  return true;
+}
+
+DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
+    DILineInfoSpecifier Specifier) {
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
     return DILineInfo();
-  SmallString<16> fileName("<invalid>");
-  SmallString<16> functionName("<invalid>");
-  uint32_t line = 0;
-  uint32_t column = 0;
-  if (specifier.needs(DILineInfoSpecifier::FunctionName)) {
-    const DWARFDebugInfoEntryMinimal *function_die =
-        cu->getFunctionDIEForAddress(address);
-    if (function_die) {
-      if (const char *name = function_die->getSubprogramName(cu))
-        functionName = name;
+  std::string FileName = "<invalid>";
+  std::string FunctionName = "<invalid>";
+  uint32_t Line = 0;
+  uint32_t Column = 0;
+  if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+    // The address may correspond to instruction in some inlined function,
+    // so we have to build the chain of inlined functions and take the
+    // name of the topmost function in it.
+    const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+        CU->getInlinedChainForAddress(Address);
+    if (InlinedChain.size() > 0) {
+      const DWARFDebugInfoEntryMinimal &TopFunctionDIE = InlinedChain[0];
+      if (const char *Name = TopFunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
     }
   }
-  if (specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
-    // Get the line table for this compile unit.
-    const DWARFDebugLine::LineTable *lineTable = getLineTableForCompileUnit(cu);
-    if (lineTable) {
-      // Get the index of the row we're looking for in the line table.
-      uint32_t rowIndex = lineTable->lookupAddress(address);
-      if (rowIndex != -1U) {
-        const DWARFDebugLine::Row &row = lineTable->Rows[rowIndex];
-        // Take file/line info from the line table.
-        const DWARFDebugLine::FileNameEntry &fileNameEntry =
-            lineTable->Prologue.FileNames[row.File - 1];
-        fileName = fileNameEntry.Name;
-        if (specifier.needs(DILineInfoSpecifier::AbsoluteFilePath) &&
-            sys::path::is_relative(fileName.str())) {
-          // Append include directory of file (if it is present in line table)
-          // and compilation directory of compile unit to make path absolute.
-          const char *includeDir = 0;
-          if (uint64_t includeDirIndex = fileNameEntry.DirIdx) {
-            includeDir = lineTable->Prologue
-                         .IncludeDirectories[includeDirIndex - 1];
-          }
-          SmallString<16> absFileName;
-          if (includeDir == 0 || sys::path::is_relative(includeDir)) {
-            if (const char *compilationDir = cu->getCompilationDir())
-              sys::path::append(absFileName, compilationDir);
-          }
-          if (includeDir) {
-            sys::path::append(absFileName, includeDir);
-          }
-          sys::path::append(absFileName, fileName.str());
-          fileName = absFileName;
-        }
-        line = row.Line;
-        column = row.Column;
+  if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+    const DWARFLineTable *LineTable = getLineTableForCompileUnit(CU);
+    const bool NeedsAbsoluteFilePath =
+        Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+    getFileLineInfoForCompileUnit(CU, LineTable, Address,
+                                  NeedsAbsoluteFilePath,
+                                  FileName, Line, Column);
+  }
+  return DILineInfo(StringRef(FileName), StringRef(FunctionName),
+                    Line, Column);
+}
+
+DIInliningInfo DWARFContext::getInliningInfoForAddress(uint64_t Address,
+    DILineInfoSpecifier Specifier) {
+  DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
+  if (!CU)
+    return DIInliningInfo();
+
+  const DWARFDebugInfoEntryMinimal::InlinedChain &InlinedChain =
+      CU->getInlinedChainForAddress(Address);
+  if (InlinedChain.size() == 0)
+    return DIInliningInfo();
+
+  DIInliningInfo InliningInfo;
+  uint32_t CallFile = 0, CallLine = 0, CallColumn = 0;
+  const DWARFLineTable *LineTable = 0;
+  for (uint32_t i = 0, n = InlinedChain.size(); i != n; i++) {
+    const DWARFDebugInfoEntryMinimal &FunctionDIE = InlinedChain[i];
+    std::string FileName = "<invalid>";
+    std::string FunctionName = "<invalid>";
+    uint32_t Line = 0;
+    uint32_t Column = 0;
+    // Get function name if necessary.
+    if (Specifier.needs(DILineInfoSpecifier::FunctionName)) {
+      if (const char *Name = FunctionDIE.getSubroutineName(CU))
+        FunctionName = Name;
+    }
+    if (Specifier.needs(DILineInfoSpecifier::FileLineInfo)) {
+      const bool NeedsAbsoluteFilePath =
+          Specifier.needs(DILineInfoSpecifier::AbsoluteFilePath);
+      if (i == 0) {
+        // For the topmost frame, initialize the line table of this
+        // compile unit and fetch file/line info from it.
+        LineTable = getLineTableForCompileUnit(CU);
+        // For the topmost routine, get file/line info from line table.
+        getFileLineInfoForCompileUnit(CU, LineTable, Address,
+                                      NeedsAbsoluteFilePath,
+                                      FileName, Line, Column);
+      } else {
+        // Otherwise, use call file, call line and call column from
+        // previous DIE in inlined chain.
+        getFileNameForCompileUnit(CU, LineTable, CallFile,
+                                  NeedsAbsoluteFilePath, FileName);
+        Line = CallLine;
+        Column = CallColumn;
+      }
+      // Get call file/line/column of a current DIE.
+      if (i + 1 < n) {
+        FunctionDIE.getCallerFrame(CU, CallFile, CallLine, CallColumn);
       }
     }
+    DILineInfo Frame(StringRef(FileName), StringRef(FunctionName),
+                     Line, Column);
+    InliningInfo.addFrame(Frame);
   }
-  return DILineInfo(fileName, functionName, line, column);
+  return InliningInfo;
 }
 
 void DWARFContextInMemory::anchor() { }
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index e55a27e69840..4001792b3d5f 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -13,6 +13,7 @@
 #include "DWARFCompileUnit.h"
 #include "DWARFDebugAranges.h"
 #include "DWARFDebugLine.h"
+#include "DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -25,21 +26,24 @@ namespace llvm {
 /// methods that a concrete implementation provides.
 class DWARFContext : public DIContext {
   bool IsLittleEndian;
+  const RelocAddrMap &RelocMap;
 
   SmallVector<DWARFCompileUnit, 1> CUs;
   OwningPtr<DWARFDebugAbbrev> Abbrev;
   OwningPtr<DWARFDebugAranges> Aranges;
   OwningPtr<DWARFDebugLine> Line;
 
-  DWARFContext(DWARFContext &); // = delete
-  DWARFContext &operator=(DWARFContext &); // = delete
+  DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
+  DWARFContext &operator=(DWARFContext &) LLVM_DELETED_FUNCTION;
 
   /// Read compile units from the debug_info section and store them in CUs.
   void parseCompileUnits();
 protected:
-  DWARFContext(bool isLittleEndian) : IsLittleEndian(isLittleEndian) {}
+  DWARFContext(bool isLittleEndian, const RelocAddrMap &Map) :
+    IsLittleEndian(isLittleEndian), RelocMap(Map) {}
 public:
   virtual void dump(raw_ostream &OS);
+
   /// Get the number of compile units in this context.
   unsigned getNumCompileUnits() {
     if (CUs.empty())
@@ -53,9 +57,6 @@ public:
     return &CUs[index];
   }
 
-  /// Return the compile unit that includes an offset (relative to .debug_info).
-  DWARFCompileUnit *getCompileUnitForOffset(uint32_t offset);
-
   /// Get a pointer to the parsed DebugAbbrev object.
   const DWARFDebugAbbrev *getDebugAbbrev();
 
@@ -66,22 +67,32 @@ public:
   const DWARFDebugLine::LineTable *
   getLineTableForCompileUnit(DWARFCompileUnit *cu);
 
-  virtual DILineInfo getLineInfoForAddress(uint64_t address,
-      DILineInfoSpecifier specifier = DILineInfoSpecifier());
+  virtual DILineInfo getLineInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier());
+  virtual DIInliningInfo getInliningInfoForAddress(uint64_t Address,
+      DILineInfoSpecifier Specifier = DILineInfoSpecifier());
 
   bool isLittleEndian() const { return IsLittleEndian; }
+  const RelocAddrMap &relocMap() const { return RelocMap; }
 
   virtual StringRef getInfoSection() = 0;
   virtual StringRef getAbbrevSection() = 0;
   virtual StringRef getARangeSection() = 0;
   virtual StringRef getLineSection() = 0;
   virtual StringRef getStringSection() = 0;
+  virtual StringRef getRangeSection() = 0;
 
   static bool isSupportedVersion(unsigned version) {
     return version == 2 || version == 3;
   }
-};
+private:
+  /// Return the compile unit that includes an offset (relative to .debug_info).
+  DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
 
+  /// Return the compile unit which contains instruction with provided
+  /// address.
+  DWARFCompileUnit *getCompileUnitForAddress(uint64_t Address);
+};
 
 /// DWARFContextInMemory is the simplest possible implementation of a
 /// DWARFContext. It assumes all content is available in memory and stores
@@ -93,19 +104,23 @@ class DWARFContextInMemory : public DWARFContext {
   StringRef ARangeSection;
   StringRef LineSection;
   StringRef StringSection;
+  StringRef RangeSection;
 public:
   DWARFContextInMemory(bool isLittleEndian,
                        StringRef infoSection,
                        StringRef abbrevSection,
                        StringRef aRangeSection,
                        StringRef lineSection,
-                       StringRef stringSection)
-    : DWARFContext(isLittleEndian),
+                       StringRef stringSection,
+                       StringRef rangeSection,
+                       const RelocAddrMap &Map = RelocAddrMap())
+    : DWARFContext(isLittleEndian, Map),
       InfoSection(infoSection),
       AbbrevSection(abbrevSection),
       ARangeSection(aRangeSection),
       LineSection(lineSection),
-      StringSection(stringSection)
+      StringSection(stringSection),
+      RangeSection(rangeSection)
     {}
 
   virtual StringRef getInfoSection() { return InfoSection; }
@@ -113,6 +128,7 @@ public:
   virtual StringRef getARangeSection() { return ARangeSection; }
   virtual StringRef getLineSection() { return LineSection; }
   virtual StringRef getStringSection() { return StringSection; }
+  virtual StringRef getRangeSection() { return RangeSection; }
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARFDebugAranges.cpp
index ef470e5799cd..f9a34c908f1d 100644
--- a/lib/DebugInfo/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARFDebugAranges.cpp
@@ -62,7 +62,6 @@ bool DWARFDebugAranges::extract(DataExtractor debug_aranges_data) {
     uint32_t offset = 0;
 
     typedef std::vector<DWARFDebugArangeSet> SetCollection;
-    typedef SetCollection::const_iterator SetCollectionIter;
     SetCollection sets;
 
     DWARFDebugArangeSet set;
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
index 429a36c0871e..ab6746445388 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugInfoEntry.cpp --------------------------------------------===//
+//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -101,7 +101,7 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
   DataExtractor debug_info_data = cu->getDebugInfoExtractor();
   uint64_t abbrCode = debug_info_data.getULEB128(offset_ptr);
 
-  assert (fixed_form_sizes); // For best performance this should be specified!
+  assert(fixed_form_sizes); // For best performance this should be specified!
 
   if (abbrCode) {
     uint32_t offset = *offset_ptr;
@@ -126,6 +126,7 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
           switch (form) {
           // Blocks if inlined data that have a length field and the data bytes
           // inlined in the .debug_info.
+          case DW_FORM_exprloc:
           case DW_FORM_block:
             form_size = debug_info_data.getULEB128(&offset);
             break;
@@ -150,6 +151,11 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
             form_size = cu->getAddressByteSize();
             break;
 
+          // 0 sized form.
+          case DW_FORM_flag_present:
+            form_size = 0;
+            break;
+
           // 1 byte values
           case DW_FORM_data1:
           case DW_FORM_flag:
@@ -173,6 +179,7 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
           // 8 byte values
           case DW_FORM_data8:
           case DW_FORM_ref8:
+          case DW_FORM_ref_sig8:
             form_size = 8;
             break;
 
@@ -188,6 +195,13 @@ bool DWARFDebugInfoEntryMinimal::extractFast(const DWARFCompileUnit *cu,
             form = debug_info_data.getULEB128(&offset);
             break;
 
+          case DW_FORM_sec_offset:
+            if (cu->getAddressByteSize() == 4)
+              debug_info_data.getU32(offset_ptr);
+            else
+              debug_info_data.getU64(offset_ptr);
+            break;
+
           default:
             *offset_ptr = Offset;
             return false;
@@ -249,6 +263,7 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
               switch (form) {
               // Blocks if inlined data that have a length field and the data
               // bytes // inlined in the .debug_info
+              case DW_FORM_exprloc:
               case DW_FORM_block:
                 form_size = debug_info_data.getULEB128(&offset);
                 break;
@@ -273,6 +288,11 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
                 form_size = cu_addr_size;
                 break;
 
+              // 0 byte value
+              case DW_FORM_flag_present:
+                form_size = 0;
+                break;
+
               // 1 byte values
               case DW_FORM_data1:
               case DW_FORM_flag:
@@ -299,6 +319,7 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
               // 8 byte values
               case DW_FORM_data8:
               case DW_FORM_ref8:
+              case DW_FORM_ref_sig8:
                 form_size = 8;
                 break;
 
@@ -314,6 +335,13 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
                 form_is_indirect = true;
                 break;
 
+              case DW_FORM_sec_offset:
+                if (cu->getAddressByteSize() == 4)
+                  debug_info_data.getU32(offset_ptr);
+                else
+                  debug_info_data.getU64(offset_ptr);
+                break;
+
               default:
                 *offset_ptr = offset;
                 return false;
@@ -336,6 +364,16 @@ DWARFDebugInfoEntryMinimal::extract(const DWARFCompileUnit *cu,
   return false;
 }
 
+bool DWARFDebugInfoEntryMinimal::isSubprogramDIE() const {
+  return getTag() == DW_TAG_subprogram;
+}
+
+bool DWARFDebugInfoEntryMinimal::isSubroutineDIE() const {
+  uint32_t Tag = getTag();
+  return Tag == DW_TAG_subprogram ||
+         Tag == DW_TAG_inlined_subroutine;
+}
+
 uint32_t
 DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
                                               const uint16_t attr,
@@ -373,9 +411,10 @@ DWARFDebugInfoEntryMinimal::getAttributeValue(const DWARFCompileUnit *cu,
 
 const char*
 DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
-    const DWARFCompileUnit* cu,
-    const uint16_t attr,
-    const char* fail_value) const {
+                                                     const DWARFCompileUnit* cu,
+                                                     const uint16_t attr,
+                                                     const char* fail_value)
+                                                     const {
   DWARFFormValue form_value;
   if (getAttributeValue(cu, attr, form_value)) {
     DataExtractor stringExtractor(cu->getContext().getStringSection(),
@@ -387,9 +426,9 @@ DWARFDebugInfoEntryMinimal::getAttributeValueAsString(
 
 uint64_t
 DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
-    const DWARFCompileUnit* cu,
-    const uint16_t attr,
-    uint64_t fail_value) const {
+                                                    const DWARFCompileUnit* cu,
+                                                    const uint16_t attr,
+                                                    uint64_t fail_value) const {
   DWARFFormValue form_value;
   if (getAttributeValue(cu, attr, form_value))
       return form_value.getUnsigned();
@@ -398,9 +437,9 @@ DWARFDebugInfoEntryMinimal::getAttributeValueAsUnsigned(
 
 int64_t
 DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
-    const DWARFCompileUnit* cu,
-    const uint16_t attr,
-    int64_t fail_value) const {
+                                                     const DWARFCompileUnit* cu,
+                                                     const uint16_t attr,
+                                                     int64_t fail_value) const {
   DWARFFormValue form_value;
   if (getAttributeValue(cu, attr, form_value))
       return form_value.getSigned();
@@ -409,33 +448,42 @@ DWARFDebugInfoEntryMinimal::getAttributeValueAsSigned(
 
 uint64_t
 DWARFDebugInfoEntryMinimal::getAttributeValueAsReference(
-                                                  const DWARFCompileUnit* cu,
-                                                  const uint16_t attr,
-                                                  uint64_t fail_value) const {
+                                                     const DWARFCompileUnit* cu,
+                                                     const uint16_t attr,
+                                                     uint64_t fail_value)
+                                                     const {
   DWARFFormValue form_value;
   if (getAttributeValue(cu, attr, form_value))
       return form_value.getReference(cu);
   return fail_value;
 }
 
+bool DWARFDebugInfoEntryMinimal::getLowAndHighPC(const DWARFCompileUnit *CU,
+                                                 uint64_t &LowPC,
+                                                 uint64_t &HighPC) const {
+  HighPC = -1ULL;
+  LowPC = getAttributeValueAsUnsigned(CU, DW_AT_low_pc, -1ULL);
+  if (LowPC != -1ULL)
+    HighPC = getAttributeValueAsUnsigned(CU, DW_AT_high_pc, -1ULL);
+  return (HighPC != -1ULL);
+}
+
 void
-DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu,
-                                               DWARFDebugAranges *debug_aranges)
+DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *CU,
+                                               DWARFDebugAranges *DebugAranges)
                                                    const {
   if (AbbrevDecl) {
-    uint16_t tag = AbbrevDecl->getTag();
-    if (tag == DW_TAG_subprogram) {
-      uint64_t hi_pc = -1ULL;
-      uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
-      if (lo_pc != -1ULL)
-        hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
-      if (hi_pc != -1ULL)
-        debug_aranges->appendRange(cu->getOffset(), lo_pc, hi_pc);
+    if (isSubprogramDIE()) {
+      uint64_t LowPC, HighPC;
+      if (getLowAndHighPC(CU, LowPC, HighPC)) {
+        DebugAranges->appendRange(CU->getOffset(), LowPC, HighPC);
+      }
+      // FIXME: try to append ranges from .debug_ranges section.
     }
 
     const DWARFDebugInfoEntryMinimal *child = getFirstChild();
     while (child) {
-      child->buildAddressRangeTable(cu, debug_aranges);
+      child->buildAddressRangeTable(CU, DebugAranges);
       child = child->getSibling();
     }
   }
@@ -443,51 +491,95 @@ DWARFDebugInfoEntryMinimal::buildAddressRangeTable(const DWARFCompileUnit *cu,
 
 bool
 DWARFDebugInfoEntryMinimal::addressRangeContainsAddress(
-    const DWARFCompileUnit *cu, const uint64_t address) const {
-  if (!isNULL() && getTag() == DW_TAG_subprogram) {
-    uint64_t hi_pc = -1ULL;
-    uint64_t lo_pc = getAttributeValueAsUnsigned(cu, DW_AT_low_pc, -1ULL);
-    if (lo_pc != -1ULL)
-      hi_pc = getAttributeValueAsUnsigned(cu, DW_AT_high_pc, -1ULL);
-    if (hi_pc != -1ULL) {
-      return (lo_pc <= address && address < hi_pc);
-    }
+                                                     const DWARFCompileUnit *CU,
+                                                     const uint64_t Address)
+                                                     const {
+  if (isNULL())
+    return false;
+  uint64_t LowPC, HighPC;
+  if (getLowAndHighPC(CU, LowPC, HighPC))
+    return (LowPC <= Address && Address <= HighPC);
+  // Try to get address ranges from .debug_ranges section.
+  uint32_t RangesOffset = getAttributeValueAsReference(CU, DW_AT_ranges, -1U);
+  if (RangesOffset != -1U) {
+    DWARFDebugRangeList RangeList;
+    if (CU->extractRangeList(RangesOffset, RangeList))
+      return RangeList.containsAddress(CU->getBaseAddress(), Address);
   }
   return false;
 }
 
 const char*
-DWARFDebugInfoEntryMinimal::getSubprogramName(
-    const DWARFCompileUnit *cu) const {
-  if (isNULL() || getTag() != DW_TAG_subprogram)
+DWARFDebugInfoEntryMinimal::getSubroutineName(const DWARFCompileUnit *CU)
+                                                                         const {
+  if (!isSubroutineDIE())
     return 0;
   // Try to get mangled name if possible.
   if (const char *name =
-      getAttributeValueAsString(cu, DW_AT_MIPS_linkage_name, 0))
+      getAttributeValueAsString(CU, DW_AT_MIPS_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(cu, DW_AT_linkage_name, 0))
+  if (const char *name = getAttributeValueAsString(CU, DW_AT_linkage_name, 0))
     return name;
-  if (const char *name = getAttributeValueAsString(cu, DW_AT_name, 0))
+  if (const char *name = getAttributeValueAsString(CU, DW_AT_name, 0))
     return name;
   // Try to get name from specification DIE.
   uint32_t spec_ref =
-      getAttributeValueAsReference(cu, DW_AT_specification, -1U);
+      getAttributeValueAsReference(CU, DW_AT_specification, -1U);
   if (spec_ref != -1U) {
     DWARFDebugInfoEntryMinimal spec_die;
-    if (spec_die.extract(cu, &spec_ref)) {
-      if (const char *name = spec_die.getSubprogramName(cu))
+    if (spec_die.extract(CU, &spec_ref)) {
+      if (const char *name = spec_die.getSubroutineName(CU))
         return name;
     }
   }
   // Try to get name from abstract origin DIE.
   uint32_t abs_origin_ref =
-      getAttributeValueAsReference(cu, DW_AT_abstract_origin, -1U);
+      getAttributeValueAsReference(CU, DW_AT_abstract_origin, -1U);
   if (abs_origin_ref != -1U) {
     DWARFDebugInfoEntryMinimal abs_origin_die;
-    if (abs_origin_die.extract(cu, &abs_origin_ref)) {
-      if (const char *name = abs_origin_die.getSubprogramName(cu))
+    if (abs_origin_die.extract(CU, &abs_origin_ref)) {
+      if (const char *name = abs_origin_die.getSubroutineName(CU))
         return name;
     }
   }
   return 0;
 }
+
+void DWARFDebugInfoEntryMinimal::getCallerFrame(const DWARFCompileUnit *CU,
+                                                uint32_t &CallFile,
+                                                uint32_t &CallLine,
+                                                uint32_t &CallColumn) const {
+  CallFile = getAttributeValueAsUnsigned(CU, DW_AT_call_file, 0);
+  CallLine = getAttributeValueAsUnsigned(CU, DW_AT_call_line, 0);
+  CallColumn = getAttributeValueAsUnsigned(CU, DW_AT_call_column, 0);
+}
+
+DWARFDebugInfoEntryMinimal::InlinedChain
+DWARFDebugInfoEntryMinimal::getInlinedChainForAddress(
+                                                     const DWARFCompileUnit *CU,
+                                                     const uint64_t Address)
+                                                     const {
+  DWARFDebugInfoEntryMinimal::InlinedChain InlinedChain;
+  if (isNULL())
+    return InlinedChain;
+  for (const DWARFDebugInfoEntryMinimal *DIE = this; DIE; ) {
+    // Append current DIE to inlined chain only if it has correct tag
+    // (e.g. it is not a lexical block).
+    if (DIE->isSubroutineDIE()) {
+      InlinedChain.push_back(*DIE);
+    }
+    // Try to get child which also contains provided address.
+    const DWARFDebugInfoEntryMinimal *Child = DIE->getFirstChild();
+    while (Child) {
+      if (Child->addressRangeContainsAddress(CU, Address)) {
+        // Assume there is only one such child.
+        break;
+      }
+      Child = Child->getSibling();
+    }
+    DIE = Child;
+  }
+  // Reverse the obtained chain to make the root of inlined chain last.
+  std::reverse(InlinedChain.begin(), InlinedChain.end());
+  return InlinedChain;
+}
diff --git a/lib/DebugInfo/DWARFDebugInfoEntry.h b/lib/DebugInfo/DWARFDebugInfoEntry.h
index d5d86b9ec0c1..9c1b2be0a71f 100644
--- a/lib/DebugInfo/DWARFDebugInfoEntry.h
+++ b/lib/DebugInfo/DWARFDebugInfoEntry.h
@@ -11,6 +11,7 @@
 #define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
 
 #include "DWARFAbbreviationDeclaration.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -19,6 +20,7 @@ class DWARFDebugAranges;
 class DWARFCompileUnit;
 class DWARFContext;
 class DWARFFormValue;
+class DWARFInlinedSubroutineChain;
 
 /// DWARFDebugInfoEntryMinimal - A DIE with only the minimum required data.
 class DWARFDebugInfoEntryMinimal {
@@ -52,6 +54,13 @@ public:
 
   uint32_t getTag() const { return AbbrevDecl ? AbbrevDecl->getTag() : 0; }
   bool isNULL() const { return AbbrevDecl == 0; }
+
+  /// Returns true if DIE represents a subprogram (not inlined).
+  bool isSubprogramDIE() const;
+  /// Returns true if DIE represents a subprogram or an inlined
+  /// subroutine.
+  bool isSubroutineDIE() const;
+
   uint32_t getOffset() const { return Offset; }
   uint32_t getNumAttributes() const {
     return !isNULL() ? AbbrevDecl->getNumAttributes() : 0;
@@ -126,17 +135,40 @@ public:
                                     const uint16_t attr,
                                     int64_t fail_value) const;
 
-  void buildAddressRangeTable(const DWARFCompileUnit *cu,
-                              DWARFDebugAranges *debug_aranges) const;
-
-  bool addressRangeContainsAddress(const DWARFCompileUnit *cu,
-                                   const uint64_t address) const;
-
-  // If a DIE represents a subprogram, returns its mangled name
-  // (or short name, if mangled is missing). This name may be fetched
-  // from specification or abstract origin for this subprogram.
-  // Returns null if no name is found.
-  const char* getSubprogramName(const DWARFCompileUnit *cu) const;
+  /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
+  /// Returns true if both attributes are present.
+  bool getLowAndHighPC(const DWARFCompileUnit *CU,
+                       uint64_t &LowPC, uint64_t &HighPC) const;
+
+  void buildAddressRangeTable(const DWARFCompileUnit *CU,
+                              DWARFDebugAranges *DebugAranges) const;
+
+  bool addressRangeContainsAddress(const DWARFCompileUnit *CU,
+                                   const uint64_t Address) const;
+
+  /// If a DIE represents a subprogram (or inlined subroutine),
+  /// returns its mangled name (or short name, if mangled is missing).
+  /// This name may be fetched from specification or abstract origin
+  /// for this subprogram. Returns null if no name is found.
+  const char* getSubroutineName(const DWARFCompileUnit *CU) const;
+
+  /// Retrieves values of DW_AT_call_file, DW_AT_call_line and
+  /// DW_AT_call_column from DIE (or zeroes if they are missing).
+  void getCallerFrame(const DWARFCompileUnit *CU, uint32_t &CallFile,
+                      uint32_t &CallLine, uint32_t &CallColumn) const;
+
+  /// InlinedChain - represents a chain of inlined_subroutine
+  /// DIEs, (possibly ending with subprogram DIE), all of which are contained
+  /// in some concrete inlined instance tree. Address range for each DIE
+  /// (except the last DIE) in this chain is contained in address
+  /// range for next DIE in the chain.
+  typedef SmallVector<DWARFDebugInfoEntryMinimal, 4> InlinedChain;
+
+  /// Get inlined chain for a given address, rooted at the current DIE.
+  /// Returns empty chain if address is not contained in address range
+  /// of current DIE.
+  InlinedChain getInlinedChainForAddress(const DWARFCompileUnit *CU,
+                                         const uint64_t Address) const;
 };
 
 }
diff --git a/lib/DebugInfo/DWARFDebugLine.cpp b/lib/DebugInfo/DWARFDebugLine.cpp
index d99575d80033..267364adfaca 100644
--- a/lib/DebugInfo/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARFDebugLine.cpp
@@ -10,6 +10,7 @@
 #include "DWARFDebugLine.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 using namespace llvm;
@@ -513,3 +514,29 @@ DWARFDebugLine::LineTable::lookupAddress(uint64_t address) const {
   }
   return index;
 }
+
+bool
+DWARFDebugLine::LineTable::getFileNameByIndex(uint64_t FileIndex,
+                                              bool NeedsAbsoluteFilePath,
+                                              std::string &Result) const {
+  if (FileIndex == 0 || FileIndex > Prologue.FileNames.size())
+    return false;
+  const FileNameEntry &Entry = Prologue.FileNames[FileIndex - 1];
+  const char *FileName = Entry.Name;
+  if (!NeedsAbsoluteFilePath ||
+      sys::path::is_absolute(FileName)) {
+    Result = FileName;
+    return true;
+  }
+  SmallString<16> FilePath;
+  uint64_t IncludeDirIndex = Entry.DirIdx;
+  // Be defensive about the contents of Entry.
+  if (IncludeDirIndex > 0 &&
+      IncludeDirIndex <= Prologue.IncludeDirectories.size()) {
+    const char *IncludeDir = Prologue.IncludeDirectories[IncludeDirIndex - 1];
+    sys::path::append(FilePath, IncludeDir);
+  }
+  sys::path::append(FilePath, FileName);
+  Result = FilePath.str();
+  return true;
+}
diff --git a/lib/DebugInfo/DWARFDebugLine.h b/lib/DebugInfo/DWARFDebugLine.h
index 6382b45a93ab..586dd7e8784f 100644
--- a/lib/DebugInfo/DWARFDebugLine.h
+++ b/lib/DebugInfo/DWARFDebugLine.h
@@ -12,6 +12,7 @@
 
 #include "llvm/Support/DataExtractor.h"
 #include <map>
+#include <string>
 #include <vector>
 
 namespace llvm {
@@ -174,6 +175,13 @@ public:
     // Returns the index of the row with file/line info for a given address,
     // or -1 if there is no such row.
     uint32_t lookupAddress(uint64_t address) const;
+
+    // Extracts filename by its index in filename table in prologue.
+    // Returns true on success.
+    bool getFileNameByIndex(uint64_t FileIndex,
+                            bool NeedsAbsoluteFilePath,
+                            std::string &Result) const;
+
     void dump(raw_ostream &OS) const;
 
     struct Prologue Prologue;
diff --git a/lib/DebugInfo/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARFDebugRangeList.cpp
new file mode 100644
index 000000000000..1806beee7285
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugRangeList.cpp
@@ -0,0 +1,67 @@
+//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFDebugRangeList.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+void DWARFDebugRangeList::clear() {
+  Offset = -1U;
+  AddressSize = 0;
+  Entries.clear();
+}
+
+bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr) {
+  clear();
+  if (!data.isValidOffset(*offset_ptr))
+    return false;
+  AddressSize = data.getAddressSize();
+  if (AddressSize != 4 && AddressSize != 8)
+    return false;
+  Offset = *offset_ptr;
+  while (true) {
+    RangeListEntry entry;
+    uint32_t prev_offset = *offset_ptr;
+    entry.StartAddress = data.getAddress(offset_ptr);
+    entry.EndAddress = data.getAddress(offset_ptr);
+    // Check that both values were extracted correctly.
+    if (*offset_ptr != prev_offset + 2 * AddressSize) {
+      clear();
+      return false;
+    }
+    if (entry.isEndOfListEntry())
+      break;
+    Entries.push_back(entry);
+  }
+  return true;
+}
+
+void DWARFDebugRangeList::dump(raw_ostream &OS) const {
+  for (int i = 0, n = Entries.size(); i != n; ++i) {
+    const char *format_str = (AddressSize == 4
+                              ? "%08x %08"  PRIx64 " %08"  PRIx64 "\n"
+                              : "%08x %016" PRIx64 " %016" PRIx64 "\n");
+    OS << format(format_str, Offset, Entries[i].StartAddress,
+                                     Entries[i].EndAddress);
+  }
+  OS << format("%08x <End of list>\n", Offset);
+}
+
+bool DWARFDebugRangeList::containsAddress(uint64_t BaseAddress,
+                                          uint64_t Address) const {
+  for (int i = 0, n = Entries.size(); i != n; ++i) {
+    if (Entries[i].isBaseAddressSelectionEntry(AddressSize))
+      BaseAddress = Entries[i].EndAddress;
+    else if (Entries[i].containsAddress(BaseAddress, Address))
+      return true;
+  }
+  return false;
+}
diff --git a/lib/DebugInfo/DWARFDebugRangeList.h b/lib/DebugInfo/DWARFDebugRangeList.h
new file mode 100644
index 000000000000..4e34a916f4a3
--- /dev/null
+++ b/lib/DebugInfo/DWARFDebugRangeList.h
@@ -0,0 +1,78 @@
+//===-- DWARFDebugRangeList.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#define LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+
+#include "llvm/Support/DataExtractor.h"
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+class DWARFDebugRangeList {
+public:
+  struct RangeListEntry {
+    // A beginning address offset. This address offset has the size of an
+    // address and is relative to the applicable base address of the
+    // compilation unit referencing this range list. It marks the beginning
+    // of an address range.
+    uint64_t StartAddress;
+    // An ending address offset. This address offset again has the size of
+    // an address and is relative to the applicable base address of the
+    // compilation unit referencing this range list. It marks the first
+    // address past the end of the address range. The ending address must
+    // be greater than or equal to the beginning address.
+    uint64_t EndAddress;
+    // The end of any given range list is marked by an end of list entry,
+    // which consists of a 0 for the beginning address offset
+    // and a 0 for the ending address offset.
+    bool isEndOfListEntry() const {
+      return (StartAddress == 0) && (EndAddress == 0);
+    }
+    // A base address selection entry consists of:
+    // 1. The value of the largest representable address offset
+    // (for example, 0xffffffff when the size of an address is 32 bits).
+    // 2. An address, which defines the appropriate base address for
+    // use in interpreting the beginning and ending address offsets of
+    // subsequent entries of the location list.
+    bool isBaseAddressSelectionEntry(uint8_t AddressSize) const {
+      assert(AddressSize == 4 || AddressSize == 8);
+      if (AddressSize == 4)
+        return StartAddress == -1U;
+      else
+        return StartAddress == -1ULL;
+    }
+    bool containsAddress(uint64_t BaseAddress, uint64_t Address) const {
+      return (BaseAddress + StartAddress <= Address) &&
+             (Address < BaseAddress + EndAddress);
+    }
+  };
+
+private:
+  // Offset in .debug_ranges section.
+  uint32_t Offset;
+  uint8_t AddressSize;
+  std::vector<RangeListEntry> Entries;
+
+public:
+  DWARFDebugRangeList() { clear(); }
+  void clear();
+  void dump(raw_ostream &OS) const;
+  bool extract(DataExtractor data, uint32_t *offset_ptr);
+  /// containsAddress - Returns true if range list contains the given
+  /// address. Has to be passed base address of the compile unit that
+  /// references this range list.
+  bool containsAddress(uint64_t BaseAddress, uint64_t Address) const;
+};
+
+}  // namespace llvm
+
+#endif  // LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index ee2a3ab7b789..fea9fd7f7d34 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -10,6 +10,7 @@
 #include "DWARFFormValue.h"
 #include "DWARFCompileUnit.h"
 #include "DWARFContext.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -41,6 +42,10 @@ static const uint8_t form_sizes_addr4[] = {
   8, // 0x14 DW_FORM_ref8
   0, // 0x15 DW_FORM_ref_udata
   0, // 0x16 DW_FORM_indirect
+  4, // 0x17 DW_FORM_sec_offset
+  0, // 0x18 DW_FORM_exprloc
+  0, // 0x19 DW_FORM_flag_present
+  8, // 0x20 DW_FORM_ref_sig8
 };
 
 static const uint8_t form_sizes_addr8[] = {
@@ -67,6 +72,10 @@ static const uint8_t form_sizes_addr8[] = {
   8, // 0x14 DW_FORM_ref8
   0, // 0x15 DW_FORM_ref_udata
   0, // 0x16 DW_FORM_indirect
+  8, // 0x17 DW_FORM_sec_offset
+  0, // 0x18 DW_FORM_exprloc
+  0, // 0x19 DW_FORM_flag_present
+  8, // 0x20 DW_FORM_ref_sig8
 };
 
 const uint8_t *
@@ -90,9 +99,18 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
     indirect = false;
     switch (Form) {
     case DW_FORM_addr:
-    case DW_FORM_ref_addr:
-      Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
+    case DW_FORM_ref_addr: {
+      RelocAddrMap::const_iterator AI
+        = cu->getContext().relocMap().find(*offset_ptr);
+      if (AI != cu->getContext().relocMap().end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = R.second;
+        *offset_ptr += R.first;
+      } else
+        Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
+    }
       break;
+    case DW_FORM_exprloc:
     case DW_FORM_block:
       Value.uval = data.getULEB128(offset_ptr);
       is_block = true;
@@ -129,9 +147,17 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
     case DW_FORM_sdata:
       Value.sval = data.getSLEB128(offset_ptr);
       break;
-    case DW_FORM_strp:
-      Value.uval = data.getU32(offset_ptr);
+    case DW_FORM_strp: {
+      RelocAddrMap::const_iterator AI
+        = cu->getContext().relocMap().find(*offset_ptr);
+      if (AI != cu->getContext().relocMap().end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = R.second;
+        *offset_ptr += R.first;
+      } else
+        Value.uval = data.getU32(offset_ptr);
       break;
+    }
     case DW_FORM_udata:
     case DW_FORM_ref_udata:
       Value.uval = data.getULEB128(offset_ptr);
@@ -141,12 +167,24 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
       // Set the string value to also be the data for inlined cstr form
       // values only so we can tell the differnence between DW_FORM_string
       // and DW_FORM_strp form values
-      Value.data = (uint8_t*)Value.cstr;
+      Value.data = (const uint8_t*)Value.cstr;
       break;
     case DW_FORM_indirect:
       Form = data.getULEB128(offset_ptr);
       indirect = true;
       break;
+    case DW_FORM_sec_offset:
+      if (cu->getAddressByteSize() == 4)
+        Value.uval = data.getU32(offset_ptr);
+      else
+        Value.uval = data.getU64(offset_ptr);
+      break;
+    case DW_FORM_flag_present:
+      Value.uval = 1;
+      break;
+    case DW_FORM_ref_sig8:
+      Value.uval = data.getU64(offset_ptr);
+      break;
     default:
       return false;
     }
@@ -179,6 +217,7 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
     switch (form) {
     // Blocks if inlined data that have a length field and the data bytes
     // inlined in the .debug_info
+    case DW_FORM_exprloc:
     case DW_FORM_block: {
       uint64_t size = debug_info_data.getULEB128(offset_ptr);
       *offset_ptr += size;
@@ -211,6 +250,10 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
       *offset_ptr += cu->getAddressByteSize();
       return true;
 
+    // 0 byte values - implied from the form.
+    case DW_FORM_flag_present:
+      return true;
+      
     // 1 byte values
     case DW_FORM_data1:
     case DW_FORM_flag:
@@ -234,6 +277,7 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
     // 8 byte values
     case DW_FORM_data8:
     case DW_FORM_ref8:
+    case DW_FORM_ref_sig8:
       *offset_ptr += 8;
       return true;
 
@@ -249,6 +293,15 @@ DWARFFormValue::skipValue(uint16_t form, DataExtractor debug_info_data,
       indirect = true;
       form = debug_info_data.getULEB128(offset_ptr);
       break;
+
+    // 4 for DWARF32, 8 for DWARF64.
+    case DW_FORM_sec_offset:
+      if (cu->getAddressByteSize() == 4)
+        *offset_ptr += 4;
+      else
+        *offset_ptr += 8;
+      return true;
+      
     default:
       return false;
     }
@@ -264,22 +317,26 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
 
   switch (Form) {
   case DW_FORM_addr:      OS << format("0x%016" PRIx64, uvalue); break;
+  case DW_FORM_flag_present: OS << "true"; break;
   case DW_FORM_flag:
   case DW_FORM_data1:     OS << format("0x%02x", (uint8_t)uvalue); break;
   case DW_FORM_data2:     OS << format("0x%04x", (uint16_t)uvalue); break;
   case DW_FORM_data4:     OS << format("0x%08x", (uint32_t)uvalue); break;
+  case DW_FORM_ref_sig8:
   case DW_FORM_data8:     OS << format("0x%016" PRIx64, uvalue); break;
   case DW_FORM_string:
     OS << '"';
     OS.write_escaped(getAsCString(NULL));
     OS << '"';
     break;
+  case DW_FORM_exprloc:
   case DW_FORM_block:
   case DW_FORM_block1:
   case DW_FORM_block2:
   case DW_FORM_block4:
     if (uvalue > 0) {
       switch (Form) {
+      case DW_FORM_exprloc:
       case DW_FORM_block:  OS << format("<0x%" PRIx64 "> ", uvalue);     break;
       case DW_FORM_block1: OS << format("<0x%2.2x> ", (uint8_t)uvalue);  break;
       case DW_FORM_block2: OS << format("<0x%4.4x> ", (uint16_t)uvalue); break;
@@ -342,6 +399,14 @@ DWARFFormValue::dump(raw_ostream &OS, const DWARFCompileUnit *cu) const {
   case DW_FORM_indirect:
     OS << "DW_FORM_indirect";
     break;
+
+  case DW_FORM_sec_offset:
+    if (cu->getAddressByteSize() == 4)
+      OS << format("0x%08x", (uint32_t)uvalue);
+    else
+      OS << format("0x%016" PRIx64, uvalue);
+    break;
+    
   default:
     OS << format("DW_FORM(0x%4.4x)", Form);
     break;
@@ -404,6 +469,7 @@ const uint8_t *DWARFFormValue::BlockData() const {
 
 bool DWARFFormValue::isBlockForm(uint16_t form) {
   switch (form) {
+  case DW_FORM_exprloc:
   case DW_FORM_block:
   case DW_FORM_block1:
   case DW_FORM_block2:
diff --git a/lib/DebugInfo/DWARFFormValue.h b/lib/DebugInfo/DWARFFormValue.h
index 22ac0116646e..c5b590db95f5 100644
--- a/lib/DebugInfo/DWARFFormValue.h
+++ b/lib/DebugInfo/DWARFFormValue.h
@@ -52,7 +52,7 @@ public:
   bool extractValue(DataExtractor data, uint32_t *offset_ptr,
                     const DWARFCompileUnit *cu);
   bool isInlinedCStr() const {
-    return Value.data != NULL && Value.data == (uint8_t*)Value.cstr;
+    return Value.data != NULL && Value.data == (const uint8_t*)Value.cstr;
   }
   const uint8_t *BlockData() const;
   uint64_t getReference(const DWARFCompileUnit* cu) const;
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index a744d0c1e798..05987f2b74e7 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -29,7 +29,7 @@
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>
 #include <cstring>
@@ -91,11 +91,11 @@ class GVMemoryBlock : public CallbackVH {
 public:
   /// \brief Returns the address the GlobalVariable should be written into.  The
   /// GVMemoryBlock object prefixes that.
-  static char *Create(const GlobalVariable *GV, const TargetData& TD) {
+  static char *Create(const GlobalVariable *GV, const DataLayout& TD) {
     Type *ElTy = GV->getType()->getElementType();
     size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
     void *RawMemory = ::operator new(
-      TargetData::RoundUpAlignment(sizeof(GVMemoryBlock),
+      DataLayout::RoundUpAlignment(sizeof(GVMemoryBlock),
                                    TD.getPreferredAlignment(GV))
       + GVSize);
     new(RawMemory) GVMemoryBlock(GV);
@@ -113,7 +113,7 @@ public:
 }  // anonymous namespace
 
 char *ExecutionEngine::getMemoryForGV(const GlobalVariable *GV) {
-  return GVMemoryBlock::Create(GV, *getTargetData());
+  return GVMemoryBlock::Create(GV, *getDataLayout());
 }
 
 bool ExecutionEngine::removeModule(Module *M) {
@@ -267,7 +267,7 @@ public:
 void *ArgvArray::reset(LLVMContext &C, ExecutionEngine *EE,
                        const std::vector<std::string> &InputArgv) {
   clear();  // Free the old contents.
-  unsigned PtrSize = EE->getTargetData()->getPointerSize();
+  unsigned PtrSize = EE->getDataLayout()->getPointerSize();
   Array = new char[(InputArgv.size()+1)*PtrSize];
 
   DEBUG(dbgs() << "JIT: ARGV = " << (void*)Array << "\n");
@@ -342,7 +342,7 @@ void ExecutionEngine::runStaticConstructorsDestructors(bool isDtors) {
 #ifndef NDEBUG
 /// isTargetNullPtr - Return whether the target pointer stored at Loc is null.
 static bool isTargetNullPtr(ExecutionEngine *EE, void *Loc) {
-  unsigned PtrSize = EE->getTargetData()->getPointerSize();
+  unsigned PtrSize = EE->getDataLayout()->getPointerSize();
   for (unsigned i = 0; i < PtrSize; ++i)
     if (*(i + (uint8_t*)Loc))
       return false;
@@ -501,7 +501,8 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
     return 0;
   }
 
-  if ((WhichEngine & EngineKind::JIT) && ExecutionEngine::JITCtor == 0) {
+  if ((WhichEngine & EngineKind::JIT) && ExecutionEngine::JITCtor == 0 &&
+      ExecutionEngine::MCJITCtor == 0) {
     if (ErrorStr)
       *ErrorStr = "JIT has not been linked in.";
   }
@@ -643,15 +644,17 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     }
     case Instruction::PtrToInt: {
       GenericValue GV = getConstantValue(Op0);
-      uint32_t PtrWidth = TD->getPointerSizeInBits();
+      uint32_t PtrWidth = TD->getTypeSizeInBits(Op0->getType());
+      assert(PtrWidth <= 64 && "Bad pointer width");
       GV.IntVal = APInt(PtrWidth, uintptr_t(GV.PointerVal));
+      uint32_t IntWidth = TD->getTypeSizeInBits(CE->getType());
+      GV.IntVal = GV.IntVal.zextOrTrunc(IntWidth);
       return GV;
     }
     case Instruction::IntToPtr: {
       GenericValue GV = getConstantValue(Op0);
-      uint32_t PtrWidth = TD->getPointerSizeInBits();
-      if (PtrWidth != GV.IntVal.getBitWidth())
-        GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth);
+      uint32_t PtrWidth = TD->getTypeSizeInBits(CE->getType());
+      GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth);
       assert(GV.IntVal.getBitWidth() <= 64 && "Bad pointer width");
       GV.PointerVal = PointerTy(uintptr_t(GV.IntVal.getZExtValue()));
       return GV;
@@ -832,7 +835,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
 static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
                              unsigned StoreBytes) {
   assert((IntVal.getBitWidth()+7)/8 >= StoreBytes && "Integer too small!");
-  uint8_t *Src = (uint8_t *)IntVal.getRawData();
+  const uint8_t *Src = (const uint8_t *)IntVal.getRawData();
 
   if (sys::isLittleEndianHost()) {
     // Little-endian host - the source is ordered from LSB to MSB.  Order the
@@ -855,7 +858,7 @@ static void StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
 
 void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
                                          GenericValue *Ptr, Type *Ty) {
-  const unsigned StoreBytes = getTargetData()->getTypeStoreSize(Ty);
+  const unsigned StoreBytes = getDataLayout()->getTypeStoreSize(Ty);
 
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID:
@@ -881,7 +884,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
     dbgs() << "Cannot store value of type " << *Ty << "!\n";
   }
 
-  if (sys::isLittleEndianHost() != getTargetData()->isLittleEndian())
+  if (sys::isLittleEndianHost() != getDataLayout()->isLittleEndian())
     // Host and target are different endian - reverse the stored bytes.
     std::reverse((uint8_t*)Ptr, StoreBytes + (uint8_t*)Ptr);
 }
@@ -917,7 +920,7 @@ static void LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
 void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
                                           GenericValue *Ptr,
                                           Type *Ty) {
-  const unsigned LoadBytes = getTargetData()->getTypeStoreSize(Ty);
+  const unsigned LoadBytes = getDataLayout()->getTypeStoreSize(Ty);
 
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID:
@@ -958,20 +961,20 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   
   if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
     unsigned ElementSize =
-      getTargetData()->getTypeAllocSize(CP->getType()->getElementType());
+      getDataLayout()->getTypeAllocSize(CP->getType()->getElementType());
     for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
       InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
   
   if (isa<ConstantAggregateZero>(Init)) {
-    memset(Addr, 0, (size_t)getTargetData()->getTypeAllocSize(Init->getType()));
+    memset(Addr, 0, (size_t)getDataLayout()->getTypeAllocSize(Init->getType()));
     return;
   }
   
   if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
     unsigned ElementSize =
-      getTargetData()->getTypeAllocSize(CPA->getType()->getElementType());
+      getDataLayout()->getTypeAllocSize(CPA->getType()->getElementType());
     for (unsigned i = 0, e = CPA->getNumOperands(); i != e; ++i)
       InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
     return;
@@ -979,7 +982,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   
   if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
     const StructLayout *SL =
-      getTargetData()->getStructLayout(cast<StructType>(CPS->getType()));
+      getDataLayout()->getStructLayout(cast<StructType>(CPS->getType()));
     for (unsigned i = 0, e = CPS->getNumOperands(); i != e; ++i)
       InitializeMemory(CPS->getOperand(i), (char*)Addr+SL->getElementOffset(i));
     return;
@@ -1126,7 +1129,7 @@ void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
     InitializeMemory(GV->getInitializer(), GA);
 
   Type *ElTy = GV->getType()->getElementType();
-  size_t GVSize = (size_t)getTargetData()->getTypeAllocSize(ElTy);
+  size_t GVSize = (size_t)getDataLayout()->getTypeAllocSize(ElTy);
   NumInitBytes += (unsigned)GVSize;
   ++NumGlobals;
 }
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 75e680ab3612..1e790e781da0 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -239,7 +239,7 @@ void *LLVMRecompileAndRelinkFunction(LLVMExecutionEngineRef EE, LLVMValueRef Fn)
 }
 
 LLVMTargetDataRef LLVMGetExecutionEngineTargetData(LLVMExecutionEngineRef EE) {
-  return wrap(unwrap(EE)->getTargetData());
+  return wrap(unwrap(EE)->getDataLayout());
 }
 
 void LLVMAddGlobalMapping(LLVMExecutionEngineRef EE, LLVMValueRef Global,
diff --git a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
index 7d67d0d8bee1..348308897dc4 100644
--- a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -1,11 +1,6 @@
-
-include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-set(system_libs
-  ${system_libs}
-  jitprofiling
-  )
+include_directories( ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 
 add_llvm_library(LLVMIntelJITEvents
   IntelJITEventListener.cpp
+  jitprofiling.c
   )
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index c11c17eac7e2..4cb0270d576d 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -22,12 +22,12 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/ExecutionEngine/IntelJITEventsWrapper.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/ValueHandle.h"
 #include "EventListenerCommon.h"
+#include "IntelJITEventsWrapper.h"
 
 using namespace llvm;
 using namespace llvm::jitprofiling;
@@ -37,13 +37,13 @@ namespace {
 class IntelJITEventListener : public JITEventListener {
   typedef DenseMap<void*, unsigned int> MethodIDMap;
 
-  IntelJITEventsWrapper& Wrapper;
+  OwningPtr<IntelJITEventsWrapper> Wrapper;
   MethodIDMap MethodIDs;
   FilenameCache Filenames;
 
 public:
-  IntelJITEventListener(IntelJITEventsWrapper& libraryWrapper)
-  : Wrapper(libraryWrapper) {
+  IntelJITEventListener(IntelJITEventsWrapper* libraryWrapper) {
+      Wrapper.reset(libraryWrapper);
   }
 
   ~IntelJITEventListener() {
@@ -54,6 +54,10 @@ public:
                                      const EmittedFunctionDetails &Details);
 
   virtual void NotifyFreeingMachineCode(void *OldPtr);
+
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+
+  virtual void NotifyFreeingObject(const ObjectImage &Obj);
 };
 
 static LineNumberInfo LineStartToIntelJITFormat(
@@ -94,7 +98,7 @@ static iJIT_Method_Load FunctionDescToIntelJITFormat(
 void IntelJITEventListener::NotifyFunctionEmitted(
     const Function &F, void *FnStart, size_t FnSize,
     const EmittedFunctionDetails &Details) {
-  iJIT_Method_Load FunctionMessage = FunctionDescToIntelJITFormat(Wrapper,
+  iJIT_Method_Load FunctionMessage = FunctionDescToIntelJITFormat(*Wrapper,
                                       F.getName().data(),
                                       reinterpret_cast<uint64_t>(FnStart),
                                       FnSize);
@@ -151,32 +155,36 @@ void IntelJITEventListener::NotifyFunctionEmitted(
     FunctionMessage.line_number_table = 0;
   }
 
-  Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
-                           &FunctionMessage);
+  Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+                            &FunctionMessage);
   MethodIDs[FnStart] = FunctionMessage.method_id;
 }
 
 void IntelJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
   MethodIDMap::iterator I = MethodIDs.find(FnStart);
   if (I != MethodIDs.end()) {
-    Wrapper.iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START, &I->second);
+    Wrapper->iJIT_NotifyEvent(iJVM_EVENT_TYPE_METHOD_UNLOAD_START, &I->second);
     MethodIDs.erase(I);
   }
 }
 
+void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+}
+
+void IntelJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+}
+
 }  // anonymous namespace.
 
 namespace llvm {
 JITEventListener *JITEventListener::createIntelJITEventListener() {
-  static OwningPtr<IntelJITEventsWrapper> JITProfilingWrapper(
-                                            new IntelJITEventsWrapper);
-  return new IntelJITEventListener(*JITProfilingWrapper);
+  return new IntelJITEventListener(new IntelJITEventsWrapper);
 }
 
 // for testing
 JITEventListener *JITEventListener::createIntelJITEventListener(
                                       IntelJITEventsWrapper* TestImpl) {
-  return new IntelJITEventListener(*TestImpl);
+  return new IntelJITEventListener(TestImpl);
 }
 
 } // namespace llvm
diff --git a/include/llvm/ExecutionEngine/IntelJITEventsWrapper.h b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
index ca873420299c..7ab08e15a8b3 100644
--- a/include/llvm/ExecutionEngine/IntelJITEventsWrapper.h
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h
@@ -18,7 +18,7 @@
 #ifndef INTEL_JIT_EVENTS_WRAPPER_H
 #define INTEL_JIT_EVENTS_WRAPPER_H
 
-#include <jitprofiling.h>
+#include "jitprofiling.h"
 
 namespace llvm {
 
diff --git a/lib/ExecutionEngine/IntelJITEvents/Makefile b/lib/ExecutionEngine/IntelJITEvents/Makefile
index ba75ac6f6462..dcf3126cc529 100644
--- a/lib/ExecutionEngine/IntelJITEvents/Makefile
+++ b/lib/ExecutionEngine/IntelJITEvents/Makefile
@@ -11,7 +11,8 @@ LIBRARYNAME = LLVMIntelJITEvents
 
 include $(LEVEL)/Makefile.config
 
-SOURCES := IntelJITEventListener.cpp
-CPPFLAGS += -I$(INTEL_JITEVENTS_INCDIR) -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+SOURCES := IntelJITEventListener.cpp \
+  jitprofiling.c
+CPPFLAGS += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/lib/ExecutionEngine/IntelJITEvents/ittnotify_config.h b/lib/ExecutionEngine/IntelJITEvents/ittnotify_config.h
new file mode 100644
index 000000000000..1f029fb1c45b
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/ittnotify_config.h
@@ -0,0 +1,454 @@
+/*===-- ittnotify_config.h - JIT Profiling API internal config-----*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file provides Intel(R) Performance Analyzer JIT (Just-In-Time) 
+ * Profiling API internal config.
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#ifndef _ITTNOTIFY_CONFIG_H_
+#define _ITTNOTIFY_CONFIG_H_
+
+/** @cond exclude_from_documentation */
+#ifndef ITT_OS_WIN
+#  define ITT_OS_WIN   1
+#endif /* ITT_OS_WIN */
+
+#ifndef ITT_OS_LINUX
+#  define ITT_OS_LINUX 2
+#endif /* ITT_OS_LINUX */
+
+#ifndef ITT_OS_MAC
+#  define ITT_OS_MAC   3
+#endif /* ITT_OS_MAC */
+
+#ifndef ITT_OS
+#  if defined WIN32 || defined _WIN32
+#    define ITT_OS ITT_OS_WIN
+#  elif defined( __APPLE__ ) && defined( __MACH__ )
+#    define ITT_OS ITT_OS_MAC
+#  else
+#    define ITT_OS ITT_OS_LINUX
+#  endif
+#endif /* ITT_OS */
+
+#ifndef ITT_PLATFORM_WIN
+#  define ITT_PLATFORM_WIN 1
+#endif /* ITT_PLATFORM_WIN */
+
+#ifndef ITT_PLATFORM_POSIX
+#  define ITT_PLATFORM_POSIX 2
+#endif /* ITT_PLATFORM_POSIX */
+
+#ifndef ITT_PLATFORM
+#  if ITT_OS==ITT_OS_WIN
+#    define ITT_PLATFORM ITT_PLATFORM_WIN
+#  else
+#    define ITT_PLATFORM ITT_PLATFORM_POSIX
+#  endif /* _WIN32 */
+#endif /* ITT_PLATFORM */
+
+#if defined(_UNICODE) && !defined(UNICODE)
+#define UNICODE
+#endif
+
+#include <stddef.h>
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <tchar.h>
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <stdint.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE || _UNICODE */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#ifndef CDECL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define CDECL __cdecl
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#      define CDECL /* not actual on x86_64 platform */
+#    else  /* _M_X64 || _M_AMD64 || __x86_64__ */
+#      define CDECL __attribute__ ((cdecl))
+#    endif /* _M_X64 || _M_AMD64 || __x86_64__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* CDECL */
+
+#ifndef STDCALL
+#  if ITT_PLATFORM==ITT_PLATFORM_WIN
+#    define STDCALL __stdcall
+#  else /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#    if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#      define STDCALL /* not supported on x86_64 platform */
+#    else  /* _M_X64 || _M_AMD64 || __x86_64__ */
+#      define STDCALL __attribute__ ((stdcall))
+#    endif /* _M_X64 || _M_AMD64 || __x86_64__ */
+#  endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#endif /* STDCALL */
+
+#define ITTAPI    CDECL
+#define LIBITTAPI CDECL
+
+/* TODO: Temporary for compatibility! */
+#define ITTAPI_CALL    CDECL
+#define LIBITTAPI_CALL CDECL
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+/* use __forceinline (VC++ specific) */
+#define ITT_INLINE           __forceinline
+#define ITT_INLINE_ATTRIBUTE /* nothing */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/*
+ * Generally, functions are not inlined unless optimization is specified.
+ * For functions declared inline, this attribute inlines the function even
+ * if no optimization level was specified.
+ */
+#ifdef __STRICT_ANSI__
+#define ITT_INLINE           static
+#else  /* __STRICT_ANSI__ */
+#define ITT_INLINE           static inline
+#endif /* __STRICT_ANSI__ */
+#define ITT_INLINE_ATTRIBUTE __attribute__ ((always_inline))
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+/** @endcond */
+
+#ifndef ITT_ARCH_IA32
+#  define ITT_ARCH_IA32  1
+#endif /* ITT_ARCH_IA32 */
+
+#ifndef ITT_ARCH_IA32E
+#  define ITT_ARCH_IA32E 2
+#endif /* ITT_ARCH_IA32E */
+
+#ifndef ITT_ARCH_IA64
+#  define ITT_ARCH_IA64  3
+#endif /* ITT_ARCH_IA64 */
+
+#ifndef ITT_ARCH
+#  if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#    define ITT_ARCH ITT_ARCH_IA32E
+#  elif defined _M_IA64 || defined __ia64
+#    define ITT_ARCH ITT_ARCH_IA64
+#  else
+#    define ITT_ARCH ITT_ARCH_IA32
+#  endif
+#endif
+
+#ifdef __cplusplus
+#  define ITT_EXTERN_C extern "C"
+#else
+#  define ITT_EXTERN_C /* nothing */
+#endif /* __cplusplus */
+
+#define ITT_TO_STR_AUX(x) #x
+#define ITT_TO_STR(x)     ITT_TO_STR_AUX(x)
+
+#define __ITT_BUILD_ASSERT(expr, suffix) do { \
+    static char __itt_build_check_##suffix[(expr) ? 1 : -1]; \
+    __itt_build_check_##suffix[0] = 0; \
+} while(0)
+#define _ITT_BUILD_ASSERT(expr, suffix)  __ITT_BUILD_ASSERT((expr), suffix)
+#define ITT_BUILD_ASSERT(expr)           _ITT_BUILD_ASSERT((expr), __LINE__)
+
+#define ITT_MAGIC { 0xED, 0xAB, 0xAB, 0xEC, 0x0D, 0xEE, 0xDA, 0x30 }
+
+/* Replace with snapshot date YYYYMMDD for promotion build. */
+#define API_VERSION_BUILD    20111111
+
+#ifndef API_VERSION_NUM
+#define API_VERSION_NUM 0.0.0
+#endif /* API_VERSION_NUM */
+
+#define API_VERSION "ITT-API-Version " ITT_TO_STR(API_VERSION_NUM) \
+                                " (" ITT_TO_STR(API_VERSION_BUILD) ")"
+
+/* OS communication functions */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+typedef HMODULE           lib_t;
+typedef DWORD             TIDT;
+typedef CRITICAL_SECTION  mutex_t;
+#define MUTEX_INITIALIZER { 0 }
+#define strong_alias(name, aliasname) /* empty for Windows */
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <dlfcn.h>
+#if defined(UNICODE) || defined(_UNICODE)
+#include <wchar.h>
+#endif /* UNICODE */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE 1 /* need for PTHREAD_MUTEX_RECURSIVE */
+#endif /* _GNU_SOURCE */
+#include <pthread.h>
+typedef void*             lib_t;
+typedef pthread_t         TIDT;
+typedef pthread_mutex_t   mutex_t;
+#define MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER
+#define _strong_alias(name, aliasname) \
+            extern __typeof (name) aliasname __attribute__ ((alias (#name)));
+#define strong_alias(name, aliasname) _strong_alias(name, aliasname)
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define __itt_get_proc(lib, name) GetProcAddress(lib, name)
+#define __itt_mutex_init(mutex)   InitializeCriticalSection(mutex)
+#define __itt_mutex_lock(mutex)   EnterCriticalSection(mutex)
+#define __itt_mutex_unlock(mutex) LeaveCriticalSection(mutex)
+#define __itt_load_lib(name)      LoadLibraryA(name)
+#define __itt_unload_lib(handle)  FreeLibrary(handle)
+#define __itt_system_error()      (int)GetLastError()
+#define __itt_fstrcmp(s1, s2)     lstrcmpA(s1, s2)
+#define __itt_fstrlen(s)          lstrlenA(s)
+#define __itt_fstrcpyn(s1, s2, l) lstrcpynA(s1, s2, l)
+#define __itt_fstrdup(s)          _strdup(s)
+#define __itt_thread_id()         GetCurrentThreadId()
+#define __itt_thread_yield()      SwitchToThread()
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long 
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return InterlockedIncrement(ptr);
+}
+#endif /* ITT_SIMPLE_INIT */
+#else /* ITT_PLATFORM!=ITT_PLATFORM_WIN */
+#define __itt_get_proc(lib, name) dlsym(lib, name)
+#define __itt_mutex_init(mutex)   {\
+    pthread_mutexattr_t mutex_attr;                                         \
+    int error_code = pthread_mutexattr_init(&mutex_attr);                   \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_init",    \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_settype(&mutex_attr,                     \
+                                           PTHREAD_MUTEX_RECURSIVE);        \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_settype", \
+                           error_code);                                     \
+    error_code = pthread_mutex_init(mutex, &mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutex_init",        \
+                           error_code);                                     \
+    error_code = pthread_mutexattr_destroy(&mutex_attr);                    \
+    if (error_code)                                                         \
+        __itt_report_error(__itt_error_system, "pthread_mutexattr_destroy", \
+                           error_code);                                     \
+}
+#define __itt_mutex_lock(mutex)   pthread_mutex_lock(mutex)
+#define __itt_mutex_unlock(mutex) pthread_mutex_unlock(mutex)
+#define __itt_load_lib(name)      dlopen(name, RTLD_LAZY)
+#define __itt_unload_lib(handle)  dlclose(handle)
+#define __itt_system_error()      errno
+#define __itt_fstrcmp(s1, s2)     strcmp(s1, s2)
+#define __itt_fstrlen(s)          strlen(s)
+#define __itt_fstrcpyn(s1, s2, l) strncpy(s1, s2, l)
+#define __itt_fstrdup(s)          strdup(s)
+#define __itt_thread_id()         pthread_self()
+#define __itt_thread_yield()      sched_yield()
+#if ITT_ARCH==ITT_ARCH_IA64
+#ifdef __INTEL_COMPILER
+#define __TBB_machine_fetchadd4(addr, val) __fetchadd4_acq((void *)addr, val)
+#else  /* __INTEL_COMPILER */
+/* TODO: Add Support for not Intel compilers for IA64 */
+#endif /* __INTEL_COMPILER */
+#else /* ITT_ARCH!=ITT_ARCH_IA64 */
+ITT_INLINE long
+__TBB_machine_fetchadd4(volatile void* ptr, long addend) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __TBB_machine_fetchadd4(volatile void* ptr, long addend)
+{
+    long result;
+    __asm__ __volatile__("lock\nxadd %0,%1"
+                          : "=r"(result),"=m"(*(long*)ptr)
+                          : "0"(addend), "m"(*(long*)ptr)
+                          : "memory");
+    return result;
+}
+#endif /* ITT_ARCH==ITT_ARCH_IA64 */
+#ifndef ITT_SIMPLE_INIT
+ITT_INLINE long 
+__itt_interlocked_increment(volatile long* ptr) ITT_INLINE_ATTRIBUTE;
+ITT_INLINE long __itt_interlocked_increment(volatile long* ptr)
+{
+    return __TBB_machine_fetchadd4(ptr, 1) + 1L;
+}
+#endif /* ITT_SIMPLE_INIT */
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+typedef enum {
+    __itt_collection_normal = 0,
+    __itt_collection_paused = 1
+} __itt_collection_state;
+
+typedef enum {
+    __itt_thread_normal  = 0,
+    __itt_thread_ignored = 1
+} __itt_thread_state;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_thread_info
+{
+    const char* nameA; /*!< Copy of original name in ASCII. */
+#if defined(UNICODE) || defined(_UNICODE)
+    const wchar_t* nameW; /*!< Copy of original name in UNICODE. */
+#else  /* UNICODE || _UNICODE */
+    void* nameW;
+#endif /* UNICODE || _UNICODE */
+    TIDT               tid;
+    __itt_thread_state state;   /*!< Thread state (paused or normal) */
+    int                extra1;  /*!< Reserved to the runtime */
+    void*              extra2;  /*!< Reserved to the runtime */
+    struct ___itt_thread_info* next;
+} __itt_thread_info;
+
+#include "ittnotify_types.h" /* For __itt_group_id definition */
+
+typedef struct ___itt_api_info_20101001
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    __itt_group_id group;
+}  __itt_api_info_20101001;
+
+typedef struct ___itt_api_info
+{
+    const char*    name;
+    void**         func_ptr;
+    void*          init_func;
+    void*          null_func;
+    __itt_group_id group;
+}  __itt_api_info;
+
+struct ___itt_domain;
+struct ___itt_string_handle;
+
+typedef struct ___itt_global
+{
+    unsigned char          magic[8];
+    unsigned long          version_major;
+    unsigned long          version_minor;
+    unsigned long          version_build;
+    volatile long          api_initialized;
+    volatile long          mutex_initialized;
+    volatile long          atomic_counter;
+    mutex_t                mutex;
+    lib_t                  lib;
+    void*                  error_handler;
+    const char**           dll_path_ptr;
+    __itt_api_info*        api_list_ptr;
+    struct ___itt_global*  next;
+    /* Joinable structures below */
+    __itt_thread_info*     thread_list;
+    struct ___itt_domain*  domain_list;
+    struct ___itt_string_handle* string_list;
+    __itt_collection_state state;
+} __itt_global;
+
+#pragma pack(pop)
+
+#define NEW_THREAD_INFO_W(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = NULL; \
+        h->nameW  = n ? _wcsdup(n) : NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_THREAD_INFO_A(gptr,h,h_tail,t,s,n) { \
+    h = (__itt_thread_info*)malloc(sizeof(__itt_thread_info)); \
+    if (h != NULL) { \
+        h->tid    = t; \
+        h->nameA  = n ? __itt_fstrdup(n) : NULL; \
+        h->nameW  = NULL; \
+        h->state  = s; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->thread_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_W(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 0;    /* domain is disabled by default */ \
+        h->nameA  = NULL; \
+        h->nameW  = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_DOMAIN_A(gptr,h,h_tail,name) { \
+    h = (__itt_domain*)malloc(sizeof(__itt_domain)); \
+    if (h != NULL) { \
+        h->flags  = 0;    /* domain is disabled by default */ \
+        h->nameA  = name ? __itt_fstrdup(name) : NULL; \
+        h->nameW  = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->domain_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_W(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = NULL; \
+        h->strW   = name ? _wcsdup(name) : NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#define NEW_STRING_HANDLE_A(gptr,h,h_tail,name) { \
+    h = (__itt_string_handle*)malloc(sizeof(__itt_string_handle)); \
+    if (h != NULL) { \
+        h->strA   = name ? __itt_fstrdup(name) : NULL; \
+        h->strW   = NULL; \
+        h->extra1 = 0;    /* reserved */ \
+        h->extra2 = NULL; /* reserved */ \
+        h->next   = NULL; \
+        if (h_tail == NULL) \
+            (gptr)->string_list = h; \
+        else \
+            h_tail->next = h; \
+    } \
+}
+
+#endif /* _ITTNOTIFY_CONFIG_H_ */
diff --git a/lib/ExecutionEngine/IntelJITEvents/ittnotify_types.h b/lib/ExecutionEngine/IntelJITEvents/ittnotify_types.h
new file mode 100644
index 000000000000..5df752f66f10
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/ittnotify_types.h
@@ -0,0 +1,70 @@
+/*===-- ittnotify_types.h - JIT Profiling API internal types--------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#ifndef _ITTNOTIFY_TYPES_H_
+#define _ITTNOTIFY_TYPES_H_
+
+typedef enum ___itt_group_id
+{
+    __itt_group_none      = 0,
+    __itt_group_legacy    = 1<<0,
+    __itt_group_control   = 1<<1,
+    __itt_group_thread    = 1<<2,
+    __itt_group_mark      = 1<<3,
+    __itt_group_sync      = 1<<4,
+    __itt_group_fsync     = 1<<5,
+    __itt_group_jit       = 1<<6,
+    __itt_group_model     = 1<<7,
+    __itt_group_splitter_min = 1<<7,
+    __itt_group_counter   = 1<<8,
+    __itt_group_frame     = 1<<9,
+    __itt_group_stitch    = 1<<10,
+    __itt_group_heap      = 1<<11,
+    __itt_group_splitter_max = 1<<12,
+    __itt_group_structure = 1<<12,
+    __itt_group_suppress = 1<<13,
+    __itt_group_all       = -1
+} __itt_group_id;
+
+#pragma pack(push, 8)
+
+typedef struct ___itt_group_list
+{
+    __itt_group_id id;
+    const char*    name;
+} __itt_group_list;
+
+#pragma pack(pop)
+
+#define ITT_GROUP_LIST(varname) \
+    static __itt_group_list varname[] = {       \
+        { __itt_group_all,       "all"       }, \
+        { __itt_group_control,   "control"   }, \
+        { __itt_group_thread,    "thread"    }, \
+        { __itt_group_mark,      "mark"      }, \
+        { __itt_group_sync,      "sync"      }, \
+        { __itt_group_fsync,     "fsync"     }, \
+        { __itt_group_jit,       "jit"       }, \
+        { __itt_group_model,     "model"     }, \
+        { __itt_group_counter,   "counter"   }, \
+        { __itt_group_frame,     "frame"     }, \
+        { __itt_group_stitch,    "stitch"    }, \
+        { __itt_group_heap,      "heap"      }, \
+        { __itt_group_structure, "structure" }, \
+        { __itt_group_suppress,  "suppress"  }, \
+        { __itt_group_none,      NULL        }  \
+    }
+
+#endif /* _ITTNOTIFY_TYPES_H_ */
diff --git a/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c b/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
new file mode 100644
index 000000000000..7b507de864cd
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/jitprofiling.c
@@ -0,0 +1,481 @@
+/*===-- jitprofiling.c - JIT (Just-In-Time) Profiling API----------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file provides Intel(R) Performance Analyzer JIT (Just-In-Time) 
+ * Profiling API implementation. 
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#include "ittnotify_config.h"
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#include <windows.h>
+#pragma optimize("", off)
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <pthread.h>
+#include <dlfcn.h>
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#include <malloc.h>
+#include <stdlib.h>
+
+#include "jitprofiling.h"
+
+static const char rcsid[] = "\n@(#) $Revision: 243501 $\n";
+
+#define DLL_ENVIRONMENT_VAR             "VS_PROFILER"
+
+#ifndef NEW_DLL_ENVIRONMENT_VAR
+#if ITT_ARCH==ITT_ARCH_IA32
+#define NEW_DLL_ENVIRONMENT_VAR	        "INTEL_JIT_PROFILER32"
+#else
+#define NEW_DLL_ENVIRONMENT_VAR	        "INTEL_JIT_PROFILER64"
+#endif
+#endif /* NEW_DLL_ENVIRONMENT_VAR */
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+#define DEFAULT_DLLNAME                 "JitPI.dll"
+HINSTANCE m_libHandle = NULL;
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+#define DEFAULT_DLLNAME                 "libJitPI.so"
+void* m_libHandle = NULL;
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+/* default location of JIT profiling agent on Android */
+#define ANDROID_JIT_AGENT_PATH  "/data/intel/libittnotify.so"
+
+/* the function pointers */
+typedef unsigned int(*TPInitialize)(void);
+static TPInitialize FUNC_Initialize=NULL;
+
+typedef unsigned int(*TPNotify)(unsigned int, void*);
+static TPNotify FUNC_NotifyEvent=NULL;
+
+static iJIT_IsProfilingActiveFlags executionMode = iJIT_NOTHING_RUNNING;
+
+/* end collector dll part. */
+
+/* loadiJIT_Funcs() : this function is called just in the beginning 
+ *  and is responsible to load the functions from BistroJavaCollector.dll
+ * result:
+ *  on success: the functions loads, iJIT_DLL_is_missing=0, return value = 1
+ *  on failure: the functions are NULL, iJIT_DLL_is_missing=1, return value = 0
+ */ 
+static int loadiJIT_Funcs(void);
+
+/* global representing whether the BistroJavaCollector can't be loaded */
+static int iJIT_DLL_is_missing = 0;
+
+/* Virtual stack - the struct is used as a virtual stack for each thread.
+ * Every thread initializes with a stack of size INIT_TOP_STACK.
+ * Every method entry decreases from the current stack point,
+ * and when a thread stack reaches its top of stack (return from the global 
+ * function), the top of stack and the current stack increase. Notice that 
+ * when returning from a function the stack pointer is the address of 
+ * the function return.
+*/
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+static DWORD threadLocalStorageHandle = 0;
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+static pthread_key_t threadLocalStorageHandle = (pthread_key_t)0;
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+#define INIT_TOP_Stack 10000
+
+typedef struct 
+{
+    unsigned int TopStack;
+    unsigned int CurrentStack;
+} ThreadStack, *pThreadStack;
+
+/* end of virtual stack. */
+
+/*
+ * The function for reporting virtual-machine related events to VTune.
+ * Note: when reporting iJVM_EVENT_TYPE_ENTER_NIDS, there is no need to fill 
+ * in the stack_id field in the iJIT_Method_NIDS structure, as VTune fills it.
+ * The return value in iJVM_EVENT_TYPE_ENTER_NIDS && 
+ * iJVM_EVENT_TYPE_LEAVE_NIDS events will be 0 in case of failure.
+ * in iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED event 
+ * it will be -1 if EventSpecificData == 0 otherwise it will be 0.
+*/
+
+ITT_EXTERN_C int JITAPI 
+iJIT_NotifyEvent(iJIT_JVM_EVENT event_type, void *EventSpecificData)
+{
+    int ReturnValue;
+
+    /*
+     * This section is for debugging outside of VTune. 
+     * It creates the environment variables that indicates call graph mode.
+     * If running outside of VTune remove the remark.
+     *
+     *
+     * static int firstTime = 1;
+     * char DoCallGraph[12] = "DoCallGraph";
+     * if (firstTime)
+     * {
+     * firstTime = 0;
+     * SetEnvironmentVariable( "BISTRO_COLLECTORS_DO_CALLGRAPH", DoCallGraph);
+     * }
+     *
+     * end of section.
+    */
+
+    /* initialization part - the functions have not been loaded yet. This part
+     *        will load the functions, and check if we are in Call Graph mode. 
+     *        (for special treatment).
+     */
+    if (!FUNC_NotifyEvent) 
+    {
+        if (iJIT_DLL_is_missing) 
+            return 0;
+
+        /* load the Function from the DLL */
+        if (!loadiJIT_Funcs()) 
+            return 0;
+
+        /* Call Graph initialization. */
+    }
+
+    /* If the event is method entry/exit, check that in the current mode 
+     * VTune is allowed to receive it
+     */
+    if ((event_type == iJVM_EVENT_TYPE_ENTER_NIDS || 
+         event_type == iJVM_EVENT_TYPE_LEAVE_NIDS) &&
+        (executionMode != iJIT_CALLGRAPH_ON))
+    {
+        return 0;
+    }
+    /* This section is performed when method enter event occurs.
+     * It updates the virtual stack, or creates it if this is the first 
+     * method entry in the thread. The stack pointer is decreased.
+     */
+    if (event_type == iJVM_EVENT_TYPE_ENTER_NIDS)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        pThreadStack threadStack = 
+            (pThreadStack)TlsGetValue (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pThreadStack threadStack = 
+            (pThreadStack)pthread_getspecific(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+        /* check for use of reserved method IDs */
+        if ( ((piJIT_Method_NIDS) EventSpecificData)->method_id <= 999 )
+            return 0;
+
+        if (!threadStack)
+        {
+            /* initialize the stack. */
+            threadStack = (pThreadStack) calloc (sizeof(ThreadStack), 1);
+            threadStack->TopStack = INIT_TOP_Stack;
+            threadStack->CurrentStack = INIT_TOP_Stack;
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            TlsSetValue(threadLocalStorageHandle,(void*)threadStack);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+            pthread_setspecific(threadLocalStorageHandle,(void*)threadStack);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        }
+
+        /* decrease the stack. */
+        ((piJIT_Method_NIDS) EventSpecificData)->stack_id = 
+            (threadStack->CurrentStack)--;
+    }
+
+    /* This section is performed when method leave event occurs
+     * It updates the virtual stack.
+     *    Increases the stack pointer.
+     *    If the stack pointer reached the top (left the global function)
+     *        increase the pointer and the top pointer.
+     */
+    if (event_type == iJVM_EVENT_TYPE_LEAVE_NIDS)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        pThreadStack threadStack = 
+           (pThreadStack)TlsGetValue (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pThreadStack threadStack = 
+            (pThreadStack)pthread_getspecific(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+        /* check for use of reserved method IDs */
+        if ( ((piJIT_Method_NIDS) EventSpecificData)->method_id <= 999 )
+            return 0;
+
+        if (!threadStack)
+        {
+            /* Error: first report in this thread is method exit */
+            exit (1);
+        }
+
+        ((piJIT_Method_NIDS) EventSpecificData)->stack_id = 
+            ++(threadStack->CurrentStack) + 1;
+
+        if (((piJIT_Method_NIDS) EventSpecificData)->stack_id 
+               > threadStack->TopStack)
+            ((piJIT_Method_NIDS) EventSpecificData)->stack_id = 
+                (unsigned int)-1;
+    }
+
+    if (event_type == iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED)
+    {
+        /* check for use of reserved method IDs */
+        if ( ((piJIT_Method_Load) EventSpecificData)->method_id <= 999 )
+            return 0;
+    }
+
+    ReturnValue = (int)FUNC_NotifyEvent(event_type, EventSpecificData);   
+
+    return ReturnValue;
+}
+
+/* The new mode call back routine */
+ITT_EXTERN_C void JITAPI 
+iJIT_RegisterCallbackEx(void *userdata, iJIT_ModeChangedEx 
+                        NewModeCallBackFuncEx) 
+{
+    /* is it already missing... or the load of functions from the DLL failed */
+    if (iJIT_DLL_is_missing || !loadiJIT_Funcs())
+    {
+        /* then do not bother with notifications */
+        NewModeCallBackFuncEx(userdata, iJIT_NO_NOTIFICATIONS);  
+        /* Error: could not load JIT functions. */
+        return;
+    }
+    /* nothing to do with the callback */
+}
+
+/*
+ * This function allows the user to query in which mode, if at all, 
+ *VTune is running
+ */
+ITT_EXTERN_C iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive()
+{
+    if (!iJIT_DLL_is_missing)
+    {
+        loadiJIT_Funcs();
+    }
+
+    return executionMode;
+}
+
+/* this function loads the collector dll (BistroJavaCollector) 
+ * and the relevant functions.
+ * on success: all functions load,     iJIT_DLL_is_missing = 0, return value = 1
+ * on failure: all functions are NULL, iJIT_DLL_is_missing = 1, return value = 0
+ */ 
+static int loadiJIT_Funcs()
+{
+    static int bDllWasLoaded = 0;
+    char *dllName = (char*)rcsid; /* !! Just to avoid unused code elimination */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    DWORD dNameLength = 0;
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+    if(bDllWasLoaded)
+    {
+        /* dll was already loaded, no need to do it for the second time */
+        return 1;
+    }
+
+    /* Assumes that the DLL will not be found */
+    iJIT_DLL_is_missing = 1;
+    FUNC_NotifyEvent = NULL;
+
+    if (m_libHandle) 
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        FreeLibrary(m_libHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        dlclose(m_libHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        m_libHandle = NULL;
+    }
+
+    /* Try to get the dll name from the environment */
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    dNameLength = GetEnvironmentVariableA(NEW_DLL_ENVIRONMENT_VAR, NULL, 0);
+    if (dNameLength)
+    {
+        DWORD envret = 0;
+        dllName = (char*)malloc(sizeof(char) * (dNameLength + 1));
+        envret = GetEnvironmentVariableA(NEW_DLL_ENVIRONMENT_VAR, 
+                                         dllName, dNameLength);
+        if (envret)
+        {
+            /* Try to load the dll from the PATH... */
+            m_libHandle = LoadLibraryExA(dllName, 
+                                         NULL, LOAD_WITH_ALTERED_SEARCH_PATH);
+        }
+        free(dllName);
+    } else {
+        /* Try to use old VS_PROFILER variable */
+        dNameLength = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, NULL, 0);
+        if (dNameLength)
+        {
+            DWORD envret = 0;
+            dllName = (char*)malloc(sizeof(char) * (dNameLength + 1));
+            envret = GetEnvironmentVariableA(DLL_ENVIRONMENT_VAR, 
+                                             dllName, dNameLength);
+            if (envret)
+            {
+                /* Try to load the dll from the PATH... */
+                m_libHandle = LoadLibraryA(dllName);
+            }
+            free(dllName);
+        }
+    }
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    dllName = getenv(NEW_DLL_ENVIRONMENT_VAR);
+    if (!dllName)
+        dllName = getenv(DLL_ENVIRONMENT_VAR);
+#ifdef ANDROID
+    if (!dllName)
+        dllName = ANDROID_JIT_AGENT_PATH;
+#endif
+    if (dllName)
+    {
+        /* Try to load the dll from the PATH... */
+        m_libHandle = dlopen(dllName, RTLD_LAZY);
+    }
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+
+    if (!m_libHandle)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        m_libHandle = LoadLibraryA(DEFAULT_DLLNAME);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        m_libHandle = dlopen(DEFAULT_DLLNAME, RTLD_LAZY);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+
+    /* if the dll wasn't loaded - exit. */
+    if (!m_libHandle)
+    {
+        iJIT_DLL_is_missing = 1; /* don't try to initialize 
+                                  * JIT agent the second time 
+                                  */
+        return 0;
+    }
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    FUNC_NotifyEvent = (TPNotify)GetProcAddress(m_libHandle, "NotifyEvent");
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    FUNC_NotifyEvent = (TPNotify)dlsym(m_libHandle, "NotifyEvent");
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (!FUNC_NotifyEvent) 
+    {
+        FUNC_Initialize = NULL;
+        return 0;
+    }
+
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+    FUNC_Initialize = (TPInitialize)GetProcAddress(m_libHandle, "Initialize");
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    FUNC_Initialize = (TPInitialize)dlsym(m_libHandle, "Initialize");
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    if (!FUNC_Initialize) 
+    {
+        FUNC_NotifyEvent = NULL;
+        return 0;
+    }
+
+    executionMode = (iJIT_IsProfilingActiveFlags)FUNC_Initialize();
+
+    bDllWasLoaded = 1;
+    iJIT_DLL_is_missing = 0; /* DLL is ok. */
+
+    /*
+     * Call Graph mode: init the thread local storage
+     * (need to store the virtual stack there).
+     */
+    if ( executionMode == iJIT_CALLGRAPH_ON )
+    {
+        /* Allocate a thread local storage slot for the thread "stack" */
+        if (!threadLocalStorageHandle)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            threadLocalStorageHandle = TlsAlloc();
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pthread_key_create(&threadLocalStorageHandle, NULL);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    }
+
+    return 1;
+}
+
+/*
+ * This function should be called by the user whenever a thread ends, 
+ * to free the thread "virtual stack" storage
+ */
+ITT_EXTERN_C void JITAPI FinalizeThread()
+{
+    if (threadLocalStorageHandle)
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        pThreadStack threadStack = 
+            (pThreadStack)TlsGetValue (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        pThreadStack threadStack = 
+            (pThreadStack)pthread_getspecific(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        if (threadStack)
+        {
+            free (threadStack);
+            threadStack = NULL;
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+            TlsSetValue (threadLocalStorageHandle, threadStack);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+            pthread_setspecific(threadLocalStorageHandle, threadStack);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        }
+    }
+}
+
+/*
+ * This function should be called by the user when the process ends, 
+ * to free the local storage index
+*/
+ITT_EXTERN_C void JITAPI FinalizeProcess()
+{
+    if (m_libHandle) 
+    {
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        FreeLibrary(m_libHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        dlclose(m_libHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+        m_libHandle = NULL;
+    }
+
+    if (threadLocalStorageHandle)
+#if ITT_PLATFORM==ITT_PLATFORM_WIN
+        TlsFree (threadLocalStorageHandle);
+#else  /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+    pthread_key_delete(threadLocalStorageHandle);
+#endif /* ITT_PLATFORM==ITT_PLATFORM_WIN */
+}
+
+/*
+ * This function should be called by the user for any method once.
+ * The function will return a unique method ID, the user should maintain 
+ * the ID for each method
+ */
+ITT_EXTERN_C unsigned int JITAPI iJIT_GetNewMethodID()
+{
+    static unsigned int methodID = 0x100000;
+
+    if (methodID == 0)
+        return 0;  /* ERROR : this is not a valid value */
+
+    return methodID++;
+}
diff --git a/lib/ExecutionEngine/IntelJITEvents/jitprofiling.h b/lib/ExecutionEngine/IntelJITEvents/jitprofiling.h
new file mode 100644
index 000000000000..f08e2870dcef
--- /dev/null
+++ b/lib/ExecutionEngine/IntelJITEvents/jitprofiling.h
@@ -0,0 +1,259 @@
+/*===-- jitprofiling.h - JIT Profiling API-------------------------*- C -*-===*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file provides Intel(R) Performance Analyzer JIT (Just-In-Time) 
+ * Profiling API declaration.
+ *
+ * NOTE: This file comes in a style different from the rest of LLVM
+ * source base since  this is a piece of code shared from Intel(R)
+ * products.  Please do not reformat / re-style this code to make
+ * subsequent merges and contributions from the original source base eaiser.
+ *
+ *===----------------------------------------------------------------------===*/
+#ifndef __JITPROFILING_H__
+#define __JITPROFILING_H__
+
+/*
+ * Various constants used by functions
+ */
+
+/* event notification */
+typedef enum iJIT_jvm_event
+{
+
+    /* shutdown  */
+    
+    /* 
+     * Program exiting EventSpecificData NA
+     */
+    iJVM_EVENT_TYPE_SHUTDOWN = 2, 
+
+    /* JIT profiling  */
+    
+    /* 
+     * issued after method code jitted into memory but before code is executed
+     * EventSpecificData is an iJIT_Method_Load
+     */
+    iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED=13,     
+
+    /* issued before unload. Method code will no longer be executed, but code 
+     * and info are still in memory. The VTune profiler may capture method 
+     * code only at this point EventSpecificData is iJIT_Method_Id
+     */
+    iJVM_EVENT_TYPE_METHOD_UNLOAD_START,         
+
+    /* Method Profiling */
+
+    /* method name, Id and stack is supplied 
+     * issued when a method is about to be entered EventSpecificData is 
+     * iJIT_Method_NIDS
+     */
+    iJVM_EVENT_TYPE_ENTER_NIDS = 19, 
+
+    /* method name, Id and stack is supplied 
+     * issued when a method is about to be left EventSpecificData is 
+     * iJIT_Method_NIDS
+     */
+    iJVM_EVENT_TYPE_LEAVE_NIDS               
+} iJIT_JVM_EVENT;
+
+typedef enum _iJIT_ModeFlags
+{
+    /* No need to Notify VTune, since VTune is not running */
+    iJIT_NO_NOTIFICATIONS          = 0x0000,     
+
+    /* when turned on the jit must call 
+     * iJIT_NotifyEvent
+     * (
+     *     iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED,
+     * )
+     * for all the method already jitted
+     */
+    iJIT_BE_NOTIFY_ON_LOAD         = 0x0001,     
+
+    /* when turned on the jit must call
+     * iJIT_NotifyEvent
+     * (
+     *     iJVM_EVENT_TYPE_METHOD_UNLOAD_FINISHED,
+     *  ) for all the method that are unloaded
+     */
+    iJIT_BE_NOTIFY_ON_UNLOAD       = 0x0002,     
+
+    /* when turned on the jit must instrument all
+     * the currently jited code with calls on
+     * method entries
+     */
+    iJIT_BE_NOTIFY_ON_METHOD_ENTRY = 0x0004,     
+
+    /* when turned on the jit must instrument all
+     * the currently jited code with calls
+     * on method exit
+     */
+    iJIT_BE_NOTIFY_ON_METHOD_EXIT  = 0x0008      
+
+} iJIT_ModeFlags;
+
+
+ /* Flags used by iJIT_IsProfilingActive() */
+typedef enum _iJIT_IsProfilingActiveFlags
+{
+    /* No profiler is running. Currently not used */
+    iJIT_NOTHING_RUNNING           = 0x0000,     
+
+    /* Sampling is running. This is the default value
+     * returned by iJIT_IsProfilingActive()
+     */
+    iJIT_SAMPLING_ON               = 0x0001,     
+    
+      /* Call Graph is running */
+    iJIT_CALLGRAPH_ON              = 0x0002
+
+} iJIT_IsProfilingActiveFlags;
+
+/* Enumerator for the environment of methods*/
+typedef enum _iJDEnvironmentType
+{
+    iJDE_JittingAPI = 2
+} iJDEnvironmentType;
+
+/**********************************
+ * Data structures for the events *
+ **********************************/
+
+/* structure for the events:
+ * iJVM_EVENT_TYPE_METHOD_UNLOAD_START
+ */
+
+typedef struct _iJIT_Method_Id
+{
+   /* Id of the method (same as the one passed in
+   * the iJIT_Method_Load struct
+   */
+    unsigned int       method_id;              
+
+} *piJIT_Method_Id, iJIT_Method_Id;
+
+
+/* structure for the events:
+ * iJVM_EVENT_TYPE_ENTER_NIDS,
+ * iJVM_EVENT_TYPE_LEAVE_NIDS,
+ * iJVM_EVENT_TYPE_EXCEPTION_OCCURRED_NIDS
+ */
+
+typedef struct _iJIT_Method_NIDS
+{
+    /* unique method ID */
+    unsigned int       method_id;              
+
+    /* NOTE: no need to fill this field, it's filled by VTune */
+    unsigned int       stack_id;               
+
+    /* method name (just the method, without the class) */
+    char*              method_name;            
+} *piJIT_Method_NIDS, iJIT_Method_NIDS;
+
+/* structures for the events:
+ * iJVM_EVENT_TYPE_METHOD_LOAD_FINISHED
+ */
+
+typedef struct _LineNumberInfo
+{
+    /* x86 Offset from the begining of the method*/
+    unsigned int        Offset;                 
+    
+    /* source line number from the begining of the source file */
+    unsigned int        LineNumber;             
+
+} *pLineNumberInfo, LineNumberInfo;
+
+typedef struct _iJIT_Method_Load
+{
+    /* unique method ID - can be any unique value, (except 0 - 999) */
+    unsigned int        method_id;              
+
+    /* method name (can be with or without the class and signature, in any case
+     * the class name will be added to it)
+     */
+    char*               method_name;            
+
+    /* virtual address of that method - This determines the method range for the
+     * iJVM_EVENT_TYPE_ENTER/LEAVE_METHOD_ADDR events
+     */
+    void*               method_load_address;    
+
+    /* Size in memory - Must be exact */
+    unsigned int        method_size;            
+
+    /* Line Table size in number of entries - Zero if none */
+    unsigned int        line_number_size;       
+    
+    /* Pointer to the begining of the line numbers info array */
+    pLineNumberInfo     line_number_table;      
+
+    /* unique class ID */
+    unsigned int        class_id;               
+    
+    /* class file name */
+    char*               class_file_name;        
+
+    /* source file name */
+    char*               source_file_name;       
+
+    /* bits supplied by the user for saving in the JIT file */
+    void*               user_data;              
+
+    /* the size of the user data buffer */
+    unsigned int        user_data_size;         
+
+    /* NOTE: no need to fill this field, it's filled by VTune */
+    iJDEnvironmentType  env;                    
+
+} *piJIT_Method_Load, iJIT_Method_Load;
+
+/* API Functions */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef CDECL
+#  if defined WIN32 || defined _WIN32
+#    define CDECL __cdecl
+#  else /* defined WIN32 || defined _WIN32 */
+#    if defined _M_X64 || defined _M_AMD64 || defined __x86_64__
+#      define CDECL /* not actual on x86_64 platform */
+#    else  /* _M_X64 || _M_AMD64 || __x86_64__ */
+#      define CDECL __attribute__ ((cdecl))
+#    endif /* _M_X64 || _M_AMD64 || __x86_64__ */
+#  endif /* defined WIN32 || defined _WIN32 */
+#endif /* CDECL */
+
+#define JITAPI CDECL
+
+/* called when the settings are changed with new settings */
+typedef void (*iJIT_ModeChangedEx)(void *UserData, iJIT_ModeFlags Flags);
+
+int JITAPI iJIT_NotifyEvent(iJIT_JVM_EVENT event_type, void *EventSpecificData);
+
+/* The new mode call back routine */
+void JITAPI iJIT_RegisterCallbackEx(void *userdata, 
+                                    iJIT_ModeChangedEx NewModeCallBackFuncEx);
+
+iJIT_IsProfilingActiveFlags JITAPI iJIT_IsProfilingActive(void);
+
+void JITAPI FinalizeThread(void);
+
+void JITAPI FinalizeProcess(void);
+
+unsigned int JITAPI iJIT_GetNewMethodID(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __JITPROFILING_H__ */
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 7a206ebf73d7..e16e2d112a99 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Config/config.h"     // Detect libffi
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Mutex.h"
 #include <csignal>
@@ -180,7 +180,7 @@ static void *ffiValueFor(Type *Ty, const GenericValue &AV,
 
 static bool ffiInvoke(RawFunc Fn, Function *F,
                       const std::vector<GenericValue> &ArgVals,
-                      const TargetData *TD, GenericValue &Result) {
+                      const DataLayout *TD, GenericValue &Result) {
   ffi_cif cif;
   FunctionType *FTy = F->getFunctionType();
   const unsigned NumArgs = F->arg_size();
@@ -276,7 +276,7 @@ GenericValue Interpreter::callExternalFunction(Function *F,
   FunctionsLock->release();
 
   GenericValue Result;
-  if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getTargetData(), Result))
+  if (RawFn != 0 && ffiInvoke(RawFn, F, ArgVals, getDataLayout(), Result))
     return Result;
 #endif // USE_LIBFFI
 
@@ -376,7 +376,7 @@ GenericValue lle_X_sprintf(FunctionType *FT,
       case 'x': case 'X':
         if (HowLong >= 1) {
           if (HowLong == 1 &&
-              TheInterpreter->getTargetData()->getPointerSizeInBits() == 64 &&
+              TheInterpreter->getDataLayout()->getPointerSizeInBits() == 64 &&
               sizeof(long) < sizeof(int64_t)) {
             // Make sure we use %lld with a 64 bit argument because we might be
             // compiling LLI on a 32 bit compiler.
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.cpp b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
index 43e34533c7ba..55152dbbea11 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.cpp
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.cpp
@@ -48,7 +48,7 @@ Interpreter::Interpreter(Module *M)
   : ExecutionEngine(M), TD(M) {
       
   memset(&ExitValue.Untyped, 0, sizeof(ExitValue.Untyped));
-  setTargetData(&TD);
+  setDataLayout(&TD);
   // Initialize the "backend"
   initializeExecutionEngine();
   initializeExternalFunctions();
diff --git a/lib/ExecutionEngine/Interpreter/Interpreter.h b/lib/ExecutionEngine/Interpreter/Interpreter.h
index 28c5775ab468..72c42c15db30 100644
--- a/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -17,7 +17,7 @@
 #include "llvm/Function.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -82,7 +82,7 @@ struct ExecutionContext {
 //
 class Interpreter : public ExecutionEngine, public InstVisitor<Interpreter> {
   GenericValue ExitValue;          // The return value of the called function
-  TargetData TD;
+  DataLayout TD;
   IntrinsicLowering *IL;
 
   // The runtime stack of executing code.  The top of the stack is the current
diff --git a/lib/ExecutionEngine/JIT/JIT.cpp b/lib/ExecutionEngine/JIT/JIT.cpp
index 97995ad95c82..1ad338203a2b 100644
--- a/lib/ExecutionEngine/JIT/JIT.cpp
+++ b/lib/ExecutionEngine/JIT/JIT.cpp
@@ -24,7 +24,7 @@
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetJITInfo.h"
 #include "llvm/Support/Dwarf.h"
@@ -272,7 +272,7 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
   : ExecutionEngine(M), TM(tm), TJI(tji),
     JMM(jmm ? jmm : JITMemoryManager::CreateDefaultMemManager()),
     AllocateGVsWithCode(GVsWithCode), isAlreadyCodeGenerating(false) {
-  setTargetData(TM.getTargetData());
+  setDataLayout(TM.getDataLayout());
 
   jitstate = new JITState(M);
 
@@ -285,7 +285,7 @@ JIT::JIT(Module *M, TargetMachine &tm, TargetJITInfo &tji,
   // Add target data
   MutexGuard locked(lock);
   FunctionPassManager &PM = jitstate->getPM(locked);
-  PM.add(new TargetData(*TM.getTargetData()));
+  PM.add(new DataLayout(*TM.getDataLayout()));
 
   // Turn the machine code intermediate representation into bytes in memory that
   // may be executed.
@@ -339,7 +339,7 @@ void JIT::addModule(Module *M) {
     jitstate = new JITState(M);
 
     FunctionPassManager &PM = jitstate->getPM(locked);
-    PM.add(new TargetData(*TM.getTargetData()));
+    PM.add(new DataLayout(*TM.getDataLayout()));
 
     // Turn the machine code intermediate representation into bytes in memory
     // that may be executed.
@@ -370,7 +370,7 @@ bool JIT::removeModule(Module *M) {
     jitstate = new JITState(Modules[0]);
 
     FunctionPassManager &PM = jitstate->getPM(locked);
-    PM.add(new TargetData(*TM.getTargetData()));
+    PM.add(new DataLayout(*TM.getDataLayout()));
 
     // Turn the machine code intermediate representation into bytes in memory
     // that may be executed.
@@ -815,8 +815,8 @@ char* JIT::getMemoryForGV(const GlobalVariable* GV) {
   // through the memory manager which puts them near the code but not in the
   // same buffer.
   Type *GlobalType = GV->getType()->getElementType();
-  size_t S = getTargetData()->getTypeAllocSize(GlobalType);
-  size_t A = getTargetData()->getPreferredAlignment(GV);
+  size_t S = getDataLayout()->getTypeAllocSize(GlobalType);
+  size_t A = getDataLayout()->getPreferredAlignment(GV);
   if (GV->isThreadLocal()) {
     MutexGuard locked(lock);
     Ptr = TJI.allocateThreadLocalMemory(S);
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
index 42a136e72d45..19c197903a63 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.cpp
@@ -24,7 +24,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -42,7 +42,7 @@ unsigned char* JITDwarfEmitter::EmitDwarfTable(MachineFunction& F,
   assert(MMI && "MachineModuleInfo not registered!");
 
   const TargetMachine& TM = F.getTarget();
-  TD = TM.getTargetData();
+  TD = TM.getDataLayout();
   stackGrowthDirection = TM.getFrameLowering()->getStackGrowthDirection();
   RI = TM.getRegisterInfo();
   MAI = TM.getMCAsmInfo();
diff --git a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
index 8dc99abc4224..9cdbeac86ace 100644
--- a/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
+++ b/lib/ExecutionEngine/JIT/JITDwarfEmitter.h
@@ -23,12 +23,12 @@ class MachineFunction;
 class MachineModuleInfo;
 class MachineMove;
 class MCAsmInfo;
-class TargetData;
+class DataLayout;
 class TargetMachine;
 class TargetRegisterInfo;
 
 class JITDwarfEmitter {
-  const TargetData* TD;
+  const DataLayout* TD;
   JITCodeEmitter* JCE;
   const TargetRegisterInfo* RI;
   const MCAsmInfo *MAI;
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index ff3a9dc23c5e..ecafda7286f6 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -30,7 +30,7 @@
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetJITInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -384,11 +384,6 @@ namespace {
       delete MemMgr;
     }
 
-    /// classof - Methods for support type inquiry through isa, cast, and
-    /// dyn_cast:
-    ///
-    static inline bool classof(const MachineCodeEmitter*) { return true; }
-
     JITResolver &getJITResolver() { return Resolver; }
 
     virtual void startFunction(MachineFunction &F);
@@ -763,7 +758,7 @@ void JITEmitter::processDebugLoc(DebugLoc DL, bool BeforePrintingInsn) {
 }
 
 static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
-                                           const TargetData *TD) {
+                                           const DataLayout *TD) {
   const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
   if (Constants.empty()) return 0;
 
@@ -780,7 +775,7 @@ static unsigned GetConstantPoolSizeInBytes(MachineConstantPool *MCP,
 
 void JITEmitter::startFunction(MachineFunction &F) {
   DEBUG(dbgs() << "JIT: Starting CodeGen of Function "
-        << F.getFunction()->getName() << "\n");
+        << F.getName() << "\n");
 
   uintptr_t ActualSize = 0;
   // Set the memory writable, if it's not already
@@ -929,7 +924,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
   PrevDL = DebugLoc();
 
   DEBUG(dbgs() << "JIT: Finished CodeGen of [" << (void*)FnStart
-        << "] Function: " << F.getFunction()->getName()
+        << "] Function: " << F.getName()
         << ": " << (FnEnd-FnStart) << " bytes of text, "
         << Relocations.size() << " relocations\n");
 
@@ -1058,7 +1053,7 @@ void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
   const std::vector<MachineConstantPoolEntry> &Constants = MCP->getConstants();
   if (Constants.empty()) return;
 
-  unsigned Size = GetConstantPoolSizeInBytes(MCP, TheJIT->getTargetData());
+  unsigned Size = GetConstantPoolSizeInBytes(MCP, TheJIT->getDataLayout());
   unsigned Align = MCP->getConstantPoolAlignment();
   ConstantPoolBase = allocateSpace(Size, Align);
   ConstantPool = MCP;
@@ -1087,7 +1082,7 @@ void JITEmitter::emitConstantPool(MachineConstantPool *MCP) {
           dbgs().write_hex(CAddr) << "]\n");
 
     Type *Ty = CPE.Val.ConstVal->getType();
-    Offset += TheJIT->getTargetData()->getTypeAllocSize(Ty);
+    Offset += TheJIT->getDataLayout()->getTypeAllocSize(Ty);
   }
 }
 
@@ -1104,14 +1099,14 @@ void JITEmitter::initJumpTableInfo(MachineJumpTableInfo *MJTI) {
   for (unsigned i = 0, e = JT.size(); i != e; ++i)
     NumEntries += JT[i].MBBs.size();
 
-  unsigned EntrySize = MJTI->getEntrySize(*TheJIT->getTargetData());
+  unsigned EntrySize = MJTI->getEntrySize(*TheJIT->getDataLayout());
 
   // Just allocate space for all the jump tables now.  We will fix up the actual
   // MBB entries in the tables after we emit the code for each block, since then
   // we will know the final locations of the MBBs in memory.
   JumpTable = MJTI;
   JumpTableBase = allocateSpace(NumEntries * EntrySize,
-                             MJTI->getEntryAlignment(*TheJIT->getTargetData()));
+                             MJTI->getEntryAlignment(*TheJIT->getDataLayout()));
 }
 
 void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
@@ -1128,7 +1123,7 @@ void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
   case MachineJumpTableInfo::EK_BlockAddress: {
     // EK_BlockAddress - Each entry is a plain address of block, e.g.:
     //     .word LBB123
-    assert(MJTI->getEntrySize(*TheJIT->getTargetData()) == sizeof(void*) &&
+    assert(MJTI->getEntrySize(*TheJIT->getDataLayout()) == sizeof(void*) &&
            "Cross JIT'ing?");
 
     // For each jump table, map each target in the jump table to the address of
@@ -1148,7 +1143,7 @@ void JITEmitter::emitJumpTableInfo(MachineJumpTableInfo *MJTI) {
   case MachineJumpTableInfo::EK_Custom32:
   case MachineJumpTableInfo::EK_GPRel32BlockAddress:
   case MachineJumpTableInfo::EK_LabelDifference32: {
-    assert(MJTI->getEntrySize(*TheJIT->getTargetData()) == 4&&"Cross JIT'ing?");
+    assert(MJTI->getEntrySize(*TheJIT->getDataLayout()) == 4&&"Cross JIT'ing?");
     // For each jump table, place the offset from the beginning of the table
     // to the target address.
     int *SlotPtr = (int*)JumpTableBase;
@@ -1224,7 +1219,7 @@ uintptr_t JITEmitter::getJumpTableEntryAddress(unsigned Index) const {
   const std::vector<MachineJumpTableEntry> &JT = JumpTable->getJumpTables();
   assert(Index < JT.size() && "Invalid jump table index!");
 
-  unsigned EntrySize = JumpTable->getEntrySize(*TheJIT->getTargetData());
+  unsigned EntrySize = JumpTable->getEntrySize(*TheJIT->getDataLayout());
 
   unsigned Offset = 0;
   for (unsigned i = 0; i < Index; ++i)
@@ -1265,15 +1260,13 @@ void *JIT::getPointerToFunctionOrStub(Function *F) {
     return Addr;
 
   // Get a stub if the target supports it.
-  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
-  JITEmitter *JE = cast<JITEmitter>(getCodeEmitter());
+  JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
   return JE->getJITResolver().getLazyFunctionStub(F);
 }
 
 void JIT::updateFunctionStub(Function *F) {
   // Get the empty stub we generated earlier.
-  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
-  JITEmitter *JE = cast<JITEmitter>(getCodeEmitter());
+  JITEmitter *JE = static_cast<JITEmitter*>(getCodeEmitter());
   void *Stub = JE->getJITResolver().getLazyFunctionStub(F);
   void *Addr = getPointerToGlobalIfAvailable(F);
   assert(Addr != Stub && "Function must have non-stub address to be updated.");
@@ -1294,6 +1287,5 @@ void JIT::freeMachineCodeForFunction(Function *F) {
   updateGlobalMapping(F, 0);
 
   // Free the actual memory for the function body and related stuff.
-  assert(isa<JITEmitter>(JCE) && "Unexpected MCE?");
-  cast<JITEmitter>(JCE)->deallocateMemForFunction(F);
+  static_cast<JITEmitter*>(JCE)->deallocateMemForFunction(F);
 }
diff --git a/lib/ExecutionEngine/MCJIT/CMakeLists.txt b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
index fef71768b493..2911a5077220 100644
--- a/lib/ExecutionEngine/MCJIT/CMakeLists.txt
+++ b/lib/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -1,4 +1,3 @@
 add_llvm_library(LLVMMCJIT
   MCJIT.cpp
-  MCJITMemoryManager.cpp
   )
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 739ffd7d85da..752c5b73ea32 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -8,18 +8,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCJIT.h"
-#include "MCJITMemoryManager.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/MutexGuard.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 using namespace llvm;
 
@@ -44,24 +46,20 @@ ExecutionEngine *MCJIT::createJIT(Module *M,
   // FIXME: Don't do this here.
   sys::DynamicLibrary::LoadLibraryPermanently(0, NULL);
 
-  // If the target supports JIT code generation, create the JIT.
-  if (TargetJITInfo *TJ = TM->getJITInfo())
-    return new MCJIT(M, TM, *TJ, new MCJITMemoryManager(JMM), GVsWithCode);
-
-  if (ErrorStr)
-    *ErrorStr = "target does not support JIT code generation";
-  return 0;
+  return new MCJIT(M, TM, JMM, GVsWithCode);
 }
 
-MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
-             RTDyldMemoryManager *MM, bool AllocateGVsWithCode)
-  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(MM), Dyld(MM), 
-    isCompiled(false), M(m), OS(Buffer)  {
+MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
+             bool AllocateGVsWithCode)
+  : ExecutionEngine(m), TM(tm), Ctx(0), MemMgr(MM), Dyld(MM),
+    isCompiled(false), M(m)  {
 
-  setTargetData(TM->getTargetData());
+  setDataLayout(TM->getDataLayout());
 }
 
 MCJIT::~MCJIT() {
+  if (LoadedObject)
+    NotifyFreeingObject(*LoadedObject.get());
   delete MemMgr;
   delete TM;
 }
@@ -69,7 +67,7 @@ MCJIT::~MCJIT() {
 void MCJIT::emitObject(Module *m) {
   /// Currently, MCJIT only supports a single module and the module passed to
   /// this function call is expected to be the contained module.  The module
-  /// is passed as a parameter here to prepare for multiple module support in 
+  /// is passed as a parameter here to prepare for multiple module support in
   /// the future.
   assert(M == m);
 
@@ -84,41 +82,65 @@ void MCJIT::emitObject(Module *m) {
 
   PassManager PM;
 
-  PM.add(new TargetData(*TM->getTargetData()));
+  PM.add(new DataLayout(*TM->getDataLayout()));
+
+  // The RuntimeDyld will take ownership of this shortly
+  OwningPtr<ObjectBufferStream> Buffer(new ObjectBufferStream());
 
   // Turn the machine code intermediate representation into bytes in memory
   // that may be executed.
-  if (TM->addPassesToEmitMC(PM, Ctx, OS, false)) {
+  if (TM->addPassesToEmitMC(PM, Ctx, Buffer->getOStream(), false)) {
     report_fatal_error("Target does not support MC emission!");
   }
 
   // Initialize passes.
-  // FIXME: When we support multiple modules, we'll want to move the code
-  // gen and finalization out of the constructor here and do it more
-  // on-demand as part of getPointerToFunction().
   PM.run(*m);
-  // Flush the output buffer so the SmallVector gets its data.
-  OS.flush();
+  // Flush the output buffer to get the generated code into memory
+  Buffer->flush();
 
   // Load the object into the dynamic linker.
-  MemoryBuffer* MB = MemoryBuffer::getMemBuffer(StringRef(Buffer.data(),
-                                                          Buffer.size()),
-                                                "", false);
-  if (Dyld.loadObject(MB))
+  // handing off ownership of the buffer
+  LoadedObject.reset(Dyld.loadObject(Buffer.take()));
+  if (!LoadedObject)
     report_fatal_error(Dyld.getErrorString());
 
   // Resolve any relocations.
   Dyld.resolveRelocations();
 
+  // FIXME: Make this optional, maybe even move it to a JIT event listener
+  LoadedObject->registerWithDebugger();
+
+  NotifyObjectEmitted(*LoadedObject);
+
   // FIXME: Add support for per-module compilation state
   isCompiled = true;
 }
 
+// FIXME: Add a parameter to identify which object is being finalized when
+// MCJIT supports multiple modules.
+void MCJIT::finalizeObject() {
+  // If the module hasn't been compiled, just do that.
+  if (!isCompiled) {
+    // If the call to Dyld.resolveRelocations() is removed from emitObject()
+    // we'll need to do that here.
+    emitObject(M);
+    return;
+  }
+
+  // Resolve any relocations.
+  Dyld.resolveRelocations();
+}
+
 void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
   report_fatal_error("not yet implemented");
 }
 
 void *MCJIT::getPointerToFunction(Function *F) {
+  // FIXME: This should really return a uint64_t since it's a pointer in the
+  // target address space, not our local address space. That's part of the
+  // ExecutionEngine interface, though. Fix that when the old JIT finally
+  // dies.
+
   // FIXME: Add support for per-module compilation state
   if (!isCompiled)
     emitObject(M);
@@ -132,10 +154,13 @@ void *MCJIT::getPointerToFunction(Function *F) {
 
   // FIXME: Should the Dyld be retaining module information? Probably not.
   // FIXME: Should we be using the mangler for this? Probably.
+  //
+  // This is the accessor for the target address, so make sure to check the
+  // load address of the symbol, not the local address.
   StringRef BaseName = F->getName();
   if (BaseName[0] == '\1')
-    return (void*)Dyld.getSymbolAddress(BaseName.substr(1));
-  return (void*)Dyld.getSymbolAddress((TM->getMCAsmInfo()->getGlobalPrefix()
+    return (void*)Dyld.getSymbolLoadAddress(BaseName.substr(1));
+  return (void*)Dyld.getSymbolLoadAddress((TM->getMCAsmInfo()->getGlobalPrefix()
                                        + BaseName).str());
 }
 
@@ -270,3 +295,33 @@ void *MCJIT::getPointerToNamedFunction(const std::string &Name,
   }
   return 0;
 }
+
+void MCJIT::RegisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  EventListeners.push_back(L);
+}
+void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  SmallVector<JITEventListener*, 2>::reverse_iterator I=
+      std::find(EventListeners.rbegin(), EventListeners.rend(), L);
+  if (I != EventListeners.rend()) {
+    std::swap(*I, EventListeners.back());
+    EventListeners.pop_back();
+  }
+}
+void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyObjectEmitted(Obj);
+  }
+}
+void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyFreeingObject(Obj);
+  }
+}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 1d272e9d9b60..571080d2bd22 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -11,33 +11,32 @@
 #define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 
 #include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
+class ObjectImage;
+
 // FIXME: This makes all kinds of horrible assumptions for the time being,
 // like only having one module, not needing to worry about multi-threading,
 // blah blah. Purely in get-it-up-and-limping mode for now.
 
 class MCJIT : public ExecutionEngine {
-  MCJIT(Module *M, TargetMachine *tm, TargetJITInfo &tji,
-        RTDyldMemoryManager *MemMgr, bool AllocateGVsWithCode);
+  MCJIT(Module *M, TargetMachine *tm, RTDyldMemoryManager *MemMgr,
+        bool AllocateGVsWithCode);
 
   TargetMachine *TM;
   MCContext *Ctx;
   RTDyldMemoryManager *MemMgr;
   RuntimeDyld Dyld;
+  SmallVector<JITEventListener*, 2> EventListeners;
 
   // FIXME: Add support for multiple modules
   bool isCompiled;
   Module *M;
-
-  // FIXME: Move these to a single container which manages JITed objects
-  SmallVector<char, 4096> Buffer; // Working buffer into which we JIT.
-  raw_svector_ostream OS;
+  OwningPtr<ObjectImage> LoadedObject;
 
 public:
   ~MCJIT();
@@ -45,6 +44,8 @@ public:
   /// @name ExecutionEngine interface implementation
   /// @{
 
+  virtual void finalizeObject();
+
   virtual void *getPointerToBasicBlock(BasicBlock *BB);
 
   virtual void *getPointerToFunction(Function *F);
@@ -71,10 +72,14 @@ public:
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
   /// This is the address which will be used for relocation resolution.
-  virtual void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress) {
+  virtual void mapSectionAddress(const void *LocalAddress,
+                                 uint64_t TargetAddress) {
     Dyld.mapSectionAddress(LocalAddress, TargetAddress);
   }
 
+  virtual void RegisterJITEventListener(JITEventListener *L);
+  virtual void UnregisterJITEventListener(JITEventListener *L);
+
   /// @}
   /// @name (Private) Registration Interfaces
   /// @{
@@ -98,6 +103,9 @@ protected:
   /// is passed as a parameter here to prepare for multiple module support in 
   /// the future.
   void emitObject(Module *M);
+
+  void NotifyObjectEmitted(const ObjectImage& Obj);
+  void NotifyFreeingObject(const ObjectImage& Obj);
 };
 
 } // End llvm namespace
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.cpp b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.cpp
deleted file mode 100644
index 457fe5e3ef06..000000000000
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//==-- MCJITMemoryManager.cpp - Definition for the Memory Manager -*-C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCJITMemoryManager.h"
-
-using namespace llvm;
-
-void MCJITMemoryManager::anchor() { }
diff --git a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h b/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
deleted file mode 100644
index 441aaeb5ecac..000000000000
--- a/lib/ExecutionEngine/MCJIT/MCJITMemoryManager.h
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- MCJITMemoryManager.h - Definition for the Memory Manager ---C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_EXECUTIONENGINE_MCJITMEMORYMANAGER_H
-#define LLVM_LIB_EXECUTIONENGINE_MCJITMEMORYMANAGER_H
-
-#include "llvm/Module.h"
-#include "llvm/ExecutionEngine/JITMemoryManager.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include <assert.h>
-
-namespace llvm {
-
-// The MCJIT memory manager is a layer between the standard JITMemoryManager
-// and the RuntimeDyld interface that maps objects, by name, onto their
-// matching LLVM IR counterparts in the module(s) being compiled.
-class MCJITMemoryManager : public RTDyldMemoryManager {
-  virtual void anchor();
-  OwningPtr<JITMemoryManager> JMM;
-
-public:
-  MCJITMemoryManager(JITMemoryManager *jmm) :
-    JMM(jmm?jmm:JITMemoryManager::CreateDefaultMemManager()) {}
-
-  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
-                               unsigned SectionID) {
-    return JMM->allocateDataSection(Size, Alignment, SectionID);
-  }
-
-  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
-                               unsigned SectionID) {
-    return JMM->allocateCodeSection(Size, Alignment, SectionID);
-  }
-
-  virtual void *getPointerToNamedFunction(const std::string &Name,
-                                          bool AbortOnFailure = true) {
-    return JMM->getPointerToNamedFunction(Name, AbortOnFailure);
-  }
-
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
index 8b5010142241..50cd0724ea4f 100644
--- a/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/GDBRegistrar.cpp
@@ -78,12 +78,12 @@ public:
   /// Creates an entry in the JIT registry for the buffer @p Object,
   /// which must contain an object file in executable memory with any
   /// debug information for the debugger.
-  void registerObject(const MemoryBuffer &Object);
+  void registerObject(const ObjectBuffer &Object);
 
   /// Removes the internal registration of @p Object, and
   /// frees associated resources.
   /// Returns true if @p Object was found in ObjectBufferMap.
-  bool deregisterObject(const MemoryBuffer &Object);
+  bool deregisterObject(const ObjectBuffer &Object);
 
 private:
   /// Deregister the debug info for the given object file from the debugger
@@ -124,7 +124,7 @@ GDBJITRegistrar::~GDBJITRegistrar() {
   ObjectBufferMap.clear();
 }
 
-void GDBJITRegistrar::registerObject(const MemoryBuffer &Object) {
+void GDBJITRegistrar::registerObject(const ObjectBuffer &Object) {
 
   const char *Buffer = Object.getBufferStart();
   size_t      Size = Object.getBufferSize();
@@ -147,7 +147,7 @@ void GDBJITRegistrar::registerObject(const MemoryBuffer &Object) {
   }
 }
 
-bool GDBJITRegistrar::deregisterObject(const MemoryBuffer& Object) {
+bool GDBJITRegistrar::deregisterObject(const ObjectBuffer& Object) {
   const char *Buffer = Object.getBufferStart();
   RegisteredObjectBufferMap::iterator I = ObjectBufferMap.find(Buffer);
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
index f964bc61829b..69e9dbe490d6 100644
--- a/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
+++ b/lib/ExecutionEngine/RuntimeDyld/JITRegistrar.h
@@ -10,7 +10,7 @@
 #ifndef LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
 #define LLVM_EXECUTION_ENGINE_JIT_REGISTRAR_H
 
-#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
 
 namespace llvm {
 
@@ -27,12 +27,12 @@ public:
   /// Creates an entry in the JIT registry for the buffer @p Object,
   /// which must contain an object file in executable memory with any
   /// debug information for the debugger.
-  virtual void registerObject(const MemoryBuffer &Object) = 0;
+  virtual void registerObject(const ObjectBuffer &Object) = 0;
 
   /// Removes the internal registration of @p Object, and
   /// frees associated resources.
   /// Returns true if @p Object was previously registered.
-  virtual bool deregisterObject(const MemoryBuffer &Object) = 0;
+  virtual bool deregisterObject(const ObjectBuffer &Object) = 0;
 
   /// Returns a reference to a GDB JIT registrar singleton
   static JITRegistrar& getGDBRegistrar();
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
index c3e3572f3bcc..17f3a2146492 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImageCommon.h
@@ -1,59 +1,76 @@
-//===---- ObjectImage.h - Format independent executuable object image -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares a file format independent ObjectImage class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
-#define LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
-
-#include "llvm/Object/ObjectFile.h"
-
-namespace llvm {
-
-class ObjectImage {
-  ObjectImage(); // = delete
-  ObjectImage(const ObjectImage &other); // = delete
-protected:
-  object::ObjectFile *ObjFile;
-
-public:
-  ObjectImage(object::ObjectFile *Obj) { ObjFile = Obj; }
-  virtual ~ObjectImage() {}
-
-  virtual object::symbol_iterator begin_symbols() const
-              { return ObjFile->begin_symbols(); }
-  virtual object::symbol_iterator end_symbols() const
-              { return ObjFile->end_symbols(); }
-
-  virtual object::section_iterator begin_sections() const
-              { return ObjFile->begin_sections(); }
-  virtual object::section_iterator end_sections() const
-              { return ObjFile->end_sections(); }
-
-  virtual /* Triple::ArchType */ unsigned getArch() const
-              { return ObjFile->getArch(); }
-
-  // Subclasses can override these methods to update the image with loaded
-  // addresses for sections and common symbols
-  virtual void updateSectionAddress(const object::SectionRef &Sec,
-                                    uint64_t Addr) {}
-  virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr)
-              {}
-
-  // Subclasses can override these methods to provide JIT debugging support
-  virtual void registerWithDebugger() {}
-  virtual void deregisterWithDebugger() {}
-};
-
-} // end namespace llvm
-
-#endif // LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
-
+//===-- ObjectImageCommon.h - Format independent executuable object image -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares a file format independent ObjectImage class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
+#define LLVM_RUNTIMEDYLD_OBJECTIMAGECOMMON_H
+
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
+
+namespace llvm {
+
+class ObjectImageCommon : public ObjectImage {
+  ObjectImageCommon(); // = delete
+  ObjectImageCommon(const ObjectImageCommon &other); // = delete
+
+protected:
+  object::ObjectFile *ObjFile;
+
+  // This form of the constructor allows subclasses to use
+  // format-specific subclasses of ObjectFile directly
+  ObjectImageCommon(ObjectBuffer *Input, object::ObjectFile *Obj)
+  : ObjectImage(Input), // saves Input as Buffer and takes ownership
+    ObjFile(Obj)
+  {
+  }
+
+public:
+  ObjectImageCommon(ObjectBuffer* Input)
+  : ObjectImage(Input) // saves Input as Buffer and takes ownership
+  {
+    ObjFile = object::ObjectFile::createObjectFile(Buffer->getMemBuffer());
+  }
+  virtual ~ObjectImageCommon() { delete ObjFile; }
+
+  virtual object::symbol_iterator begin_symbols() const
+              { return ObjFile->begin_symbols(); }
+  virtual object::symbol_iterator end_symbols() const
+              { return ObjFile->end_symbols(); }
+
+  virtual object::section_iterator begin_sections() const
+              { return ObjFile->begin_sections(); }
+  virtual object::section_iterator end_sections() const
+              { return ObjFile->end_sections(); }
+
+  virtual /* Triple::ArchType */ unsigned getArch() const
+              { return ObjFile->getArch(); }
+
+  virtual StringRef getData() const { return ObjFile->getData(); }
+
+  // Subclasses can override these methods to update the image with loaded
+  // addresses for sections and common symbols
+  virtual void updateSectionAddress(const object::SectionRef &Sec,
+                                    uint64_t Addr) {}
+  virtual void updateSymbolAddress(const object::SymbolRef &Sym, uint64_t Addr)
+              {}
+
+  // Subclasses can override these methods to provide JIT debugging support
+  virtual void registerWithDebugger() {}
+  virtual void deregisterWithDebugger() {}
+};
+
+} // end namespace llvm
+
+#endif // LLVM_RUNTIMEDYLD_OBJECT_IMAGE_H
+
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index b4640404f602..f6dccb106d9b 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -12,10 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "dyld"
+#include "ObjectImageCommon.h"
 #include "RuntimeDyldImpl.h"
 #include "RuntimeDyldELF.h"
 #include "RuntimeDyldMachO.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -26,16 +28,6 @@ RuntimeDyldImpl::~RuntimeDyldImpl() {}
 
 namespace llvm {
 
-namespace {
-  // Helper for extensive error checking in debug builds.
-  error_code Check(error_code Err) {
-    if (Err) {
-      report_fatal_error(Err.message());
-    }
-    return Err;
-  }
-} // end anonymous namespace
-
 // Resolve the relocations for all symbols we currently know about.
 void RuntimeDyldImpl::resolveRelocations() {
   // First, resolve relocations associated with external symbols.
@@ -44,11 +36,15 @@ void RuntimeDyldImpl::resolveRelocations() {
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
   for (int i = 0, e = Sections.size(); i != e; ++i) {
-    reassignSectionAddress(i, Sections[i].LoadAddress);
+    uint64_t Addr = Sections[i].LoadAddress;
+    DEBUG(dbgs() << "Resolving relocations Section #" << i
+            << "\t" << format("%p", (uint8_t *)Addr)
+            << "\n");
+    resolveRelocationList(Relocations[i], Addr);
   }
 }
 
-void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
+void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
     if (Sections[i].Address == LocalAddress) {
@@ -61,14 +57,11 @@ void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
 
 // Subclasses can implement this method to create specialized image instances.
 // The caller owns the pointer that is returned.
-ObjectImage *RuntimeDyldImpl::createObjectImage(const MemoryBuffer *InputBuffer) {
-  ObjectFile *ObjFile = ObjectFile::createObjectFile(const_cast<MemoryBuffer*>
-                                                                 (InputBuffer));
-  ObjectImage *Obj = new ObjectImage(ObjFile);
-  return Obj;
+ObjectImage *RuntimeDyldImpl::createObjectImage(ObjectBuffer *InputBuffer) {
+  return new ObjectImageCommon(InputBuffer);
 }
 
-bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
+ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
   OwningPtr<ObjectImage> obj(createObjectImage(InputBuffer));
   if (!obj)
     report_fatal_error("Unable to create object image from memory buffer!");
@@ -80,9 +73,9 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
   // Used sections from the object file
   ObjSectionToIDMap LocalSections;
 
-  // Common symbols requiring allocation, and the total size required to
-  // allocate all common symbols.
+  // Common symbols requiring allocation, with their sizes and alignments
   CommonSymbolMap CommonSymbols;
+  // Maximum required total memory to allocate all common symbols
   uint64_t CommonSize = 0;
 
   error_code err;
@@ -102,13 +95,15 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
     bool isCommon = flags & SymbolRef::SF_Common;
     if (isCommon) {
       // Add the common symbols to a list.  We'll allocate them all below.
+      uint64_t Align = getCommonSymbolAlignment(*i);
       uint64_t Size = 0;
       Check(i->getSize(Size));
-      CommonSize += Size;
-      CommonSymbols[*i] = Size;
+      CommonSize += Size + Align;
+      CommonSymbols[*i] = CommonSymbolInfo(Size, Align);
     } else {
       if (SymType == object::SymbolRef::ST_Function ||
-          SymType == object::SymbolRef::ST_Data) {
+          SymType == object::SymbolRef::ST_Data ||
+          SymType == object::SymbolRef::ST_Unknown) {
         uint64_t FileOffset;
         StringRef SectionData;
         section_iterator si = obj->end_sections();
@@ -177,9 +172,7 @@ bool RuntimeDyldImpl::loadObject(const MemoryBuffer *InputBuffer) {
     }
   }
 
-  handleObjectLoaded(obj.take());
-
-  return false;
+  return obj.take();
 }
 
 void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
@@ -193,7 +186,7 @@ void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
-  Sections.push_back(SectionEntry(Addr, TotalSize, TotalSize, 0));
+  Sections.push_back(SectionEntry(StringRef(), Addr, TotalSize, TotalSize, 0));
   memset(Addr, 0, TotalSize);
 
   DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID
@@ -204,11 +197,20 @@ void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
   // Assign the address of each symbol
   for (CommonSymbolMap::const_iterator it = CommonSymbols.begin(),
        itEnd = CommonSymbols.end(); it != itEnd; it++) {
+    uint64_t Size = it->second.first;
+    uint64_t Align = it->second.second;
     StringRef Name;
     it->first.getName(Name);
+    if (Align) {
+      // This symbol has an alignment requirement.
+      uint64_t AlignOffset = OffsetToAlignment((uint64_t)Addr, Align);
+      Addr += AlignOffset;
+      Offset += AlignOffset;
+      DEBUG(dbgs() << "Allocating common symbol " << Name << " address " <<
+                      format("%p\n", Addr));
+    }
     Obj.updateSymbolAddress(it->first, (uint64_t)Addr);
     SymbolTable[Name.data()] = SymbolLoc(SectionID, Offset);
-    uint64_t Size = it->second;
     Offset += Size;
     Addr += Size;
   }
@@ -236,10 +238,12 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   bool IsVirtual;
   bool IsZeroInit;
   uint64_t DataSize;
+  StringRef Name;
   Check(Section.isRequiredForExecution(IsRequired));
   Check(Section.isVirtual(IsVirtual));
   Check(Section.isZeroInit(IsZeroInit));
   Check(Section.getSize(DataSize));
+  Check(Section.getName(Name));
 
   unsigned Allocate;
   unsigned SectionID = Sections.size();
@@ -267,6 +271,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
       memcpy(Addr, pData, DataSize);
 
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " Name: " << Name
                  << " obj addr: " << format("%p", pData)
                  << " new addr: " << format("%p", Addr)
                  << " DataSize: " << DataSize
@@ -282,6 +287,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
     Allocate = 0;
     Addr = 0;
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " Name: " << Name
                  << " obj addr: " << format("%p", data.data())
                  << " new addr: 0"
                  << " DataSize: " << DataSize
@@ -290,7 +296,8 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
                  << "\n");
   }
 
-  Sections.push_back(SectionEntry(Addr, Allocate, DataSize,(uintptr_t)pData));
+  Sections.push_back(SectionEntry(Name, Addr, Allocate, DataSize,
+				  (uintptr_t)pData));
   return SectionID;
 }
 
@@ -333,15 +340,49 @@ void RuntimeDyldImpl::addRelocationForSymbol(const RelocationEntry &RE,
 }
 
 uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
-  // TODO: There is only ARM far stub now. We should add the Thumb stub,
-  // and stubs for branches Thumb - ARM and ARM - Thumb.
   if (Arch == Triple::arm) {
+    // TODO: There is only ARM far stub now. We should add the Thumb stub,
+    // and stubs for branches Thumb - ARM and ARM - Thumb.
     uint32_t *StubAddr = (uint32_t*)Addr;
     *StubAddr = 0xe51ff004; // ldr pc,<label>
     return (uint8_t*)++StubAddr;
-  }
-  else
+  } else if (Arch == Triple::mipsel) {
+    uint32_t *StubAddr = (uint32_t*)Addr;
+    // 0:   3c190000        lui     t9,%hi(addr).
+    // 4:   27390000        addiu   t9,t9,%lo(addr).
+    // 8:   03200008        jr      t9.
+    // c:   00000000        nop.
+    const unsigned LuiT9Instr = 0x3c190000, AdduiT9Instr = 0x27390000;
+    const unsigned JrT9Instr = 0x03200008, NopInstr = 0x0;
+
+    *StubAddr = LuiT9Instr;
+    StubAddr++;
+    *StubAddr = AdduiT9Instr;
+    StubAddr++;
+    *StubAddr = JrT9Instr;
+    StubAddr++;
+    *StubAddr = NopInstr;
     return Addr;
+  } else if (Arch == Triple::ppc64) {
+    // PowerPC64 stub: the address points to a function descriptor
+    // instead of the function itself. Load the function address
+    // on r11 and sets it to control register. Also loads the function
+    // TOC in r2 and environment pointer to r11.
+    writeInt32BE(Addr,    0x3D800000); // lis   r12, highest(addr)
+    writeInt32BE(Addr+4,  0x618C0000); // ori   r12, higher(addr)
+    writeInt32BE(Addr+8,  0x798C07C6); // sldi  r12, r12, 32
+    writeInt32BE(Addr+12, 0x658C0000); // oris  r12, r12, h(addr)
+    writeInt32BE(Addr+16, 0x618C0000); // ori   r12, r12, l(addr)
+    writeInt32BE(Addr+20, 0xF8410028); // std   r2,  40(r1)
+    writeInt32BE(Addr+24, 0xE96C0000); // ld    r11, 0(r12)
+    writeInt32BE(Addr+28, 0xE84C0008); // ld    r2,  0(r12)
+    writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11
+    writeInt32BE(Addr+36, 0xE96C0010); // ld    r11, 16(r2)
+    writeInt32BE(Addr+40, 0x4E800420); // bctr
+
+    return Addr;
+  }
+  return Addr;
 }
 
 // Assign an address to a symbol name and resolve all the relocations
@@ -350,32 +391,30 @@ void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
                                              uint64_t Addr) {
   // The address to use for relocation resolution is not
   // the address of the local section buffer. We must be doing
-  // a remote execution environment of some sort. Re-apply any
-  // relocations referencing this section with the given address.
+  // a remote execution environment of some sort. Relocations can't
+  // be applied until all the sections have been moved.  The client must
+  // trigger this with a call to MCJIT::finalize() or
+  // RuntimeDyld::resolveRelocations().
   //
   // Addr is a uint64_t because we can't assume the pointer width
   // of the target is the same as that of the host. Just use a generic
   // "big enough" type.
   Sections[SectionID].LoadAddress = Addr;
-  DEBUG(dbgs() << "Resolving relocations Section #" << SectionID
-          << "\t" << format("%p", (uint8_t *)Addr)
-          << "\n");
-  resolveRelocationList(Relocations[SectionID], Addr);
 }
 
 void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
                                              uint64_t Value) {
-    // Ignore relocations for sections that were not loaded
-    if (Sections[RE.SectionID].Address != 0) {
-      uint8_t *Target = Sections[RE.SectionID].Address + RE.Offset;
-      DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
-            << " + " << RE.Offset << " (" << format("%p", Target) << ")"
-            << " RelType: " << RE.RelType
-            << " Addend: " << RE.Addend
-            << "\n");
+  // Ignore relocations for sections that were not loaded
+  if (Sections[RE.SectionID].Address != 0) {
+    DEBUG(dbgs() << "\tSectionID: " << RE.SectionID
+          << " + " << RE.Offset << " ("
+          << format("%p", Sections[RE.SectionID].Address + RE.Offset) << ")"
+          << " RelType: " << RE.RelType
+          << " Addend: " << RE.Addend
+          << "\n");
 
-      resolveRelocation(Target, Sections[RE.SectionID].LoadAddress + RE.Offset,
-                        Value, RE.RelType, RE.Addend);
+    resolveRelocation(Sections[RE.SectionID], RE.Offset,
+                      Value, RE.RelType, RE.Addend);
   }
 }
 
@@ -420,7 +459,7 @@ RuntimeDyld::~RuntimeDyld() {
   delete Dyld;
 }
 
-bool RuntimeDyld::loadObject(MemoryBuffer *InputBuffer) {
+ObjectImage *RuntimeDyld::loadObject(ObjectBuffer *InputBuffer) {
   if (!Dyld) {
     sys::LLVMFileType type = sys::IdentifyFileType(
             InputBuffer->getBufferStart(),
@@ -462,6 +501,10 @@ void *RuntimeDyld::getSymbolAddress(StringRef Name) {
   return Dyld->getSymbolAddress(Name);
 }
 
+uint64_t RuntimeDyld::getSymbolLoadAddress(StringRef Name) {
+  return Dyld->getSymbolLoadAddress(Name);
+}
+
 void RuntimeDyld::resolveRelocations() {
   Dyld->resolveRelocations();
 }
@@ -471,7 +514,7 @@ void RuntimeDyld::reassignSectionAddress(unsigned SectionID,
   Dyld->reassignSectionAddress(SectionID, Addr);
 }
 
-void RuntimeDyld::mapSectionAddress(void *LocalAddress,
+void RuntimeDyld::mapSectionAddress(const void *LocalAddress,
                                     uint64_t TargetAddress) {
   Dyld->mapSectionAddress(LocalAddress, TargetAddress);
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 75bb586ef2cb..1ebcaf7ba822 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -12,21 +12,32 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "dyld"
+#include "RuntimeDyldELF.h"
+#include "JITRegistrar.h"
+#include "ObjectImageCommon.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/IntervalMap.h"
-#include "RuntimeDyldELF.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/ELF.h"
-#include "JITRegistrar.h"
 using namespace llvm;
 using namespace llvm::object;
 
 namespace {
 
+static inline
+error_code check(error_code Err) {
+  if (Err) {
+    report_fatal_error(Err.message());
+  }
+  return Err;
+}
+
 template<support::endianness target_endianness, bool is64Bits>
 class DyldELFObject : public ELFObjectFile<target_endianness, is64Bits> {
   LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
@@ -36,25 +47,17 @@ class DyldELFObject : public ELFObjectFile<target_endianness, is64Bits> {
   typedef Elf_Rel_Impl<target_endianness, is64Bits, false> Elf_Rel;
   typedef Elf_Rel_Impl<target_endianness, is64Bits, true> Elf_Rela;
 
-  typedef typename ELFObjectFile<target_endianness, is64Bits>::
-    Elf_Ehdr Elf_Ehdr;
+  typedef Elf_Ehdr_Impl<target_endianness, is64Bits> Elf_Ehdr;
 
   typedef typename ELFDataTypeTypedefHelper<
           target_endianness, is64Bits>::value_type addr_type;
 
-protected:
-  // This duplicates the 'Data' member in the 'Binary' base class
-  // but it is necessary to workaround a bug in gcc 4.2
-  MemoryBuffer *InputData;
-
 public:
-  DyldELFObject(MemoryBuffer *Object, error_code &ec);
+  DyldELFObject(MemoryBuffer *Wrapper, error_code &ec);
 
   void updateSectionAddress(const SectionRef &Sec, uint64_t Addr);
   void updateSymbolAddress(const SymbolRef &Sym, uint64_t Addr);
 
-  const MemoryBuffer& getBuffer() const { return *InputData; }
-
   // Methods for type inquiry through isa, cast and dyn_cast
   static inline bool classof(const Binary *v) {
     return (isa<ELFObjectFile<target_endianness, is64Bits> >(v)
@@ -64,20 +67,18 @@ public:
       const ELFObjectFile<target_endianness, is64Bits> *v) {
     return v->isDyldType();
   }
-  static inline bool classof(const DyldELFObject *v) {
-    return true;
-  }
 };
 
 template<support::endianness target_endianness, bool is64Bits>
-class ELFObjectImage : public ObjectImage {
+class ELFObjectImage : public ObjectImageCommon {
   protected:
     DyldELFObject<target_endianness, is64Bits> *DyldObj;
     bool Registered;
 
   public:
-    ELFObjectImage(DyldELFObject<target_endianness, is64Bits> *Obj)
-    : ObjectImage(Obj),
+    ELFObjectImage(ObjectBuffer *Input,
+                   DyldELFObject<target_endianness, is64Bits> *Obj)
+    : ObjectImageCommon(Input, Obj),
       DyldObj(Obj),
       Registered(false) {}
 
@@ -100,20 +101,22 @@ class ELFObjectImage : public ObjectImage {
 
     virtual void registerWithDebugger()
     {
-      JITRegistrar::getGDBRegistrar().registerObject(DyldObj->getBuffer());
+      JITRegistrar::getGDBRegistrar().registerObject(*Buffer);
       Registered = true;
     }
     virtual void deregisterWithDebugger()
     {
-      JITRegistrar::getGDBRegistrar().deregisterObject(DyldObj->getBuffer());
+      JITRegistrar::getGDBRegistrar().deregisterObject(*Buffer);
     }
 };
 
+// The MemoryBuffer passed into this constructor is just a wrapper around the
+// actual memory.  Ultimately, the Binary parent class will take ownership of
+// this MemoryBuffer object but not the underlying memory.
 template<support::endianness target_endianness, bool is64Bits>
-DyldELFObject<target_endianness, is64Bits>::DyldELFObject(MemoryBuffer *Object,
+DyldELFObject<target_endianness, is64Bits>::DyldELFObject(MemoryBuffer *Wrapper,
                                                           error_code &ec)
-  : ELFObjectFile<target_endianness, is64Bits>(Object, ec),
-    InputData(Object) {
+  : ELFObjectFile<target_endianness, is64Bits>(Wrapper, ec) {
   this->isDyldELFObject = true;
 }
 
@@ -149,50 +152,43 @@ void DyldELFObject<target_endianness, is64Bits>::updateSymbolAddress(
 
 namespace llvm {
 
-ObjectImage *RuntimeDyldELF::createObjectImage(
-                                         const MemoryBuffer *ConstInputBuffer) {
-  MemoryBuffer *InputBuffer = const_cast<MemoryBuffer*>(ConstInputBuffer);
-  std::pair<unsigned char, unsigned char> Ident = getElfArchType(InputBuffer);
+ObjectImage *RuntimeDyldELF::createObjectImage(ObjectBuffer *Buffer) {
+  if (Buffer->getBufferSize() < ELF::EI_NIDENT)
+    llvm_unreachable("Unexpected ELF object size");
+  std::pair<unsigned char, unsigned char> Ident = std::make_pair(
+                         (uint8_t)Buffer->getBufferStart()[ELF::EI_CLASS],
+                         (uint8_t)Buffer->getBufferStart()[ELF::EI_DATA]);
   error_code ec;
 
   if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2LSB) {
     DyldELFObject<support::little, false> *Obj =
-           new DyldELFObject<support::little, false>(InputBuffer, ec);
-    return new ELFObjectImage<support::little, false>(Obj);
+           new DyldELFObject<support::little, false>(Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<support::little, false>(Buffer, Obj);
   }
   else if (Ident.first == ELF::ELFCLASS32 && Ident.second == ELF::ELFDATA2MSB) {
     DyldELFObject<support::big, false> *Obj =
-           new DyldELFObject<support::big, false>(InputBuffer, ec);
-    return new ELFObjectImage<support::big, false>(Obj);
+           new DyldELFObject<support::big, false>(Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<support::big, false>(Buffer, Obj);
   }
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2MSB) {
     DyldELFObject<support::big, true> *Obj =
-           new DyldELFObject<support::big, true>(InputBuffer, ec);
-    return new ELFObjectImage<support::big, true>(Obj);
+           new DyldELFObject<support::big, true>(Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<support::big, true>(Buffer, Obj);
   }
   else if (Ident.first == ELF::ELFCLASS64 && Ident.second == ELF::ELFDATA2LSB) {
     DyldELFObject<support::little, true> *Obj =
-           new DyldELFObject<support::little, true>(InputBuffer, ec);
-    return new ELFObjectImage<support::little, true>(Obj);
+           new DyldELFObject<support::little, true>(Buffer->getMemBuffer(), ec);
+    return new ELFObjectImage<support::little, true>(Buffer, Obj);
   }
   else
     llvm_unreachable("Unexpected ELF format");
 }
 
-void RuntimeDyldELF::handleObjectLoaded(ObjectImage *Obj)
-{
-  Obj->registerWithDebugger();
-  // Save the loaded object.  It will deregister itself when deleted
-  LoadedObject = Obj;
-}
-
 RuntimeDyldELF::~RuntimeDyldELF() {
-  if (LoadedObject)
-    delete LoadedObject;
 }
 
-void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
-                                             uint64_t FinalAddress,
+void RuntimeDyldELF::resolveX86_64Relocation(const SectionEntry &Section,
+                                             uint64_t Offset,
                                              uint64_t Value,
                                              uint32_t Type,
                                              int64_t Addend) {
@@ -201,8 +197,10 @@ void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
     llvm_unreachable("Relocation type not implemented yet!");
   break;
   case ELF::R_X86_64_64: {
-    uint64_t *Target = (uint64_t*)(LocalAddress);
+    uint64_t *Target = reinterpret_cast<uint64_t*>(Section.Address + Offset);
     *Target = Value + Addend;
+    DEBUG(dbgs() << "Writing " << format("%p", (Value + Addend))
+                 << " at " << format("%p\n",Target));
     break;
   }
   case ELF::R_X86_64_32:
@@ -212,37 +210,52 @@ void RuntimeDyldELF::resolveX86_64Relocation(uint8_t *LocalAddress,
            (Type == ELF::R_X86_64_32S && 
              ((int64_t)Value <= INT32_MAX && (int64_t)Value >= INT32_MIN)));
     uint32_t TruncatedAddr = (Value & 0xFFFFFFFF);
-    uint32_t *Target = reinterpret_cast<uint32_t*>(LocalAddress);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
     *Target = TruncatedAddr;
+    DEBUG(dbgs() << "Writing " << format("%p", TruncatedAddr)
+                 << " at " << format("%p\n",Target));
     break;
   }
   case ELF::R_X86_64_PC32: {
-    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(LocalAddress);
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint64_t  FinalAddress = Section.LoadAddress + Offset;
     int64_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
     assert(RealOffset <= INT32_MAX && RealOffset >= INT32_MIN);
     int32_t TruncOffset = (RealOffset & 0xFFFFFFFF);
-    *Placeholder = TruncOffset;
+    *Target = TruncOffset;
     break;
   }
   }
 }
 
-void RuntimeDyldELF::resolveX86Relocation(uint8_t *LocalAddress,
-                                          uint32_t FinalAddress,
+void RuntimeDyldELF::resolveX86Relocation(const SectionEntry &Section,
+                                          uint64_t Offset,
                                           uint32_t Value,
                                           uint32_t Type,
                                           int32_t Addend) {
   switch (Type) {
   case ELF::R_386_32: {
-    uint32_t *Target = (uint32_t*)(LocalAddress);
-    uint32_t Placeholder = *Target;
-    *Target = Placeholder + Value + Addend;
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    *Target = *Placeholder + Value + Addend;
     break;
   }
   case ELF::R_386_PC32: {
-    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(LocalAddress);
+    // Get the placeholder value from the generated object since
+    // a previous relocation attempt may have overwritten the loaded version
+    uint32_t *Placeholder = reinterpret_cast<uint32_t*>(Section.ObjAddress
+                                                                   + Offset);
+    uint32_t *Target = reinterpret_cast<uint32_t*>(Section.Address + Offset);
+    uint32_t  FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
     uint32_t RealOffset = *Placeholder + Value + Addend - FinalAddress;
-    *Placeholder = RealOffset;
+    *Target = RealOffset;
     break;
     }
     default:
@@ -253,16 +266,18 @@ void RuntimeDyldELF::resolveX86Relocation(uint8_t *LocalAddress,
   }
 }
 
-void RuntimeDyldELF::resolveARMRelocation(uint8_t *LocalAddress,
-                                          uint32_t FinalAddress,
+void RuntimeDyldELF::resolveARMRelocation(const SectionEntry &Section,
+                                          uint64_t Offset,
                                           uint32_t Value,
                                           uint32_t Type,
                                           int32_t Addend) {
   // TODO: Add Thumb relocations.
-  uint32_t* TargetPtr = (uint32_t*)LocalAddress;
+  uint32_t* TargetPtr = (uint32_t*)(Section.Address + Offset);
+  uint32_t FinalAddress = ((Section.LoadAddress + Offset) & 0xFFFFFFFF);
   Value += Addend;
 
-  DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: " << LocalAddress
+  DEBUG(dbgs() << "resolveARMRelocation, LocalAddress: "
+               << Section.Address + Offset
                << " FinalAddress: " << format("%p",FinalAddress)
                << " Value: " << format("%x",Value)
                << " Type: " << format("%x",Type)
@@ -273,14 +288,19 @@ void RuntimeDyldELF::resolveARMRelocation(uint8_t *LocalAddress,
   default:
     llvm_unreachable("Not implemented relocation type!");
 
-  // Just write 32bit value to relocation address
+  // Write a 32bit value to relocation address, taking into account the 
+  // implicit addend encoded in the target.
   case ELF::R_ARM_ABS32 :
-    *TargetPtr = Value;
+    *TargetPtr += Value;
     break;
 
   // Write first 16 bit of 32 bit value to the mov instruction.
   // Last 4 bit should be shifted.
   case ELF::R_ARM_MOVW_ABS_NC :
+    // We are not expecting any other addend in the relocation address.
+    // Using 0x000F0FFF because MOVW has its 16 bit immediate split into 2 
+    // non-contiguous fields.
+    assert((*TargetPtr & 0x000F0FFF) == 0);
     Value = Value & 0xFFFF;
     *TargetPtr |= Value & 0xFFF;
     *TargetPtr |= ((Value >> 12) & 0xF) << 16;
@@ -289,6 +309,9 @@ void RuntimeDyldELF::resolveARMRelocation(uint8_t *LocalAddress,
   // Write last 16 bit of 32 bit value to the mov instruction.
   // Last 4 bit should be shifted.
   case ELF::R_ARM_MOVT_ABS :
+    // We are not expecting any other addend in the relocation address.
+    // Use 0x000F0FFF for the same reason as R_ARM_MOVW_ABS_NC.
+    assert((*TargetPtr & 0x000F0FFF) == 0);
     Value = (Value >> 16) & 0xFFFF;
     *TargetPtr |= Value & 0xFFF;
     *TargetPtr |= ((Value >> 12) & 0xF) << 16;
@@ -306,26 +329,250 @@ void RuntimeDyldELF::resolveARMRelocation(uint8_t *LocalAddress,
   }
 }
 
-void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
-                                       uint64_t FinalAddress,
+void RuntimeDyldELF::resolveMIPSRelocation(const SectionEntry &Section,
+                                           uint64_t Offset,
+                                           uint32_t Value,
+                                           uint32_t Type,
+                                           int32_t Addend) {
+  uint32_t* TargetPtr = (uint32_t*)(Section.Address + Offset);
+  Value += Addend;
+
+  DEBUG(dbgs() << "resolveMipselocation, LocalAddress: "
+               << Section.Address + Offset
+               << " FinalAddress: "
+               << format("%p",Section.LoadAddress + Offset)
+               << " Value: " << format("%x",Value)
+               << " Type: " << format("%x",Type)
+               << " Addend: " << format("%x",Addend)
+               << "\n");
+
+  switch(Type) {
+  default:
+    llvm_unreachable("Not implemented relocation type!");
+    break;
+  case ELF::R_MIPS_32:
+    *TargetPtr = Value + (*TargetPtr);
+    break;
+  case ELF::R_MIPS_26:
+    *TargetPtr = ((*TargetPtr) & 0xfc000000) | (( Value & 0x0fffffff) >> 2);
+    break;
+  case ELF::R_MIPS_HI16:
+    // Get the higher 16-bits. Also add 1 if bit 15 is 1.
+    Value += ((*TargetPtr) & 0x0000ffff) << 16;
+    *TargetPtr = ((*TargetPtr) & 0xffff0000) |
+                 (((Value + 0x8000) >> 16) & 0xffff);
+    break;
+   case ELF::R_MIPS_LO16:
+    Value += ((*TargetPtr) & 0x0000ffff);
+    *TargetPtr = ((*TargetPtr) & 0xffff0000) | (Value & 0xffff);
+    break;
+   }
+}
+
+// Return the .TOC. section address to R_PPC64_TOC relocations.
+uint64_t RuntimeDyldELF::findPPC64TOC() const {
+  // The TOC consists of sections .got, .toc, .tocbss, .plt in that
+  // order. The TOC starts where the first of these sections starts.
+  SectionList::const_iterator it = Sections.begin();
+  SectionList::const_iterator ite = Sections.end();
+  for (; it != ite; ++it) {
+    if (it->Name == ".got" ||
+        it->Name == ".toc" ||
+        it->Name == ".tocbss" ||
+        it->Name == ".plt")
+      break;
+  }
+  if (it == ite) {
+    // This may happen for
+    // * references to TOC base base (sym@toc, .odp relocation) without
+    // a .toc directive.
+    // In this case just use the first section (which is usually
+    // the .odp) since the code won't reference the .toc base
+    // directly.
+    it = Sections.begin();
+  }
+  assert (it != ite);
+  // Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000
+  // thus permitting a full 64 Kbytes segment.
+  return it->LoadAddress + 0x8000;
+}
+
+// Returns the sections and offset associated with the ODP entry referenced
+// by Symbol.
+void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
+                                         ObjSectionToIDMap &LocalSections,
+                                         RelocationValueRef &Rel) {
+  // Get the ELF symbol value (st_value) to compare with Relocation offset in
+  // .opd entries
+
+  error_code err;
+  for (section_iterator si = Obj.begin_sections(),
+     se = Obj.end_sections(); si != se; si.increment(err)) {
+    StringRef SectionName;
+    check(si->getName(SectionName));
+    if (SectionName != ".opd")
+      continue;
+
+    for (relocation_iterator i = si->begin_relocations(),
+         e = si->end_relocations(); i != e;) {
+      check(err);
+
+      // The R_PPC64_ADDR64 relocation indicates the first field
+      // of a .opd entry
+      uint64_t TypeFunc;
+      check(i->getType(TypeFunc));
+      if (TypeFunc != ELF::R_PPC64_ADDR64) {
+        i.increment(err);
+        continue;
+      }
+
+      SymbolRef TargetSymbol;
+      uint64_t TargetSymbolOffset;
+      int64_t TargetAdditionalInfo;
+      check(i->getSymbol(TargetSymbol));
+      check(i->getOffset(TargetSymbolOffset));
+      check(i->getAdditionalInfo(TargetAdditionalInfo));
+
+      i = i.increment(err);
+      if (i == e)
+        break;
+      check(err);
+
+      // Just check if following relocation is a R_PPC64_TOC
+      uint64_t TypeTOC;
+      check(i->getType(TypeTOC));
+      if (TypeTOC != ELF::R_PPC64_TOC)
+        continue;
+
+      // Finally compares the Symbol value and the target symbol offset
+      // to check if this .opd entry refers to the symbol the relocation
+      // points to.
+      if (Rel.Addend != (intptr_t)TargetSymbolOffset)
+        continue;
+
+      section_iterator tsi(Obj.end_sections());
+      check(TargetSymbol.getSection(tsi));
+      Rel.SectionID = findOrEmitSection(Obj, (*tsi), true, LocalSections);
+      Rel.Addend = (intptr_t)TargetAdditionalInfo;
+      return;
+    }
+  }
+  llvm_unreachable("Attempting to get address of ODP entry!");
+}
+
+// Relocation masks following the #lo(value), #hi(value), #higher(value),
+// and #highest(value) macros defined in section 4.5.1. Relocation Types
+// in PPC-elf64abi document.
+//
+static inline
+uint16_t applyPPClo (uint64_t value)
+{
+  return value & 0xffff;
+}
+
+static inline
+uint16_t applyPPChi (uint64_t value)
+{
+  return (value >> 16) & 0xffff;
+}
+
+static inline
+uint16_t applyPPChigher (uint64_t value)
+{
+  return (value >> 32) & 0xffff;
+}
+
+static inline
+uint16_t applyPPChighest (uint64_t value)
+{
+  return (value >> 48) & 0xffff;
+}
+
+void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
+                                            uint64_t Offset,
+                                            uint64_t Value,
+                                            uint32_t Type,
+                                            int64_t Addend) {
+  uint8_t* LocalAddress = Section.Address + Offset;
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+  break;
+  case ELF::R_PPC64_ADDR16_LO :
+    writeInt16BE(LocalAddress, applyPPClo (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HI :
+    writeInt16BE(LocalAddress, applyPPChi (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HIGHER :
+    writeInt16BE(LocalAddress, applyPPChigher (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HIGHEST :
+    writeInt16BE(LocalAddress, applyPPChighest (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR14 : {
+    assert(((Value + Addend) & 3) == 0);
+    // Preserve the AA/LK bits in the branch instruction
+    uint8_t aalk = *(LocalAddress+3);
+    writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc));
+  } break;
+  case ELF::R_PPC64_REL24 : {
+    uint64_t FinalAddress = (Section.LoadAddress + Offset);
+    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
+    if (SignExtend32<24>(delta) != delta)
+      llvm_unreachable("Relocation R_PPC64_REL24 overflow");
+    // Generates a 'bl <address>' instruction
+    writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
+  } break;
+  case ELF::R_PPC64_ADDR64 :
+    writeInt64BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_PPC64_TOC :
+    writeInt64BE(LocalAddress, findPPC64TOC());
+    break;
+  case ELF::R_PPC64_TOC16 : {
+    uint64_t TOCStart = findPPC64TOC();
+    Value = applyPPClo((Value + Addend) - TOCStart);
+    writeInt16BE(LocalAddress, applyPPClo(Value));
+  } break;
+  case ELF::R_PPC64_TOC16_DS : {
+    uint64_t TOCStart = findPPC64TOC();
+    Value = ((Value + Addend) - TOCStart);
+    writeInt16BE(LocalAddress, applyPPClo(Value));
+  } break;
+  }
+}
+
+
+void RuntimeDyldELF::resolveRelocation(const SectionEntry &Section,
+                                       uint64_t Offset,
                                        uint64_t Value,
                                        uint32_t Type,
                                        int64_t Addend) {
   switch (Arch) {
   case Triple::x86_64:
-    resolveX86_64Relocation(LocalAddress, FinalAddress, Value, Type, Addend);
+    resolveX86_64Relocation(Section, Offset, Value, Type, Addend);
     break;
   case Triple::x86:
-    resolveX86Relocation(LocalAddress, (uint32_t)(FinalAddress & 0xffffffffL),
+    resolveX86Relocation(Section, Offset,
                          (uint32_t)(Value & 0xffffffffL), Type,
                          (uint32_t)(Addend & 0xffffffffL));
     break;
   case Triple::arm:    // Fall through.
   case Triple::thumb:
-    resolveARMRelocation(LocalAddress, (uint32_t)(FinalAddress & 0xffffffffL),
+    resolveARMRelocation(Section, Offset,
                          (uint32_t)(Value & 0xffffffffL), Type,
                          (uint32_t)(Addend & 0xffffffffL));
     break;
+  case Triple::mips:    // Fall through.
+  case Triple::mipsel:
+    resolveMIPSRelocation(Section, Offset,
+                          (uint32_t)(Value & 0xffffffffL), Type,
+                          (uint32_t)(Addend & 0xffffffffL));
+    break;
+  case Triple::ppc64:
+    resolvePPC64Relocation(Section, Offset, Value, Type, Addend);
+    break;
   default: llvm_unreachable("Unsupported CPU type!");
   }
 }
@@ -350,6 +597,8 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
   RelocationValueRef Value;
   // First search for the symbol in the local symbol table
   SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
+  SymbolRef::Type SymType;
+  Symbol.getType(SymType);
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
     Value.Addend = lsi->second.second;
@@ -361,8 +610,6 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
       Value.SectionID = gsi->second.first;
       Value.Addend = gsi->second.second;
     } else {
-      SymbolRef::Type SymType;
-      Symbol.getType(SymType);
       switch (SymType) {
         case SymbolRef::ST_Debug: {
           // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
@@ -373,7 +620,13 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
           if (si == Obj.end_sections())
             llvm_unreachable("Symbol section not found, bad object file format!");
           DEBUG(dbgs() << "\t\tThis is section symbol\n");
-          Value.SectionID = findOrEmitSection(Obj, (*si), true, ObjSectionToID);
+          // Default to 'true' in case isText fails (though it never does).
+          bool isCode = true;
+          si->isText(isCode);
+          Value.SectionID = findOrEmitSection(Obj, 
+                                              (*si), 
+                                              isCode, 
+                                              ObjSectionToID);
           Value.Addend = Addend;
           break;
         }
@@ -398,13 +651,12 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
     // This is an ARM branch relocation, need to use a stub function.
     DEBUG(dbgs() << "\t\tThis is an ARM branch relocation.");
     SectionEntry &Section = Sections[Rel.SectionID];
-    uint8_t *Target = Section.Address + Rel.Offset;
 
-    //  Look up for existing stub.
+    // Look for an existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end()) {
-      resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
-                        i->second, RelType, 0);
+        resolveRelocation(Section, Rel.Offset,
+                          (uint64_t)Section.Address + i->second, RelType, 0);
       DEBUG(dbgs() << " Stub function found\n");
     } else {
       // Create a new stub function.
@@ -419,10 +671,145 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
       else
         addRelocationForSection(RE, Value.SectionID);
 
-      resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
-                        Section.StubOffset, RelType, 0);
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + Section.StubOffset,
+                        RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
+  } else if (Arch == Triple::mipsel && RelType == ELF::R_MIPS_26) {
+    // This is an Mips branch relocation, need to use a stub function.
+    DEBUG(dbgs() << "\t\tThis is a Mips branch relocation.");
+    SectionEntry &Section = Sections[Rel.SectionID];
+    uint8_t *Target = Section.Address + Rel.Offset;
+    uint32_t *TargetAddress = (uint32_t *)Target;
+
+    // Extract the addend from the instruction.
+    uint32_t Addend = ((*TargetAddress) & 0x03ffffff) << 2;
+
+    Value.Addend += Addend;
+
+    //  Look up for existing stub.
+    StubMap::const_iterator i = Stubs.find(Value);
+    if (i != Stubs.end()) {
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + i->second, RelType, 0);
+      DEBUG(dbgs() << " Stub function found\n");
+    } else {
+      // Create a new stub function.
+      DEBUG(dbgs() << " Create a new stub function\n");
+      Stubs[Value] = Section.StubOffset;
+      uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                   Section.StubOffset);
+
+      // Creating Hi and Lo relocations for the filled stub instructions.
+      RelocationEntry REHi(Rel.SectionID,
+                           StubTargetAddr - Section.Address,
+                           ELF::R_MIPS_HI16, Value.Addend);
+      RelocationEntry RELo(Rel.SectionID,
+                           StubTargetAddr - Section.Address + 4,
+                           ELF::R_MIPS_LO16, Value.Addend);
+
+      if (Value.SymbolName) {
+        addRelocationForSymbol(REHi, Value.SymbolName);
+        addRelocationForSymbol(RELo, Value.SymbolName);
+      } else {
+        addRelocationForSection(REHi, Value.SectionID);
+        addRelocationForSection(RELo, Value.SectionID);
+      }
+
+      resolveRelocation(Section, Rel.Offset,
+                        (uint64_t)Section.Address + Section.StubOffset,
+                        RelType, 0);
+      Section.StubOffset += getMaxStubSize();
+    }
+  } else if (Arch == Triple::ppc64) {
+    if (RelType == ELF::R_PPC64_REL24) {
+      // A PPC branch relocation will need a stub function if the target is
+      // an external symbol (Symbol::ST_Unknown) or if the target address
+      // is not within the signed 24-bits branch address.
+      SectionEntry &Section = Sections[Rel.SectionID];
+      uint8_t *Target = Section.Address + Rel.Offset;
+      bool RangeOverflow = false;
+      if (SymType != SymbolRef::ST_Unknown) {
+        // A function call may points to the .opd entry, so the final symbol value
+        // in calculated based in the relocation values in .opd section.
+        findOPDEntrySection(Obj, ObjSectionToID, Value);
+        uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend;
+        int32_t delta = static_cast<int32_t>(Target - RelocTarget);
+        // If it is within 24-bits branch range, just set the branch target
+        if (SignExtend32<24>(delta) == delta) {
+          RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+          if (Value.SymbolName)
+            addRelocationForSymbol(RE, Value.SymbolName);
+          else
+            addRelocationForSection(RE, Value.SectionID);
+        } else {
+          RangeOverflow = true;
+        }
+      }
+      if (SymType == SymbolRef::ST_Unknown || RangeOverflow == true) {
+        // It is an external symbol (SymbolRef::ST_Unknown) or within a range
+        // larger than 24-bits.
+        StubMap::const_iterator i = Stubs.find(Value);
+        if (i != Stubs.end()) {
+          // Symbol function stub already created, just relocate to it
+          resolveRelocation(Section, Rel.Offset,
+                            (uint64_t)Section.Address + i->second, RelType, 0);
+          DEBUG(dbgs() << " Stub function found\n");
+        } else {
+          // Create a new stub function.
+          DEBUG(dbgs() << " Create a new stub function\n");
+          Stubs[Value] = Section.StubOffset;
+          uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                       Section.StubOffset);
+          RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                             ELF::R_PPC64_ADDR64, Value.Addend);
+
+          // Generates the 64-bits address loads as exemplified in section
+          // 4.5.1 in PPC64 ELF ABI.
+          RelocationEntry REhst(Rel.SectionID,
+                                StubTargetAddr - Section.Address + 2,
+                                ELF::R_PPC64_ADDR16_HIGHEST, Value.Addend);
+          RelocationEntry REhr(Rel.SectionID,
+                               StubTargetAddr - Section.Address + 6,
+                               ELF::R_PPC64_ADDR16_HIGHER, Value.Addend);
+          RelocationEntry REh(Rel.SectionID,
+                              StubTargetAddr - Section.Address + 14,
+                              ELF::R_PPC64_ADDR16_HI, Value.Addend);
+          RelocationEntry REl(Rel.SectionID,
+                              StubTargetAddr - Section.Address + 18,
+                              ELF::R_PPC64_ADDR16_LO, Value.Addend);
+
+          if (Value.SymbolName) {
+            addRelocationForSymbol(REhst, Value.SymbolName);
+            addRelocationForSymbol(REhr,  Value.SymbolName);
+            addRelocationForSymbol(REh,   Value.SymbolName);
+            addRelocationForSymbol(REl,   Value.SymbolName);
+          } else {
+            addRelocationForSection(REhst, Value.SectionID);
+            addRelocationForSection(REhr,  Value.SectionID);
+            addRelocationForSection(REh,   Value.SectionID);
+            addRelocationForSection(REl,   Value.SectionID);
+          }
+
+          resolveRelocation(Section, Rel.Offset,
+                            (uint64_t)Section.Address + Section.StubOffset,
+                            RelType, 0);
+          if (SymType == SymbolRef::ST_Unknown)
+            // Restore the TOC for external calls
+            writeInt32BE(Target+4, 0xE8410028); // ld r2,40(r1)
+          Section.StubOffset += getMaxStubSize();
+        }
+      }
+    } else {
+      RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+      // Extra check to avoid relocation againt empty symbols (usually
+      // the R_PPC64_TOC).
+      if (Value.SymbolName && !TargetName.empty())
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    }
   } else {
     RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
     if (Value.SymbolName)
@@ -432,8 +819,16 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
   }
 }
 
-bool RuntimeDyldELF::isCompatibleFormat(const MemoryBuffer *InputBuffer) const {
-  StringRef Magic = InputBuffer->getBuffer().slice(0, ELF::EI_NIDENT);
-  return (memcmp(Magic.data(), ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
+unsigned RuntimeDyldELF::getCommonSymbolAlignment(const SymbolRef &Sym) {
+  // In ELF, the value of an SHN_COMMON symbol is its alignment requirement.
+  uint64_t Align;
+  Check(Sym.getValue(Align));
+  return Align;
+}
+
+bool RuntimeDyldELF::isCompatibleFormat(const ObjectBuffer *Buffer) const {
+  if (Buffer->getBufferSize() < strlen(ELF::ElfMagic))
+    return false;
+  return (memcmp(Buffer->getBufferStart(), ELF::ElfMagic, strlen(ELF::ElfMagic))) == 0;
 }
 } // namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index e413f780f54a..07e704b45930 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -18,32 +18,52 @@
 
 using namespace llvm;
 
-
 namespace llvm {
+
+namespace {
+  // Helper for extensive error checking in debug builds.
+  error_code Check(error_code Err) {
+    if (Err) {
+      report_fatal_error(Err.message());
+    }
+    return Err;
+  }
+} // end anonymous namespace
+
 class RuntimeDyldELF : public RuntimeDyldImpl {
 protected:
-  ObjectImage *LoadedObject;
-
-  void resolveX86_64Relocation(uint8_t *LocalAddress,
-                               uint64_t FinalAddress,
+  void resolveX86_64Relocation(const SectionEntry &Section,
+                               uint64_t Offset,
                                uint64_t Value,
                                uint32_t Type,
                                int64_t Addend);
 
-  void resolveX86Relocation(uint8_t *LocalAddress,
-                            uint32_t FinalAddress,
+  void resolveX86Relocation(const SectionEntry &Section,
+                            uint64_t Offset,
                             uint32_t Value,
                             uint32_t Type,
                             int32_t Addend);
 
-  void resolveARMRelocation(uint8_t *LocalAddress,
-                            uint32_t FinalAddress,
+  void resolveARMRelocation(const SectionEntry &Section,
+                            uint64_t Offset,
                             uint32_t Value,
                             uint32_t Type,
                             int32_t Addend);
 
-  virtual void resolveRelocation(uint8_t *LocalAddress,
-                                 uint64_t FinalAddress,
+  void resolveMIPSRelocation(const SectionEntry &Section,
+                             uint64_t Offset,
+                             uint32_t Value,
+                             uint32_t Type,
+                             int32_t Addend);
+
+  void resolvePPC64Relocation(const SectionEntry &Section,
+                              uint64_t Offset,
+                              uint64_t Value,
+                              uint32_t Type,
+                              int64_t Addend);
+
+  virtual void resolveRelocation(const SectionEntry &Section,
+                                 uint64_t Offset,
                                  uint64_t Value,
                                  uint32_t Type,
                                  int64_t Addend);
@@ -54,16 +74,22 @@ protected:
                                     const SymbolTableMap &Symbols,
                                     StubMap &Stubs);
 
-  virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
-  virtual void handleObjectLoaded(ObjectImage *Obj);
+  unsigned getCommonSymbolAlignment(const SymbolRef &Sym);
+
+  virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
+
+  uint64_t findPPC64TOC() const;
+  void findOPDEntrySection(ObjectImage &Obj,
+                           ObjSectionToIDMap &LocalSections,
+                           RelocationValueRef &Rel);
 
 public:
   RuntimeDyldELF(RTDyldMemoryManager *mm)
-      : RuntimeDyldImpl(mm), LoadedObject(0) {}
+      : RuntimeDyldImpl(mm) {}
 
   virtual ~RuntimeDyldELF();
 
-  bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
+  bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index c38ca696f27b..829fd6c4c9a9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -14,8 +14,8 @@
 #ifndef LLVM_RUNTIME_DYLD_IMPL_H
 #define LLVM_RUNTIME_DYLD_IMPL_H
 
-#include "ObjectImage.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -24,6 +24,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include <map>
@@ -33,7 +35,7 @@ using namespace llvm::object;
 
 namespace llvm {
 
-class MemoryBuffer;
+class ObjectBuffer;
 class Twine;
 
 
@@ -41,6 +43,9 @@ class Twine;
 /// linker.
 class SectionEntry {
 public:
+  /// Name - section name.
+  StringRef Name;
+
   /// Address - address in the linker's memory where the section resides.
   uint8_t *Address;
 
@@ -61,9 +66,9 @@ public:
   /// for calculating relocations in some object formats (like MachO).
   uintptr_t ObjAddress;
 
-  SectionEntry(uint8_t *address, size_t size, uintptr_t stubOffset,
-               uintptr_t objAddress)
-    : Address(address), Size(size), LoadAddress((uintptr_t)address),
+  SectionEntry(StringRef name, uint8_t *address, size_t size,
+	       uintptr_t stubOffset, uintptr_t objAddress)
+    : Name(name), Address(address), Size(size), LoadAddress((uintptr_t)address),
       StubOffset(stubOffset), ObjAddress(objAddress) {}
 };
 
@@ -135,8 +140,10 @@ protected:
   typedef StringMap<SymbolLoc> SymbolTableMap;
   SymbolTableMap GlobalSymbolTable;
 
-  // Keep a map of common symbols to their sizes
-  typedef std::map<SymbolRef, unsigned> CommonSymbolMap;
+  // Pair representing the size and alignment requirement for a common symbol.
+  typedef std::pair<unsigned, unsigned> CommonSymbolInfo;
+  // Keep a map of common symbols to their info pairs
+  typedef std::map<SymbolRef, CommonSymbolInfo> CommonSymbolMap;
 
   // For each symbol, keep a list of relocations based on it. Anytime
   // its address is reassigned (the JIT re-compiled the function, e.g.),
@@ -161,6 +168,10 @@ protected:
   inline unsigned getMaxStubSize() {
     if (Arch == Triple::arm || Arch == Triple::thumb)
       return 8; // 32-bit instruction and 32-bit address
+    else if (Arch == Triple::mipsel)
+      return 16;
+    else if (Arch == Triple::ppc64)
+      return 44;
     else
       return 0;
   }
@@ -175,10 +186,50 @@ protected:
     return true;
   }
 
+  uint64_t getSectionLoadAddress(unsigned SectionID) {
+    return Sections[SectionID].LoadAddress;
+  }
+
   uint8_t *getSectionAddress(unsigned SectionID) {
     return (uint8_t*)Sections[SectionID].Address;
   }
 
+  // Subclasses can override this method to get the alignment requirement of
+  // a common symbol. Returns no alignment requirement if not implemented.
+  virtual unsigned getCommonSymbolAlignment(const SymbolRef &Sym) {
+    return 0;
+  }
+
+
+  void writeInt16BE(uint8_t *Addr, uint16_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 8) & 0xFF;
+    *(Addr+1) = Value & 0xFF;
+  }
+
+  void writeInt32BE(uint8_t *Addr, uint32_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 24) & 0xFF;
+    *(Addr+1) = (Value >> 16) & 0xFF;
+    *(Addr+2) = (Value >> 8) & 0xFF;
+    *(Addr+3) = Value & 0xFF;
+  }
+
+  void writeInt64BE(uint8_t *Addr, uint64_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 56) & 0xFF;
+    *(Addr+1) = (Value >> 48) & 0xFF;
+    *(Addr+2) = (Value >> 40) & 0xFF;
+    *(Addr+3) = (Value >> 32) & 0xFF;
+    *(Addr+4) = (Value >> 24) & 0xFF;
+    *(Addr+5) = (Value >> 16) & 0xFF;
+    *(Addr+6) = (Value >> 8) & 0xFF;
+    *(Addr+7) = Value & 0xFF;
+  }
+
   /// \brief Given the common symbols discovered in the object file, emit a
   /// new section for them and update the symbol mappings in the object and
   /// symbol table.
@@ -221,13 +272,14 @@ protected:
   void resolveRelocationEntry(const RelocationEntry &RE, uint64_t Value);
 
   /// \brief A object file specific relocation resolver
-  /// \param Address Address to apply the relocation action
+  /// \param Section The section where the relocation is being applied
+  /// \param Offset The offset into the section for this relocation
   /// \param Value Target symbol address to apply the relocation action
   /// \param Type object file specific relocation type
   /// \param Addend A constant addend used to compute the value to be stored
   ///        into the relocatable field
-  virtual void resolveRelocation(uint8_t *LocalAddress,
-                                 uint64_t FinalAddress,
+  virtual void resolveRelocation(const SectionEntry &Section,
+                                 uint64_t Offset,
                                  uint64_t Value,
                                  uint32_t Type,
                                  int64_t Addend) = 0;
@@ -242,19 +294,13 @@ protected:
 
   /// \brief Resolve relocations to external symbols.
   void resolveExternalSymbols();
-  virtual ObjectImage *createObjectImage(const MemoryBuffer *InputBuffer);
-  virtual void handleObjectLoaded(ObjectImage *Obj)
-  {
-    // Subclasses may choose to retain this image if they have a use for it
-    delete Obj;
-  }
-
+  virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
 public:
   RuntimeDyldImpl(RTDyldMemoryManager *mm) : MemMgr(mm), HasError(false) {}
 
   virtual ~RuntimeDyldImpl();
 
-  bool loadObject(const MemoryBuffer *InputBuffer);
+  ObjectImage *loadObject(ObjectBuffer *InputBuffer);
 
   void *getSymbolAddress(StringRef Name) {
     // FIXME: Just look up as a function for now. Overly simple of course.
@@ -265,11 +311,20 @@ public:
     return getSectionAddress(Loc.first) + Loc.second;
   }
 
+  uint64_t getSymbolLoadAddress(StringRef Name) {
+    // FIXME: Just look up as a function for now. Overly simple of course.
+    // Work in progress.
+    if (GlobalSymbolTable.find(Name) == GlobalSymbolTable.end())
+      return 0;
+    SymbolLoc Loc = GlobalSymbolTable.lookup(Name);
+    return getSectionLoadAddress(Loc.first) + Loc.second;
+  }
+
   void resolveRelocations();
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 
-  void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress);
+  void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress);
 
   // Is the linker in an error state?
   bool hasError() { return HasError; }
@@ -280,8 +335,7 @@ public:
   // Get the error message.
   StringRef getErrorString() { return ErrorStr; }
 
-  virtual bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const = 0;
-
+  virtual bool isCompatibleFormat(const ObjectBuffer *Buffer) const = 0;
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 0e3a9d4af50d..987c0c3afc26 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -21,11 +21,13 @@ using namespace llvm::object;
 
 namespace llvm {
 
-void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
-                                         uint64_t FinalAddress,
+void RuntimeDyldMachO::resolveRelocation(const SectionEntry &Section,
+                                         uint64_t Offset,
                                          uint64_t Value,
                                          uint32_t Type,
                                          int64_t Addend) {
+  uint8_t *LocalAddress = Section.Address + Offset;
+  uint64_t FinalAddress = Section.LoadAddress + Offset;
   bool isPCRel = (Type >> 24) & 1;
   unsigned MachoType = (Type >> 28) & 0xf;
   unsigned Size = 1 << ((Type >> 25) & 3);
@@ -57,7 +59,7 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
                           FinalAddress,
                           (uintptr_t)Value,
                           isPCRel,
-                          Type,
+                          MachoType,
                           Size,
                           Addend);
     break;
@@ -211,7 +213,6 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
   uint32_t RelType = (uint32_t) (Rel.Type & 0xffffffffL);
   RelocationValueRef Value;
   SectionEntry &Section = Sections[Rel.SectionID];
-  uint8_t *Target = Section.Address + Rel.Offset;
 
   bool isExtern = (RelType >> 27) & 1;
   if (isExtern) {
@@ -246,7 +247,12 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
     }
     assert(si != se && "No section containing relocation!");
     Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID);
-    Value.Addend = *(const intptr_t *)Target;
+    Value.Addend = 0;
+    // FIXME: The size and type of the relocation determines if we can
+    // encode an Addend in the target location itself, and if so, how many
+    // bytes we should read in order to get it. We don't yet support doing
+    // that, and just assuming it's sizeof(intptr_t) is blatantly wrong.
+    //Value.Addend = *(const intptr_t *)Target;
     if (Value.Addend) {
       // The MachO addend is an offset from the current section.  We need it
       // to be an offset from the destination section
@@ -254,13 +260,13 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
     }
   }
 
-  if (Arch == Triple::arm && RelType == macho::RIT_ARM_Branch24Bit) {
+  if (Arch == Triple::arm && (RelType & 0xf) == macho::RIT_ARM_Branch24Bit) {
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end())
-      resolveRelocation(Target, (uint64_t)Target,
+      resolveRelocation(Section, Rel.Offset,
                         (uint64_t)Section.Address + i->second,
                         RelType, 0);
     else {
@@ -274,7 +280,7 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
         addRelocationForSymbol(RE, Value.SymbolName);
       else
         addRelocationForSection(RE, Value.SectionID);
-      resolveRelocation(Target, (uint64_t)Target,
+      resolveRelocation(Section, Rel.Offset,
                         (uint64_t)Section.Address + Section.StubOffset,
                         RelType, 0);
       Section.StubOffset += getMaxStubSize();
@@ -290,8 +296,10 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
 
 
 bool RuntimeDyldMachO::isCompatibleFormat(
-        const MemoryBuffer *InputBuffer) const {
-  StringRef Magic = InputBuffer->getBuffer().slice(0, 4);
+        const ObjectBuffer *InputBuffer) const {
+  if (InputBuffer->getBufferSize() < 4)
+    return false;
+  StringRef Magic(InputBuffer->getBufferStart(), 4);
   if (Magic == "\xFE\xED\xFA\xCE") return true;
   if (Magic == "\xCE\xFA\xED\xFE") return true;
   if (Magic == "\xFE\xED\xFA\xCF") return true;
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
index 707664c73278..fe3539dff6f5 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.h
@@ -55,15 +55,15 @@ protected:
                                     StubMap &Stubs);
 
 public:
-  virtual void resolveRelocation(uint8_t *LocalAddress,
-                                 uint64_t FinalAddress,
+  virtual void resolveRelocation(const SectionEntry &Section,
+                                 uint64_t Offset,
                                  uint64_t Value,
                                  uint32_t Type,
                                  int64_t Addend);
 
   RuntimeDyldMachO(RTDyldMemoryManager *mm) : RuntimeDyldImpl(mm) {}
 
-  bool isCompatibleFormat(const MemoryBuffer *InputBuffer) const;
+  bool isCompatibleFormat(const ObjectBuffer *Buffer) const;
 };
 
 } // end namespace llvm
diff --git a/lib/ExecutionEngine/TargetSelect.cpp b/lib/ExecutionEngine/TargetSelect.cpp
index 7cdd669028b8..8b6104fdca9c 100644
--- a/lib/ExecutionEngine/TargetSelect.cpp
+++ b/lib/ExecutionEngine/TargetSelect.cpp
@@ -26,7 +26,14 @@
 using namespace llvm;
 
 TargetMachine *EngineBuilder::selectTarget() {
-  Triple TT(LLVM_HOSTTRIPLE);
+  Triple TT;
+
+  // MCJIT can generate code for remote targets, but the old JIT and Interpreter
+  // must use the host architecture.
+  if (UseMCJIT && WhichEngine != EngineKind::Interpreter && M)
+    TT.setTriple(M->getTargetTriple());
+  else
+    TT.setTriple(LLVM_HOSTTRIPLE);
   return selectTarget(TT, MArch, MCPU, MAttrs);
 }
 
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 7203b9a4bfab..eda062376edc 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -133,6 +133,11 @@ class ELFObjectWriter : public MCObjectWriter {
                                    bool IsPCRel) const {
       return TargetObjectWriter->ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
     }
+    const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+      return TargetObjectWriter->undefinedExplicitRelSym(Target, Fixup, IsPCRel);
+    }
 
     bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
     bool hasRelocationAddend() const {
@@ -270,9 +275,10 @@ class ELFObjectWriter : public MCObjectWriter {
 
     /// ComputeSymbolTable - Compute the symbol table data
     ///
-    /// \param StringTable [out] - The string table data.
-    /// \param StringIndexMap [out] - Map from symbol names to offsets in the
-    /// string table.
+    /// \param Asm - The assembler.
+    /// \param SectionIndexMap - Maps a section to its index.
+    /// \param RevGroupMap - Maps a signature symbol to the group section.
+    /// \param NumRegularSections - Number of non-relocation sections.
     void ComputeSymbolTable(MCAssembler &Asm,
                             const SectionIndexMapTy &SectionIndexMap,
                             RevGroupMapTy RevGroupMap,
@@ -638,7 +644,7 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
   if (ASymbol.isUndefined()) {
     if (Renamed)
       return Renamed;
-    return &ASymbol;
+    return undefinedExplicitRelSym(Target, Fixup, IsPCRel);
   }
 
   if (SD.isExternal()) {
@@ -720,10 +726,13 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
       MCSymbolData &SD = Asm.getSymbolData(ASymbol);
       MCFragment *F = SD.getFragment();
 
-      Index = F->getParent()->getOrdinal() + 1;
-
-      // Offset of the symbol in the section
-      Value += Layout.getSymbolOffset(&SD);
+      if (F) {
+        Index = F->getParent()->getOrdinal() + 1;
+        // Offset of the symbol in the section
+        Value += Layout.getSymbolOffset(&SD);
+      } else {
+        Index = 0;
+      }
     } else {
       if (Asm.getSymbolData(Symbol).getFlags() & ELF_Other_Weakref)
         WeakrefUsedInReloc.insert(RelocSymbol);
@@ -732,8 +741,7 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
       Index = -1;
     }
     Addend = Value;
-    // Compensate for the addend on i386.
-    if (is64Bit())
+    if (hasRelocationAddend())
       Value = 0;
   }
 
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 2e447b05a41a..53960e7980ae 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -12,12 +12,9 @@
 using namespace llvm;
 
 MCAsmBackend::MCAsmBackend()
-  : HasReliableSymbolDifference(false)
-{
-}
+  : HasReliableSymbolDifference(false), HasDataInCodeSupport(false) {}
 
-MCAsmBackend::~MCAsmBackend() {
-}
+MCAsmBackend::~MCAsmBackend() {}
 
 const MCFixupKindInfo &
 MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 8da2e0e5839a..7ea0f3b85a53 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -68,8 +68,8 @@ MCAsmInfo::MCAsmInfo() {
   GlobalDirective = "\t.globl\t";
   HasSetDirective = true;
   HasAggressiveSymbolFolding = true;
-  LCOMMDirectiveType = LCOMM::None;
   COMMDirectiveAlignmentIsInBytes = true;
+  LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
   HasDotTypeDotSizeDirective = true;
   HasSingleParameterDotFile = true;
   HasNoDeadStrip = false;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 678e75af5dab..fd79193073df 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -19,8 +19,10 @@ void MCAsmInfoCOFF::anchor() { }
 
 MCAsmInfoCOFF::MCAsmInfoCOFF() {
   GlobalPrefix = "_";
+  // MingW 4.5 and later support .comm with log2 alignment, but .lcomm uses byte
+  // alignment.
   COMMDirectiveAlignmentIsInBytes = false;
-  LCOMMDirectiveType = LCOMM::ByteAlignment;
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   HasDotTypeDotSizeDirective = false;
   HasSingleParameterDotFile = false;
   PrivateGlobalPrefix = "L";  // Prefix for private global symbols
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index 8e0ac23efced..a0e3ebad5e2b 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -32,6 +32,7 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
 
   AlignmentIsInBytes = false;
   COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
   InlineAsmStart = " InlineAsm Start";
   InlineAsmEnd = " InlineAsm End";
 
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 373df4b2bf72..17a6323d0e76 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -166,7 +166,7 @@ public:
   ///
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
-  /// @param Size - The alignment of the common symbol in bytes.
+  /// @param ByteAlignment - The alignment of the common symbol in bytes.
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment);
 
@@ -251,6 +251,7 @@ public:
   virtual void EmitPad(int64_t Offset);
   virtual void EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool);
 
+  virtual void EmitTCEntry(const MCSymbol &S);
 
   virtual void EmitInstruction(const MCInst &Inst);
 
@@ -517,13 +518,19 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 /// @param Size - The size of the common symbol.
 void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
-  assert(MAI.getLCOMMDirectiveType() != LCOMM::None &&
-         "Doesn't have .lcomm, can't emit it!");
   OS << "\t.lcomm\t" << *Symbol << ',' << Size;
   if (ByteAlign > 1) {
-    assert(MAI.getLCOMMDirectiveType() == LCOMM::ByteAlignment &&
-           "Alignment not supported on .lcomm!");
-    OS << ',' << ByteAlign;
+    switch (MAI.getLCOMMDirectiveAlignmentType()) {
+    case LCOMM::NoAlignment:
+      llvm_unreachable("alignment not supported on .lcomm!");
+    case LCOMM::ByteAlignment:
+      OS << ',' << ByteAlign;
+      break;
+    case LCOMM::Log2Alignment:
+      assert(isPowerOf2_32(ByteAlign) && "alignment must be a power of 2");
+      OS << ',' << Log2_32(ByteAlign);
+      break;
+    }
   }
   EmitEOL();
 }
@@ -1293,6 +1300,14 @@ void MCAsmStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitTCEntry(const MCSymbol &S) {
+  OS << "\t.tc ";
+  OS << S.getName();
+  OS << "[TC],";
+  OS << S.getName();
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitInstruction(const MCInst &Inst) {
   assert(getCurrentSection() && "Cannot emit contents before setting section!");
 
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 05519b56ffec..726ec5aba512 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -199,8 +199,7 @@ MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
                          MCCodeEmitter &Emitter_, MCObjectWriter &Writer_,
                          raw_ostream &OS_)
   : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
-    OS(OS_), RelaxAll(false), NoExecStack(false), SubsectionsViaSymbols(false)
-{
+    OS(OS_), RelaxAll(false), NoExecStack(false), SubsectionsViaSymbols(false) {
 }
 
 MCAssembler::~MCAssembler() {
@@ -325,6 +324,12 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
     unsigned Offset = Layout.getFragmentOffset(&AF);
     unsigned Size = OffsetToAlignment(Offset, AF.getAlignment());
+    // If we are padding with nops, force the padding to be larger than the
+    // minimum nop size.
+    if (Size > 0 && AF.hasEmitNops()) {
+      while (Size % getBackend().getMinimumNopSize())
+        Size += AF.getAlignment();
+    }
     if (Size > AF.getMaxBytesToEmit())
       return 0;
     return Size;
@@ -375,7 +380,7 @@ void MCAsmLayout::LayoutFragment(MCFragment *F) {
   LastValidFragment[F->getParent()] = F;
 }
 
-/// WriteFragmentData - Write the \arg F data to the output file.
+/// WriteFragmentData - Write the \p F data to the output file.
 static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
                               const MCFragment &F) {
   MCObjectWriter *OW = &Asm.getWriter();
@@ -527,7 +532,7 @@ void MCAssembler::writeSectionData(const MCSectionData *SD,
   }
 
   uint64_t Start = getWriter().getStream().tell();
-  (void) Start;
+  (void)Start;
 
   for (MCSectionData::const_iterator it = SD->begin(),
          ie = SD->end(); it != ie; ++it)
@@ -824,6 +829,7 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
 
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCFragment::dump() {
   raw_ostream &OS = llvm::errs();
 
@@ -964,6 +970,7 @@ void MCAssembler::dump() {
   }
   OS << "]>\n";
 }
+#endif
 
 // anchors for MC*Fragment vtables
 void MCDataFragment::anchor() { }
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index b5b14b95f667..477bd17c0d57 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -153,6 +153,12 @@ MCSymbol *MCContext::LookupSymbol(StringRef Name) const {
   return Symbols.lookup(Name);
 }
 
+MCSymbol *MCContext::LookupSymbol(const Twine &Name) const {
+  SmallString<128> NameSV;
+  Name.toVector(NameSV);
+  return LookupSymbol(NameSV.str());
+}
+
 //===----------------------------------------------------------------------===//
 // Section Management
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDisassembler/Disassembler.cpp b/lib/MC/MCDisassembler/Disassembler.cpp
index 35f675dc6d1b..5189c9daeed6 100644
--- a/lib/MC/MCDisassembler/Disassembler.cpp
+++ b/lib/MC/MCDisassembler/Disassembler.cpp
@@ -184,3 +184,17 @@ size_t LLVMDisasmInstruction(LLVMDisasmContextRef DCR, uint8_t *Bytes,
   }
   llvm_unreachable("Invalid DecodeStatus!");
 }
+
+//
+// LLVMSetDisasmOptions() sets the disassembler's options.  It returns 1 if it
+// can set all the Options and 0 otherwise.
+//
+int LLVMSetDisasmOptions(LLVMDisasmContextRef DCR, uint64_t Options){
+  if (Options & LLVMDisassembler_Option_UseMarkup){
+      LLVMDisasmContext *DC = (LLVMDisasmContext *)DCR;
+      MCInstPrinter *IP = DC->getIP();
+      IP->setUseMarkup(1);
+      Options &= ~LLVMDisassembler_Option_UseMarkup;
+  }
+  return (Options == 0);
+}
diff --git a/lib/MC/MCDisassembler/EDDisassembler.cpp b/lib/MC/MCDisassembler/EDDisassembler.cpp
index 1226f1a2e324..eed7a771b97e 100644
--- a/lib/MC/MCDisassembler/EDDisassembler.cpp
+++ b/lib/MC/MCDisassembler/EDDisassembler.cpp
@@ -366,8 +366,9 @@ int EDDisassembler::parseInst(SmallVectorImpl<MCParsedAsmOperand*> &operands,
     instName = OpcodeToken.getString();
     instLoc = OpcodeToken.getLoc();
     
+    ParseInstructionInfo Info;
     if (NextToken.isNot(AsmToken::Eof) &&
-        TargetParser->ParseInstruction(instName, instLoc, operands))
+        TargetParser->ParseInstruction(Info, instName, instLoc, operands))
       ret = -1;
   } else {
     ret = -1;
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 4c63e434d23f..f71b266ad632 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -425,9 +425,11 @@ void MCDwarfFile::print(raw_ostream &OS) const {
   OS << '"' << getName() << '"';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCDwarfFile::dump() const {
   print(dbgs());
 }
+#endif
 
 // Utility function to write a tuple for .debug_abbrev.
 static void EmitAbbrev(MCStreamer *MCOS, uint64_t Name, uint64_t Form) {
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 6eb6914f4b1f..74cd042a0f8c 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -9,6 +9,8 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -35,6 +37,12 @@ const MCSymbol *MCELFObjectTargetWriter::ExplicitRelSym(const MCAssembler &Asm,
   return NULL;
 }
 
+const MCSymbol *MCELFObjectTargetWriter::undefinedExplicitRelSym(const MCValue &Target,
+                                                                 const MCFixup &Fixup,
+                                                                 bool IsPCRel) const {
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol();
+  return &Symbol.AliasedSymbol();
+}
 
 void MCELFObjectTargetWriter::adjustFixupOffset(const MCFixup &Fixup,
                                                 uint64_t &RelocOffset) {
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 2d342dccfe25..14fbc1ec8391 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -98,17 +98,13 @@ public:
                               uint64_t Size, unsigned ByteAlignment = 0) {
     llvm_unreachable("ELF doesn't support this directive");
   }
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
-  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
-                                    unsigned ValueSize = 1,
-                                    unsigned MaxBytesToEmit = 0);
-  virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                 unsigned MaxBytesToEmit = 0);
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
                              unsigned AddrSpace);
 
   virtual void EmitFileDirective(StringRef Filename);
 
+  virtual void EmitTCEntry(const MCSymbol &S);
+
   virtual void FinishImpl();
 
 private:
@@ -247,7 +243,6 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   switch (Attribute) {
   case MCSA_LazyReference:
   case MCSA_Reference:
-  case MCSA_NoDeadStrip:
   case MCSA_SymbolResolver:
   case MCSA_PrivateExtern:
   case MCSA_WeakDefinition:
@@ -256,6 +251,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_IndirectSymbol:
     llvm_unreachable("Invalid symbol attribute for ELF!");
 
+  case MCSA_NoDeadStrip:
   case MCSA_ELF_TypeGnuUniqueObject:
     // Ignore for now.
     break;
@@ -355,42 +351,6 @@ void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   EmitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
-void MCELFStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
-  // MCObjectStreamer.
-  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
-}
-
-void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
-                                           int64_t Value, unsigned ValueSize,
-                                           unsigned MaxBytesToEmit) {
-  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
-  // MCObjectStreamer.
-  if (MaxBytesToEmit == 0)
-    MaxBytesToEmit = ByteAlignment;
-  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
-                      getCurrentSectionData());
-
-  // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > getCurrentSectionData()->getAlignment())
-    getCurrentSectionData()->setAlignment(ByteAlignment);
-}
-
-void MCELFStreamer::EmitCodeAlignment(unsigned ByteAlignment,
-                                        unsigned MaxBytesToEmit) {
-  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
-  // MCObjectStreamer.
-  if (MaxBytesToEmit == 0)
-    MaxBytesToEmit = ByteAlignment;
-  MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
-                                           getCurrentSectionData());
-  F->setEmitNops(true);
-
-  // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > getCurrentSectionData()->getAlignment())
-    getCurrentSectionData()->setAlignment(ByteAlignment);
-}
-
 void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
                                   unsigned AddrSpace) {
   fixSymbolsInTLSFixups(Value);
@@ -511,6 +471,12 @@ void MCELFStreamer::FinishImpl() {
   this->MCObjectStreamer::FinishImpl();
 }
 
+void MCELFStreamer::EmitTCEntry(const MCSymbol &S)
+{
+  // Creates a R_PPC64_TOC relocation
+  MCObjectStreamer::EmitSymbolValue(&S, 8, 0);
+}
+
 MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                     raw_ostream &OS, MCCodeEmitter *CE,
                                     bool RelaxAll, bool NoExecStack) {
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 0eb7fcce684b..e0336342d6d1 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -60,7 +60,8 @@ void MCExpr::print(raw_ostream &OS) const {
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TPOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF ||
-        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1)
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
     else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
              SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
@@ -136,10 +137,12 @@ void MCExpr::print(raw_ostream &OS) const {
   llvm_unreachable("Invalid expression kind!");
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCExpr::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 /* *** */
 
@@ -197,7 +200,9 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_GOTTPOFF: return "(gottpoff)";
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_ARM_TARGET1: return "(target1)";
-  case VK_PPC_TOC: return "toc";
+  case VK_ARM_TARGET2: return "(target2)";
+  case VK_PPC_TOC: return "tocbase";
+  case VK_PPC_TOC_ENTRY: return "toc";
   case VK_PPC_DARWIN_HA16: return "ha16";
   case VK_PPC_DARWIN_LO16: return "lo16";
   case VK_PPC_GAS_HA16: return "ha";
@@ -264,7 +269,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
 
 /* *** */
 
-void MCTargetExpr::Anchor() {}
+void MCTargetExpr::anchor() {}
 
 /* *** */
 
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 7bbfd2efa136..124cc149beb6 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -32,10 +32,12 @@ void MCOperand::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << ">";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCOperand::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
 }
+#endif
 
 void MCInst::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << "<MCInst " << getOpcode();
@@ -62,7 +64,9 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
   OS << ">";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCInst::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
 }
+#endif
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 847bcc0a1604..41d90abeeb63 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -36,3 +36,17 @@ void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
       OS << " " << MAI.getCommentString() << " " << Annot;
   }
 }
+
+/// Utility functions to make adding mark ups simpler.
+StringRef MCInstPrinter::markup(StringRef s) const {
+  if (getUseMarkup())
+    return s;
+  else
+    return "";
+}
+StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
+  if (getUseMarkup())
+    return a;
+  else
+    return b;
+}
diff --git a/lib/MC/MCLabel.cpp b/lib/MC/MCLabel.cpp
index 9c0fc92e6c05..1d3022a93e86 100644
--- a/lib/MC/MCLabel.cpp
+++ b/lib/MC/MCLabel.cpp
@@ -16,6 +16,8 @@ void MCLabel::print(raw_ostream &OS) const {
   OS << '"' << getInstance() << '"';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCLabel::dump() const {
   print(dbgs());
 }
+#endif
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index b75fe2c3a7f1..04b0e86aed61 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -70,19 +70,11 @@ public:
     llvm_unreachable("macho doesn't support this directive");
   }
   virtual void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
-                                     unsigned ByteAlignment) {
-    llvm_unreachable("macho doesn't support this directive");
-  }
+                                     unsigned ByteAlignment);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment = 0);
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
-  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
-                                    unsigned ValueSize = 1,
-                                    unsigned MaxBytesToEmit = 0);
-  virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                 unsigned MaxBytesToEmit = 0);
 
   virtual void EmitFileDirective(StringRef Filename) {
     // FIXME: Just ignore the .file; it isn't important enough to fail the
@@ -141,6 +133,8 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
 }
 
 void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
+  if (!getAssembler().getBackend().hasDataInCodeSupport())
+    return;
   // Create a temporary label to mark the start of the data region.
   MCSymbol *Start = getContext().CreateTempSymbol();
   EmitLabel(Start);
@@ -151,6 +145,8 @@ void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
 }
 
 void MCMachOStreamer::EmitDataRegionEnd() {
+  if (!getAssembler().getBackend().hasDataInCodeSupport())
+    return;
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
   assert(Regions.size() && "Mismatched .end_data_region!");
   DataRegionData &Data = Regions.back();
@@ -325,6 +321,15 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   SD.setCommon(Size, ByteAlignment);
 }
 
+void MCMachOStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                                            unsigned ByteAlignment) {
+  // '.lcomm' is equivalent to '.zerofill'.
+  return EmitZerofill(getContext().getMachOSection("__DATA", "__bss",
+                                                   MCSectionMachO::S_ZEROFILL,
+                                                   0, SectionKind::getBSS()),
+                      Symbol, Size, ByteAlignment);
+}
+
 void MCMachOStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   MCSectionData &SectData = getAssembler().getOrCreateSectionData(*Section);
@@ -361,42 +366,6 @@ void MCMachOStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
   return;
 }
 
-void MCMachOStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
-  // MCObjectStreamer.
-  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
-}
-
-void MCMachOStreamer::EmitValueToAlignment(unsigned ByteAlignment,
-                                           int64_t Value, unsigned ValueSize,
-                                           unsigned MaxBytesToEmit) {
-  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
-  // MCObjectStreamer.
-  if (MaxBytesToEmit == 0)
-    MaxBytesToEmit = ByteAlignment;
-  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
-                      getCurrentSectionData());
-
-  // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > getCurrentSectionData()->getAlignment())
-    getCurrentSectionData()->setAlignment(ByteAlignment);
-}
-
-void MCMachOStreamer::EmitCodeAlignment(unsigned ByteAlignment,
-                                        unsigned MaxBytesToEmit) {
-  // TODO: This is exactly the same as WinCOFFStreamer. Consider merging into
-  // MCObjectStreamer.
-  if (MaxBytesToEmit == 0)
-    MaxBytesToEmit = ByteAlignment;
-  MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
-                                           getCurrentSectionData());
-  F->setEmitNops(true);
-
-  // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > getCurrentSectionData()->getAlignment())
-    getCurrentSectionData()->setAlignment(ByteAlignment);
-}
-
 void MCMachOStreamer::EmitInstToData(const MCInst &Inst) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 29b4a9465356..2e1604d6b506 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -392,6 +392,18 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   DwarfMacroInfoSection =
     Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS, 0,
                        SectionKind::getMetadata());
+  DwarfAccelNamesSection =
+    Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAccelObjCSection =
+    Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAccelNamespaceSection =
+    Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
+  DwarfAccelTypesSection =
+    Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0,
+                       SectionKind::getMetadata());
 }
 
 
@@ -430,12 +442,20 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   }
 
 
-  StaticDtorSection =
-    Ctx->getCOFFSection(".dtors",
-                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                        COFF::IMAGE_SCN_MEM_READ |
-                        COFF::IMAGE_SCN_MEM_WRITE,
-                        SectionKind::getDataRel());
+  if (T.getOS() == Triple::Win32) {
+    StaticDtorSection =
+      Ctx->getCOFFSection(".CRT$XTX",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getReadOnly());
+  } else {
+    StaticDtorSection =
+      Ctx->getCOFFSection(".dtors",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ |
+                          COFF::IMAGE_SCN_MEM_WRITE,
+                          SectionKind::getDataRel());
+  }
 
   // FIXME: We're emitting LSDA info into a readonly section on COFF, even
   // though it contains relocatable pointers.  In PIC mode, this is probably a
@@ -557,6 +577,7 @@ void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
     Env = IsMachO;
     InitMachOMCObjectFileInfo(T);
   } else if ((Arch == Triple::x86 || Arch == Triple::x86_64) &&
+             (T.getEnvironment() != Triple::ELF) &&
              (T.getOS() == Triple::MinGW32 || T.getOS() == Triple::Cygwin ||
               T.getOS() == Triple::Win32)) {
     Env = IsCOFF;
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index bad7cfe38a17..774632306d94 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -232,6 +232,31 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   new MCDwarfCallFrameFragment(*AddrDelta, getCurrentSectionData());
 }
 
+void MCObjectStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
+  assert(AddrSpace == 0 && "Address space must be 0!");
+  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
+}
+
+void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+                                            int64_t Value,
+                                            unsigned ValueSize,
+                                            unsigned MaxBytesToEmit) {
+  if (MaxBytesToEmit == 0)
+    MaxBytesToEmit = ByteAlignment;
+  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
+                      getCurrentSectionData());
+
+  // Update the maximum alignment on the current section if necessary.
+  if (ByteAlignment > getCurrentSectionData()->getAlignment())
+    getCurrentSectionData()->setAlignment(ByteAlignment);
+}
+
+void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+                                         unsigned MaxBytesToEmit) {
+  EmitValueToAlignment(ByteAlignment, 0, 1, MaxBytesToEmit);
+  cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true);
+}
+
 bool MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value) {
   int64_t Res;
@@ -258,12 +283,26 @@ bool MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset,
 void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
-  DF->addFixup(MCFixup::Create(DF->getContents().size(),
-                               Value,
-                               FK_GPRel_4));
+  DF->addFixup(MCFixup::Create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
+// Associate GPRel32 fixup with data and resize data area
+void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->addFixup(MCFixup::Create(DF->getContents().size(), Value, FK_GPRel_4));
+  DF->getContents().resize(DF->getContents().size() + 8, 0);
+}
+
+void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
+                                unsigned AddrSpace) {
+  assert(AddrSpace == 0 && "Address space must be 0!");
+  // FIXME: A MCFillFragment would be more memory efficient but MCExpr has
+  //        problems evaluating expressions across multiple fragments.
+  getOrCreateDataFragment()->getContents().append(NumBytes, FillValue);
+}
+
 void MCObjectStreamer::FinishImpl() {
   // Dump out the dwarf file & directory tables and line tables.
   const MCSymbol *LineSectionSymbol = NULL;
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index c76052d66e00..f93f685bf502 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -396,8 +396,17 @@ AsmToken AsmLexer::LexToken() {
   case 0:
   case ' ':
   case '\t':
-    // Ignore whitespace.
-    return LexToken();
+    if (SkipSpace) {
+      // Ignore whitespace.
+      return LexToken();
+    } else {
+      int len = 1;
+      while (*CurPtr==' ' || *CurPtr=='\t') {
+        CurPtr++;
+        len++;
+      }
+      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
+    }
   case '\n': // FALL THROUGH.
   case '\r':
     isAtStartOfLine = true;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index b67c7691138e..6f2e85e55335 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -19,6 +19,8 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/AsmCond.h"
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
@@ -35,6 +37,8 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cctype>
+#include <set>
+#include <string>
 #include <vector>
 using namespace llvm;
 
@@ -42,12 +46,14 @@ static cl::opt<bool>
 FatalAssemblerWarnings("fatal-assembler-warnings",
                        cl::desc("Consider warnings as error"));
 
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {} 
+
 namespace {
 
 /// \brief Helper class for tracking macro definitions.
 typedef std::vector<AsmToken> MacroArgument;
 typedef std::vector<MacroArgument> MacroArguments;
-typedef StringRef MacroParameter;
+typedef std::pair<StringRef, MacroArgument> MacroParameter;
 typedef std::vector<MacroParameter> MacroParameters;
 
 struct Macro {
@@ -80,12 +86,34 @@ public:
                      MemoryBuffer *I);
 };
 
+//struct AsmRewrite;
+struct ParseStatementInfo {
+  /// ParsedOperands - The parsed operands from the last parsed statement.
+  SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
+
+  /// Opcode - The opcode from the last parsed instruction.
+  unsigned Opcode;
+
+  SmallVectorImpl<AsmRewrite> *AsmRewrites;
+
+  ParseStatementInfo() : Opcode(~0U), AsmRewrites(0) {}
+  ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
+    : Opcode(~0), AsmRewrites(rewrites) {}
+
+  ~ParseStatementInfo() {
+    // Free any parsed operands.
+    for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
+      delete ParsedOperands[i];
+    ParsedOperands.clear();
+  }
+};
+
 /// \brief The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
   friend class GenericAsmParser;
 
-  AsmParser(const AsmParser &);   // DO NOT IMPLEMENT
-  void operator=(const AsmParser &);  // DO NOT IMPLEMENT
+  AsmParser(const AsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const AsmParser &) LLVM_DELETED_FUNCTION;
 private:
   AsmLexer Lexer;
   MCContext &Ctx;
@@ -126,20 +154,27 @@ private:
   StringRef CppHashFilename;
   int64_t CppHashLineNumber;
   SMLoc CppHashLoc;
+  int CppHashBuf;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect;
 
+  /// IsDarwin - is Darwin compatibility enabled?
+  bool IsDarwin;
+
+  /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
+  bool ParsingInlineAsm;
+
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
             const MCAsmInfo &MAI);
-  ~AsmParser();
+  virtual ~AsmParser();
 
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false);
 
-  void AddDirectiveHandler(MCAsmParserExtension *Object,
-                           StringRef Directive,
-                           DirectiveHandler Handler) {
+  virtual void AddDirectiveHandler(MCAsmParserExtension *Object,
+                                   StringRef Directive,
+                                   DirectiveHandler Handler) {
     DirectiveMap[Directive] = std::make_pair(Object, Handler);
   }
 
@@ -166,7 +201,19 @@ public:
   virtual bool Error(SMLoc L, const Twine &Msg,
                      ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
 
-  const AsmToken &Lex();
+  virtual const AsmToken &Lex();
+
+  void setParsingInlineAsm(bool V) { ParsingInlineAsm = V; }
+  bool isParsingInlineAsm() { return ParsingInlineAsm; }
+
+  bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                        unsigned &NumOutputs, unsigned &NumInputs,
+                        SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
+                        SmallVectorImpl<std::string> &Constraints,
+                        SmallVectorImpl<std::string> &Clobbers,
+                        const MCInstrInfo *MII,
+                        const MCInstPrinter *IP,
+                        MCAsmParserSemaCallback &SI);
 
   bool ParseExpression(const MCExpr *&Res);
   virtual bool ParseExpression(const MCExpr *&Res, SMLoc &EndLoc);
@@ -178,7 +225,7 @@ public:
 private:
   void CheckForValidSection();
 
-  bool ParseStatement();
+  bool ParseStatement(ParseStatementInfo &Info);
   void EatToEndOfLine();
   bool ParseCppHashLineFilenameComment(const SMLoc &L);
 
@@ -202,26 +249,28 @@ private:
   /// This returns true on failure.
   bool ProcessIncbinFile(const std::string &Filename);
 
-  /// \brief Reset the current lexer position to that given by \arg Loc. The
+  /// \brief Reset the current lexer position to that given by \p Loc. The
   /// current token is not set; clients should ensure Lex() is called
   /// subsequently.
   void JumpToLoc(SMLoc Loc);
 
-  void EatToEndOfStatement();
+  virtual void EatToEndOfStatement();
 
-  bool ParseMacroArgument(MacroArgument &MA);
+  bool ParseMacroArgument(MacroArgument &MA,
+                          AsmToken::TokenKind &ArgumentDelimiter);
   bool ParseMacroArguments(const Macro *M, MacroArguments &A);
 
   /// \brief Parse up to the end of statement and a return the contents from the
   /// current token until the end of the statement; the current token on exit
   /// will be either the EndOfStatement or EOF.
-  StringRef ParseStringToEndOfStatement();
+  virtual StringRef ParseStringToEndOfStatement();
 
   /// \brief Parse until the end of a statement or a comma is encountered,
   /// return the contents from the current token up to the end or comma.
   StringRef ParseStringToComma();
 
-  bool ParseAssignment(StringRef Name, bool allow_redef);
+  bool ParseAssignment(StringRef Name, bool allow_redef,
+                       bool NoDeadStrip = false);
 
   bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
   bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
@@ -229,8 +278,8 @@ private:
   bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
 
   /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
-  /// and set \arg Res to the identifier contents.
-  bool ParseIdentifier(StringRef &Res);
+  /// and set \p Res to the identifier contents.
+  virtual bool ParseIdentifier(StringRef &Res);
 
   // Directive Parsing.
 
@@ -282,6 +331,9 @@ private:
   bool ParseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
   bool ParseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
   bool ParseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+
+  // "_emit"
+  bool ParseDirectiveEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info);
 };
 
 /// \brief Generic implementations of directive handling, etc. which is shared
@@ -406,8 +458,8 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
                      MCStreamer &_Out, const MCAsmInfo &_MAI)
   : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
     GenericParser(new GenericAsmParser), PlatformParser(0),
-    CurBuffer(0), MacrosEnabled(true), CppHashLineNumber(0), 
-    AssemblerDialect(~0U) {
+    CurBuffer(0), MacrosEnabled(true), CppHashLineNumber(0),
+    AssemblerDialect(~0U), IsDarwin(false), ParsingInlineAsm(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -428,6 +480,7 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
   } else if (_MAI.hasSubsectionsViaSymbols()) {
     PlatformParser = createDarwinAsmParser();
     PlatformParser->Initialize(*this);
+    IsDarwin = true;
   } else {
     PlatformParser = createELFAsmParser();
     PlatformParser->Initialize(*this);
@@ -545,7 +598,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 
   // While we have input, parse each statement.
   while (Lexer.isNot(AsmToken::Eof)) {
-    if (!ParseStatement()) continue;
+    ParseStatementInfo Info;
+    if (!ParseStatement(Info)) continue;
 
     // We had an error, validate that one was emitted and recover by skipping to
     // the next line.
@@ -598,7 +652,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 }
 
 void AsmParser::CheckForValidSection() {
-  if (!getStreamer().getCurrentSection()) {
+  if (!ParsingInlineAsm && !getStreamer().getCurrentSection()) {
     TokError("expected section directive before assembly directive");
     Out.SwitchSection(Ctx.getMachOSection(
                         "__TEXT", "__text",
@@ -1024,14 +1078,11 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
   }
 }
 
-
-
-
 /// ParseStatement:
 ///   ::= EndOfStatement
 ///   ::= Label* Directive ...Operands... EndOfStatement
 ///   ::= Label* Identifier OperandList* EndOfStatement
-bool AsmParser::ParseStatement() {
+bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   if (Lexer.is(AsmToken::EndOfStatement)) {
     Out.AddBlankLine();
     Lex();
@@ -1150,7 +1201,7 @@ bool AsmParser::ParseStatement() {
         return false;
     }
 
-    return ParseStatement();
+    return false;
   }
 
   case AsmToken::Equal:
@@ -1304,26 +1355,30 @@ bool AsmParser::ParseStatement() {
     return Error(IDLoc, "unknown directive");
   }
 
+  // _emit
+  if (ParsingInlineAsm && IDVal == "_emit")
+    return ParseDirectiveEmit(IDLoc, Info);
+
   CheckForValidSection();
 
   // Canonicalize the opcode to lower case.
-  SmallString<128> Opcode;
+  SmallString<128> OpcodeStr;
   for (unsigned i = 0, e = IDVal.size(); i != e; ++i)
-    Opcode.push_back(tolower(IDVal[i]));
+    OpcodeStr.push_back(tolower(IDVal[i]));
 
-  SmallVector<MCParsedAsmOperand*, 8> ParsedOperands;
-  bool HadError = getTargetParser().ParseInstruction(Opcode.str(), IDLoc,
-                                                     ParsedOperands);
+  ParseInstructionInfo IInfo(Info.AsmRewrites);
+  bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr.str(),
+                                                     IDLoc,Info.ParsedOperands);
 
   // Dump the parsed representation, if requested.
   if (getShowParsedOperands()) {
     SmallString<256> Str;
     raw_svector_ostream OS(Str);
     OS << "parsed instruction: [";
-    for (unsigned i = 0; i != ParsedOperands.size(); ++i) {
+    for (unsigned i = 0; i != Info.ParsedOperands.size(); ++i) {
       if (i != 0)
         OS << ", ";
-      ParsedOperands[i]->print(OS);
+      Info.ParsedOperands[i]->print(OS);
     }
     OS << "]";
 
@@ -1335,21 +1390,38 @@ bool AsmParser::ParseStatement() {
   // the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
       getContext().getGenDwarfSection() == getStreamer().getCurrentSection() ) {
+
+     unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
+
+     // If we previously parsed a cpp hash file line comment then make sure the
+     // current Dwarf File is for the CppHashFilename if not then emit the
+     // Dwarf File table for it and adjust the line number for the .loc.
+     const std::vector<MCDwarfFile *> &MCDwarfFiles =
+       getContext().getMCDwarfFiles();
+     if (CppHashFilename.size() != 0) {
+       if(MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
+          CppHashFilename)
+	 getStreamer().EmitDwarfFileDirective(
+	   getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
+
+       unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc,CppHashBuf);
+       Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
+     }
+
     getStreamer().EmitDwarfLocDirective(getContext().getGenDwarfFileNumber(),
-                                        SrcMgr.FindLineNumber(IDLoc, CurBuffer),
-                                        0, DWARF2_LINE_DEFAULT_IS_STMT ?
+                                        Line, 0, DWARF2_LINE_DEFAULT_IS_STMT ?
                                         DWARF2_FLAG_IS_STMT : 0, 0, 0,
                                         StringRef());
   }
 
   // If parsing succeeded, match the instruction.
-  if (!HadError)
-    HadError = getTargetParser().MatchAndEmitInstruction(IDLoc, ParsedOperands,
-                                                         Out);
-
-  // Free any parsed operands.
-  for (unsigned i = 0, e = ParsedOperands.size(); i != e; ++i)
-    delete ParsedOperands[i];
+  if (!HadError) {
+    unsigned ErrorInfo;
+    HadError = getTargetParser().MatchAndEmitInstruction(IDLoc, Info.Opcode,
+                                                         Info.ParsedOperands,
+                                                         Out, ErrorInfo,
+                                                         ParsingInlineAsm);
+  }
 
   // Don't skip the rest of the line, the instruction parser is responsible for
   // that.
@@ -1394,6 +1466,7 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   CppHashLoc = L;
   CppHashFilename = Filename;
   CppHashLineNumber = LineNumber;
+  CppHashBuf = CurBuffer;
 
   // Ignore any trailing characters, they're just comment.
   EatToEndOfLine();
@@ -1454,6 +1527,14 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
     NewDiag.print(0, OS);
 }
 
+// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
+// difference being that that function accepts '@' as part of identifiers and
+// we can't do that. AsmLexer.cpp should probably be changed to handle
+// '@' as a special case when needed.
+static bool isIdentifierChar(char c) {
+  return isalnum(c) || c == '_' || c == '$' || c == '.';
+}
+
 bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
                             const MacroParameters &Parameters,
                             const MacroArguments &A,
@@ -1462,6 +1543,8 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
 
+  // A macro without parameters is handled differently on Darwin:
+  // gas accepts no arguments and does no substitutions
   while (!Body.empty()) {
     // Scan for the next substitution.
     std::size_t End = Body.size(), Pos = 0;
@@ -1518,25 +1601,33 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       Pos += 2;
     } else {
       unsigned I = Pos + 1;
-      while (isalnum(Body[I]) && I + 1 != End)
+      while (isIdentifierChar(Body[I]) && I + 1 != End)
         ++I;
 
       const char *Begin = Body.data() + Pos +1;
       StringRef Argument(Begin, I - (Pos +1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
-        if (Parameters[Index] == Argument)
+        if (Parameters[Index].first == Argument)
           break;
 
-      // FIXME: We should error at the macro definition.
-      if (Index == NParameters)
-        return Error(L, "Parameter not found");
-
-      for (MacroArgument::const_iterator it = A[Index].begin(),
-             ie = A[Index].end(); it != ie; ++it)
-        OS << it->getString();
+      if (Index == NParameters) {
+          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
+            Pos += 3;
+          else {
+            OS << '\\' << Argument;
+            Pos = I;
+          }
+      } else {
+        for (MacroArgument::const_iterator it = A[Index].begin(),
+               ie = A[Index].end(); it != ie; ++it)
+          if (it->getKind() == AsmToken::String)
+            OS << it->getStringContents();
+          else
+            OS << it->getString();
 
-      Pos += 1 + Argument.size();
+        Pos += 1 + Argument.size();
+      }
     }
     // Update the scan point.
     Body = Body.substr(Pos);
@@ -1551,24 +1642,97 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
 {
 }
 
+static bool IsOperator(AsmToken::TokenKind kind)
+{
+  switch (kind)
+  {
+    default:
+      return false;
+    case AsmToken::Plus:
+    case AsmToken::Minus:
+    case AsmToken::Tilde:
+    case AsmToken::Slash:
+    case AsmToken::Star:
+    case AsmToken::Dot:
+    case AsmToken::Equal:
+    case AsmToken::EqualEqual:
+    case AsmToken::Pipe:
+    case AsmToken::PipePipe:
+    case AsmToken::Caret:
+    case AsmToken::Amp:
+    case AsmToken::AmpAmp:
+    case AsmToken::Exclaim:
+    case AsmToken::ExclaimEqual:
+    case AsmToken::Percent:
+    case AsmToken::Less:
+    case AsmToken::LessEqual:
+    case AsmToken::LessLess:
+    case AsmToken::LessGreater:
+    case AsmToken::Greater:
+    case AsmToken::GreaterEqual:
+    case AsmToken::GreaterGreater:
+      return true;
+  }
+}
+
 /// ParseMacroArgument - Extract AsmTokens for a macro argument.
 /// This is used for both default macro parameter values and the
 /// arguments in macro invocations
-bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
+bool AsmParser::ParseMacroArgument(MacroArgument &MA,
+                                   AsmToken::TokenKind &ArgumentDelimiter) {
   unsigned ParenLevel = 0;
+  unsigned AddTokens = 0;
 
-  for (;;) {
-    SMLoc LastTokenLoc;
+  // gas accepts arguments separated by whitespace, except on Darwin
+  if (!IsDarwin)
+    Lexer.setSkipSpace(false);
 
-    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
+  for (;;) {
+    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal)) {
+      Lexer.setSkipSpace(true);
       return TokError("unexpected token in macro instantiation");
+    }
+
+    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) {
+      // Spaces and commas cannot be mixed to delimit parameters
+      if (ArgumentDelimiter == AsmToken::Eof)
+        ArgumentDelimiter = AsmToken::Comma;
+      else if (ArgumentDelimiter != AsmToken::Comma) {
+        Lexer.setSkipSpace(true);
+        return TokError("expected ' ' for macro argument separator");
+      }
+      break;
+    }
+
+    if (Lexer.is(AsmToken::Space)) {
+      Lex(); // Eat spaces
+
+      // Spaces can delimit parameters, but could also be part an expression.
+      // If the token after a space is an operator, add the token and the next
+      // one into this argument
+      if (ArgumentDelimiter == AsmToken::Space ||
+          ArgumentDelimiter == AsmToken::Eof) {
+        if (IsOperator(Lexer.getKind())) {
+          // Check to see whether the token is used as an operator,
+          // or part of an identifier
+          const char *NextChar = getTok().getEndLoc().getPointer() + 1;
+          if (*NextChar == ' ')
+            AddTokens = 2;
+        }
+
+        if (!AddTokens && ParenLevel == 0) {
+          if (ArgumentDelimiter == AsmToken::Eof &&
+              !IsOperator(Lexer.getKind()))
+            ArgumentDelimiter = AsmToken::Space;
+          break;
+        }
+      }
+    }
 
     // HandleMacroEntry relies on not advancing the lexer here
     // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
-    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma))
-      break;
 
     // Adjust the current parentheses level.
     if (Lexer.is(AsmToken::LParen))
@@ -1578,16 +1742,23 @@ bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
 
     // Append the token to the current argument list.
     MA.push_back(getTok());
+    if (AddTokens)
+      AddTokens--;
     Lex();
   }
+
+  Lexer.setSkipSpace(true);
   if (ParenLevel != 0)
-    return TokError("unbalanced parenthesises in macro argument");
+    return TokError("unbalanced parentheses in macro argument");
   return false;
 }
 
 // Parse the macro instantiation arguments.
 bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
+  // Argument delimiter is initially unknown. It will be set by
+  // ParseMacroArgument()
+  AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
 
   // Parse two kinds of macro invocations:
   // - macros defined without any parameters accept an arbitrary number of them
@@ -1596,13 +1767,30 @@ bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
        ++Parameter) {
     MacroArgument MA;
 
-    if (ParseMacroArgument(MA))
+    if (ParseMacroArgument(MA, ArgumentDelimiter))
       return true;
 
-    A.push_back(MA);
+    if (!MA.empty() || !NParameters)
+      A.push_back(MA);
+    else if (NParameters) {
+      if (!M->Parameters[Parameter].second.empty())
+        A.push_back(M->Parameters[Parameter].second);
+    }
 
-    if (Lexer.is(AsmToken::EndOfStatement))
+    // At the end of the statement, fill in remaining arguments that have
+    // default values. If there aren't any, then the next argument is
+    // required but missing
+    if (Lexer.is(AsmToken::EndOfStatement)) {
+      if (NParameters && Parameter < NParameters - 1) {
+        if (M->Parameters[Parameter + 1].second.empty())
+          return TokError("macro argument '" +
+                          Twine(M->Parameters[Parameter + 1].first) +
+                          "' is missing");
+        else
+          continue;
+      }
       return false;
+    }
 
     if (Lexer.is(AsmToken::Comma))
       Lex();
@@ -1691,7 +1879,8 @@ static bool IsUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
   llvm_unreachable("Unknown expr kind!");
 }
 
-bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) {
+bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
+                                bool NoDeadStrip) {
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Lexer.getLoc();
 
@@ -1746,6 +1935,9 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) {
 
   // Do the assignment.
   Out.EmitAssignment(Sym, Value);
+  if (NoDeadStrip)
+    Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
+
 
   return false;
 }
@@ -1803,7 +1995,7 @@ bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
     return TokError("unexpected token in '" + Twine(IDVal) + "'");
   Lex();
 
-  return ParseAssignment(Name, allow_redef);
+  return ParseAssignment(Name, allow_redef, true);
 }
 
 bool AsmParser::ParseEscapedString(std::string &Data) {
@@ -2274,8 +2466,13 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
     if (ParseAbsoluteExpression(Pow2Alignment))
       return true;
 
+    LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
+    if (IsLocal && LCOMM == LCOMM::NoAlignment)
+      return Error(Pow2AlignmentLoc, "alignment not supported on this target");
+
     // If this target takes alignments in bytes (not log) validate and convert.
-    if (Lexer.getMAI().getAlignmentIsInBytes()) {
+    if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
+        (IsLocal && LCOMM == LCOMM::ByteAlignment)) {
       if (!isPowerOf2_64(Pow2Alignment))
         return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
       Pow2Alignment = Log2_64(Pow2Alignment);
@@ -2303,13 +2500,9 @@ bool AsmParser::ParseDirectiveComm(bool IsLocal) {
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
 
-  // '.lcomm' is equivalent to '.zerofill'.
   // Create the Symbol as a common or local common with Size and Pow2Alignment
   if (IsLocal) {
-    getStreamer().EmitZerofill(Ctx.getMachOSection(
-                                 "__DATA", "__bss", MCSectionMachO::S_ZEROFILL,
-                                 0, SectionKind::getBSS()),
-                               Sym, Size, 1 << Pow2Alignment);
+    getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
     return false;
   }
 
@@ -3073,25 +3266,33 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
                                            SMLoc DirectiveLoc) {
   StringRef Name;
   if (getParser().ParseIdentifier(Name))
-    return TokError("expected identifier in directive");
+    return TokError("expected identifier in '.macro' directive");
 
   MacroParameters Parameters;
+  // Argument delimiter is initially unknown. It will be set by
+  // ParseMacroArgument()
+  AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for(;;) {
-      StringRef Parameter;
-      if (getParser().ParseIdentifier(Parameter))
-        return TokError("expected identifier in directive");
+    for (;;) {
+      MacroParameter Parameter;
+      if (getParser().ParseIdentifier(Parameter.first))
+        return TokError("expected identifier in '.macro' directive");
+
+      if (getLexer().is(AsmToken::Equal)) {
+        Lex();
+        if (getParser().ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+          return true;
+      }
+
       Parameters.push_back(Parameter);
 
-      if (getLexer().isNot(AsmToken::Comma))
+      if (getLexer().is(AsmToken::Comma))
+        Lex();
+      else if (getLexer().is(AsmToken::EndOfStatement))
         break;
-      Lex();
     }
   }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.macro' directive");
-
   // Eat the end of statement.
   Lex();
 
@@ -3296,7 +3497,7 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   MacroParameters Parameters;
   MacroParameter Parameter;
 
-  if (ParseIdentifier(Parameter))
+  if (ParseIdentifier(Parameter.first))
     return TokError("expected identifier in '.irp' directive");
 
   Parameters.push_back(Parameter);
@@ -3323,9 +3524,8 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   SmallString<256> Buf;
   raw_svector_ostream OS(Buf);
 
-  for (std::vector<MacroArgument>::iterator i = A.begin(), e = A.end(); i != e;
-       ++i) {
-    std::vector<MacroArgument> Args;
+  for (MacroArguments::iterator i = A.begin(), e = A.end(); i != e; ++i) {
+    MacroArguments Args;
     Args.push_back(*i);
 
     if (expandMacro(OS, M->Body, Parameters, Args, getTok().getLoc()))
@@ -3343,7 +3543,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   MacroParameters Parameters;
   MacroParameter Parameter;
 
-  if (ParseIdentifier(Parameter))
+  if (ParseIdentifier(Parameter.first))
     return TokError("expected identifier in '.irpc' directive");
 
   Parameters.push_back(Parameter);
@@ -3393,7 +3593,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
 
 bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
   if (ActiveMacros.empty())
-    return TokError("unexpected '.endr' directive, no current .rept");
+    return TokError("unmatched '.endr' directive");
 
   // The only .repl that should get here are the ones created by
   // InstantiateMacroLikeBody.
@@ -3403,6 +3603,214 @@ bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
   return false;
 }
 
+bool AsmParser::ParseDirectiveEmit(SMLoc IDLoc, ParseStatementInfo &Info) {
+  const MCExpr *Value;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (ParseExpression(Value))
+    return true;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+  if (!MCE)
+    return Error(ExprLoc, "unexpected expression in _emit");
+  uint64_t IntValue = MCE->getValue();
+  if (!isUIntN(8, IntValue) && !isIntN(8, IntValue))
+    return Error(ExprLoc, "literal value out of range for directive");
+
+  Info.AsmRewrites->push_back(AsmRewrite(AOK_Emit, IDLoc, 5));
+  return false;
+}
+
+bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                                 unsigned &NumOutputs, unsigned &NumInputs,
+                                 SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+                                 SmallVectorImpl<std::string> &Constraints,
+                                 SmallVectorImpl<std::string> &Clobbers,
+                                 const MCInstrInfo *MII,
+                                 const MCInstPrinter *IP,
+                                 MCAsmParserSemaCallback &SI) {
+  SmallVector<void *, 4> InputDecls;
+  SmallVector<void *, 4> OutputDecls;
+  SmallVector<bool, 4> InputDeclsOffsetOf;
+  SmallVector<bool, 4> OutputDeclsOffsetOf;
+  SmallVector<std::string, 4> InputConstraints;
+  SmallVector<std::string, 4> OutputConstraints;
+  std::set<std::string> ClobberRegs;
+
+  SmallVector<struct AsmRewrite, 4> AsmStrRewrites;
+
+  // Prime the lexer.
+  Lex();
+
+  // While we have input, parse each statement.
+  unsigned InputIdx = 0;
+  unsigned OutputIdx = 0;
+  while (getLexer().isNot(AsmToken::Eof)) {
+    ParseStatementInfo Info(&AsmStrRewrites);
+    if (ParseStatement(Info))
+      return true;
+
+    if (Info.Opcode != ~0U) {
+      const MCInstrDesc &Desc = MII->get(Info.Opcode);
+
+      // Build the list of clobbers, outputs and inputs.
+      for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
+        MCParsedAsmOperand *Operand = Info.ParsedOperands[i];
+
+        // Immediate.
+        if (Operand->isImm()) {
+          if (Operand->needAsmRewrite())
+            AsmStrRewrites.push_back(AsmRewrite(AOK_ImmPrefix,
+                                                Operand->getStartLoc()));
+          continue;
+        }
+
+        // Register operand.
+        if (Operand->isReg() && !Operand->isOffsetOf()) {
+          unsigned NumDefs = Desc.getNumDefs();
+          // Clobber.
+          if (NumDefs && Operand->getMCOperandNum() < NumDefs) {
+            std::string Reg;
+            raw_string_ostream OS(Reg);
+            IP->printRegName(OS, Operand->getReg());
+            ClobberRegs.insert(StringRef(OS.str()));
+          }
+          continue;
+        }
+
+        // Expr/Input or Output.
+        unsigned Size;
+        void *OpDecl = SI.LookupInlineAsmIdentifier(Operand->getName(), AsmLoc,
+                                                    Size);
+        if (OpDecl) {
+          bool isOutput = (i == 1) && Desc.mayStore();
+          if (!Operand->isOffsetOf() && Operand->needSizeDirective())
+            AsmStrRewrites.push_back(AsmRewrite(AOK_SizeDirective,
+                                                Operand->getStartLoc(),
+                                                /*Len*/0,
+                                                Operand->getMemSize()));
+          if (isOutput) {
+            std::string Constraint = "=";
+            ++InputIdx;
+            OutputDecls.push_back(OpDecl);
+            OutputDeclsOffsetOf.push_back(Operand->isOffsetOf());
+            Constraint += Operand->getConstraint().str();
+            OutputConstraints.push_back(Constraint);
+            AsmStrRewrites.push_back(AsmRewrite(AOK_Output,
+                                                Operand->getStartLoc(),
+                                                Operand->getNameLen()));
+          } else {
+            InputDecls.push_back(OpDecl);
+            InputDeclsOffsetOf.push_back(Operand->isOffsetOf());
+            InputConstraints.push_back(Operand->getConstraint().str());
+            AsmStrRewrites.push_back(AsmRewrite(AOK_Input,
+                                                Operand->getStartLoc(),
+                                                Operand->getNameLen()));
+          }
+        }
+      }
+    }
+  }
+
+  // Set the number of Outputs and Inputs.
+  NumOutputs = OutputDecls.size();
+  NumInputs = InputDecls.size();
+
+  // Set the unique clobbers.
+  for (std::set<std::string>::iterator I = ClobberRegs.begin(),
+         E = ClobberRegs.end(); I != E; ++I)
+    Clobbers.push_back(*I);
+
+  // Merge the various outputs and inputs.  Output are expected first.
+  if (NumOutputs || NumInputs) {
+    unsigned NumExprs = NumOutputs + NumInputs;
+    OpDecls.resize(NumExprs);
+    Constraints.resize(NumExprs);
+    // FIXME: Constraints are hard coded to 'm', but we need an 'r'
+    // constraint for offsetof.  This needs to be cleaned up!
+    for (unsigned i = 0; i < NumOutputs; ++i) {
+      OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsOffsetOf[i]);
+      Constraints[i] = OutputDeclsOffsetOf[i] ? "=r" : OutputConstraints[i];
+    }
+    for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
+      OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsOffsetOf[i]);
+      Constraints[j] = InputDeclsOffsetOf[i] ? "r" : InputConstraints[i];
+    }
+  }
+
+  // Build the IR assembly string.
+  std::string AsmStringIR;
+  AsmRewriteKind PrevKind = AOK_Imm;
+  raw_string_ostream OS(AsmStringIR);
+  const char *Start = SrcMgr.getMemoryBuffer(0)->getBufferStart();
+  for (SmallVectorImpl<struct AsmRewrite>::iterator
+         I = AsmStrRewrites.begin(), E = AsmStrRewrites.end(); I != E; ++I) {
+    const char *Loc = (*I).Loc.getPointer();
+
+    AsmRewriteKind Kind = (*I).Kind;
+
+    // Emit everything up to the immediate/expression.  If the previous rewrite
+    // was a size directive, then this has already been done.
+    if (PrevKind != AOK_SizeDirective)
+      OS << StringRef(Start, Loc - Start);
+    PrevKind = Kind;
+
+    // Skip the original expression.
+    if (Kind == AOK_Skip) {
+      Start = Loc + (*I).Len;
+      continue;
+    }
+
+    // Rewrite expressions in $N notation.
+    switch (Kind) {
+    default: break;
+    case AOK_Imm:
+      OS << Twine("$$");
+      OS << (*I).Val;
+      break;
+    case AOK_ImmPrefix:
+      OS << Twine("$$");
+      break;
+    case AOK_Input:
+      OS << '$';
+      OS << InputIdx++;
+      break;
+    case AOK_Output:
+      OS << '$';
+      OS << OutputIdx++;
+      break;
+    case AOK_SizeDirective:
+      switch((*I).Val) {
+      default: break;
+      case 8:  OS << "byte ptr "; break;
+      case 16: OS << "word ptr "; break;
+      case 32: OS << "dword ptr "; break;
+      case 64: OS << "qword ptr "; break;
+      case 80: OS << "xword ptr "; break;
+      case 128: OS << "xmmword ptr "; break;
+      case 256: OS << "ymmword ptr "; break;
+      }
+      break;
+    case AOK_Emit:
+      OS << ".byte";
+      break;
+    case AOK_DotOperator:
+      OS << (*I).Val;
+      break;
+    }
+
+    // Skip the original expression.
+    if (Kind != AOK_SizeDirective)
+      Start = Loc + (*I).Len;
+  }
+
+  // Emit the remainder of the asm string.
+  const char *AsmEnd = SrcMgr.getMemoryBuffer(0)->getBufferEnd();
+  if (Start != AsmEnd)
+    OS << StringRef(Start, AsmEnd - Start);
+
+  AsmString = OS.str();
+  return false;
+}
+
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM,
                                      MCContext &C, MCStreamer &Out,
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 9316bb1c1cdb..d55de1f3fbe8 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -203,7 +203,7 @@ bool ELFAsmParser::ParseDirectiveSize(StringRef, SMLoc) {
   StringRef Name;
   if (getParser().ParseIdentifier(Name))
     return TokError("expected identifier in directive");
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);;
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
 
   if (getLexer().isNot(AsmToken::Comma))
     return TokError("unexpected token in directive");
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 3a3ff147117e..384b341bc730 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -12,7 +12,8 @@
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()), TokStart(0) {
+MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()),
+                           TokStart(0), SkipSpace(true) {
 }
 
 MCAsmLexer::~MCAsmLexer() {
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 3a825f03b776..6967feef2440 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -44,5 +44,7 @@ bool MCAsmParser::ParseExpression(const MCExpr *&Res) {
 }
 
 void MCParsedAsmOperand::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
+#endif
 }
diff --git a/lib/MC/MCParser/MCTargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp
index 6fb1ba4216f8..60a3a3b59a3d 100644
--- a/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -11,7 +11,7 @@
 using namespace llvm;
 
 MCTargetAsmParser::MCTargetAsmParser()
-  : AvailableFeatures(0)
+  : AvailableFeatures(0), ParsingInlineAsm(false)
 {
 }
 
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
index 4d1aff3e427e..5c71106c9017 100644
--- a/lib/MC/MCRegisterInfo.cpp
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -24,6 +24,8 @@ unsigned MCRegisterInfo::getMatchingSuperReg(unsigned Reg, unsigned SubIdx,
 }
 
 unsigned MCRegisterInfo::getSubReg(unsigned Reg, unsigned Idx) const {
+  assert(Idx && Idx < getNumSubRegIndices() &&
+         "This is not a subregister index");
   // Get a pointer to the corresponding SubRegIndices list. This list has the
   // name of each sub-register in the same order as MCSubRegIterator.
   const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
@@ -34,6 +36,7 @@ unsigned MCRegisterInfo::getSubReg(unsigned Reg, unsigned Idx) const {
 }
 
 unsigned MCRegisterInfo::getSubRegIndex(unsigned Reg, unsigned SubReg) const {
+  assert(SubReg && SubReg < getNumRegs() && "This is not a register");
   // Get a pointer to the corresponding SubRegIndices list. This list has the
   // name of each sub-register in the same order as MCSubRegIterator.
   const uint16_t *SRI = SubRegIndices + get(Reg).SubRegIndices;
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 0bac24dc3a73..afece0ba5519 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -561,6 +561,10 @@ void MCStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList, bool) {
   abort();
 }
 
+void MCStreamer::EmitTCEntry(const MCSymbol &S) {
+  llvm_unreachable("Unsupported method");
+}
+
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 05c83f760a2d..80a1f02ce653 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -19,11 +19,28 @@ using namespace llvm;
 
 MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
 
+/// InitMCProcessorInfo - Set or change the CPU (optionally supplemented
+/// with feature string). Recompute feature bits and scheduling model.
+void
+MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
+  SubtargetFeatures Features(FS);
+  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
+                                        ProcFeatures, NumFeatures);
+
+  if (!CPU.empty())
+    CPUSchedModel = getSchedModelForCPU(CPU);
+  else
+    CPUSchedModel = &MCSchedModel::DefaultSchedModel;
+}
+
 void
 MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
                                      const SubtargetFeatureKV *PF,
                                      const SubtargetFeatureKV *PD,
                                      const SubtargetInfoKV *ProcSched,
+                                     const MCWriteProcResEntry *WPR,
+                                     const MCWriteLatencyEntry *WL,
+                                     const MCReadAdvanceEntry *RA,
                                      const InstrStage *IS,
                                      const unsigned *OC,
                                      const unsigned *FP,
@@ -31,26 +48,18 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
   TargetTriple = TT;
   ProcFeatures = PF;
   ProcDesc = PD;
-  ProcSchedModel = ProcSched;
+  ProcSchedModels = ProcSched;
+  WriteProcResTable = WPR;
+  WriteLatencyTable = WL;
+  ReadAdvanceTable = RA;
+
   Stages = IS;
   OperandCycles = OC;
   ForwardingPaths = FP;
   NumFeatures = NF;
   NumProcs = NP;
 
-  SubtargetFeatures Features(FS);
-  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
-                                        ProcFeatures, NumFeatures);
-}
-
-
-/// ReInitMCSubtargetInfo - Change CPU (and optionally supplemented with
-/// feature string) and recompute feature bits.
-uint64_t MCSubtargetInfo::ReInitMCSubtargetInfo(StringRef CPU, StringRef FS) {
-  SubtargetFeatures Features(FS);
-  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
-                                        ProcFeatures, NumFeatures);
-  return FeatureBits;
+  InitMCProcessorInfo(CPU, FS);
 }
 
 /// ToggleFeature - Toggle a feature and returns the re-computed feature
@@ -70,13 +79,13 @@ uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) {
 }
 
 
-MCSchedModel *
+const MCSchedModel *
 MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
-  assert(ProcSchedModel && "Processor machine model not available!");
+  assert(ProcSchedModels && "Processor machine model not available!");
 
 #ifndef NDEBUG
   for (size_t i = 1; i < NumProcs; i++) {
-    assert(strcmp(ProcSchedModel[i - 1].Key, ProcSchedModel[i].Key) < 0 &&
+    assert(strcmp(ProcSchedModels[i - 1].Key, ProcSchedModels[i].Key) < 0 &&
            "Processor machine model table is not sorted");
   }
 #endif
@@ -85,19 +94,25 @@ MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   SubtargetInfoKV KV;
   KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModel, ProcSchedModel+NumProcs, KV);
-  if (Found == ProcSchedModel+NumProcs || StringRef(Found->Key) != CPU) {
+    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, KV);
+  if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
            << " (ignoring processor)\n";
     return &MCSchedModel::DefaultSchedModel;
   }
   assert(Found->Value && "Missing processor SchedModel value");
-  return (MCSchedModel *)Found->Value;
+  return (const MCSchedModel *)Found->Value;
 }
 
 InstrItineraryData
 MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
-  MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
+  const MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
   return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }
+
+/// Initialize an InstrItineraryData instance.
+void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
+  InstrItins =
+    InstrItineraryData(CPUSchedModel, Stages, OperandCycles, ForwardingPaths);
+}
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index f7f9184f03d0..b973c57f7b81 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -26,7 +26,7 @@ static bool isAcceptableChar(char C) {
   return true;
 }
 
-/// NameNeedsQuoting - Return true if the identifier \arg Str needs quotes to be
+/// NameNeedsQuoting - Return true if the identifier \p Str needs quotes to be
 /// syntactically correct.
 static bool NameNeedsQuoting(StringRef Str) {
   assert(!Str.empty() && "Cannot create an empty MCSymbol");
@@ -76,6 +76,8 @@ void MCSymbol::print(raw_ostream &OS) const {
   OS << '"' << getName() << '"';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCSymbol::dump() const {
   print(dbgs());
 }
+#endif
diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index c6ea16ce7b4d..4393777211e8 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp
@@ -31,6 +31,8 @@ void MCValue::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
     OS << " + " << getConstant();
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCValue::dump() const {
   print(dbgs(), 0);
 }
+#endif
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index 5820a224c527..a94b2140227f 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -68,6 +68,11 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbolData* SD,
 
   // If this is a variable, then recursively evaluate now.
   if (S.isVariable()) {
+    if (const MCConstantExpr *C =
+          dyn_cast<const MCConstantExpr>(S.getVariableValue()))
+      return C->getValue();
+
+
     MCValue Target;
     if (!S.getVariableValue()->EvaluateAsRelocatable(Target, Layout))
       report_fatal_error("unable to evaluate offset for variable '" +
@@ -140,8 +145,8 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
 
 /// WriteSegmentLoadCommand - Write a segment load command.
 ///
-/// \arg NumSections - The number of sections in this segment.
-/// \arg SectionDataSize - The total size of the sections.
+/// \param NumSections The number of sections in this segment.
+/// \param SectionDataSize The total size of the sections.
 void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
                                                uint64_t VMSize,
                                                uint64_t SectionDataStartOffset,
@@ -315,11 +320,7 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
 
   // Compute the symbol address.
   if (Symbol.isDefined()) {
-    if (Symbol.isAbsolute()) {
-      Address = cast<MCConstantExpr>(Symbol.getVariableValue())->getValue();
-    } else {
-      Address = getSymbolAddress(&Data, Layout);
-    }
+    Address = getSymbolAddress(&Data, Layout);
   } else if (Data.isCommon()) {
     // Common symbols are encoded with the size in the address
     // field, and their alignment in the flags.
@@ -396,8 +397,7 @@ void MachObjectWriter::BindIndirectSymbols(MCAssembler &Asm) {
       continue;
 
     // Initialize the section indirect symbol base, if necessary.
-    if (!IndirectSymBase.count(it->SectionData))
-      IndirectSymBase[it->SectionData] = IndirectIndex;
+    IndirectSymBase.insert(std::make_pair(it->SectionData, IndirectIndex));
 
     Asm.getOrCreateSymbolData(*it->Symbol);
   }
@@ -414,8 +414,7 @@ void MachObjectWriter::BindIndirectSymbols(MCAssembler &Asm) {
       continue;
 
     // Initialize the section indirect symbol base, if necessary.
-    if (!IndirectSymBase.count(it->SectionData))
-      IndirectSymBase[it->SectionData] = IndirectIndex;
+    IndirectSymBase.insert(std::make_pair(it->SectionData, IndirectIndex));
 
     // Set the symbol type to undefined lazy, but only on construction.
     //
@@ -559,6 +558,26 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
   }
 }
 
+void MachObjectWriter::markAbsoluteVariableSymbols(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout) {
+  for (MCAssembler::symbol_iterator i = Asm.symbol_begin(),
+                                    e = Asm.symbol_end();
+      i != e; ++i) {
+    MCSymbolData &SD = *i;
+    if (!SD.getSymbol().isVariable())
+      continue;
+
+    // Is the variable is a symbol difference (SA - SB + C) expression,
+    // and neither symbol is external, mark the variable as absolute.
+    const MCExpr *Expr = SD.getSymbol().getVariableValue();
+    MCValue Value;
+    if (Expr->EvaluateAsRelocatable(Value, Layout)) {
+      if (Value.getSymA() && Value.getSymB())
+        const_cast<MCSymbol*>(&SD.getSymbol())->setAbsolute();
+    }
+  }
+}
+
 void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
                                                 const MCAsmLayout &Layout) {
   computeSectionAddresses(Asm, Layout);
@@ -566,6 +585,10 @@ void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
   // Create symbol data for any indirect symbols.
   BindIndirectSymbols(Asm);
 
+  // Mark symbol difference expressions in variables (from .set or = directives)
+  // as absolute.
+  markAbsoluteVariableSymbols(Asm, Layout);
+
   // Compute symbol table information and bind symbol indices.
   ComputeSymbolTable(Asm, StringTable, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
@@ -797,8 +820,12 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          it = Asm.data_region_begin(), ie = Asm.data_region_end();
          it != ie; ++it) {
     const DataRegionData *Data = &(*it);
-    uint64_t Start = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start), Layout);
-    uint64_t End = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End), Layout);
+    uint64_t Start =
+      getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start),
+                       Layout);
+    uint64_t End =
+      getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End),
+                       Layout);
     DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
                  << "  start: " << Start << "(" << Data->Start->getName() << ")"
                  << "  end: " << End << "(" << Data->End->getName() << ")"
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 0a44e7731be3..7625abd465fa 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -119,14 +119,15 @@ void SubtargetFeatures::AddFeature(const StringRef String,
 }
 
 /// Find KV in array using binary search.
-template<typename T> const T *Find(const StringRef S, const T *A, size_t L) {
+static const SubtargetFeatureKV *Find(StringRef S, const SubtargetFeatureKV *A,
+                                      size_t L) {
   // Make the lower bound element we're looking for
-  T KV;
+  SubtargetFeatureKV KV;
   KV.Key = S.data();
   // Determine the end of the array
-  const T *Hi = A + L;
+  const SubtargetFeatureKV *Hi = A + L;
   // Binary search the array
-  const T *F = std::lower_bound(A, Hi, KV);
+  const SubtargetFeatureKV *F = std::lower_bound(A, Hi, KV);
   // If not found then return NULL
   if (F == Hi || StringRef(F->Key) != S) return NULL;
   // Return the found array item
@@ -336,30 +337,6 @@ uint64_t SubtargetFeatures::getFeatureBits(const StringRef CPU,
   return Bits;
 }
 
-/// Get scheduling itinerary of a CPU.
-void *SubtargetFeatures::getItinerary(const StringRef CPU,
-                                      const SubtargetInfoKV *Table,
-                                      size_t TableSize) {
-  assert(Table && "missing table");
-#ifndef NDEBUG
-  for (size_t i = 1; i < TableSize; i++) {
-    assert(strcmp(Table[i - 1].Key, Table[i].Key) < 0 && "Table is not sorted");
-  }
-#endif
-
-  // Find entry
-  const SubtargetInfoKV *Entry = Find(CPU, Table, TableSize);
-
-  if (Entry) {
-    return Entry->Value;
-  } else {
-    errs() << "'" << CPU
-           << "' is not a recognized processor for this target"
-           << " (ignoring processor)\n";
-    return NULL;
-  }
-}
-
 /// print - Print feature string.
 ///
 void SubtargetFeatures::print(raw_ostream &OS) const {
@@ -368,11 +345,13 @@ void SubtargetFeatures::print(raw_ostream &OS) const {
   OS << "\n";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump feature info.
 ///
 void SubtargetFeatures::dump() const {
   print(dbgs());
 }
+#endif
 
 /// getDefaultSubtargetFeatures - Return a string listing the features
 /// associated with the target triple.
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index b026277ac624..702eec04ef1b 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -70,11 +70,6 @@ public:
                             uint64_t Size,unsigned ByteAlignment);
   virtual void EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
                               uint64_t Size, unsigned ByteAlignment);
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
-  virtual void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
-                                   unsigned ValueSize, unsigned MaxBytesToEmit);
-  virtual void EmitCodeAlignment(unsigned ByteAlignment,
-                                 unsigned MaxBytesToEmit);
   virtual void EmitFileDirective(StringRef Filename);
   virtual void EmitInstruction(const MCInst &Instruction);
   virtual void EmitWin64EHHandlerData();
@@ -333,43 +328,6 @@ void WinCOFFStreamer::EmitTBSSSymbol(const MCSection *Section, MCSymbol *Symbol,
   llvm_unreachable("not implemented");
 }
 
-void WinCOFFStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
-  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
-  // MCObjectStreamer?
-  getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
-}
-
-void WinCOFFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
-                                           int64_t Value,
-                                           unsigned ValueSize,
-                                           unsigned MaxBytesToEmit) {
-  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
-  // MCObjectStreamer?
-  if (MaxBytesToEmit == 0)
-    MaxBytesToEmit = ByteAlignment;
-  new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
-                      getCurrentSectionData());
-
-  // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > getCurrentSectionData()->getAlignment())
-    getCurrentSectionData()->setAlignment(ByteAlignment);
-}
-
-void WinCOFFStreamer::EmitCodeAlignment(unsigned ByteAlignment,
-                                        unsigned MaxBytesToEmit) {
-  // TODO: This is copied exactly from the MachOStreamer. Consider merging into
-  // MCObjectStreamer?
-  if (MaxBytesToEmit == 0)
-    MaxBytesToEmit = ByteAlignment;
-  MCAlignFragment *F = new MCAlignFragment(ByteAlignment, 0, 1, MaxBytesToEmit,
-                                           getCurrentSectionData());
-  F->setEmitNops(true);
-
-  // Update the maximum alignment on the current section if necessary.
-  if (ByteAlignment > getCurrentSectionData()->getAlignment())
-    getCurrentSectionData()->setAlignment(ByteAlignment);
-}
-
 void WinCOFFStreamer::EmitFileDirective(StringRef Filename) {
   // Ignore for now, linkers don't care, and proper debug
   // info will be a much large effort.
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 8ab54c629504..0b7ee34c09af 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -288,6 +288,11 @@ error_code COFFObjectFile::getSymbolSection(DataRefImpl Symb,
   return object_error::success;
 }
 
+error_code COFFObjectFile::getSymbolValue(DataRefImpl Symb,
+                                          uint64_t &Val) const {
+  report_fatal_error("getSymbolValue unimplemented in COFFObjectFile");
+}
+
 error_code COFFObjectFile::getSectionNext(DataRefImpl Sec,
                                           SectionRef &Result) const {
   const coff_section *sec = toSec(Sec);
@@ -372,7 +377,14 @@ error_code COFFObjectFile::isSectionVirtual(DataRefImpl Sec,
 
 error_code COFFObjectFile::isSectionZeroInit(DataRefImpl Sec,
                                              bool &Result) const {
-  // FIXME: Unimplemented
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
+error_code COFFObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
+                                                bool &Result) const {
+  // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index d229671954f8..45aeaac6b831 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -363,6 +363,10 @@ error_code MachOObjectFile::getSymbolType(DataRefImpl Symb,
   return object_error::success;
 }
 
+error_code MachOObjectFile::getSymbolValue(DataRefImpl Symb,
+                                           uint64_t &Val) const {
+  report_fatal_error("getSymbolValue unimplemented in MachOObjectFile");
+}
 
 symbol_iterator MachOObjectFile::begin_symbols() const {
   // DRI.d.a = segment number; DRI.d.b = symbol index.
@@ -581,14 +585,14 @@ error_code MachOObjectFile::isSectionBSS(DataRefImpl DRI,
 
 error_code MachOObjectFile::isSectionRequiredForExecution(DataRefImpl Sec,
                                                           bool &Result) const {
-  // FIXME: Unimplemented
+  // FIXME: Unimplemented.
   Result = true;
   return object_error::success;
 }
 
 error_code MachOObjectFile::isSectionVirtual(DataRefImpl Sec,
-                                            bool &Result) const {
-  // FIXME: Unimplemented
+                                             bool &Result) const {
+  // FIXME: Unimplemented.
   Result = false;
   return object_error::success;
 }
@@ -612,6 +616,17 @@ error_code MachOObjectFile::isSectionZeroInit(DataRefImpl DRI,
   return object_error::success;
 }
 
+error_code MachOObjectFile::isSectionReadOnlyData(DataRefImpl Sec,
+                                                  bool &Result) const {
+  // Consider using the code from isSectionText to look for __const sections.
+  // Alternately, emit S_ATTR_PURE_INSTRUCTIONS and/or S_ATTR_SOME_INSTRUCTIONS
+  // to use section attributes to distinguish code from data.
+
+  // FIXME: Unimplemented.
+  Result = false;
+  return object_error::success;
+}
+
 error_code MachOObjectFile::sectionContainsSymbol(DataRefImpl Sec,
                                                   DataRefImpl Symb,
                                                   bool &Result) const {
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index ed261a4194c9..7e8b4a3d0d29 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -46,22 +46,27 @@ namespace llvm {
     /* Number of bits in the significand.  This includes the integer
        bit.  */
     unsigned int precision;
-
-    /* True if arithmetic is supported.  */
-    unsigned int arithmeticOK;
   };
 
-  const fltSemantics APFloat::IEEEhalf = { 15, -14, 11, true };
-  const fltSemantics APFloat::IEEEsingle = { 127, -126, 24, true };
-  const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53, true };
-  const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113, true };
-  const fltSemantics APFloat::x87DoubleExtended = { 16383, -16382, 64, true };
-  const fltSemantics APFloat::Bogus = { 0, 0, 0, true };
-
-  // The PowerPC format consists of two doubles.  It does not map cleanly
-  // onto the usual format above.  For now only storage of constants of
-  // this type is supported, no arithmetic.
-  const fltSemantics APFloat::PPCDoubleDouble = { 1023, -1022, 106, false };
+  const fltSemantics APFloat::IEEEhalf = { 15, -14, 11 };
+  const fltSemantics APFloat::IEEEsingle = { 127, -126, 24 };
+  const fltSemantics APFloat::IEEEdouble = { 1023, -1022, 53 };
+  const fltSemantics APFloat::IEEEquad = { 16383, -16382, 113 };
+  const fltSemantics APFloat::x87DoubleExtended = { 16383, -16382, 64 };
+  const fltSemantics APFloat::Bogus = { 0, 0, 0 };
+
+  /* The PowerPC format consists of two doubles.  It does not map cleanly
+     onto the usual format above.  It is approximated using twice the
+     mantissa bits.  Note that for exponents near the double minimum,
+     we no longer can represent the full 106 mantissa bits, so those
+     will be treated as denormal numbers.
+
+     FIXME: While this approximation is equivalent to what GCC uses for
+     compile-time arithmetic on PPC double-double numbers, it is not able
+     to represent all possible values held by a PPC double-double number,
+     for example: (long double) 1.0 + (long double) 0x1p-106
+     Should this be replaced by a full emulation of PPC double-double?  */
+  const fltSemantics APFloat::PPCDoubleDouble = { 1023, -1022 + 53, 53 + 53 };
 
   /* A tight upper bound on number of parts required to hold the value
      pow(5, power) is
@@ -116,12 +121,6 @@ hexDigitValue(unsigned int c)
   return -1U;
 }
 
-static inline void
-assertArithmeticOK(const llvm::fltSemantics &semantics) {
-  assert(semantics.arithmeticOK &&
-         "Compile-time arithmetic does not support these semantics");
-}
-
 /* Return the value of a decimal exponent of the form
    [+-]ddddddd.
 
@@ -196,8 +195,10 @@ totalExponent(StringRef::iterator p, StringRef::iterator end,
     assert(value < 10U && "Invalid character in exponent");
 
     unsignedExponent = unsignedExponent * 10 + value;
-    if (unsignedExponent > 32767)
+    if (unsignedExponent > 32767) {
       overflow = true;
+      break;
+    }
   }
 
   if (exponentAdjustment > 32767 || exponentAdjustment < -32768)
@@ -610,8 +611,6 @@ APFloat::assign(const APFloat &rhs)
   sign = rhs.sign;
   category = rhs.category;
   exponent = rhs.exponent;
-  sign2 = rhs.sign2;
-  exponent2 = rhs.exponent2;
   if (category == fcNormal || category == fcNaN)
     copySignificand(rhs);
 }
@@ -705,16 +704,10 @@ APFloat::bitwiseIsEqual(const APFloat &rhs) const {
       category != rhs.category ||
       sign != rhs.sign)
     return false;
-  if (semantics==(const llvm::fltSemantics*)&PPCDoubleDouble &&
-      sign2 != rhs.sign2)
-    return false;
   if (category==fcZero || category==fcInfinity)
     return true;
   else if (category==fcNormal && exponent!=rhs.exponent)
     return false;
-  else if (semantics==(const llvm::fltSemantics*)&PPCDoubleDouble &&
-           exponent2!=rhs.exponent2)
-    return false;
   else {
     int i= partCount();
     const integerPart* p=significandParts();
@@ -727,9 +720,7 @@ APFloat::bitwiseIsEqual(const APFloat &rhs) const {
   }
 }
 
-APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value)
-  : exponent2(0), sign2(0) {
-  assertArithmeticOK(ourSemantics);
+APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value) {
   initialize(&ourSemantics);
   sign = 0;
   zeroSignificand();
@@ -738,24 +729,19 @@ APFloat::APFloat(const fltSemantics &ourSemantics, integerPart value)
   normalize(rmNearestTiesToEven, lfExactlyZero);
 }
 
-APFloat::APFloat(const fltSemantics &ourSemantics) : exponent2(0), sign2(0) {
-  assertArithmeticOK(ourSemantics);
+APFloat::APFloat(const fltSemantics &ourSemantics) {
   initialize(&ourSemantics);
   category = fcZero;
   sign = false;
 }
 
-APFloat::APFloat(const fltSemantics &ourSemantics, uninitializedTag tag)
-  : exponent2(0), sign2(0) {
-  assertArithmeticOK(ourSemantics);
+APFloat::APFloat(const fltSemantics &ourSemantics, uninitializedTag tag) {
   // Allocates storage if necessary but does not initialize it.
   initialize(&ourSemantics);
 }
 
 APFloat::APFloat(const fltSemantics &ourSemantics,
-                 fltCategory ourCategory, bool negative)
-  : exponent2(0), sign2(0) {
-  assertArithmeticOK(ourSemantics);
+                 fltCategory ourCategory, bool negative) {
   initialize(&ourSemantics);
   category = ourCategory;
   sign = negative;
@@ -765,14 +751,12 @@ APFloat::APFloat(const fltSemantics &ourSemantics,
     makeNaN();
 }
 
-APFloat::APFloat(const fltSemantics &ourSemantics, StringRef text)
-  : exponent2(0), sign2(0) {
-  assertArithmeticOK(ourSemantics);
+APFloat::APFloat(const fltSemantics &ourSemantics, StringRef text) {
   initialize(&ourSemantics);
   convertFromString(text, rmNearestTiesToEven);
 }
 
-APFloat::APFloat(const APFloat &rhs) : exponent2(0), sign2(0) {
+APFloat::APFloat(const APFloat &rhs) {
   initialize(rhs.semantics);
   assign(rhs);
 }
@@ -1559,8 +1543,6 @@ APFloat::addOrSubtract(const APFloat &rhs, roundingMode rounding_mode,
 {
   opStatus fs;
 
-  assertArithmeticOK(*semantics);
-
   fs = addOrSubtractSpecials(rhs, subtract);
 
   /* This return code means it was not a simple case.  */
@@ -1605,7 +1587,6 @@ APFloat::multiply(const APFloat &rhs, roundingMode rounding_mode)
 {
   opStatus fs;
 
-  assertArithmeticOK(*semantics);
   sign ^= rhs.sign;
   fs = multiplySpecials(rhs);
 
@@ -1625,7 +1606,6 @@ APFloat::divide(const APFloat &rhs, roundingMode rounding_mode)
 {
   opStatus fs;
 
-  assertArithmeticOK(*semantics);
   sign ^= rhs.sign;
   fs = divideSpecials(rhs);
 
@@ -1647,7 +1627,6 @@ APFloat::remainder(const APFloat &rhs)
   APFloat V = *this;
   unsigned int origSign = sign;
 
-  assertArithmeticOK(*semantics);
   fs = V.divide(rhs, rmNearestTiesToEven);
   if (fs == opDivByZero)
     return fs;
@@ -1682,7 +1661,6 @@ APFloat::opStatus
 APFloat::mod(const APFloat &rhs, roundingMode rounding_mode)
 {
   opStatus fs;
-  assertArithmeticOK(*semantics);
   fs = modSpecials(rhs);
 
   if (category == fcNormal && rhs.category == fcNormal) {
@@ -1726,8 +1704,6 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
 {
   opStatus fs;
 
-  assertArithmeticOK(*semantics);
-
   /* Post-multiplication sign, before addition.  */
   sign ^= multiplicand.sign;
 
@@ -1768,12 +1744,11 @@ APFloat::fusedMultiplyAdd(const APFloat &multiplicand,
 /* Rounding-mode corrrect round to integral value.  */
 APFloat::opStatus APFloat::roundToIntegral(roundingMode rounding_mode) {
   opStatus fs;
-  assertArithmeticOK(*semantics);
 
   // If the exponent is large enough, we know that this value is already
   // integral, and the arithmetic below would potentially cause it to saturate
   // to +/-Inf.  Bail out early instead.
-  if (exponent+1 >= (int)semanticsPrecision(*semantics))
+  if (category == fcNormal && exponent+1 >= (int)semanticsPrecision(*semantics))
     return opOK;
 
   // The algorithm here is quite simple: we add 2^(p-1), where p is the
@@ -1815,7 +1790,6 @@ APFloat::compare(const APFloat &rhs) const
 {
   cmpResult result;
 
-  assertArithmeticOK(*semantics);
   assert(semantics == rhs.semantics);
 
   switch (convolve(category, rhs.category)) {
@@ -1900,8 +1874,6 @@ APFloat::convert(const fltSemantics &toSemantics,
   int shift;
   const fltSemantics &fromSemantics = *semantics;
 
-  assertArithmeticOK(fromSemantics);
-  assertArithmeticOK(toSemantics);
   lostFraction = lfExactlyZero;
   newPartCount = partCountForBits(toSemantics.precision + 1);
   oldPartCount = partCount();
@@ -1986,8 +1958,6 @@ APFloat::convertToSignExtendedInteger(integerPart *parts, unsigned int width,
   const integerPart *src;
   unsigned int dstPartsCount, truncatedBits;
 
-  assertArithmeticOK(*semantics);
-
   *isExact = false;
 
   /* Handle the three special cases first.  */
@@ -2149,7 +2119,6 @@ APFloat::convertFromUnsignedParts(const integerPart *src,
   integerPart *dst;
   lostFraction lost_fraction;
 
-  assertArithmeticOK(*semantics);
   category = fcNormal;
   omsb = APInt::tcMSB(src, srcCount) + 1;
   dst = significandParts();
@@ -2200,7 +2169,6 @@ APFloat::convertFromSignExtendedInteger(const integerPart *src,
 {
   opStatus status;
 
-  assertArithmeticOK(*semantics);
   if (isSigned &&
       APInt::tcExtractBit(src, srcCount * integerPartWidth - 1)) {
     integerPart *copy;
@@ -2334,7 +2302,7 @@ APFloat::roundSignificandWithExponent(const integerPart *decSigParts,
                                       roundingMode rounding_mode)
 {
   unsigned int parts, pow5PartCount;
-  fltSemantics calcSemantics = { 32767, -32767, 0, true };
+  fltSemantics calcSemantics = { 32767, -32767, 0 };
   integerPart pow5Parts[maxPowerOfFiveParts];
   bool isNearest;
 
@@ -2526,7 +2494,6 @@ APFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode)
 APFloat::opStatus
 APFloat::convertFromString(StringRef str, roundingMode rounding_mode)
 {
-  assertArithmeticOK(*semantics);
   assert(!str.empty() && "Invalid string length");
 
   /* Handle a leading minus sign.  */
@@ -2578,8 +2545,6 @@ APFloat::convertToHexString(char *dst, unsigned int hexDigits,
 {
   char *p;
 
-  assertArithmeticOK(*semantics);
-
   p = dst;
   if (sign)
     *dst++ = '-';
@@ -2788,42 +2753,46 @@ APFloat::convertPPCDoubleDoubleAPFloatToAPInt() const
   assert(semantics == (const llvm::fltSemantics*)&PPCDoubleDouble);
   assert(partCount()==2);
 
-  uint64_t myexponent, mysignificand, myexponent2, mysignificand2;
-
-  if (category==fcNormal) {
-    myexponent = exponent + 1023; //bias
-    myexponent2 = exponent2 + 1023;
-    mysignificand = significandParts()[0];
-    mysignificand2 = significandParts()[1];
-    if (myexponent==1 && !(mysignificand & 0x10000000000000LL))
-      myexponent = 0;   // denormal
-    if (myexponent2==1 && !(mysignificand2 & 0x10000000000000LL))
-      myexponent2 = 0;   // denormal
-  } else if (category==fcZero) {
-    myexponent = 0;
-    mysignificand = 0;
-    myexponent2 = 0;
-    mysignificand2 = 0;
-  } else if (category==fcInfinity) {
-    myexponent = 0x7ff;
-    myexponent2 = 0;
-    mysignificand = 0;
-    mysignificand2 = 0;
+  uint64_t words[2];
+  opStatus fs;
+  bool losesInfo;
+
+  // Convert number to double.  To avoid spurious underflows, we re-
+  // normalize against the "double" minExponent first, and only *then*
+  // truncate the mantissa.  The result of that second conversion
+  // may be inexact, but should never underflow.
+  APFloat extended(*this);
+  fltSemantics extendedSemantics = *semantics;
+  extendedSemantics.minExponent = IEEEdouble.minExponent;
+  fs = extended.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK && !losesInfo);
+  (void)fs;
+
+  APFloat u(extended);
+  fs = u.convert(IEEEdouble, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK || fs == opInexact);
+  (void)fs;
+  words[0] = *u.convertDoubleAPFloatToAPInt().getRawData();
+
+  // If conversion was exact or resulted in a special case, we're done;
+  // just set the second double to zero.  Otherwise, re-convert back to
+  // the extended format and compute the difference.  This now should
+  // convert exactly to double.
+  if (u.category == fcNormal && losesInfo) {
+    fs = u.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
+    assert(fs == opOK && !losesInfo);
+    (void)fs;
+
+    APFloat v(extended);
+    v.subtract(u, rmNearestTiesToEven);
+    fs = v.convert(IEEEdouble, rmNearestTiesToEven, &losesInfo);
+    assert(fs == opOK && !losesInfo);
+    (void)fs;
+    words[1] = *v.convertDoubleAPFloatToAPInt().getRawData();
   } else {
-    assert(category == fcNaN && "Unknown category");
-    myexponent = 0x7ff;
-    mysignificand = significandParts()[0];
-    myexponent2 = exponent2;
-    mysignificand2 = significandParts()[1];
+    words[1] = 0;
   }
 
-  uint64_t words[2];
-  words[0] =  ((uint64_t)(sign & 1) << 63) |
-              ((myexponent & 0x7ff) <<  52) |
-              (mysignificand & 0xfffffffffffffLL);
-  words[1] =  ((uint64_t)(sign2 & 1) << 63) |
-              ((myexponent2 & 0x7ff) <<  52) |
-              (mysignificand2 & 0xfffffffffffffLL);
   return APInt(128, words);
 }
 
@@ -3043,47 +3012,23 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
   assert(api.getBitWidth()==128);
   uint64_t i1 = api.getRawData()[0];
   uint64_t i2 = api.getRawData()[1];
-  uint64_t myexponent = (i1 >> 52) & 0x7ff;
-  uint64_t mysignificand = i1 & 0xfffffffffffffLL;
-  uint64_t myexponent2 = (i2 >> 52) & 0x7ff;
-  uint64_t mysignificand2 = i2 & 0xfffffffffffffLL;
+  opStatus fs;
+  bool losesInfo;
 
-  initialize(&APFloat::PPCDoubleDouble);
-  assert(partCount()==2);
+  // Get the first double and convert to our format.
+  initFromDoubleAPInt(APInt(64, i1));
+  fs = convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK && !losesInfo);
+  (void)fs;
 
-  sign = static_cast<unsigned int>(i1>>63);
-  sign2 = static_cast<unsigned int>(i2>>63);
-  if (myexponent==0 && mysignificand==0) {
-    // exponent, significand meaningless
-    // exponent2 and significand2 are required to be 0; we don't check
-    category = fcZero;
-  } else if (myexponent==0x7ff && mysignificand==0) {
-    // exponent, significand meaningless
-    // exponent2 and significand2 are required to be 0; we don't check
-    category = fcInfinity;
-  } else if (myexponent==0x7ff && mysignificand!=0) {
-    // exponent meaningless.  So is the whole second word, but keep it
-    // for determinism.
-    category = fcNaN;
-    exponent2 = myexponent2;
-    significandParts()[0] = mysignificand;
-    significandParts()[1] = mysignificand2;
-  } else {
-    category = fcNormal;
-    // Note there is no category2; the second word is treated as if it is
-    // fcNormal, although it might be something else considered by itself.
-    exponent = myexponent - 1023;
-    exponent2 = myexponent2 - 1023;
-    significandParts()[0] = mysignificand;
-    significandParts()[1] = mysignificand2;
-    if (myexponent==0)          // denormal
-      exponent = -1022;
-    else
-      significandParts()[0] |= 0x10000000000000LL;  // integer bit
-    if (myexponent2==0)
-      exponent2 = -1022;
-    else
-      significandParts()[1] |= 0x10000000000000LL;  // integer bit
+  // Unless we have a special case, add in second double.
+  if (category == fcNormal) {
+    APFloat v(APInt(64, i2));
+    fs = v.convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
+    assert(fs == opOK && !losesInfo);
+    (void)fs;
+
+    add(v, rmNearestTiesToEven);
   }
 }
 
@@ -3309,15 +3254,15 @@ APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
   return Val;
 }
 
-APFloat::APFloat(const APInt& api, bool isIEEE) : exponent2(0), sign2(0) {
+APFloat::APFloat(const APInt& api, bool isIEEE) {
   initFromAPInt(api, isIEEE);
 }
 
-APFloat::APFloat(float f) : exponent2(0), sign2(0) {
+APFloat::APFloat(float f) {
   initFromAPInt(APInt::floatToBits(f));
 }
 
-APFloat::APFloat(double d) : exponent2(0), sign2(0) {
+APFloat::APFloat(double d) {
   initFromAPInt(APInt::doubleToBits(d));
 }
 
@@ -3608,11 +3553,6 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
 }
 
 bool APFloat::getExactInverse(APFloat *inv) const {
-  // We can only guarantee the existence of an exact inverse for IEEE floats.
-  if (semantics != &IEEEhalf && semantics != &IEEEsingle &&
-      semantics != &IEEEdouble && semantics != &IEEEquad)
-    return false;
-
   // Special floats and denormals have no exact inverse.
   if (category != fcNormal)
     return false;
diff --git a/lib/Support/Atomic.cpp b/lib/Support/Atomic.cpp
index 3001f6c468aa..9559ad729570 100644
--- a/lib/Support/Atomic.cpp
+++ b/lib/Support/Atomic.cpp
@@ -21,11 +21,15 @@ using namespace llvm;
 #undef MemoryFence
 #endif
 
+#if defined(__GNUC__) || (defined(__IBMCPP__) && __IBMCPP__ >= 1210)
+#define GNU_ATOMICS
+#endif
+
 void sys::MemoryFence() {
 #if LLVM_HAS_ATOMICS == 0
   return;
 #else
-#  if defined(__GNUC__)
+#  if defined(GNU_ATOMICS)
   __sync_synchronize();
 #  elif defined(_MSC_VER)
   MemoryBarrier();
@@ -43,7 +47,7 @@ sys::cas_flag sys::CompareAndSwap(volatile sys::cas_flag* ptr,
   if (result == old_value)
     *ptr = new_value;
   return result;
-#elif defined(__GNUC__)
+#elif defined(GNU_ATOMICS)
   return __sync_val_compare_and_swap(ptr, old_value, new_value);
 #elif defined(_MSC_VER)
   return InterlockedCompareExchange(ptr, new_value, old_value);
@@ -56,7 +60,7 @@ sys::cas_flag sys::AtomicIncrement(volatile sys::cas_flag* ptr) {
 #if LLVM_HAS_ATOMICS == 0
   ++(*ptr);
   return *ptr;
-#elif defined(__GNUC__)
+#elif defined(GNU_ATOMICS)
   return __sync_add_and_fetch(ptr, 1);
 #elif defined(_MSC_VER)
   return InterlockedIncrement(ptr);
@@ -69,7 +73,7 @@ sys::cas_flag sys::AtomicDecrement(volatile sys::cas_flag* ptr) {
 #if LLVM_HAS_ATOMICS == 0
   --(*ptr);
   return *ptr;
-#elif defined(__GNUC__)
+#elif defined(GNU_ATOMICS)
   return __sync_sub_and_fetch(ptr, 1);
 #elif defined(_MSC_VER)
   return InterlockedDecrement(ptr);
@@ -82,7 +86,7 @@ sys::cas_flag sys::AtomicAdd(volatile sys::cas_flag* ptr, sys::cas_flag val) {
 #if LLVM_HAS_ATOMICS == 0
   *ptr += val;
   return *ptr;
-#elif defined(__GNUC__)
+#elif defined(GNU_ATOMICS)
   return __sync_add_and_fetch(ptr, val);
 #elif defined(_MSC_VER)
   return InterlockedExchangeAdd(ptr, val) + val;
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 83baf60d040c..6af0f4a6c938 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -1,9 +1,3 @@
-## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
-set(LLVM_REQUIRES_RTTI 1)
-if( MINGW )
-  set(LLVM_REQUIRES_EH 1)
-endif()
-
 add_llvm_library(LLVMSupport
   APFloat.cpp
   APInt.cpp
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 593315d1a72f..fc4f1891d95f 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -464,7 +464,7 @@ static void ParseCStringVector(std::vector<char *> &OutputVector,
 /// an environment variable (whose name is given in ENVVAR).
 ///
 void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
-                                 const char *Overview, bool ReadResponseFiles) {
+                                 const char *Overview) {
   // Check args.
   assert(progName && "Program name not specified");
   assert(envVar && "Environment variable name missing");
@@ -483,7 +483,7 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
   // and hand it off to ParseCommandLineOptions().
   ParseCStringVector(newArgv, envValue);
   int newArgc = static_cast<int>(newArgv.size());
-  ParseCommandLineOptions(newArgc, &newArgv[0], Overview, ReadResponseFiles);
+  ParseCommandLineOptions(newArgc, &newArgv[0], Overview);
 
   // Free all the strdup()ed strings.
   for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
@@ -529,7 +529,7 @@ static void ExpandResponseFiles(unsigned argc, const char*const* argv,
 }
 
 void cl::ParseCommandLineOptions(int argc, const char * const *argv,
-                                 const char *Overview, bool ReadResponseFiles) {
+                                 const char *Overview) {
   // Process all registered options.
   SmallVector<Option*, 4> PositionalOpts;
   SmallVector<Option*, 4> SinkOpts;
@@ -541,12 +541,10 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
 
   // Expand response files.
   std::vector<char*> newArgv;
-  if (ReadResponseFiles) {
-    newArgv.push_back(strdup(argv[0]));
-    ExpandResponseFiles(argc, argv, newArgv);
-    argv = &newArgv[0];
-    argc = static_cast<int>(newArgv.size());
-  }
+  newArgv.push_back(strdup(argv[0]));
+  ExpandResponseFiles(argc, argv, newArgv);
+  argv = &newArgv[0];
+  argc = static_cast<int>(newArgv.size());
 
   // Copy the program name into ProgName, making sure not to overflow it.
   std::string ProgName = sys::path::filename(argv[0]);
@@ -839,12 +837,10 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv,
   MoreHelp->clear();
 
   // Free the memory allocated by ExpandResponseFiles.
-  if (ReadResponseFiles) {
-    // Free all the strdup()ed strings.
-    for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
-         i != e; ++i)
-      free(*i);
-  }
+  // Free all the strdup()ed strings.
+  for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end();
+       i != e; ++i)
+    free(*i);
 
   // If we had an error processing our arguments, don't let the program execute
   if (ErrorParsing) exit(1);
diff --git a/lib/Support/DAGDeltaAlgorithm.cpp b/lib/Support/DAGDeltaAlgorithm.cpp
index 1e89c6ad2ff2..34e82cf44169 100644
--- a/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/lib/Support/DAGDeltaAlgorithm.cpp
@@ -122,7 +122,7 @@ private:
     DDA.UpdatedSearchState(Changes, Sets, Required);
   }
 
-  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   bool ExecuteOneTest(const changeset_ty &S) {
     // Check dependencies invariant.
     DEBUG({
@@ -143,8 +143,8 @@ public:
 
   changeset_ty Run();
 
-  /// GetTestResult - Get the test result for the active set \arg Changes with
-  /// \arg Required changes from the cache, executing the test if necessary.
+  /// GetTestResult - Get the test result for the active set \p Changes with
+  /// \p Required changes from the cache, executing the test if necessary.
   ///
   /// \param Changes - The set of active changes being minimized, which should
   /// have their pred closure included in the test.
@@ -163,11 +163,11 @@ class DeltaActiveSetHelper : public DeltaAlgorithm {
 protected:
   /// UpdatedSearchState - Callback used when the search state changes.
   virtual void UpdatedSearchState(const changeset_ty &Changes,
-                                  const changesetlist_ty &Sets) {
+                                  const changesetlist_ty &Sets) LLVM_OVERRIDE {
     DDAI.UpdatedSearchState(Changes, Sets, Required);
   }
 
-  virtual bool ExecuteOneTest(const changeset_ty &S) {
+  virtual bool ExecuteOneTest(const changeset_ty &S) LLVM_OVERRIDE {
     return DDAI.GetTestResult(S, Required);
   }
 
diff --git a/lib/Support/DataExtractor.cpp b/lib/Support/DataExtractor.cpp
index dc21155a0624..3d5cce05358c 100644
--- a/lib/Support/DataExtractor.cpp
+++ b/lib/Support/DataExtractor.cpp
@@ -139,7 +139,7 @@ uint64_t DataExtractor::getULEB128(uint32_t *offset_ptr) const {
 
   while (isValidOffset(offset)) {
     byte = Data[offset++];
-    result |= (byte & 0x7f) << shift;
+    result |= uint64_t(byte & 0x7f) << shift;
     shift += 7;
     if ((byte & 0x80) == 0)
       break;
@@ -160,7 +160,7 @@ int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
 
   while (isValidOffset(offset)) {
     byte = Data[offset++];
-    result |= (byte & 0x7f) << shift;
+    result |= uint64_t(byte & 0x7f) << shift;
     shift += 7;
     if ((byte & 0x80) == 0)
       break;
@@ -168,7 +168,7 @@ int64_t DataExtractor::getSLEB128(uint32_t *offset_ptr) const {
 
   // Sign bit of byte is 2nd high order bit (0x40)
   if (shift < 64 && (byte & 0x40))
-    result |= -(1 << shift);
+    result |= -(1ULL << shift);
 
   *offset_ptr = offset;
   return result;
diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index 94d14a5e36b0..3a38e2a66b43 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp
@@ -58,7 +58,7 @@ public:
   virtual ~DataFileStreamer() {
     close(Fd);
   }
-  virtual size_t GetBytes(unsigned char *buf, size_t len) {
+  virtual size_t GetBytes(unsigned char *buf, size_t len) LLVM_OVERRIDE {
     NumStreamFetches++;
     return read(Fd, buf, len);
   }
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index fb02c07e4af8..45fec361c1a6 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -160,7 +160,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 // On linux we have a weird situation. The stderr/out/in symbols are both
 // macros and global variables because of standards requirements. So, we
 // boldly use the EXPLICIT_SYMBOL macro without checking for a #define first.
-#if defined(__linux__)
+#if defined(__linux__) and !defined(__ANDROID__)
   {
     EXPLICIT_SYMBOL(stderr);
     EXPLICIT_SYMBOL(stdout);
diff --git a/lib/Support/Errno.cpp b/lib/Support/Errno.cpp
index dd218f60990e..730220f47d92 100644
--- a/lib/Support/Errno.cpp
+++ b/lib/Support/Errno.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/Errno.h"
 #include "llvm/Config/config.h"     // Get autoconf configuration settings
+#include "llvm/Support/raw_ostream.h"
 
 #if HAVE_STRING_H
 #include <string.h>
@@ -39,7 +40,7 @@ std::string StrError(int errnum) {
   const int MaxErrStrLen = 2000;
   char buffer[MaxErrStrLen];
   buffer[0] = '\0';
-  char* str = buffer;
+  std::string str;
 #ifdef HAVE_STRERROR_R
   // strerror_r is thread-safe.
   if (errnum)
@@ -49,21 +50,25 @@ std::string StrError(int errnum) {
     str = strerror_r(errnum,buffer,MaxErrStrLen-1);
 # else
     strerror_r(errnum,buffer,MaxErrStrLen-1);
+    str = buffer;
 # endif
 #elif HAVE_DECL_STRERROR_S // "Windows Secure API"
-    if (errnum)
+    if (errnum) {
       strerror_s(buffer, MaxErrStrLen - 1, errnum);
+      str = buffer;
+    }
 #elif defined(HAVE_STRERROR)
   // Copy the thread un-safe result of strerror into
   // the buffer as fast as possible to minimize impact
   // of collision of strerror in multiple threads.
   if (errnum)
-    strncpy(buffer,strerror(errnum),MaxErrStrLen-1);
-  buffer[MaxErrStrLen-1] = '\0';
+    str = strerror(errnum);
 #else
   // Strange that this system doesn't even have strerror
   // but, oh well, just use a generic message
-  sprintf(buffer, "Error #%d", errnum);
+  raw_string_ostream stream(str);
+  stream << "Error #" << errnum;
+  stream.flush();
 #endif
   return str;
 }
diff --git a/lib/Support/FoldingSet.cpp b/lib/Support/FoldingSet.cpp
index c6282c6ab2ab..4d489a88e55d 100644
--- a/lib/Support/FoldingSet.cpp
+++ b/lib/Support/FoldingSet.cpp
@@ -38,6 +38,14 @@ bool FoldingSetNodeIDRef::operator==(FoldingSetNodeIDRef RHS) const {
   return memcmp(Data, RHS.Data, Size*sizeof(*Data)) == 0;
 }
 
+/// Used to compare the "ordering" of two nodes as defined by the
+/// profiled bits and their ordering defined by memcmp().
+bool FoldingSetNodeIDRef::operator<(FoldingSetNodeIDRef RHS) const {
+  if (Size != RHS.Size)
+    return Size < RHS.Size;
+  return memcmp(Data, RHS.Data, Size*sizeof(*Data)) < 0;
+}
+
 //===----------------------------------------------------------------------===//
 // FoldingSetNodeID Implementation
 
@@ -152,6 +160,16 @@ bool FoldingSetNodeID::operator==(FoldingSetNodeIDRef RHS) const {
   return FoldingSetNodeIDRef(Bits.data(), Bits.size()) == RHS;
 }
 
+/// Used to compare the "ordering" of two nodes as defined by the
+/// profiled bits and their ordering defined by memcmp().
+bool FoldingSetNodeID::operator<(const FoldingSetNodeID &RHS)const{
+  return *this < FoldingSetNodeIDRef(RHS.Bits.data(), RHS.Bits.size());
+}
+
+bool FoldingSetNodeID::operator<(FoldingSetNodeIDRef RHS) const {
+  return FoldingSetNodeIDRef(Bits.data(), Bits.size()) < RHS;
+}
+
 /// Intern - Copy this node's data to a memory region allocated from the
 /// given allocator and return a FoldingSetNodeIDRef describing the
 /// interned data.
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 9a2c39d72e3f..34e32b817b36 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -234,6 +234,8 @@ std::string sys::getHostCPUName() {
       case 37: // Intel Core i7, laptop version.
       case 44: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 32 nm process.
+      case 46: // Nehalem EX
+      case 47: // Westmere EX
         return "corei7";
 
       // SandyBridge:
@@ -303,6 +305,7 @@ std::string sys::getHostCPUName() {
         case 8:  return "k6-2";
         case 9:
         case 13: return "k6-3";
+        case 10: return "geode";
         default: return "pentium";
         }
       case 6:
@@ -500,6 +503,7 @@ std::string sys::getHostCPUName() {
           .Case("0xb76", "arm1176jz-s")
           .Case("0xc08", "cortex-a8")
           .Case("0xc09", "cortex-a9")
+          .Case("0xc0f", "cortex-a15")
           .Case("0xc20", "cortex-m0")
           .Case("0xc23", "cortex-m3")
           .Case("0xc24", "cortex-m4")
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 64404a1a8e77..59bfcfcd254c 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -49,7 +49,7 @@ LockFileManager::readLockFile(StringRef LockFileName) {
 }
 
 bool LockFileManager::processStillExecuting(StringRef Hostname, int PID) {
-#if LLVM_ON_UNIX
+#if LLVM_ON_UNIX && !defined(__ANDROID__)
   char MyHostname[256];
   MyHostname[255] = 0;
   MyHostname[0] = 0;
diff --git a/lib/Support/Makefile b/lib/Support/Makefile
index d68e500ca5f4..4a2185d589e5 100644
--- a/lib/Support/Makefile
+++ b/lib/Support/Makefile
@@ -11,9 +11,6 @@ LEVEL = ../..
 LIBRARYNAME = LLVMSupport
 BUILD_ARCHIVE = 1
 
-## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
-REQUIRES_RTTI = 1
-
 EXTRA_DIST = Unix Win32 README.txt
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Support/Memory.cpp b/lib/Support/Memory.cpp
index 22f74944865c..12f083822fd4 100644
--- a/lib/Support/Memory.cpp
+++ b/lib/Support/Memory.cpp
@@ -16,14 +16,6 @@
 #include "llvm/Support/Valgrind.h"
 #include "llvm/Config/config.h"
 
-#if defined(__mips__)
-#include <sys/cachectl.h>
-#endif
-
-namespace llvm {
-using namespace sys;
-}
-
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Memory.inc"
@@ -31,51 +23,3 @@ using namespace sys;
 #ifdef LLVM_ON_WIN32
 #include "Windows/Memory.inc"
 #endif
-
-extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
-
-/// InvalidateInstructionCache - Before the JIT can run a block of code
-/// that has been emitted it must invalidate the instruction cache on some
-/// platforms.
-void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
-                                                   size_t Len) {
-
-// icache invalidation for PPC and ARM.
-#if defined(__APPLE__)
-
-#  if (defined(__POWERPC__) || defined (__ppc__) || \
-     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
-  sys_icache_invalidate(const_cast<void *>(Addr), Len);
-#  endif
-
-#else
-
-#  if (defined(__POWERPC__) || defined (__ppc__) || \
-       defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
-  const size_t LineSize = 32;
-
-  const intptr_t Mask = ~(LineSize - 1);
-  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
-  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
-
-  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
-    asm volatile("dcbf 0, %0" : : "r"(Line));
-  asm volatile("sync");
-
-  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
-    asm volatile("icbi 0, %0" : : "r"(Line));
-  asm volatile("isync");
-#  elif defined(__arm__) && defined(__GNUC__)
-  // FIXME: Can we safely always call this for __GNUC__ everywhere?
-  const char *Start = static_cast<const char *>(Addr);
-  const char *End = Start + Len;
-  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
-#  elif defined(__mips__)
-  const char *Start = static_cast<const char *>(Addr);
-  cacheflush(const_cast<char *>(Start), Len, BCACHE);
-#  endif
-
-#endif  // end apple
-
-  ValgrindDiscardTranslations(Addr, Len);
-}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 992f03c52058..ec373e7f997c 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -33,6 +33,9 @@
 #include <unistd.h>
 #else
 #include <io.h>
+#ifndef S_ISFIFO
+#define S_ISFIFO(x) (0)
+#endif
 #endif
 #include <fcntl.h>
 using namespace llvm;
@@ -81,12 +84,12 @@ public:
     init(InputData.begin(), InputData.end(), RequiresNullTerminator);
   }
 
-  virtual const char *getBufferIdentifier() const {
+  virtual const char *getBufferIdentifier() const LLVM_OVERRIDE {
      // The name is stored after the class itself.
     return reinterpret_cast<const char*>(this + 1);
   }
-  
-  virtual BufferKind getBufferKind() const {
+
+  virtual BufferKind getBufferKind() const LLVM_OVERRIDE {
     return MemoryBuffer_Malloc;
   }
 };
@@ -194,13 +197,34 @@ public:
     sys::Path::UnMapFilePages(reinterpret_cast<const char*>(RealStart),
                               RealSize);
   }
-  
-  virtual BufferKind getBufferKind() const {
+
+  virtual BufferKind getBufferKind() const LLVM_OVERRIDE {
     return MemoryBuffer_MMap;
   }
 };
 }
 
+static error_code getMemoryBufferForStream(int FD, 
+                                           StringRef BufferName,
+                                           OwningPtr<MemoryBuffer> &result) {
+  const ssize_t ChunkSize = 4096*4;
+  SmallString<ChunkSize> Buffer;
+  ssize_t ReadBytes;
+  // Read into Buffer until we hit EOF.
+  do {
+    Buffer.reserve(Buffer.size() + ChunkSize);
+    ReadBytes = read(FD, Buffer.end(), ChunkSize);
+    if (ReadBytes == -1) {
+      if (errno == EINTR) continue;
+      return error_code(errno, posix_category());
+    }
+    Buffer.set_size(Buffer.size() + ReadBytes);
+  } while (ReadBytes != 0);
+
+  result.reset(MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
+  return error_code::success();
+}
+
 error_code MemoryBuffer::getFile(StringRef Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
@@ -297,6 +321,13 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
       if (fstat(FD, &FileInfo) == -1) {
         return error_code(errno, posix_category());
       }
+
+      // If this is a named pipe, we can't trust the size. Create the memory
+      // buffer by copying off the stream.
+      if (S_ISFIFO(FileInfo.st_mode)) {
+        return getMemoryBufferForStream(FD, Filename, result);
+      }
+
       FileSize = FileInfo.st_size;
     }
     MapSize = FileSize;
@@ -370,20 +401,5 @@ error_code MemoryBuffer::getSTDIN(OwningPtr<MemoryBuffer> &result) {
   // fallback if it fails.
   sys::Program::ChangeStdinToBinary();
 
-  const ssize_t ChunkSize = 4096*4;
-  SmallString<ChunkSize> Buffer;
-  ssize_t ReadBytes;
-  // Read into Buffer until we hit EOF.
-  do {
-    Buffer.reserve(Buffer.size() + ChunkSize);
-    ReadBytes = read(0, Buffer.end(), ChunkSize);
-    if (ReadBytes == -1) {
-      if (errno == EINTR) continue;
-      return error_code(errno, posix_category());
-    }
-    Buffer.set_size(Buffer.size() + ReadBytes);
-  } while (ReadBytes != 0);
-
-  result.reset(getMemBufferCopy(Buffer, "<stdin>"));
-  return error_code::success();
+  return getMemoryBufferForStream(0, "<stdin>", result);
 }
diff --git a/lib/Support/SmallVector.cpp b/lib/Support/SmallVector.cpp
index a89f14957635..f9c0e78270c9 100644
--- a/lib/Support/SmallVector.cpp
+++ b/lib/Support/SmallVector.cpp
@@ -16,14 +16,15 @@ using namespace llvm;
 
 /// grow_pod - This is an implementation of the grow() method which only works
 /// on POD-like datatypes and is out of line to reduce code duplication.
-void SmallVectorBase::grow_pod(size_t MinSizeInBytes, size_t TSize) {
+void SmallVectorBase::grow_pod(void *FirstEl, size_t MinSizeInBytes,
+                               size_t TSize) {
   size_t CurSizeBytes = size_in_bytes();
   size_t NewCapacityInBytes = 2 * capacity_in_bytes() + TSize; // Always grow.
   if (NewCapacityInBytes < MinSizeInBytes)
     NewCapacityInBytes = MinSizeInBytes;
 
   void *NewElts;
-  if (this->isSmall()) {
+  if (BeginX == FirstEl) {
     NewElts = malloc(NewCapacityInBytes);
 
     // Copy the elements over.  No need to run dtors on PODs.
@@ -37,4 +38,3 @@ void SmallVectorBase::grow_pod(size_t MinSizeInBytes, size_t TSize) {
   this->BeginX = NewElts;
   this->CapacityX = (char*)this->BeginX + NewCapacityInBytes;
 }
-
diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp
index fe3752a77ad2..59e27a263e06 100644
--- a/lib/Support/StreamableMemoryObject.cpp
+++ b/lib/Support/StreamableMemoryObject.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/StreamableMemoryObject.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstring>
 
@@ -23,18 +24,23 @@ public:
     assert(LastChar >= FirstChar && "Invalid start/end range");
   }
 
-  virtual uint64_t getBase() const { return 0; }
-  virtual uint64_t getExtent() const { return LastChar - FirstChar; }
-  virtual int readByte(uint64_t address, uint8_t* ptr) const;
+  virtual uint64_t getBase() const LLVM_OVERRIDE { return 0; }
+  virtual uint64_t getExtent() const LLVM_OVERRIDE {
+    return LastChar - FirstChar;
+  }
+  virtual int readByte(uint64_t address, uint8_t* ptr) const LLVM_OVERRIDE;
   virtual int readBytes(uint64_t address,
                         uint64_t size,
                         uint8_t* buf,
-                        uint64_t* copied) const;
-  virtual const uint8_t *getPointer(uint64_t address, uint64_t size) const;
-  virtual bool isValidAddress(uint64_t address) const {
+                        uint64_t* copied) const LLVM_OVERRIDE;
+  virtual const uint8_t *getPointer(uint64_t address,
+                                    uint64_t size) const LLVM_OVERRIDE;
+  virtual bool isValidAddress(uint64_t address) const LLVM_OVERRIDE {
     return validAddress(address);
   }
-  virtual bool isObjectEnd(uint64_t address) const {return objectEnd(address);}
+  virtual bool isObjectEnd(uint64_t address) const LLVM_OVERRIDE {
+    return objectEnd(address);
+  }
 
 private:
   const uint8_t* const FirstChar;
@@ -49,8 +55,8 @@ private:
     return static_cast<ptrdiff_t>(address) == LastChar - FirstChar;
   }
 
-  RawMemoryObject(const RawMemoryObject&);  // DO NOT IMPLEMENT
-  void operator=(const RawMemoryObject&);  // DO NOT IMPLEMENT
+  RawMemoryObject(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
 };
 
 int RawMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
diff --git a/lib/Support/StringMap.cpp b/lib/Support/StringMap.cpp
index c2fc261df3a6..9ac1f867fdd4 100644
--- a/lib/Support/StringMap.cpp
+++ b/lib/Support/StringMap.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 using namespace llvm;
 
@@ -69,7 +70,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
   while (1) {
     StringMapEntryBase *BucketItem = TheTable[BucketNo];
     // If we found an empty bucket, this key isn't in the table yet, return it.
-    if (BucketItem == 0) {
+    if (LLVM_LIKELY(BucketItem == 0)) {
       // If we found a tombstone, we want to reuse the tombstone instead of an
       // empty bucket.  This reduces probing.
       if (FirstTombstone != -1) {
@@ -84,7 +85,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
     if (BucketItem == getTombstoneVal()) {
       // Skip over tombstones.  However, remember the first one we see.
       if (FirstTombstone == -1) FirstTombstone = BucketNo;
-    } else if (HashTable[BucketNo] == FullHashValue) {
+    } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
       // If the full hash value matches, check deeply for a match.  The common
       // case here is that we are only looking at the buckets (for item info
       // being non-null and for the full hash value) not at the items.  This
@@ -123,12 +124,12 @@ int StringMapImpl::FindKey(StringRef Key) const {
   while (1) {
     StringMapEntryBase *BucketItem = TheTable[BucketNo];
     // If we found an empty bucket, this key isn't in the table yet, return.
-    if (BucketItem == 0)
+    if (LLVM_LIKELY(BucketItem == 0))
       return -1;
     
     if (BucketItem == getTombstoneVal()) {
       // Ignore tombstones.
-    } else if (HashTable[BucketNo] == FullHashValue) {
+    } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
       // If the full hash value matches, check deeply for a match.  The common
       // case here is that we are only looking at the buckets (for item info
       // being non-null and for the full hash value) not at the items.  This
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index 8aab4b2760e7..f8e920846259 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -350,8 +350,8 @@ bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
     unsigned long long PrevResult = Result;
     Result = Result*Radix+CharVal;
 
-    // Check for overflow.
-    if (Result < PrevResult)
+    // Check for overflow by shifting back and seeing if bits were lost.
+    if (Result/Radix < PrevResult)
       return true;
 
     Str = Str.substr(1);
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index cca549dad567..c058c05595f1 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -42,6 +42,8 @@ const char *Triple::getArchTypeName(ArchType Kind) {
   case nvptx64: return "nvptx64";
   case le32:    return "le32";
   case amdil:   return "amdil";
+  case spir:    return "spir";
+  case spir64:  return "spir64";
   }
 
   llvm_unreachable("Invalid ArchType!");
@@ -83,6 +85,8 @@ const char *Triple::getArchTypePrefix(ArchType Kind) {
   case nvptx64: return "nvptx";
   case le32:    return "le32";
   case amdil:   return "amdil";
+  case spir:    return "spir";
+  case spir64:  return "spir";
   }
 }
 
@@ -95,6 +99,8 @@ const char *Triple::getVendorTypeName(VendorType Kind) {
   case SCEI: return "scei";
   case BGP: return "bgp";
   case BGQ: return "bgq";
+  case Freescale: return "fsl";
+  case IBM: return "ibm";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -125,6 +131,7 @@ const char *Triple::getOSTypeName(OSType Kind) {
   case NativeClient: return "nacl";
   case CNK: return "cnk";
   case Bitrig: return "bitrig";
+  case AIX: return "aix";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -138,7 +145,8 @@ const char *Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   case GNUEABI: return "gnueabi";
   case EABI: return "eabi";
   case MachO: return "macho";
-  case ANDROIDEABI: return "androideabi";
+  case Android: return "android";
+  case ELF: return "elf";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -170,40 +178,11 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("nvptx64", nvptx64)
     .Case("le32", le32)
     .Case("amdil", amdil)
+    .Case("spir", spir)
+    .Case("spir64", spir64)
     .Default(UnknownArch);
 }
 
-Triple::ArchType Triple::getArchTypeForDarwinArchName(StringRef Str) {
-  // See arch(3) and llvm-gcc's driver-driver.c. We don't implement support for
-  // archs which Darwin doesn't use.
-
-  // The matching this routine does is fairly pointless, since it is neither the
-  // complete architecture list, nor a reasonable subset. The problem is that
-  // historically the driver driver accepts this and also ties its -march=
-  // handling to the architecture name, so we need to be careful before removing
-  // support for it.
-
-  // This code must be kept in sync with Clang's Darwin specific argument
-  // translation.
-
-  return StringSwitch<ArchType>(Str)
-    .Cases("ppc", "ppc601", "ppc603", "ppc604", "ppc604e", Triple::ppc)
-    .Cases("ppc750", "ppc7400", "ppc7450", "ppc970", Triple::ppc)
-    .Case("ppc64", Triple::ppc64)
-    .Cases("i386", "i486", "i486SX", "i586", "i686", Triple::x86)
-    .Cases("pentium", "pentpro", "pentIIm3", "pentIIm5", "pentium4",
-           Triple::x86)
-    .Case("x86_64", Triple::x86_64)
-    // This is derived from the driver driver.
-    .Cases("arm", "armv4t", "armv5", "armv6", Triple::arm)
-    .Cases("armv7", "armv7f", "armv7k", "armv7s", "xscale", Triple::arm)
-    .Case("r600", Triple::r600)
-    .Case("nvptx", Triple::nvptx)
-    .Case("nvptx64", Triple::nvptx64)
-    .Case("amdil", Triple::amdil)
-    .Default(Triple::UnknownArch);
-}
-
 // Returns architecture name that is understood by the target assembler.
 const char *Triple::getArchNameForAssembler() {
   if (!isOSDarwin() && getVendor() != Triple::Apple)
@@ -225,6 +204,8 @@ const char *Triple::getArchNameForAssembler() {
     .Case("nvptx64", "nvptx64")
     .Case("le32", "le32")
     .Case("amdil", "amdil")
+    .Case("spir", "spir")
+    .Case("spir64", "spir64")
     .Default(NULL);
 }
 
@@ -259,6 +240,8 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("nvptx64", Triple::nvptx64)
     .Case("le32", Triple::le32)
     .Case("amdil", Triple::amdil)
+    .Case("spir", Triple::spir)
+    .Case("spir64", Triple::spir64)
     .Default(Triple::UnknownArch);
 }
 
@@ -269,6 +252,8 @@ static Triple::VendorType parseVendor(StringRef VendorName) {
     .Case("scei", Triple::SCEI)
     .Case("bgp", Triple::BGP)
     .Case("bgq", Triple::BGQ)
+    .Case("fsl", Triple::Freescale)
+    .Case("ibm", Triple::IBM)
     .Default(Triple::UnknownVendor);
 }
 
@@ -295,6 +280,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("nacl", Triple::NativeClient)
     .StartsWith("cnk", Triple::CNK)
     .StartsWith("bitrig", Triple::Bitrig)
+    .StartsWith("aix", Triple::AIX)
     .Default(Triple::UnknownOS);
 }
 
@@ -305,7 +291,8 @@ static Triple::EnvironmentType parseEnvironment(StringRef EnvironmentName) {
     .StartsWith("gnueabi", Triple::GNUEABI)
     .StartsWith("gnu", Triple::GNU)
     .StartsWith("macho", Triple::MachO)
-    .StartsWith("androideabi", Triple::ANDROIDEABI)
+    .StartsWith("android", Triple::Android)
+    .StartsWith("elf", Triple::ELF)
     .Default(Triple::UnknownEnvironment);
 }
 
@@ -690,6 +677,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::thumb:
   case llvm::Triple::x86:
   case llvm::Triple::xcore:
+  case llvm::Triple::spir:
     return 32;
 
   case llvm::Triple::mips64:
@@ -698,6 +686,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::ppc64:
   case llvm::Triple::sparcv9:
   case llvm::Triple::x86_64:
+  case llvm::Triple::spir64:
     return 64;
   }
   llvm_unreachable("Invalid architecture value");
@@ -724,6 +713,7 @@ Triple Triple::get32BitArchVariant() const {
     break;
 
   case Triple::amdil:
+  case Triple::spir:
   case Triple::arm:
   case Triple::cellspu:
   case Triple::hexagon:
@@ -748,6 +738,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::ppc64:     T.setArch(Triple::ppc);   break;
   case Triple::sparcv9:   T.setArch(Triple::sparc);   break;
   case Triple::x86_64:    T.setArch(Triple::x86);     break;
+  case Triple::spir64:    T.setArch(Triple::spir);    break;
   }
   return T;
 }
@@ -770,6 +761,7 @@ Triple Triple::get64BitArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
+  case Triple::spir64:
   case Triple::mips64:
   case Triple::mips64el:
   case Triple::nvptx64:
@@ -785,6 +777,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::ppc:     T.setArch(Triple::ppc64);     break;
   case Triple::sparc:   T.setArch(Triple::sparcv9);   break;
   case Triple::x86:     T.setArch(Triple::x86_64);    break;
+  case Triple::spir:    T.setArch(Triple::spir64);    break;
   }
   return T;
 }
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 5a57a2870636..9a8abd27f158 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -13,6 +13,7 @@
 
 #include "Unix.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
 
 #ifdef HAVE_SYS_MMAN_H
@@ -23,14 +24,146 @@
 #include <mach/mach.h>
 #endif
 
+#if defined(__mips__)
+#  if defined(__OpenBSD__)
+#    include <mips64/sysarch.h>
+#  else
+#    include <sys/cachectl.h>
+#  endif
+#endif
+
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+
+namespace {
+
+int getPosixProtectionFlags(unsigned Flags) {
+  switch (Flags) {
+  case llvm::sys::Memory::MF_READ:
+    return PROT_READ;
+  case llvm::sys::Memory::MF_WRITE:
+    return PROT_WRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_WRITE:
+    return PROT_READ | PROT_WRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
+    return PROT_READ | PROT_EXEC;
+  case llvm::sys::Memory::MF_READ |
+	 llvm::sys::Memory::MF_WRITE |
+	 llvm::sys::Memory::MF_EXEC:
+    return PROT_READ | PROT_WRITE | PROT_EXEC;
+  case llvm::sys::Memory::MF_EXEC:
+    return PROT_EXEC;
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PROT_NONE;
+}
+
+} // namespace
+
+namespace llvm {
+namespace sys {
+
+MemoryBlock
+Memory::allocateMappedMemory(size_t NumBytes,
+                             const MemoryBlock *const NearBlock,
+                             unsigned PFlags,
+                             error_code &EC) {
+  EC = error_code::success();
+  if (NumBytes == 0)
+    return MemoryBlock();
+
+  static const size_t PageSize = Process::GetPageSize();
+  const size_t NumPages = (NumBytes+PageSize-1)/PageSize;
+
+  int fd = -1;
+#ifdef NEED_DEV_ZERO_FOR_MMAP
+  static int zero_fd = open("/dev/zero", O_RDWR);
+  if (zero_fd == -1) {
+    EC = error_code(errno, system_category());
+    return MemoryBlock();
+  }
+  fd = zero_fd;
+#endif
+
+  int MMFlags = MAP_PRIVATE |
+#ifdef HAVE_MMAP_ANONYMOUS
+  MAP_ANONYMOUS
+#else
+  MAP_ANON
+#endif
+  ; // Ends statement above
+
+  int Protect = getPosixProtectionFlags(PFlags);
+
+  // Use any near hint and the page size to set a page-aligned starting address
+  uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
+                                      NearBlock->size() : 0;
+  if (Start && Start % PageSize)
+    Start += PageSize - Start % PageSize;
+
+  void *Addr = ::mmap(reinterpret_cast<void*>(Start), PageSize*NumPages,
+                      Protect, MMFlags, fd, 0);
+  if (Addr == MAP_FAILED) {
+    if (NearBlock) //Try again without a near hint
+      return allocateMappedMemory(NumBytes, 0, PFlags, EC);
+
+    EC = error_code(errno, system_category());
+    return MemoryBlock();
+  }
+
+  MemoryBlock Result;
+  Result.Address = Addr;
+  Result.Size = NumPages*PageSize;
+
+  if (PFlags & MF_EXEC)
+    Memory::InvalidateInstructionCache(Result.Address, Result.Size);
+
+  return Result;
+}
+
+error_code
+Memory::releaseMappedMemory(MemoryBlock &M) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (0 != ::munmap(M.Address, M.Size))
+    return error_code(errno, system_category());
+
+  M.Address = 0;
+  M.Size = 0;
+
+  return error_code::success();
+}
+
+error_code
+Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (!Flags)
+    return error_code(EINVAL, generic_category());
+
+  int Protect = getPosixProtectionFlags(Flags);
+
+  int Result = ::mprotect(M.Address, M.Size, Protect);
+  if (Result != 0)
+    return error_code(errno, system_category());
+
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(M.Address, M.Size);
+
+  return error_code::success();
+}
+
 /// AllocateRWX - Allocate a slab of memory with read/write/execute
 /// permissions.  This is typically used for JIT applications where we want
 /// to emit code to the memory then jump to it.  Getting this type of memory
 /// is very OS specific.
 ///
-llvm::sys::MemoryBlock
-llvm::sys::Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
-                               std::string *ErrMsg) {
+MemoryBlock
+Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
+                    std::string *ErrMsg) {
   if (NumBytes == 0) return MemoryBlock();
 
   size_t pageSize = Process::GetPageSize();
@@ -78,7 +211,7 @@ llvm::sys::Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
                                 VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
   if (KERN_SUCCESS != kr) {
     MakeErrMsg(ErrMsg, "vm_protect max RX failed");
-    return sys::MemoryBlock();
+    return MemoryBlock();
   }
 
   kr = vm_protect(mach_task_self(), (vm_address_t)pa,
@@ -86,7 +219,7 @@ llvm::sys::Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
                   VM_PROT_READ | VM_PROT_WRITE);
   if (KERN_SUCCESS != kr) {
     MakeErrMsg(ErrMsg, "vm_protect RW failed");
-    return sys::MemoryBlock();
+    return MemoryBlock();
   }
 #endif
 
@@ -97,17 +230,17 @@ llvm::sys::Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
   return result;
 }
 
-bool llvm::sys::Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
   if (M.Address == 0 || M.Size == 0) return false;
   if (0 != ::munmap(M.Address, M.Size))
     return MakeErrMsg(ErrMsg, "Can't release RWX Memory");
   return false;
 }
 
-bool llvm::sys::Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
+bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 #if defined(__APPLE__) && defined(__arm__)
   if (M.Address == 0 || M.Size == 0) return false;
-  sys::Memory::InvalidateInstructionCache(M.Address, M.Size);
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
     (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_WRITE);
   return KERN_SUCCESS == kr;
@@ -116,10 +249,10 @@ bool llvm::sys::Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 #endif
 }
 
-bool llvm::sys::Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
+bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
 #if defined(__APPLE__) && defined(__arm__)
   if (M.Address == 0 || M.Size == 0) return false;
-  sys::Memory::InvalidateInstructionCache(M.Address, M.Size);
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
     (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
   return KERN_SUCCESS == kr;
@@ -128,7 +261,7 @@ bool llvm::sys::Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
 #endif
 }
 
-bool llvm::sys::Memory::setRangeWritable(const void *Addr, size_t Size) {
+bool Memory::setRangeWritable(const void *Addr, size_t Size) {
 #if defined(__APPLE__) && defined(__arm__)
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
                                 (vm_size_t)Size, 0,
@@ -139,7 +272,7 @@ bool llvm::sys::Memory::setRangeWritable(const void *Addr, size_t Size) {
 #endif
 }
 
-bool llvm::sys::Memory::setRangeExecutable(const void *Addr, size_t Size) {
+bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
 #if defined(__APPLE__) && defined(__arm__)
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
                                 (vm_size_t)Size, 0,
@@ -149,3 +282,52 @@ bool llvm::sys::Memory::setRangeExecutable(const void *Addr, size_t Size) {
   return true;
 #endif
 }
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void Memory::InvalidateInstructionCache(const void *Addr,
+                                        size_t Len) {
+
+// icache invalidation for PPC and ARM.
+#if defined(__APPLE__)
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
+  sys_icache_invalidate(const_cast<void *>(Addr), Len);
+#  endif
+
+#else
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+       defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+#  elif defined(__arm__) && defined(__GNUC__)
+  // FIXME: Can we safely always call this for __GNUC__ everywhere?
+  const char *Start = static_cast<const char *>(Addr);
+  const char *End = Start + Len;
+  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
+#  elif defined(__mips__)
+  const char *Start = static_cast<const char *>(Addr);
+  cacheflush(const_cast<char *>(Start), Len, BCACHE);
+#  endif
+
+#endif  // end apple
+
+  ValgrindDiscardTranslations(Addr, Len);
+}
+
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 6bddbdf7bdb8..6a5ebb8cd9c7 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -261,7 +261,8 @@ Path::GetCurrentDirectory() {
 }
 
 #if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
-    defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__)
+    defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \
+    defined(__linux__) || defined(__CYGWIN__)
 static int
 test_dir(char buf[PATH_MAX], char ret[PATH_MAX],
     const char *dir, const char *bin)
@@ -337,9 +338,17 @@ Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
     return Path(exe_path);
 #elif defined(__linux__) || defined(__CYGWIN__)
   char exe_path[MAXPATHLEN];
-  ssize_t len = readlink("/proc/self/exe", exe_path, sizeof(exe_path));
-  if (len >= 0)
-    return Path(StringRef(exe_path, len));
+  StringRef aPath("/proc/self/exe");
+  if (sys::fs::exists(aPath)) {
+      // /proc is not always mounted under Linux (chroot for example).
+      ssize_t len = readlink(aPath.str().c_str(), exe_path, sizeof(exe_path));
+      if (len >= 0)
+          return Path(StringRef(exe_path, len));
+  } else {
+      // Fall back to the classical detection.
+      if (getprogpath(exe_path, argv0) != NULL)
+          return Path(exe_path);
+  }
 #elif defined(HAVE_DLFCN_H)
   // Use dladdr to get executable path if available.
   Dl_info DLInfo;
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 519511685dd6..9e94068c9c36 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -121,17 +121,29 @@ static void UnregisterHandlers() {
 /// NB: This must be an async signal safe function. It cannot allocate or free
 /// memory, even in debug builds.
 static void RemoveFilesToRemove() {
-  // Note: avoid iterators in case of debug iterators that allocate or release
+  // We avoid iterators in case of debug iterators that allocate or release
   // memory.
   for (unsigned i = 0, e = FilesToRemove.size(); i != e; ++i) {
-    // Note that we don't want to use any external code here, and we don't care
-    // about errors. We're going to try as hard as we can as often as we need
-    // to to make these files go away. If these aren't files, too bad.
-    //
-    // We do however rely on a std::string implementation for which repeated
-    // calls to 'c_str()' don't allocate memory. We pre-call 'c_str()' on all
-    // of these strings to try to ensure this is safe.
-    unlink(FilesToRemove[i].c_str());
+    // We rely on a std::string implementation for which repeated calls to
+    // 'c_str()' don't allocate memory. We pre-call 'c_str()' on all of these
+    // strings to try to ensure this is safe.
+    const char *path = FilesToRemove[i].c_str();
+
+    // Get the status so we can determine if it's a file or directory. If we
+    // can't stat the file, ignore it.
+    struct stat buf;
+    if (stat(path, &buf) != 0)
+      continue;
+
+    // If this is not a regular file, ignore it. We want to prevent removal of
+    // special files like /dev/null, even if the compiler is being run with the
+    // super-user permissions.
+    if (!S_ISREG(buf.st_mode))
+      continue;
+  
+    // Otherwise, remove the file. We ignore any errors here as there is nothing
+    // else we can do.
+    unlink(path);
   }
 }
 
@@ -243,7 +255,7 @@ void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
 // On glibc systems we have the 'backtrace' function, which works nicely, but
 // doesn't demangle symbols.
 static void PrintStackTrace(void *) {
-#ifdef HAVE_BACKTRACE
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACES)
   static void* StackTrace[256];
   // Use backtrace() to output a backtrace on Linux systems with glibc.
   int depth = backtrace(StackTrace,
@@ -293,7 +305,7 @@ static void PrintStackTrace(void *) {
 #endif
 }
 
-/// PrintStackTraceOnErrorSignal - When an error signal (such as SIBABRT or
+/// PrintStackTraceOnErrorSignal - When an error signal (such as SIGABRT or
 /// SIGSEGV) is delivered to the process, print a stack trace and then exit.
 void llvm::sys::PrintStackTraceOnErrorSignal() {
   AddSignalHandler(PrintStackTrace, 0);
@@ -305,10 +317,10 @@ void llvm::sys::PrintStackTraceOnErrorSignal() {
 
     exception_mask_t mask = EXC_MASK_CRASH;
 
-    kern_return_t ret = task_set_exception_ports(self, 
+    kern_return_t ret = task_set_exception_ports(self,
                              mask,
                              MACH_PORT_NULL,
-                             EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES, 
+                             EXCEPTION_STATE_IDENTITY | MACH_EXCEPTION_CODES,
                              THREAD_STATE_NONE);
     (void)ret;
   }
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index fcc72837c456..cb80f2817c02 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -12,51 +12,163 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Windows.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
+#include "Windows.h"
+
+namespace {
+
+DWORD getWindowsProtectionFlags(unsigned Flags) {
+  switch (Flags) {
+  // Contrary to what you might expect, the Windows page protection flags
+  // are not a bitwise combination of RWX values
+  case llvm::sys::Memory::MF_READ:
+    return PAGE_READONLY;
+  case llvm::sys::Memory::MF_WRITE:
+    // Note: PAGE_WRITE is not supported by VirtualProtect
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_WRITE:
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READ;
+  case llvm::sys::Memory::MF_READ |
+         llvm::sys::Memory::MF_WRITE |
+         llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READWRITE;
+  case llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE;
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PAGE_NOACCESS;
+}
+
+size_t getAllocationGranularity() {
+  SYSTEM_INFO  Info;
+  ::GetSystemInfo(&Info);
+  if (Info.dwPageSize > Info.dwAllocationGranularity)
+    return Info.dwPageSize;
+  else
+    return Info.dwAllocationGranularity;
+}
+
+} // namespace
 
 namespace llvm {
-using namespace sys;
+namespace sys {
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
 //===          and must not be UNIX code
 //===----------------------------------------------------------------------===//
 
-MemoryBlock Memory::AllocateRWX(size_t NumBytes,
-                                const MemoryBlock *NearBlock,
-                                std::string *ErrMsg) {
-  if (NumBytes == 0) return MemoryBlock();
+MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
+                                         const MemoryBlock *const NearBlock,
+                                         unsigned Flags,
+                                         error_code &EC) {
+  EC = error_code::success();
+  if (NumBytes == 0)
+    return MemoryBlock();
+
+  // While we'd be happy to allocate single pages, the Windows allocation
+  // granularity may be larger than a single page (in practice, it is 64K)
+  // so mapping less than that will create an unreachable fragment of memory.
+  static const size_t Granularity = getAllocationGranularity();
+  const size_t NumBlocks = (NumBytes+Granularity-1)/Granularity;
 
-  static const size_t pageSize = Process::GetPageSize();
-  size_t NumPages = (NumBytes+pageSize-1)/pageSize;
+  uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
+                                NearBlock->size()
+                           : NULL;
 
-  PVOID start = NearBlock ? static_cast<unsigned char *>(NearBlock->base()) +
-                                NearBlock->size() : NULL;
+  // If the requested address is not aligned to the allocation granularity,
+  // round up to get beyond NearBlock. VirtualAlloc would have rounded down.
+  if (Start && Start % Granularity != 0)
+    Start += Granularity - Start % Granularity;
 
-  void *pa = VirtualAlloc(start, NumPages*pageSize, MEM_RESERVE | MEM_COMMIT,
-                  PAGE_EXECUTE_READWRITE);
-  if (pa == NULL) {
+  DWORD Protect = getWindowsProtectionFlags(Flags);
+
+  void *PA = ::VirtualAlloc(reinterpret_cast<void*>(Start),
+                            NumBlocks*Granularity,
+                            MEM_RESERVE | MEM_COMMIT, Protect);
+  if (PA == NULL) {
     if (NearBlock) {
       // Try again without the NearBlock hint
-      return AllocateRWX(NumBytes, NULL, ErrMsg);
+      return allocateMappedMemory(NumBytes, NULL, Flags, EC);
     }
-    MakeErrMsg(ErrMsg, "Can't allocate RWX Memory: ");
+    EC = error_code(::GetLastError(), system_category());
     return MemoryBlock();
   }
 
-  MemoryBlock result;
-  result.Address = pa;
-  result.Size = NumPages*pageSize;
-  return result;
+  MemoryBlock Result;
+  Result.Address = PA;
+  Result.Size = NumBlocks*Granularity;
+                                 ;
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(Result.Address, Result.Size);
+
+  return Result;
 }
 
-bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
-  if (M.Address == 0 || M.Size == 0) return false;
+error_code Memory::releaseMappedMemory(MemoryBlock &M) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
   if (!VirtualFree(M.Address, 0, MEM_RELEASE))
-    return MakeErrMsg(ErrMsg, "Can't release RWX Memory: ");
-  return false;
+    return error_code(::GetLastError(), system_category());
+
+  M.Address = 0;
+  M.Size = 0;
+
+  return error_code::success();
+}
+
+error_code Memory::protectMappedMemory(const MemoryBlock &M,
+                                       unsigned Flags) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  DWORD Protect = getWindowsProtectionFlags(Flags);
+
+  DWORD OldFlags;
+  if (!VirtualProtect(M.Address, M.Size, Protect, &OldFlags))
+    return error_code(::GetLastError(), system_category());
+
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(M.Address, M.Size);
+
+  return error_code::success();
+}
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void Memory::InvalidateInstructionCache(
+    const void *Addr, size_t Len) {
+  FlushInstructionCache(GetCurrentProcess(), Addr, Len);
+}
+
+
+MemoryBlock Memory::AllocateRWX(size_t NumBytes,
+                                const MemoryBlock *NearBlock,
+                                std::string *ErrMsg) {
+  MemoryBlock MB;
+  error_code EC;
+  MB = allocateMappedMemory(NumBytes, NearBlock,
+                            MF_READ|MF_WRITE|MF_EXEC, EC);
+  if (EC != error_code::success() && ErrMsg) {
+    MakeErrMsg(ErrMsg, EC.message());
+  }
+  return MB;
+}
+
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+  error_code EC = releaseMappedMemory(M);
+  if (EC == error_code::success())
+    return false;
+  MakeErrMsg(ErrMsg, EC.message());
+  return true;
 }
 
 static DWORD getProtection(const void *addr) {
@@ -93,7 +205,7 @@ bool Memory::setRangeWritable(const void *Addr, size_t Size) {
   }
 
   DWORD oldProt;
-  sys::Memory::InvalidateInstructionCache(Addr, Size);
+  Memory::InvalidateInstructionCache(Addr, Size);
   return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
             == TRUE;
 }
@@ -112,9 +224,10 @@ bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
   }
 
   DWORD oldProt;
-  sys::Memory::InvalidateInstructionCache(Addr, Size);
+  Memory::InvalidateInstructionCache(Addr, Size);
   return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
             == TRUE;
 }
 
-}
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/Windows/PathV2.inc b/lib/Support/Windows/PathV2.inc
index 696768ba9dd1..3dfac66b77ce 100644
--- a/lib/Support/Windows/PathV2.inc
+++ b/lib/Support/Windows/PathV2.inc
@@ -794,7 +794,7 @@ mapped_file_region::mapped_file_region(const Twine &path,
   SmallVector<wchar_t, 128> path_utf16;
 
   // Convert path to UTF-16.
-  if (ec = UTF8ToUTF16(path.toStringRef(path_storage), path_utf16))
+  if ((ec = UTF8ToUTF16(path.toStringRef(path_storage), path_utf16)))
     return;
 
   // Get file handle for creating a file mapping.
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 7c353c89bb84..34df636a72a0 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -903,6 +903,7 @@ bool Scanner::consume(uint32_t Expected) {
 void Scanner::skip(uint32_t Distance) {
   Current += Distance;
   Column += Distance;
+  assert(Current <= End && "Skipped past the end");
 }
 
 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
@@ -1239,6 +1240,12 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
       }
     }
   }
+
+  if (Current == End) {
+    setError("Expected quote at end of scalar", Current);
+    return false;
+  }
+
   skip(1); // Skip ending quote.
   Token T;
   T.Kind = Token::TK_Scalar;
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index fa69c2d3f538..7cd53648da35 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -266,8 +266,8 @@ void raw_ostream::flush_nonempty() {
 
 raw_ostream &raw_ostream::write(unsigned char C) {
   // Group exceptional cases into a single branch.
-  if (BUILTIN_EXPECT(OutBufCur >= OutBufEnd, false)) {
-    if (BUILTIN_EXPECT(!OutBufStart, false)) {
+  if (LLVM_UNLIKELY(OutBufCur >= OutBufEnd)) {
+    if (LLVM_UNLIKELY(!OutBufStart)) {
       if (BufferMode == Unbuffered) {
         write_impl(reinterpret_cast<char*>(&C), 1);
         return *this;
@@ -286,8 +286,8 @@ raw_ostream &raw_ostream::write(unsigned char C) {
 
 raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
   // Group exceptional cases into a single branch.
-  if (BUILTIN_EXPECT(size_t(OutBufEnd - OutBufCur) < Size, false)) {
-    if (BUILTIN_EXPECT(!OutBufStart, false)) {
+  if (LLVM_UNLIKELY(size_t(OutBufEnd - OutBufCur) < Size)) {
+    if (LLVM_UNLIKELY(!OutBufStart)) {
       if (BufferMode == Unbuffered) {
         write_impl(Ptr, Size);
         return *this;
@@ -302,7 +302,7 @@ raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
     // If the buffer is empty at this point we have a string that is larger
     // than the buffer. Directly write the chunk that is a multiple of the
     // preferred buffer size and put the remainder in the buffer.
-    if (BUILTIN_EXPECT(OutBufCur == OutBufStart, false)) {
+    if (LLVM_UNLIKELY(OutBufCur == OutBufStart)) {
       size_t BytesToWrite = Size - (Size % NumBytes);
       write_impl(Ptr, BytesToWrite);
       copy_to_buffer(Ptr + BytesToWrite, Size - BytesToWrite);
@@ -523,7 +523,7 @@ void raw_fd_ostream::write_impl(const char *Ptr, size_t Size) {
     ssize_t ret;
 
     // Check whether we should attempt to use atomic writes.
-    if (BUILTIN_EXPECT(!UseAtomicWrites, true)) {
+    if (LLVM_LIKELY(!UseAtomicWrites)) {
       ret = ::write(FD, Ptr, Size);
     } else {
       // Use ::writev() where available.
diff --git a/lib/Support/regexec.c b/lib/Support/regexec.c
index 007861675ba1..bd5e72d4c522 100644
--- a/lib/Support/regexec.c
+++ b/lib/Support/regexec.c
@@ -69,7 +69,7 @@
 #define	SETUP(v)	((v) = 0)
 #define	onestate	long
 #define	INIT(o, n)	((o) = (unsigned long)1 << (n))
-#define	INC(o)		((o) <<= 1)
+#define	INC(o)		((o) = (unsigned long)(o) << 1)
 #define	ISSTATEIN(v, o)	(((v) & (o)) != 0)
 /* some abbreviations; note that some of these know variable names! */
 /* do "if I'm here, I can also be there" etc without branches */
diff --git a/lib/Support/system_error.cpp b/lib/Support/system_error.cpp
index 56898de31520..2df223ca718a 100644
--- a/lib/Support/system_error.cpp
+++ b/lib/Support/system_error.cpp
@@ -48,8 +48,8 @@ _do_message::message(int ev) const {
 
 class _generic_error_category : public _do_message {
 public:
-  virtual const char* name() const;
-  virtual std::string message(int ev) const;
+  virtual const char* name() const LLVM_OVERRIDE;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
 };
 
 const char*
@@ -74,9 +74,9 @@ generic_category() {
 
 class _system_error_category : public _do_message {
 public:
-  virtual const char* name() const;
-  virtual std::string message(int ev) const;
-  virtual error_condition default_error_condition(int ev) const;
+  virtual const char* name() const LLVM_OVERRIDE;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
+  virtual error_condition default_error_condition(int ev) const LLVM_OVERRIDE;
 };
 
 const char*
diff --git a/lib/TableGen/CMakeLists.txt b/lib/TableGen/CMakeLists.txt
index ba7bf14e5dce..935d674a3603 100644
--- a/lib/TableGen/CMakeLists.txt
+++ b/lib/TableGen/CMakeLists.txt
@@ -1,13 +1,8 @@
-## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
-set(LLVM_REQUIRES_RTTI 1)
-set(LLVM_REQUIRES_EH 1)
-
 add_llvm_library(LLVMTableGen
   Error.cpp
   Main.cpp
   Record.cpp
   StringMatcher.cpp
-  TableGenAction.cpp
   TableGenBackend.cpp
   TGLexer.cpp
   TGParser.cpp
diff --git a/lib/TableGen/Error.cpp b/lib/TableGen/Error.cpp
index 1463b68144a8..0bb86b0686a0 100644
--- a/lib/TableGen/Error.cpp
+++ b/lib/TableGen/Error.cpp
@@ -16,12 +16,25 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <cstdlib>
+
 namespace llvm {
 
 SourceMgr SrcMgr;
 
-void PrintWarning(SMLoc WarningLoc, const Twine &Msg) {
-  SrcMgr.PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
+static void PrintMessage(ArrayRef<SMLoc> Loc, SourceMgr::DiagKind Kind,
+                         const Twine &Msg) {
+  SMLoc NullLoc;
+  if (Loc.empty())
+    Loc = NullLoc;
+  SrcMgr.PrintMessage(Loc.front(), Kind, Msg);
+  for (unsigned i = 1; i < Loc.size(); ++i)
+    SrcMgr.PrintMessage(Loc[i], SourceMgr::DK_Note,
+                        "instantiated from multiclass");
+}
+
+void PrintWarning(ArrayRef<SMLoc> WarningLoc, const Twine &Msg) {
+  PrintMessage(WarningLoc, SourceMgr::DK_Warning, Msg);
 }
 
 void PrintWarning(const char *Loc, const Twine &Msg) {
@@ -32,12 +45,8 @@ void PrintWarning(const Twine &Msg) {
   errs() << "warning:" << Msg << "\n";
 }
 
-void PrintWarning(const TGError &Warning) {
-  PrintWarning(Warning.getLoc(), Warning.getMessage());
-}
-
-void PrintError(SMLoc ErrorLoc, const Twine &Msg) {
-  SrcMgr.PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
+void PrintError(ArrayRef<SMLoc> ErrorLoc, const Twine &Msg) {
+  PrintMessage(ErrorLoc, SourceMgr::DK_Error, Msg);
 }
 
 void PrintError(const char *Loc, const Twine &Msg) {
@@ -48,8 +57,14 @@ void PrintError(const Twine &Msg) {
   errs() << "error:" << Msg << "\n";
 }
 
-void PrintError(const TGError &Error) {
-  PrintError(Error.getLoc(), Error.getMessage());
+void PrintFatalError(const std::string &Msg) {
+  PrintError(Twine(Msg));
+  std::exit(1);
+}
+
+void PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const std::string &Msg) {
+  PrintError(ErrorLoc, Msg);
+  std::exit(1);
 }
 
 } // end namespace llvm
diff --git a/lib/TableGen/Main.cpp b/lib/TableGen/Main.cpp
index 7aeef563b859..d0ca756016f2 100644
--- a/lib/TableGen/Main.cpp
+++ b/lib/TableGen/Main.cpp
@@ -22,8 +22,8 @@
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/system_error.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenAction.h"
 #include <algorithm>
 #include <cstdio>
 using namespace llvm;
@@ -47,79 +47,79 @@ namespace {
               cl::value_desc("directory"), cl::Prefix);
 }
 
+/// \brief Create a dependency file for `-d` option.
+///
+/// This functionality is really only for the benefit of the build system.
+/// It is similar to GCC's `-M*` family of options.
+static int createDependencyFile(const TGParser &Parser, const char *argv0) {
+  if (OutputFilename == "-") {
+    errs() << argv0 << ": the option -d must be used together with -o\n";
+    return 1;
+  }
+  std::string Error;
+  tool_output_file DepOut(DependFilename.c_str(), Error);
+  if (!Error.empty()) {
+    errs() << argv0 << ": error opening " << DependFilename
+      << ":" << Error << "\n";
+    return 1;
+  }
+  DepOut.os() << OutputFilename << ":";
+  const std::vector<std::string> &Dependencies = Parser.getDependencies();
+  for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
+                                                E = Dependencies.end();
+       I != E; ++I) {
+    DepOut.os() << " " << (*I);
+  }
+  DepOut.os() << "\n";
+  DepOut.keep();
+  return 0;
+}
+
 namespace llvm {
 
-int TableGenMain(char *argv0, TableGenAction &Action) {
+int TableGenMain(char *argv0, TableGenMainFn *MainFn) {
   RecordKeeper Records;
 
-  try {
-    // Parse the input file.
-    OwningPtr<MemoryBuffer> File;
-    if (error_code ec =
-          MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
-      errs() << "Could not open input file '" << InputFilename << "': "
-             << ec.message() <<"\n";
-      return 1;
-    }
-    MemoryBuffer *F = File.take();
-
-    // Tell SrcMgr about this buffer, which is what TGParser will pick up.
-    SrcMgr.AddNewSourceBuffer(F, SMLoc());
-
-    // Record the location of the include directory so that the lexer can find
-    // it later.
-    SrcMgr.setIncludeDirs(IncludeDirs);
-
-    TGParser Parser(SrcMgr, Records);
-
-    if (Parser.ParseFile())
-      return 1;
-
-    std::string Error;
-    tool_output_file Out(OutputFilename.c_str(), Error);
-    if (!Error.empty()) {
-      errs() << argv0 << ": error opening " << OutputFilename
-        << ":" << Error << "\n";
-      return 1;
-    }
-    if (!DependFilename.empty()) {
-      if (OutputFilename == "-") {
-        errs() << argv0 << ": the option -d must be used together with -o\n";
-        return 1;
-      }
-      tool_output_file DepOut(DependFilename.c_str(), Error);
-      if (!Error.empty()) {
-        errs() << argv0 << ": error opening " << DependFilename
-          << ":" << Error << "\n";
-        return 1;
-      }
-      DepOut.os() << OutputFilename << ":";
-      const std::vector<std::string> &Dependencies = Parser.getDependencies();
-      for (std::vector<std::string>::const_iterator I = Dependencies.begin(),
-                                                    E = Dependencies.end();
-           I != E; ++I) {
-        DepOut.os() << " " << (*I);
-      }
-      DepOut.os() << "\n";
-      DepOut.keep();
-    }
-
-    if (Action(Out.os(), Records))
-      return 1;
-
-    // Declare success.
-    Out.keep();
-    return 0;
-
-  } catch (const TGError &Error) {
-    PrintError(Error);
-  } catch (const std::string &Error) {
-    PrintError(Error);
-  } catch (const char *Error) {
-    PrintError(Error);
-  } catch (...) {
-    errs() << argv0 << ": Unknown unexpected exception occurred.\n";
+  // Parse the input file.
+  OwningPtr<MemoryBuffer> File;
+  if (error_code ec =
+        MemoryBuffer::getFileOrSTDIN(InputFilename.c_str(), File)) {
+    errs() << "Could not open input file '" << InputFilename << "': "
+           << ec.message() <<"\n";
+    return 1;
+  }
+  MemoryBuffer *F = File.take();
+
+  // Tell SrcMgr about this buffer, which is what TGParser will pick up.
+  SrcMgr.AddNewSourceBuffer(F, SMLoc());
+
+  // Record the location of the include directory so that the lexer can find
+  // it later.
+  SrcMgr.setIncludeDirs(IncludeDirs);
+
+  TGParser Parser(SrcMgr, Records);
+
+  if (Parser.ParseFile())
+    return 1;
+
+  std::string Error;
+  tool_output_file Out(OutputFilename.c_str(), Error);
+  if (!Error.empty()) {
+    errs() << argv0 << ": error opening " << OutputFilename
+      << ":" << Error << "\n";
+    return 1;
   }
+  if (!DependFilename.empty()) {
+    if (int Ret = createDependencyFile(Parser, argv0))
+      return Ret;
+  }
+
+  if (MainFn(Out.os(), Records))
+    return 1;
+
+  // Declare success.
+  Out.keep();
+  return 0;
 
   return 1;
 }
diff --git a/lib/TableGen/Makefile b/lib/TableGen/Makefile
index 44724389e1d0..345db3465cfe 100644
--- a/lib/TableGen/Makefile
+++ b/lib/TableGen/Makefile
@@ -11,8 +11,4 @@ LEVEL = ../..
 LIBRARYNAME = LLVMTableGen
 BUILD_ARCHIVE = 1
 
-## FIXME: This only requires RTTI because tblgen uses it.  Fix that.
-REQUIRES_RTTI = 1
-REQUIRES_EH = 1
-
 include $(LEVEL)/Makefile.common
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 99fdc1f6e994..11feb435421c 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -112,7 +112,8 @@ Init *BitRecTy::convertValue(IntInit *II) {
 }
 
 Init *BitRecTy::convertValue(TypedInit *VI) {
-  if (dynamic_cast<BitRecTy*>(VI->getType()))
+  RecTy *Ty = VI->getType();
+  if (isa<BitRecTy>(Ty) || isa<BitsRecTy>(Ty) || isa<IntRecTy>(Ty))
     return VI;  // Accept variable if it is already of bit type!
   return 0;
 }
@@ -178,60 +179,15 @@ Init *BitsRecTy::convertValue(BitsInit *BI) {
 }
 
 Init *BitsRecTy::convertValue(TypedInit *VI) {
-  if (BitsRecTy *BRT = dynamic_cast<BitsRecTy*>(VI->getType()))
-    if (BRT->Size == Size) {
-      SmallVector<Init *, 16> NewBits(Size);
- 
-      for (unsigned i = 0; i != Size; ++i)
-        NewBits[i] = VarBitInit::get(VI, i);
-      return BitsInit::get(NewBits);
-    }
-
-  if (Size == 1 && dynamic_cast<BitRecTy*>(VI->getType()))
+  if (Size == 1 && isa<BitRecTy>(VI->getType()))
     return BitsInit::get(VI);
 
-  if (TernOpInit *Tern = dynamic_cast<TernOpInit*>(VI)) {
-    if (Tern->getOpcode() == TernOpInit::IF) {
-      Init *LHS = Tern->getLHS();
-      Init *MHS = Tern->getMHS();
-      Init *RHS = Tern->getRHS();
-
-      IntInit *MHSi = dynamic_cast<IntInit*>(MHS);
-      IntInit *RHSi = dynamic_cast<IntInit*>(RHS);
-
-      if (MHSi && RHSi) {
-        int64_t MHSVal = MHSi->getValue();
-        int64_t RHSVal = RHSi->getValue();
-
-        if (canFitInBitfield(MHSVal, Size) && canFitInBitfield(RHSVal, Size)) {
-          SmallVector<Init *, 16> NewBits(Size);
-
-          for (unsigned i = 0; i != Size; ++i)
-            NewBits[i] =
-              TernOpInit::get(TernOpInit::IF, LHS,
-                              IntInit::get((MHSVal & (1LL << i)) ? 1 : 0),
-                              IntInit::get((RHSVal & (1LL << i)) ? 1 : 0),
-                              VI->getType());
-
-          return BitsInit::get(NewBits);
-        }
-      } else {
-        BitsInit *MHSbs = dynamic_cast<BitsInit*>(MHS);
-        BitsInit *RHSbs = dynamic_cast<BitsInit*>(RHS);
-
-        if (MHSbs && RHSbs) {
-          SmallVector<Init *, 16> NewBits(Size);
-
-          for (unsigned i = 0; i != Size; ++i)
-            NewBits[i] = TernOpInit::get(TernOpInit::IF, LHS,
-                                         MHSbs->getBit(i),
-                                         RHSbs->getBit(i),
-                                         VI->getType());
+  if (VI->getType()->typeIsConvertibleTo(this)) {
+    SmallVector<Init *, 16> NewBits(Size);
 
-          return BitsInit::get(NewBits);
-        }
-      }
-    }
+    for (unsigned i = 0; i != Size; ++i)
+      NewBits[i] = VarBitInit::get(VI, i);
+    return BitsInit::get(NewBits);
   }
 
   return 0;
@@ -244,7 +200,7 @@ Init *IntRecTy::convertValue(BitInit *BI) {
 Init *IntRecTy::convertValue(BitsInit *BI) {
   int64_t Result = 0;
   for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i)
-    if (BitInit *Bit = dynamic_cast<BitInit*>(BI->getBit(i))) {
+    if (BitInit *Bit = dyn_cast<BitInit>(BI->getBit(i))) {
       Result |= Bit->getValue() << i;
     } else {
       return 0;
@@ -285,7 +241,7 @@ Init *StringRecTy::convertValue(BinOpInit *BO) {
 
 
 Init *StringRecTy::convertValue(TypedInit *TI) {
-  if (dynamic_cast<StringRecTy*>(TI->getType()))
+  if (isa<StringRecTy>(TI->getType()))
     return TI;  // Accept variable if already of the right type!
   return 0;
 }
@@ -305,17 +261,15 @@ Init *ListRecTy::convertValue(ListInit *LI) {
     else
       return 0;
 
-  ListRecTy *LType = dynamic_cast<ListRecTy*>(LI->getType());
-  if (LType == 0) {
+  if (!isa<ListRecTy>(LI->getType()))
     return 0;
-  }
 
   return ListInit::get(Elements, this);
 }
 
 Init *ListRecTy::convertValue(TypedInit *TI) {
   // Ensure that TI is compatible with our class.
-  if (ListRecTy *LRT = dynamic_cast<ListRecTy*>(TI->getType()))
+  if (ListRecTy *LRT = dyn_cast<ListRecTy>(TI->getType()))
     if (LRT->getElementType()->typeIsConvertibleTo(getElementType()))
       return TI;
   return 0;
@@ -351,7 +305,7 @@ Init *DagRecTy::convertValue(BinOpInit *BO) {
 }
 
 RecordRecTy *RecordRecTy::get(Record *R) {
-  return &dynamic_cast<RecordRecTy&>(*R->getDefInit()->getType());
+  return dyn_cast<RecordRecTy>(R->getDefInit()->getType());
 }
 
 std::string RecordRecTy::getAsString() const {
@@ -367,7 +321,7 @@ Init *RecordRecTy::convertValue(DefInit *DI) {
 
 Init *RecordRecTy::convertValue(TypedInit *TI) {
   // Ensure that TI is compatible with Rec.
-  if (RecordRecTy *RRT = dynamic_cast<RecordRecTy*>(TI->getType()))
+  if (RecordRecTy *RRT = dyn_cast<RecordRecTy>(TI->getType()))
     if (RRT->getRecord()->isSubClassOf(getRecord()) ||
         RRT->getRecord() == getRecord())
       return TI;
@@ -386,57 +340,53 @@ bool RecordRecTy::baseClassOf(const RecordRecTy *RHS) const {
   return false;
 }
 
-
 /// resolveTypes - Find a common type that T1 and T2 convert to.
 /// Return 0 if no such type exists.
 ///
 RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
-  if (!T1->typeIsConvertibleTo(T2)) {
-    if (!T2->typeIsConvertibleTo(T1)) {
-      // If one is a Record type, check superclasses
-      RecordRecTy *RecTy1 = dynamic_cast<RecordRecTy*>(T1);
-      if (RecTy1) {
-        // See if T2 inherits from a type T1 also inherits from
-        const std::vector<Record *> &T1SuperClasses =
-          RecTy1->getRecord()->getSuperClasses();
-        for(std::vector<Record *>::const_iterator i = T1SuperClasses.begin(),
-              iend = T1SuperClasses.end();
-            i != iend;
-            ++i) {
-          RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
-          RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
-          if (NewType1 != 0) {
-            if (NewType1 != SuperRecTy1) {
-              delete SuperRecTy1;
-            }
-            return NewType1;
-          }
+  if (T1->typeIsConvertibleTo(T2))
+    return T2;
+  if (T2->typeIsConvertibleTo(T1))
+    return T1;
+
+  // If one is a Record type, check superclasses
+  if (RecordRecTy *RecTy1 = dyn_cast<RecordRecTy>(T1)) {
+    // See if T2 inherits from a type T1 also inherits from
+    const std::vector<Record *> &T1SuperClasses =
+      RecTy1->getRecord()->getSuperClasses();
+    for(std::vector<Record *>::const_iterator i = T1SuperClasses.begin(),
+          iend = T1SuperClasses.end();
+        i != iend;
+        ++i) {
+      RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
+      RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
+      if (NewType1 != 0) {
+        if (NewType1 != SuperRecTy1) {
+          delete SuperRecTy1;
         }
+        return NewType1;
       }
-      RecordRecTy *RecTy2 = dynamic_cast<RecordRecTy*>(T2);
-      if (RecTy2) {
-        // See if T1 inherits from a type T2 also inherits from
-        const std::vector<Record *> &T2SuperClasses =
-          RecTy2->getRecord()->getSuperClasses();
-        for (std::vector<Record *>::const_iterator i = T2SuperClasses.begin(),
-              iend = T2SuperClasses.end();
-            i != iend;
-            ++i) {
-          RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
-          RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
-          if (NewType2 != 0) {
-            if (NewType2 != SuperRecTy2) {
-              delete SuperRecTy2;
-            }
-            return NewType2;
-          }
+    }
+  }
+  if (RecordRecTy *RecTy2 = dyn_cast<RecordRecTy>(T2)) {
+    // See if T1 inherits from a type T2 also inherits from
+    const std::vector<Record *> &T2SuperClasses =
+      RecTy2->getRecord()->getSuperClasses();
+    for (std::vector<Record *>::const_iterator i = T2SuperClasses.begin(),
+          iend = T2SuperClasses.end();
+        i != iend;
+        ++i) {
+      RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
+      RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
+      if (NewType2 != 0) {
+        if (NewType2 != SuperRecTy2) {
+          delete SuperRecTy2;
         }
+        return NewType2;
       }
-      return 0;
     }
-    return T2;
   }
-  return T1;
+  return 0;
 }
 
 
@@ -519,6 +469,15 @@ std::string BitsInit::getAsString() const {
   return Result + " }";
 }
 
+// Fix bit initializer to preserve the behavior that bit reference from a unset
+// bits initializer will resolve into VarBitInit to keep the field name and bit
+// number used in targets with fixed insn length.
+static Init *fixBitInit(const RecordVal *RV, Init *Before, Init *After) {
+  if (RV || After != UnsetInit::get())
+    return After;
+  return Before;
+}
+
 // resolveReferences - If there are any field references that refer to fields
 // that have been filled in, we can propagate the values now.
 //
@@ -526,16 +485,39 @@ Init *BitsInit::resolveReferences(Record &R, const RecordVal *RV) const {
   bool Changed = false;
   SmallVector<Init *, 16> NewBits(getNumBits());
 
-  for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
-    Init *B;
-    Init *CurBit = getBit(i);
+  Init *CachedInit = 0;
+  Init *CachedBitVar = 0;
+  bool CachedBitVarChanged = false;
+
+  for (unsigned i = 0, e = getNumBits(); i != e; ++i) {
+    Init *CurBit = Bits[i];
+    Init *CurBitVar = CurBit->getBitVar();
 
-    do {
-      B = CurBit;
-      CurBit = CurBit->resolveReferences(R, RV);
-      Changed |= B != CurBit;
-    } while (B != CurBit);
     NewBits[i] = CurBit;
+
+    if (CurBitVar == CachedBitVar) {
+      if (CachedBitVarChanged) {
+        Init *Bit = CachedInit->getBit(CurBit->getBitNum());
+        NewBits[i] = fixBitInit(RV, CurBit, Bit);
+      }
+      continue;
+    }
+    CachedBitVar = CurBitVar;
+    CachedBitVarChanged = false;
+
+    Init *B;
+    do {
+      B = CurBitVar;
+      CurBitVar = CurBitVar->resolveReferences(R, RV);
+      CachedBitVarChanged |= B != CurBitVar;
+      Changed |= B != CurBitVar;
+    } while (B != CurBitVar);
+    CachedInit = CurBitVar;
+
+    if (CachedBitVarChanged) {
+      Init *Bit = CurBitVar->getBit(CurBit->getBitNum());
+      NewBits[i] = fixBitInit(RV, CurBit, Bit);
+    }
   }
 
   if (Changed)
@@ -613,7 +595,7 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
 }
 
 void ListInit::Profile(FoldingSetNodeID &ID) const {
-  ListRecTy *ListType = dynamic_cast<ListRecTy *>(getType());
+  ListRecTy *ListType = dyn_cast<ListRecTy>(getType());
   assert(ListType && "Bad type for ListInit!");
   RecTy *EltTy = ListType->getElementType();
 
@@ -633,8 +615,9 @@ ListInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
 
 Record *ListInit::getElementAsRecord(unsigned i) const {
   assert(i < Values.size() && "List element index out of range!");
-  DefInit *DI = dynamic_cast<DefInit*>(Values[i]);
-  if (DI == 0) throw "Expected record in list!";
+  DefInit *DI = dyn_cast<DefInit>(Values[i]);
+  if (DI == 0)
+    PrintFatalError("Expected record in list!");
   return DI->getDef();
 }
 
@@ -668,7 +651,7 @@ Init *ListInit::resolveListElementReference(Record &R, const RecordVal *IRV,
   // If the element is set to some value, or if we are resolving a reference
   // to a specific variable and that variable is explicitly unset, then
   // replace the VarListElementInit with it.
-  if (IRV || !dynamic_cast<UnsetInit*>(E))
+  if (IRV || !isa<UnsetInit>(E))
     return E;
   return 0;
 }
@@ -682,30 +665,16 @@ std::string ListInit::getAsString() const {
   return Result + "]";
 }
 
-Init *OpInit::resolveBitReference(Record &R, const RecordVal *IRV,
-                                  unsigned Bit) const {
-  Init *Folded = Fold(&R, 0);
-
-  if (Folded != this) {
-    TypedInit *Typed = dynamic_cast<TypedInit *>(Folded);
-    if (Typed) {
-      return Typed->resolveBitReference(R, IRV, Bit);
-    }
-  }
-
-  return 0;
-}
-
 Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
                                           unsigned Elt) const {
   Init *Resolved = resolveReferences(R, IRV);
-  OpInit *OResolved = dynamic_cast<OpInit *>(Resolved);
+  OpInit *OResolved = dyn_cast<OpInit>(Resolved);
   if (OResolved) {
     Resolved = OResolved->Fold(&R, 0);
   }
 
   if (Resolved != this) {
-    TypedInit *Typed = dynamic_cast<TypedInit *>(Resolved); 
+    TypedInit *Typed = dyn_cast<TypedInit>(Resolved);
     assert(Typed && "Expected typed init for list reference");
     if (Typed) {
       Init *New = Typed->resolveListElementReference(R, IRV, Elt);
@@ -718,6 +687,12 @@ Init *OpInit::resolveListElementReference(Record &R, const RecordVal *IRV,
   return 0;
 }
 
+Init *OpInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<OpInit*>(this);
+  return VarBitInit::get(const_cast<OpInit*>(this), Bit);
+}
+
 UnOpInit *UnOpInit::get(UnaryOp opc, Init *lhs, RecTy *Type) {
   typedef std::pair<std::pair<unsigned, Init *>, RecTy *> Key;
 
@@ -735,30 +710,23 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   switch (getOpcode()) {
   case CAST: {
     if (getType()->getAsString() == "string") {
-      StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
-      if (LHSs) {
+      if (StringInit *LHSs = dyn_cast<StringInit>(LHS))
         return LHSs;
-      }
 
-      DefInit *LHSd = dynamic_cast<DefInit*>(LHS);
-      if (LHSd) {
+      if (DefInit *LHSd = dyn_cast<DefInit>(LHS))
         return StringInit::get(LHSd->getDef()->getName());
-      }
 
-      IntInit *LHSi = dynamic_cast<IntInit*>(LHS);
-      if (LHSi) {
+      if (IntInit *LHSi = dyn_cast<IntInit>(LHS))
         return StringInit::get(LHSi->getAsString());
-      }
     } else {
-      StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
-      if (LHSs) {
+      if (StringInit *LHSs = dyn_cast<StringInit>(LHS)) {
         std::string Name = LHSs->getValue();
 
         // From TGParser::ParseIDValue
         if (CurRec) {
           if (const RecordVal *RV = CurRec->getValue(Name)) {
             if (RV->getType() != getType())
-              throw "type mismatch in cast";
+              PrintFatalError("type mismatch in cast");
             return VarInit::get(Name, RV->getType());
           }
 
@@ -770,7 +738,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
             assert(RV && "Template arg doesn't exist??");
 
             if (RV->getType() != getType())
-              throw "type mismatch in cast";
+              PrintFatalError("type mismatch in cast");
 
             return VarInit::get(TemplateArgName, RV->getType());
           }
@@ -784,7 +752,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
             assert(RV && "Template arg doesn't exist??");
 
             if (RV->getType() != getType())
-              throw "type mismatch in cast";
+              PrintFatalError("type mismatch in cast");
 
             return VarInit::get(MCName, RV->getType());
           }
@@ -793,14 +761,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
         if (Record *D = (CurRec->getRecords()).getDef(Name))
           return DefInit::get(D);
 
-        throw TGError(CurRec->getLoc(), "Undefined reference:'" + Name + "'\n");
+        PrintFatalError(CurRec->getLoc(),
+                        "Undefined reference:'" + Name + "'\n");
       }
     }
     break;
   }
   case HEAD: {
-    ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
-    if (LHSl) {
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       if (LHSl->getSize() == 0) {
         assert(0 && "Empty list in car");
         return 0;
@@ -810,8 +778,7 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     break;
   }
   case TAIL: {
-    ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
-    if (LHSl) {
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       if (LHSl->getSize() == 0) {
         assert(0 && "Empty list in cdr");
         return 0;
@@ -828,16 +795,14 @@ Init *UnOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     break;
   }
   case EMPTY: {
-    ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
-    if (LHSl) {
+    if (ListInit *LHSl = dyn_cast<ListInit>(LHS)) {
       if (LHSl->getSize() == 0) {
         return IntInit::get(1);
       } else {
         return IntInit::get(0);
       }
     }
-    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
-    if (LHSs) {
+    if (StringInit *LHSs = dyn_cast<StringInit>(LHS)) {
       if (LHSs->getValue().empty()) {
         return IntInit::get(1);
       } else {
@@ -891,13 +856,13 @@ BinOpInit *BinOpInit::get(BinaryOp opc, Init *lhs,
 Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   switch (getOpcode()) {
   case CONCAT: {
-    DagInit *LHSs = dynamic_cast<DagInit*>(LHS);
-    DagInit *RHSs = dynamic_cast<DagInit*>(RHS);
+    DagInit *LHSs = dyn_cast<DagInit>(LHS);
+    DagInit *RHSs = dyn_cast<DagInit>(RHS);
     if (LHSs && RHSs) {
-      DefInit *LOp = dynamic_cast<DefInit*>(LHSs->getOperator());
-      DefInit *ROp = dynamic_cast<DefInit*>(RHSs->getOperator());
+      DefInit *LOp = dyn_cast<DefInit>(LHSs->getOperator());
+      DefInit *ROp = dyn_cast<DefInit>(RHSs->getOperator());
       if (LOp == 0 || ROp == 0 || LOp->getDef() != ROp->getDef())
-        throw "Concated Dag operators do not match!";
+        PrintFatalError("Concated Dag operators do not match!");
       std::vector<Init*> Args;
       std::vector<std::string> ArgNames;
       for (unsigned i = 0, e = LHSs->getNumArgs(); i != e; ++i) {
@@ -913,8 +878,8 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
     break;
   }
   case STRCONCAT: {
-    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
-    StringInit *RHSs = dynamic_cast<StringInit*>(RHS);
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
     if (LHSs && RHSs)
       return StringInit::get(LHSs->getValue() + RHSs->getValue());
     break;
@@ -922,16 +887,16 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   case EQ: {
     // try to fold eq comparison for 'bit' and 'int', otherwise fallback
     // to string objects.
-    IntInit* L =
-      dynamic_cast<IntInit*>(LHS->convertInitializerTo(IntRecTy::get()));
-    IntInit* R =
-      dynamic_cast<IntInit*>(RHS->convertInitializerTo(IntRecTy::get()));
+    IntInit *L =
+      dyn_cast_or_null<IntInit>(LHS->convertInitializerTo(IntRecTy::get()));
+    IntInit *R =
+      dyn_cast_or_null<IntInit>(RHS->convertInitializerTo(IntRecTy::get()));
 
     if (L && R)
       return IntInit::get(L->getValue() == R->getValue());
 
-    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
-    StringInit *RHSs = dynamic_cast<StringInit*>(RHS);
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
 
     // Make sure we've resolved
     if (LHSs && RHSs)
@@ -942,8 +907,8 @@ Init *BinOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   case SHL:
   case SRA:
   case SRL: {
-    IntInit *LHSi = dynamic_cast<IntInit*>(LHS);
-    IntInit *RHSi = dynamic_cast<IntInit*>(RHS);
+    IntInit *LHSi = dyn_cast<IntInit>(LHS);
+    IntInit *RHSi = dyn_cast<IntInit>(RHS);
     if (LHSi && RHSi) {
       int64_t LHSv = LHSi->getValue(), RHSv = RHSi->getValue();
       int64_t Result;
@@ -1016,7 +981,7 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
                                MultiClass *CurMultiClass) {
   std::vector<Init *> NewOperands;
 
-  TypedInit *TArg = dynamic_cast<TypedInit*>(Arg);
+  TypedInit *TArg = dyn_cast<TypedInit>(Arg);
 
   // If this is a dag, recurse
   if (TArg && TArg->getType()->getAsString() == "dag") {
@@ -1030,7 +995,7 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
   }
 
   for (int i = 0; i < RHSo->getNumOperands(); ++i) {
-    OpInit *RHSoo = dynamic_cast<OpInit*>(RHSo->getOperand(i));
+    OpInit *RHSoo = dyn_cast<OpInit>(RHSo->getOperand(i));
 
     if (RHSoo) {
       Init *Result = EvaluateOperation(RHSoo, LHS, Arg,
@@ -1058,25 +1023,21 @@ static Init *EvaluateOperation(OpInit *RHSo, Init *LHS, Init *Arg,
 
 static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
                            Record *CurRec, MultiClass *CurMultiClass) {
-  DagInit *MHSd = dynamic_cast<DagInit*>(MHS);
-  ListInit *MHSl = dynamic_cast<ListInit*>(MHS);
+  DagInit *MHSd = dyn_cast<DagInit>(MHS);
+  ListInit *MHSl = dyn_cast<ListInit>(MHS);
 
-  DagRecTy *DagType = dynamic_cast<DagRecTy*>(Type);
-  ListRecTy *ListType = dynamic_cast<ListRecTy*>(Type);
-
-  OpInit *RHSo = dynamic_cast<OpInit*>(RHS);
+  OpInit *RHSo = dyn_cast<OpInit>(RHS);
 
   if (!RHSo) {
-    throw TGError(CurRec->getLoc(), "!foreach requires an operator\n");
+    PrintFatalError(CurRec->getLoc(), "!foreach requires an operator\n");
   }
 
-  TypedInit *LHSt = dynamic_cast<TypedInit*>(LHS);
+  TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
 
-  if (!LHSt) {
-    throw TGError(CurRec->getLoc(), "!foreach requires typed variable\n");
-  }
+  if (!LHSt)
+    PrintFatalError(CurRec->getLoc(), "!foreach requires typed variable\n");
 
-  if ((MHSd && DagType) || (MHSl && ListType)) {
+  if ((MHSd && isa<DagRecTy>(Type)) || (MHSl && isa<ListRecTy>(Type))) {
     if (MHSd) {
       Init *Val = MHSd->getOperator();
       Init *Result = EvaluateOperation(RHSo, LHS, Val,
@@ -1139,17 +1100,17 @@ static Init *ForeachHelper(Init *LHS, Init *MHS, Init *RHS, RecTy *Type,
 Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   switch (getOpcode()) {
   case SUBST: {
-    DefInit *LHSd = dynamic_cast<DefInit*>(LHS);
-    VarInit *LHSv = dynamic_cast<VarInit*>(LHS);
-    StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
+    DefInit *LHSd = dyn_cast<DefInit>(LHS);
+    VarInit *LHSv = dyn_cast<VarInit>(LHS);
+    StringInit *LHSs = dyn_cast<StringInit>(LHS);
 
-    DefInit *MHSd = dynamic_cast<DefInit*>(MHS);
-    VarInit *MHSv = dynamic_cast<VarInit*>(MHS);
-    StringInit *MHSs = dynamic_cast<StringInit*>(MHS);
+    DefInit *MHSd = dyn_cast<DefInit>(MHS);
+    VarInit *MHSv = dyn_cast<VarInit>(MHS);
+    StringInit *MHSs = dyn_cast<StringInit>(MHS);
 
-    DefInit *RHSd = dynamic_cast<DefInit*>(RHS);
-    VarInit *RHSv = dynamic_cast<VarInit*>(RHS);
-    StringInit *RHSs = dynamic_cast<StringInit*>(RHS);
+    DefInit *RHSd = dyn_cast<DefInit>(RHS);
+    VarInit *RHSv = dyn_cast<VarInit>(RHS);
+    StringInit *RHSs = dyn_cast<StringInit>(RHS);
 
     if ((LHSd && MHSd && RHSd)
         || (LHSv && MHSv && RHSv)
@@ -1197,9 +1158,9 @@ Init *TernOpInit::Fold(Record *CurRec, MultiClass *CurMultiClass) const {
   }
 
   case IF: {
-    IntInit *LHSi = dynamic_cast<IntInit*>(LHS);
+    IntInit *LHSi = dyn_cast<IntInit>(LHS);
     if (Init *I = LHS->convertInitializerTo(IntRecTy::get()))
-      LHSi = dynamic_cast<IntInit*>(I);
+      LHSi = dyn_cast<IntInit>(I);
     if (LHSi) {
       if (LHSi->getValue()) {
         return MHS;
@@ -1219,9 +1180,9 @@ Init *TernOpInit::resolveReferences(Record &R,
   Init *lhs = LHS->resolveReferences(R, RV);
 
   if (Opc == IF && lhs != LHS) {
-    IntInit *Value = dynamic_cast<IntInit*>(lhs);
+    IntInit *Value = dyn_cast<IntInit>(lhs);
     if (Init *I = lhs->convertInitializerTo(IntRecTy::get()))
-      Value = dynamic_cast<IntInit*>(I);
+      Value = dyn_cast<IntInit>(I);
     if (Value != 0) {
       // Short-circuit
       if (Value->getValue()) {
@@ -1257,19 +1218,15 @@ std::string TernOpInit::getAsString() const {
 }
 
 RecTy *TypedInit::getFieldType(const std::string &FieldName) const {
-  RecordRecTy *RecordType = dynamic_cast<RecordRecTy *>(getType());
-  if (RecordType) {
-    RecordVal *Field = RecordType->getRecord()->getValue(FieldName);
-    if (Field) {
+  if (RecordRecTy *RecordType = dyn_cast<RecordRecTy>(getType()))
+    if (RecordVal *Field = RecordType->getRecord()->getValue(FieldName))
       return Field->getType();
-    }
-  }
   return 0;
 }
 
 Init *
 TypedInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
-  BitsRecTy *T = dynamic_cast<BitsRecTy*>(getType());
+  BitsRecTy *T = dyn_cast<BitsRecTy>(getType());
   if (T == 0) return 0;  // Cannot subscript a non-bits variable.
   unsigned NumBits = T->getNumBits();
 
@@ -1285,7 +1242,7 @@ TypedInit::convertInitializerBitRange(const std::vector<unsigned> &Bits) const {
 
 Init *
 TypedInit::convertInitListSlice(const std::vector<unsigned> &Elements) const {
-  ListRecTy *T = dynamic_cast<ListRecTy*>(getType());
+  ListRecTy *T = dyn_cast<ListRecTy>(getType());
   if (T == 0) return 0;  // Cannot subscript a non-list variable.
 
   if (Elements.size() == 1)
@@ -1318,31 +1275,15 @@ VarInit *VarInit::get(Init *VN, RecTy *T) {
 }
 
 const std::string &VarInit::getName() const {
-  StringInit *NameString =
-    dynamic_cast<StringInit *>(getNameInit());
+  StringInit *NameString = dyn_cast<StringInit>(getNameInit());
   assert(NameString && "VarInit name is not a string!");
   return NameString->getValue();
 }
 
-Init *VarInit::resolveBitReference(Record &R, const RecordVal *IRV,
-                                   unsigned Bit) const {
-  if (R.isTemplateArg(getNameInit())) return 0;
-  if (IRV && IRV->getNameInit() != getNameInit()) return 0;
-
-  RecordVal *RV = R.getValue(getNameInit());
-  assert(RV && "Reference to a non-existent variable?");
-  assert(dynamic_cast<BitsInit*>(RV->getValue()));
-  BitsInit *BI = (BitsInit*)RV->getValue();
-
-  assert(Bit < BI->getNumBits() && "Bit reference out of range!");
-  Init *B = BI->getBit(Bit);
-
-  // If the bit is set to some value, or if we are resolving a reference to a
-  // specific variable and that variable is explicitly unset, then replace the
-  // VarBitInit with it.
-  if (IRV || !dynamic_cast<UnsetInit*>(B))
-    return B;
-  return 0;
+Init *VarInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<VarInit*>(this);
+  return VarBitInit::get(const_cast<VarInit*>(this), Bit);
 }
 
 Init *VarInit::resolveListElementReference(Record &R,
@@ -1353,9 +1294,9 @@ Init *VarInit::resolveListElementReference(Record &R,
 
   RecordVal *RV = R.getValue(getNameInit());
   assert(RV && "Reference to a non-existent variable?");
-  ListInit *LI = dynamic_cast<ListInit*>(RV->getValue());
+  ListInit *LI = dyn_cast<ListInit>(RV->getValue());
   if (!LI) {
-    TypedInit *VI = dynamic_cast<TypedInit*>(RV->getValue());
+    TypedInit *VI = dyn_cast<TypedInit>(RV->getValue());
     assert(VI && "Invalid list element!");
     return VarListElementInit::get(VI, Elt);
   }
@@ -1366,14 +1307,14 @@ Init *VarInit::resolveListElementReference(Record &R,
   // If the element is set to some value, or if we are resolving a reference
   // to a specific variable and that variable is explicitly unset, then
   // replace the VarListElementInit with it.
-  if (IRV || !dynamic_cast<UnsetInit*>(E))
+  if (IRV || !isa<UnsetInit>(E))
     return E;
   return 0;
 }
 
 
 RecTy *VarInit::getFieldType(const std::string &FieldName) const {
-  if (RecordRecTy *RTy = dynamic_cast<RecordRecTy*>(getType()))
+  if (RecordRecTy *RTy = dyn_cast<RecordRecTy>(getType()))
     if (const RecordVal *RV = RTy->getRecord()->getValue(FieldName))
       return RV->getType();
   return 0;
@@ -1381,9 +1322,9 @@ RecTy *VarInit::getFieldType(const std::string &FieldName) const {
 
 Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
                             const std::string &FieldName) const {
-  if (dynamic_cast<RecordRecTy*>(getType()))
+  if (isa<RecordRecTy>(getType()))
     if (const RecordVal *Val = R.getValue(VarName)) {
-      if (RV != Val && (RV || dynamic_cast<UnsetInit*>(Val->getValue())))
+      if (RV != Val && (RV || isa<UnsetInit>(Val->getValue())))
         return 0;
       Init *TheInit = Val->getValue();
       assert(TheInit != this && "Infinite loop detected!");
@@ -1402,7 +1343,7 @@ Init *VarInit::getFieldInit(Record &R, const RecordVal *RV,
 ///
 Init *VarInit::resolveReferences(Record &R, const RecordVal *RV) const {
   if (RecordVal *Val = R.getValue(VarName))
-    if (RV == Val || (RV == 0 && !dynamic_cast<UnsetInit*>(Val->getValue())))
+    if (RV == Val || (RV == 0 && !isa<UnsetInit>(Val->getValue())))
       return Val->getValue();
   return const_cast<VarInit *>(this);
 }
@@ -1425,9 +1366,11 @@ std::string VarBitInit::getAsString() const {
 }
 
 Init *VarBitInit::resolveReferences(Record &R, const RecordVal *RV) const {
-  if (Init *I = getVariable()->resolveBitReference(R, RV, getBitNum()))
-    return I;
-  return const_cast<VarBitInit *>(this);
+  Init *I = TI->resolveReferences(R, RV);
+  if (TI != I)
+    return I->getBit(getBitNum());
+
+  return const_cast<VarBitInit*>(this);
 }
 
 VarListElementInit *VarListElementInit::get(TypedInit *T,
@@ -1456,11 +1399,10 @@ VarListElementInit::resolveReferences(Record &R, const RecordVal *RV) const {
   return const_cast<VarListElementInit *>(this);
 }
 
-Init *VarListElementInit::resolveBitReference(Record &R, const RecordVal *RV,
-                                              unsigned Bit) const {
-  // FIXME: This should be implemented, to support references like:
-  // bit B = AA[0]{1};
-  return 0;
+Init *VarListElementInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<VarListElementInit*>(this);
+  return VarBitInit::get(const_cast<VarListElementInit*>(this), Bit);
 }
 
 Init *VarListElementInit:: resolveListElementReference(Record &R,
@@ -1469,8 +1411,7 @@ Init *VarListElementInit:: resolveListElementReference(Record &R,
   Init *Result = TI->resolveListElementReference(R, RV, Element);
   
   if (Result) {
-    TypedInit *TInit = dynamic_cast<TypedInit *>(Result);
-    if (TInit) {
+    if (TypedInit *TInit = dyn_cast<TypedInit>(Result)) {
       Init *Result2 = TInit->resolveListElementReference(R, RV, Elt);
       if (Result2) return Result2;
       return new VarListElementInit(TInit, Elt);
@@ -1513,30 +1454,23 @@ FieldInit *FieldInit::get(Init *R, const std::string &FN) {
   return I;
 }
 
-Init *FieldInit::resolveBitReference(Record &R, const RecordVal *RV,
-                                     unsigned Bit) const {
-  if (Init *BitsVal = Rec->getFieldInit(R, RV, FieldName))
-    if (BitsInit *BI = dynamic_cast<BitsInit*>(BitsVal)) {
-      assert(Bit < BI->getNumBits() && "Bit reference out of range!");
-      Init *B = BI->getBit(Bit);
-
-      if (dynamic_cast<BitInit*>(B))  // If the bit is set.
-        return B;                     // Replace the VarBitInit with it.
-    }
-  return 0;
+Init *FieldInit::getBit(unsigned Bit) const {
+  if (getType() == BitRecTy::get())
+    return const_cast<FieldInit*>(this);
+  return VarBitInit::get(const_cast<FieldInit*>(this), Bit);
 }
 
 Init *FieldInit::resolveListElementReference(Record &R, const RecordVal *RV,
                                              unsigned Elt) const {
   if (Init *ListVal = Rec->getFieldInit(R, RV, FieldName))
-    if (ListInit *LI = dynamic_cast<ListInit*>(ListVal)) {
+    if (ListInit *LI = dyn_cast<ListInit>(ListVal)) {
       if (Elt >= LI->getSize()) return 0;
       Init *E = LI->getElement(Elt);
 
       // If the element is set to some value, or if we are resolving a
       // reference to a specific variable and that variable is explicitly
       // unset, then replace the VarListElementInit with it.
-      if (RV || !dynamic_cast<UnsetInit*>(E))
+      if (RV || !isa<UnsetInit>(E))
         return E;
     }
   return 0;
@@ -1665,7 +1599,7 @@ RecordVal::RecordVal(const std::string &N, RecTy *T, unsigned P)
 }
 
 const std::string &RecordVal::getName() const {
-  StringInit *NameString = dynamic_cast<StringInit *>(Name);
+  StringInit *NameString = dyn_cast<StringInit>(Name);
   assert(NameString && "RecordVal name is not a string!");
   return NameString->getValue();
 }
@@ -1695,12 +1629,11 @@ void Record::init() {
 
 void Record::checkName() {
   // Ensure the record name has string type.
-  const TypedInit *TypedName = dynamic_cast<const TypedInit *>(Name);
+  const TypedInit *TypedName = dyn_cast<const TypedInit>(Name);
   assert(TypedName && "Record name is not typed!");
   RecTy *Type = TypedName->getType();
-  if (dynamic_cast<StringRecTy *>(Type) == 0) {
-    throw TGError(getLoc(), "Record name is not a string!");
-  }
+  if (!isa<StringRecTy>(Type))
+    PrintFatalError(getLoc(), "Record name is not a string!");
 }
 
 DefInit *Record::getDefInit() {
@@ -1710,8 +1643,7 @@ DefInit *Record::getDefInit() {
 }
 
 const std::string &Record::getName() const {
-  const StringInit *NameString =
-    dynamic_cast<const StringInit *>(Name);
+  const StringInit *NameString = dyn_cast<StringInit>(Name);
   assert(NameString && "Record name is not a string!");
   return NameString->getValue();
 }
@@ -1751,7 +1683,15 @@ void Record::resolveReferencesTo(const RecordVal *RV) {
     if (RV == &Values[i]) // Skip resolve the same field as the given one
       continue;
     if (Init *V = Values[i].getValue())
-      Values[i].setValue(V->resolveReferences(*this, RV));
+      if (Values[i].setValue(V->resolveReferences(*this, RV)))
+        PrintFatalError(getLoc(), "Invalid value is found when setting '"
+                      + Values[i].getNameInitAsString()
+                      + "' after resolving references"
+                      + (RV ? " against '" + RV->getNameInitAsString()
+                              + "' of ("
+                              + RV->getValue()->getAsUnquotedString() + ")"
+                            : "")
+                      + "\n");
   }
   Init *OldName = getNameInit();
   Init *NewName = Name->resolveReferences(*this, RV);
@@ -1799,184 +1739,201 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const Record &R) {
 }
 
 /// getValueInit - Return the initializer for a value with the specified name,
-/// or throw an exception if the field does not exist.
+/// or abort if the field does not exist.
 ///
 Init *Record::getValueInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-      FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
   return R->getValue();
 }
 
 
 /// getValueAsString - This method looks up the specified field and returns its
-/// value as a string, throwing an exception if the field does not exist or if
+/// value as a string, aborts if the field does not exist or if
 /// the value is not a string.
 ///
 std::string Record::getValueAsString(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-          FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (StringInit *SI = dynamic_cast<StringInit*>(R->getValue()))
+  if (StringInit *SI = dyn_cast<StringInit>(R->getValue()))
     return SI->getValue();
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have a string initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a string initializer!");
 }
 
 /// getValueAsBitsInit - This method looks up the specified field and returns
-/// its value as a BitsInit, throwing an exception if the field does not exist
-/// or if the value is not the right type.
+/// its value as a BitsInit, aborts if the field does not exist or if
+/// the value is not the right type.
 ///
 BitsInit *Record::getValueAsBitsInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-          FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (BitsInit *BI = dynamic_cast<BitsInit*>(R->getValue()))
+  if (BitsInit *BI = dyn_cast<BitsInit>(R->getValue()))
     return BI;
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have a BitsInit initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a BitsInit initializer!");
 }
 
 /// getValueAsListInit - This method looks up the specified field and returns
-/// its value as a ListInit, throwing an exception if the field does not exist
-/// or if the value is not the right type.
+/// its value as a ListInit, aborting if the field does not exist or if
+/// the value is not the right type.
 ///
 ListInit *Record::getValueAsListInit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-          FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (ListInit *LI = dynamic_cast<ListInit*>(R->getValue()))
+  if (ListInit *LI = dyn_cast<ListInit>(R->getValue()))
     return LI;
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have a list initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a list initializer!");
 }
 
 /// getValueAsListOfDefs - This method looks up the specified field and returns
-/// its value as a vector of records, throwing an exception if the field does
-/// not exist or if the value is not the right type.
+/// its value as a vector of records, aborting if the field does not exist
+/// or if the value is not the right type.
 ///
 std::vector<Record*>
 Record::getValueAsListOfDefs(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
   std::vector<Record*> Defs;
   for (unsigned i = 0; i < List->getSize(); i++) {
-    if (DefInit *DI = dynamic_cast<DefInit*>(List->getElement(i))) {
+    if (DefInit *DI = dyn_cast<DefInit>(List->getElement(i))) {
       Defs.push_back(DI->getDef());
     } else {
-      throw "Record `" + getName() + "', field `" + FieldName.str() +
-            "' list is not entirely DefInit!";
+      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+        FieldName.str() + "' list is not entirely DefInit!");
     }
   }
   return Defs;
 }
 
 /// getValueAsInt - This method looks up the specified field and returns its
-/// value as an int64_t, throwing an exception if the field does not exist or if
-/// the value is not the right type.
+/// value as an int64_t, aborting if the field does not exist or if the value
+/// is not the right type.
 ///
 int64_t Record::getValueAsInt(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-          FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (IntInit *II = dynamic_cast<IntInit*>(R->getValue()))
+  if (IntInit *II = dyn_cast<IntInit>(R->getValue()))
     return II->getValue();
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have an int initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have an int initializer!");
 }
 
 /// getValueAsListOfInts - This method looks up the specified field and returns
-/// its value as a vector of integers, throwing an exception if the field does
-/// not exist or if the value is not the right type.
+/// its value as a vector of integers, aborting if the field does not exist or
+/// if the value is not the right type.
 ///
 std::vector<int64_t>
 Record::getValueAsListOfInts(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
   std::vector<int64_t> Ints;
   for (unsigned i = 0; i < List->getSize(); i++) {
-    if (IntInit *II = dynamic_cast<IntInit*>(List->getElement(i))) {
+    if (IntInit *II = dyn_cast<IntInit>(List->getElement(i))) {
       Ints.push_back(II->getValue());
     } else {
-      throw "Record `" + getName() + "', field `" + FieldName.str() +
-            "' does not have a list of ints initializer!";
+      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+        FieldName.str() + "' does not have a list of ints initializer!");
     }
   }
   return Ints;
 }
 
 /// getValueAsListOfStrings - This method looks up the specified field and
-/// returns its value as a vector of strings, throwing an exception if the
-/// field does not exist or if the value is not the right type.
+/// returns its value as a vector of strings, aborting if the field does not
+/// exist or if the value is not the right type.
 ///
 std::vector<std::string>
 Record::getValueAsListOfStrings(StringRef FieldName) const {
   ListInit *List = getValueAsListInit(FieldName);
   std::vector<std::string> Strings;
   for (unsigned i = 0; i < List->getSize(); i++) {
-    if (StringInit *II = dynamic_cast<StringInit*>(List->getElement(i))) {
+    if (StringInit *II = dyn_cast<StringInit>(List->getElement(i))) {
       Strings.push_back(II->getValue());
     } else {
-      throw "Record `" + getName() + "', field `" + FieldName.str() +
-            "' does not have a list of strings initializer!";
+      PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+        FieldName.str() + "' does not have a list of strings initializer!");
     }
   }
   return Strings;
 }
 
 /// getValueAsDef - This method looks up the specified field and returns its
-/// value as a Record, throwing an exception if the field does not exist or if
-/// the value is not the right type.
+/// value as a Record, aborting if the field does not exist or if the value
+/// is not the right type.
 ///
 Record *Record::getValueAsDef(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-      FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (DefInit *DI = dynamic_cast<DefInit*>(R->getValue()))
+  if (DefInit *DI = dyn_cast<DefInit>(R->getValue()))
     return DI->getDef();
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have a def initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a def initializer!");
 }
 
 /// getValueAsBit - This method looks up the specified field and returns its
-/// value as a bit, throwing an exception if the field does not exist or if
-/// the value is not the right type.
+/// value as a bit, aborting if the field does not exist or if the value is
+/// not the right type.
 ///
 bool Record::getValueAsBit(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-      FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (BitInit *BI = dynamic_cast<BitInit*>(R->getValue()))
+  if (BitInit *BI = dyn_cast<BitInit>(R->getValue()))
     return BI->getValue();
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have a bit initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a bit initializer!");
+}
+
+bool Record::getValueAsBitOrUnset(StringRef FieldName, bool &Unset) const {
+  const RecordVal *R = getValue(FieldName);
+  if (R == 0 || R->getValue() == 0)
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
+
+  if (R->getValue() == UnsetInit::get()) {
+    Unset = true;
+    return false;
+  }
+  Unset = false;
+  if (BitInit *BI = dyn_cast<BitInit>(R->getValue()))
+    return BI->getValue();
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a bit initializer!");
 }
 
 /// getValueAsDag - This method looks up the specified field and returns its
-/// value as an Dag, throwing an exception if the field does not exist or if
-/// the value is not the right type.
+/// value as an Dag, aborting if the field does not exist or if the value is
+/// not the right type.
 ///
 DagInit *Record::getValueAsDag(StringRef FieldName) const {
   const RecordVal *R = getValue(FieldName);
   if (R == 0 || R->getValue() == 0)
-    throw "Record `" + getName() + "' does not have a field named `" +
-      FieldName.str() + "'!\n";
+    PrintFatalError(getLoc(), "Record `" + getName() +
+      "' does not have a field named `" + FieldName.str() + "'!\n");
 
-  if (DagInit *DI = dynamic_cast<DagInit*>(R->getValue()))
+  if (DagInit *DI = dyn_cast<DagInit>(R->getValue()))
     return DI;
-  throw "Record `" + getName() + "', field `" + FieldName.str() +
-        "' does not have a dag initializer!";
+  PrintFatalError(getLoc(), "Record `" + getName() + "', field `" +
+    FieldName.str() + "' does not have a dag initializer!");
 }
 
 
@@ -2019,7 +1976,7 @@ std::vector<Record*>
 RecordKeeper::getAllDerivedDefinitions(const std::string &ClassName) const {
   Record *Class = getClass(ClassName);
   if (!Class)
-    throw "ERROR: Couldn't find the `" + ClassName + "' class!\n";
+    PrintFatalError("ERROR: Couldn't find the `" + ClassName + "' class!\n");
 
   std::vector<Record*> Defs;
   for (std::map<std::string, Record*>::const_iterator I = getDefs().begin(),
@@ -2034,7 +1991,7 @@ RecordKeeper::getAllDerivedDefinitions(const std::string &ClassName) const {
 /// to CurRec's name.
 Init *llvm::QualifyName(Record &CurRec, MultiClass *CurMultiClass,
                         Init *Name, const std::string &Scoper) {
-  RecTy *Type = dynamic_cast<TypedInit *>(Name)->getType();
+  RecTy *Type = dyn_cast<TypedInit>(Name)->getType();
 
   BinOpInit *NewName =
     BinOpInit::get(BinOpInit::STRCONCAT, 
diff --git a/lib/TableGen/TGParser.cpp b/lib/TableGen/TGParser.cpp
index b9c7ff694d7f..b1f9f724efd3 100644
--- a/lib/TableGen/TGParser.cpp
+++ b/lib/TableGen/TGParser.cpp
@@ -93,7 +93,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
   // Do not allow assignments like 'X = X'.  This will just cause infinite loops
   // in the resolution machinery.
   if (BitList.empty())
-    if (VarInit *VI = dynamic_cast<VarInit*>(V))
+    if (VarInit *VI = dyn_cast<VarInit>(V))
       if (VI->getNameInit() == ValName)
         return false;
 
@@ -102,7 +102,7 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
   // initializer.
   //
   if (!BitList.empty()) {
-    BitsInit *CurVal = dynamic_cast<BitsInit*>(RV->getValue());
+    BitsInit *CurVal = dyn_cast<BitsInit>(RV->getValue());
     if (CurVal == 0)
       return Error(Loc, "Value '" + ValName->getAsUnquotedString()
                    + "' is not a bits type");
@@ -110,12 +110,11 @@ bool TGParser::SetValue(Record *CurRec, SMLoc Loc, Init *ValName,
     // Convert the incoming value to a bits type of the appropriate size...
     Init *BI = V->convertInitializerTo(BitsRecTy::get(BitList.size()));
     if (BI == 0) {
-      V->convertInitializerTo(BitsRecTy::get(BitList.size()));
       return Error(Loc, "Initializer is not compatible with bit range");
     }
 
     // We should have a BitsInit type now.
-    BitsInit *BInit = dynamic_cast<BitsInit*>(BI);
+    BitsInit *BInit = dyn_cast<BitsInit>(BI);
     assert(BInit != 0);
 
     SmallVector<Init *, 16> NewBits(CurVal->getNumBits());
@@ -311,7 +310,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
   if (IterVals.size() != Loops.size()) {
     assert(IterVals.size() < Loops.size());
     ForeachLoop &CurLoop = Loops[IterVals.size()];
-    ListInit *List = dynamic_cast<ListInit *>(CurLoop.ListValue);
+    ListInit *List = dyn_cast<ListInit>(CurLoop.ListValue);
     if (List == 0) {
       Error(Loc, "Loop list is not a list");
       return true;
@@ -336,7 +335,7 @@ bool TGParser::ProcessForeachDefs(Record *CurRec, SMLoc Loc, IterSet &IterVals){
   // Set the iterator values now.
   for (unsigned i = 0, e = IterVals.size(); i != e; ++i) {
     VarInit *IterVar = IterVals[i].IterVar;
-    TypedInit *IVal = dynamic_cast<TypedInit *>(IterVals[i].IterValue);
+    TypedInit *IVal = dyn_cast<TypedInit>(IterVals[i].IterValue);
     if (IVal == 0) {
       Error(Loc, "foreach iterator value is untyped");
       return true;
@@ -407,8 +406,7 @@ Init *TGParser::ParseObjectName(MultiClass *CurMultiClass) {
 
   RecTy *Type = 0;
   if (CurRec) {
-    const TypedInit *CurRecName =
-      dynamic_cast<const TypedInit *>(CurRec->getNameInit());
+    const TypedInit *CurRecName = dyn_cast<TypedInit>(CurRec->getNameInit());
     if (!CurRecName) {
       TokError("Record name is not typed!");
       return 0;
@@ -781,7 +779,7 @@ Init *TGParser::ParseIDValue(Record *CurRec,
   for (LoopVector::iterator i = Loops.begin(), iend = Loops.end();
        i != iend;
        ++i) {
-    VarInit *IterVar = dynamic_cast<VarInit *>(i->IterVar);
+    VarInit *IterVar = dyn_cast<VarInit>(i->IterVar);
     if (IterVar && IterVar->getName() == Name)
       return IterVar;
   }
@@ -856,16 +854,16 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     if (Code == UnOpInit::HEAD
         || Code == UnOpInit::TAIL
         || Code == UnOpInit::EMPTY) {
-      ListInit *LHSl = dynamic_cast<ListInit*>(LHS);
-      StringInit *LHSs = dynamic_cast<StringInit*>(LHS);
-      TypedInit *LHSt = dynamic_cast<TypedInit*>(LHS);
+      ListInit *LHSl = dyn_cast<ListInit>(LHS);
+      StringInit *LHSs = dyn_cast<StringInit>(LHS);
+      TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
       if (LHSl == 0 && LHSs == 0 && LHSt == 0) {
         TokError("expected list or string type argument in unary operator");
         return 0;
       }
       if (LHSt) {
-        ListRecTy *LType = dynamic_cast<ListRecTy*>(LHSt->getType());
-        StringRecTy *SType = dynamic_cast<StringRecTy*>(LHSt->getType());
+        ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
+        StringRecTy *SType = dyn_cast<StringRecTy>(LHSt->getType());
         if (LType == 0 && SType == 0) {
           TokError("expected list or string type argumnet in unary operator");
           return 0;
@@ -885,7 +883,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
         }
         if (LHSl) {
           Init *Item = LHSl->getElement(0);
-          TypedInit *Itemt = dynamic_cast<TypedInit*>(Item);
+          TypedInit *Itemt = dyn_cast<TypedInit>(Item);
           if (Itemt == 0) {
             TokError("untyped list element in unary operator");
             return 0;
@@ -897,7 +895,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
           }
         } else {
           assert(LHSt && "expected list type argument in unary operator");
-          ListRecTy *LType = dynamic_cast<ListRecTy*>(LHSt->getType());
+          ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
           if (LType == 0) {
             TokError("expected list type argumnet in unary operator");
             return 0;
@@ -1044,35 +1042,28 @@ Init *TGParser::ParseOperation(Record *CurRec) {
     switch (LexCode) {
     default: llvm_unreachable("Unhandled code!");
     case tgtok::XIf: {
-      // FIXME: The `!if' operator doesn't handle non-TypedInit well at
-      // all. This can be made much more robust.
-      TypedInit *MHSt = dynamic_cast<TypedInit*>(MHS);
-      TypedInit *RHSt = dynamic_cast<TypedInit*>(RHS);
-
       RecTy *MHSTy = 0;
       RecTy *RHSTy = 0;
 
-      if (MHSt == 0 && RHSt == 0) {
-        BitsInit *MHSbits = dynamic_cast<BitsInit*>(MHS);
-        BitsInit *RHSbits = dynamic_cast<BitsInit*>(RHS);
-
-        if (MHSbits && RHSbits &&
-            MHSbits->getNumBits() == RHSbits->getNumBits()) {
-          Type = BitRecTy::get();
-          break;
-        } else {
-          BitInit *MHSbit = dynamic_cast<BitInit*>(MHS);
-          BitInit *RHSbit = dynamic_cast<BitInit*>(RHS);
-
-          if (MHSbit && RHSbit) {
-            Type = BitRecTy::get();
-            break;
-          }
-        }
-      } else if (MHSt != 0 && RHSt != 0) {
+      if (TypedInit *MHSt = dyn_cast<TypedInit>(MHS))
         MHSTy = MHSt->getType();
+      if (BitsInit *MHSbits = dyn_cast<BitsInit>(MHS))
+        MHSTy = BitsRecTy::get(MHSbits->getNumBits());
+      if (isa<BitInit>(MHS))
+        MHSTy = BitRecTy::get();
+
+      if (TypedInit *RHSt = dyn_cast<TypedInit>(RHS))
         RHSTy = RHSt->getType();
-      }
+      if (BitsInit *RHSbits = dyn_cast<BitsInit>(RHS))
+        RHSTy = BitsRecTy::get(RHSbits->getNumBits());
+      if (isa<BitInit>(RHS))
+        RHSTy = BitRecTy::get();
+
+      // For UnsetInit, it's typed from the other hand.
+      if (isa<UnsetInit>(MHS))
+        MHSTy = RHSTy;
+      if (isa<UnsetInit>(RHS))
+        RHSTy = MHSTy;
 
       if (!MHSTy || !RHSTy) {
         TokError("could not get type for !if");
@@ -1090,7 +1081,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
       break;
     }
     case tgtok::XForEach: {
-      TypedInit *MHSt = dynamic_cast<TypedInit *>(MHS);
+      TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
       if (MHSt == 0) {
         TokError("could not get type for !foreach");
         return 0;
@@ -1099,7 +1090,7 @@ Init *TGParser::ParseOperation(Record *CurRec) {
       break;
     }
     case tgtok::XSubst: {
-      TypedInit *RHSt = dynamic_cast<TypedInit *>(RHS);
+      TypedInit *RHSt = dyn_cast<TypedInit>(RHS);
       if (RHSt == 0) {
         TokError("could not get type for !subst");
         return 0;
@@ -1278,7 +1269,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     ListRecTy *GivenListTy = 0;
 
     if (ItemType != 0) {
-      ListRecTy *ListType = dynamic_cast<ListRecTy*>(ItemType);
+      ListRecTy *ListType = dyn_cast<ListRecTy>(ItemType);
       if (ListType == 0) {
         std::stringstream s;
         s << "Type mismatch for list, expected list type, got "
@@ -1323,7 +1314,7 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     for (std::vector<Init *>::iterator i = Vals.begin(), ie = Vals.end();
          i != ie;
          ++i) {
-      TypedInit *TArg = dynamic_cast<TypedInit*>(*i);
+      TypedInit *TArg = dyn_cast<TypedInit>(*i);
       if (TArg == 0) {
         TokError("Untyped list element");
         return 0;
@@ -1506,7 +1497,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       // Create a !strconcat() operation, first casting each operand to
       // a string if necessary.
 
-      TypedInit *LHS = dynamic_cast<TypedInit *>(Result);
+      TypedInit *LHS = dyn_cast<TypedInit>(Result);
       if (!LHS) {
         Error(PasteLoc, "LHS of paste is not typed!");
         return 0;
@@ -1533,7 +1524,7 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
 
       default:
         Init *RHSResult = ParseValue(CurRec, ItemType, ParseNameMode);
-        RHS = dynamic_cast<TypedInit *>(RHSResult);
+        RHS = dyn_cast<TypedInit>(RHSResult);
         if (!RHS) {
           Error(PasteLoc, "RHS of paste is not typed!");
           return 0;
@@ -1724,13 +1715,13 @@ VarInit *TGParser::ParseForeachDeclaration(ListInit *&ForeachListValue) {
   default: TokError("Unknown token when expecting a range list"); return 0;
   case tgtok::l_square: { // '[' ValueList ']'
     Init *List = ParseSimpleValue(0, 0, ParseForeachMode);
-    ForeachListValue = dynamic_cast<ListInit*>(List);
+    ForeachListValue = dyn_cast<ListInit>(List);
     if (ForeachListValue == 0) {
       TokError("Expected a Value list");
       return 0;
     }
     RecTy *ValueType = ForeachListValue->getType();
-    ListRecTy *ListType = dynamic_cast<ListRecTy *>(ValueType);
+    ListRecTy *ListType = dyn_cast<ListRecTy>(ValueType);
     if (ListType == 0) {
       TokError("Value list is not of list type");
       return 0;
@@ -2265,7 +2256,7 @@ InstantiateMulticlassDef(MultiClass &MC,
 
   Init *DefName = DefProto->getNameInit();
 
-  StringInit *DefNameString = dynamic_cast<StringInit *>(DefName);
+  StringInit *DefNameString = dyn_cast<StringInit>(DefName);
 
   if (DefNameString != 0) {
     // We have a fully expanded string so there are no operators to
@@ -2277,7 +2268,10 @@ InstantiateMulticlassDef(MultiClass &MC,
                      DefName, StringRecTy::get())->Fold(DefProto, &MC);
   }
 
-  Record *CurRec = new Record(DefName, DefmPrefixLoc, Records);
+  // Make a trail of SMLocs from the multiclass instantiations.
+  SmallVector<SMLoc, 4> Locs(1, DefmPrefixLoc);
+  Locs.append(DefProto->getLoc().begin(), DefProto->getLoc().end());
+  Record *CurRec = new Record(DefName, Locs, Records);
 
   SubClassReference Ref;
   Ref.RefLoc = DefmPrefixLoc;
diff --git a/lib/TableGen/TGParser.h b/lib/TableGen/TGParser.h
index 3d2c72cf773a..9c2ad43c426e 100644
--- a/lib/TableGen/TGParser.h
+++ b/lib/TableGen/TGParser.h
@@ -30,7 +30,7 @@ namespace llvm {
   struct MultiClass;
   struct SubClassReference;
   struct SubMultiClassReference;
-  
+
   struct LetRecord {
     std::string Name;
     std::vector<unsigned> Bits;
@@ -41,7 +41,7 @@ namespace llvm {
       : Name(N), Bits(B), Value(V), Loc(L) {
     }
   };
-  
+
   /// ForeachLoop - Record the iteration state associated with a for loop.
   /// This is used to instantiate items in the loop body.
   struct ForeachLoop {
@@ -56,13 +56,13 @@ class TGParser {
   TGLexer Lex;
   std::vector<std::vector<LetRecord> > LetStack;
   std::map<std::string, MultiClass*> MultiClasses;
-  
+
   /// Loops - Keep track of any foreach loops we are within.
   ///
   typedef std::vector<ForeachLoop> LoopVector;
   LoopVector Loops;
 
-  /// CurMultiClass - If we are parsing a 'multiclass' definition, this is the 
+  /// CurMultiClass - If we are parsing a 'multiclass' definition, this is the
   /// current value.
   MultiClass *CurMultiClass;
 
@@ -82,13 +82,13 @@ class TGParser {
   };
 
 public:
-  TGParser(SourceMgr &SrcMgr, RecordKeeper &records) : 
+  TGParser(SourceMgr &SrcMgr, RecordKeeper &records) :
     Lex(SrcMgr), CurMultiClass(0), Records(records) {}
-  
+
   /// ParseFile - Main entrypoint for parsing a tblgen file.  These parser
   /// routines return true on error, or false on success.
   bool ParseFile();
-  
+
   bool Error(SMLoc L, const Twine &Msg) const {
     PrintError(L, Msg);
     return true;
@@ -102,9 +102,9 @@ public:
 
 private:  // Semantic analysis methods.
   bool AddValue(Record *TheRec, SMLoc Loc, const RecordVal &RV);
-  bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName, 
+  bool SetValue(Record *TheRec, SMLoc Loc, Init *ValName,
                 const std::vector<unsigned> &BitList, Init *V);
-  bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName, 
+  bool SetValue(Record *TheRec, SMLoc Loc, const std::string &ValName,
                 const std::vector<unsigned> &BitList, Init *V) {
     return SetValue(TheRec, Loc, StringInit::get(ValName), BitList, V);
   }
@@ -170,7 +170,8 @@ private:  // Parser methods.
                          IDParseMode Mode = ParseValueMode);
   Init *ParseValue(Record *CurRec, RecTy *ItemType = 0,
                    IDParseMode Mode = ParseValueMode);
-  std::vector<Init*> ParseValueList(Record *CurRec, Record *ArgsRec = 0, RecTy *EltTy = 0);
+  std::vector<Init*> ParseValueList(Record *CurRec, Record *ArgsRec = 0,
+                                    RecTy *EltTy = 0);
   std::vector<std::pair<llvm::Init*, std::string> > ParseDagArgList(Record *);
   bool ParseOptionalRangeList(std::vector<unsigned> &Ranges);
   bool ParseOptionalBitList(std::vector<unsigned> &Ranges);
@@ -184,7 +185,7 @@ private:  // Parser methods.
   MultiClass *ParseMultiClassID();
   Record *ParseDefmID();
 };
-  
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/TableGen/TableGenAction.cpp b/lib/TableGen/TableGenAction.cpp
deleted file mode 100644
index 54e508309457..000000000000
--- a/lib/TableGen/TableGenAction.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-//===- TableGenAction.cpp - defines TableGenAction --------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/TableGen/TableGenAction.h"
-
-using namespace llvm;
-
-void TableGenAction::anchor() { }
-
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 2a1e8e4d3079..1446bbbb8e7c 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -37,6 +37,7 @@ FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
 
 FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
 FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMGlobalBaseRegPass();
 FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
 FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 69e2346dc0b8..23974ad9052c 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -32,9 +32,6 @@ def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true",
 def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true",
                                    "Enable VFP3 instructions",
                                    [FeatureVFP2]>;
-def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true",
-                                   "Enable VFP4 instructions",
-                                   [FeatureVFP3]>;
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
                                    "Enable NEON instructions",
                                    [FeatureVFP3]>;
@@ -44,10 +41,16 @@ def FeatureNoARM  : SubtargetFeature<"noarm", "NoARM", "true",
                                      "Does not support ARM mode execution">;
 def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
                                      "Enable half-precision floating point">;
+def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
+                                     "Enable VFP4 instructions",
+                                     [FeatureVFP3, FeatureFP16]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
                                      "Restrict VFP3 to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
                                      "Enable divide instructions">;
+def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
+                                        "HasHardwareDivideInARM", "true",
+                                      "Enable divide instructions in ARM mode">;
 def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
                                  "Enable Thumb2 extract and pack instructions">;
 def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
@@ -139,6 +142,18 @@ def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                    [FeatureVMLxForwarding,
                                     FeatureT2XtPk, FeatureFP16,
                                     FeatureAvoidPartialCPSR]>;
+def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
+                                   "Swift ARM processors",
+                                   [FeatureNEONForFP, FeatureT2XtPk,
+                                    FeatureVFP4, FeatureMP, FeatureHWDiv,
+                                    FeatureHWDivARM, FeatureAvoidPartialCPSR,
+                                    FeatureHasSlowFPVMLx]>;
+
+// FIXME: It has not been determined if A15 has these features.
+def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+                                   "Cortex-A15 ARM processors",
+                                   [FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR]>;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
@@ -214,6 +229,10 @@ def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
                                      FeatureHasRAS]>;
+// FIXME: A15 has currently the same ProcessorModel as A9.
+def : ProcessorModel<"cortex-a15",   CortexA9Model,
+                                    [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
@@ -227,6 +246,12 @@ def : ProcNoItin<"cortex-m4",       [HasV7Ops,
                                      FeatureT2XtPk, FeatureVFP4,
                                      FeatureVFPOnlySP, FeatureMClass]>;
 
+// Swift uArch Processors.
+def : ProcessorModel<"swift",       SwiftModel,
+                                    [ProcSwift, HasV7Ops, FeatureNEON,
+                                     FeatureDB, FeatureDSPThumb2,
+                                     FeatureHasRAS]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index e9e2803ad579..d439d1d7cb7e 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -23,6 +23,8 @@
 #include "InstPrinter/ARMInstPrinter.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMMCExpr.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
@@ -40,9 +42,8 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -302,7 +303,7 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() {
 }
 
 void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
-  uint64_t Size = TM.getTargetData()->getTypeAllocSize(CV->getType());
+  uint64_t Size = TM.getDataLayout()->getTypeAllocSize(CV->getType());
   assert(Size && "C++ constructor pointer had zero size!");
 
   const GlobalValue *GV = dyn_cast<GlobalValue>(CV->stripPointerCasts());
@@ -389,16 +390,6 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
 //===--------------------------------------------------------------------===//
 
 MCSymbol *ARMAsmPrinter::
-GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
-                            const MachineBasicBlock *MBB) const {
-  SmallString<60> Name;
-  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix()
-    << getFunctionNumber() << '_' << uid << '_' << uid2
-    << "_set_" << MBB->getNumber();
-  return OutContext.GetOrCreateSymbol(Name.str());
-}
-
-MCSymbol *ARMAsmPrinter::
 GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
   SmallString<60> Name;
   raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI"
@@ -592,9 +583,24 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
       const TargetLoweringObjectFileMachO &TLOFMacho =
         static_cast<const TargetLoweringObjectFileMachO &>(
           getObjFileLowering());
-      OutStreamer.SwitchSection(TLOFMacho.getTextSection());
-      OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
-      OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection());
+
+      // Collect the set of sections our functions will go into.
+      SetVector<const MCSection *, SmallVector<const MCSection *, 8>,
+        SmallPtrSet<const MCSection *, 8> > TextSections;
+      // Default text section comes first.
+      TextSections.insert(TLOFMacho.getTextSection());
+      // Now any user defined text sections from function attributes.
+      for (Module::iterator F = M.begin(), e = M.end(); F != e; ++F)
+        if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage())
+          TextSections.insert(TLOFMacho.SectionForGlobal(F, Mang, TM));
+      // Now the coalescable sections.
+      TextSections.insert(TLOFMacho.getTextCoalSection());
+      TextSections.insert(TLOFMacho.getConstTextCoalSection());
+
+      // Emit the sections in the .s file header to fix the order.
+      for (unsigned i = 0, e = TextSections.size(); i != e; ++i)
+        OutStreamer.SwitchSection(TextSections[i]);
+
       if (RelocM == Reloc::DynamicNoPIC) {
         const MCSection *sect =
           OutContext.getMachOSection("__TEXT", "__symbol_stub4",
@@ -743,13 +749,28 @@ void ARMAsmPrinter::emitAttributes() {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
                                ARMBuildAttrs::Allowed);
   } else if (CPUString == "generic") {
-    // FIXME: Why these defaults?
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    // For a generic CPU, we assume a standard v7a architecture in Subtarget.
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                               ARMBuildAttrs::ApplicationProfile);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
                                ARMBuildAttrs::Allowed);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::Allowed);
-  }
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV7Ops()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV6T2Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+  else if (Subtarget->hasV6Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+  else if (Subtarget->hasV5TEOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+  else if (Subtarget->hasV5TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+  else if (Subtarget->hasV4TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
 
   if (Subtarget->hasNEON() && emitFPU) {
     /* NEON is not exactly a VFP architecture, but GAS emit one of
@@ -893,7 +914,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV) {
 
 void ARMAsmPrinter::
 EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
-  int Size = TM.getTargetData()->getTypeAllocSize(MCPV->getType());
+  int Size = TM.getDataLayout()->getTypeAllocSize(MCPV->getType());
 
   ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
 
@@ -1091,16 +1112,6 @@ static void populateADROperands(MCInst &Inst, unsigned Dest,
   Inst.addOperand(MCOperand::CreateReg(ccreg));
 }
 
-void ARMAsmPrinter::EmitPatchedInstruction(const MachineInstr *MI,
-                                           unsigned Opcode) {
-  MCInst TmpInst;
-
-  // Emit the instruction as usual, just patch the opcode.
-  LowerARMMachineInstrToMCInst(MI, TmpInst, *this);
-  TmpInst.setOpcode(Opcode);
-  OutStreamer.EmitInstruction(TmpInst);
-}
-
 void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   assert(MI->getFlag(MachineInstr::FrameSetup) &&
       "Only instruction which are involved into frame setup code are allowed");
@@ -1402,31 +1413,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::t2BMOVPCB_CALL: {
-    {
-      MCInst TmpInst;
-      TmpInst.setOpcode(ARM::tMOVr);
-      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
-      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
-      // Add predicate operands.
-      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
-      OutStreamer.EmitInstruction(TmpInst);
-    }
-    {
-      MCInst TmpInst;
-      TmpInst.setOpcode(ARM::t2B);
-      const GlobalValue *GV = MI->getOperand(0).getGlobal();
-      MCSymbol *GVSym = Mang->getSymbol(GV);
-      const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
-      TmpInst.addOperand(MCOperand::CreateExpr(GVSymExpr));
-      // Add predicate operands.
-      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
-      OutStreamer.EmitInstruction(TmpInst);
-    }
-    return;
-  }
   case ARM::MOVi16_ga_pcrel:
   case ARM::t2MOVi16_ga_pcrel: {
     MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 3555e8f50a0d..c875b2cbdffe 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -53,7 +53,7 @@ public:
       Subtarget = &TM.getSubtarget<ARMSubtarget>();
     }
 
-  virtual const char *getPassName() const {
+  virtual const char *getPassName() const LLVM_OVERRIDE {
     return "ARM Assembly Printer";
   }
 
@@ -62,22 +62,24 @@ public:
 
   virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &O);
+                               raw_ostream &O) LLVM_OVERRIDE;
   virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                                     unsigned AsmVariant,
-                                     const char *ExtraCode, raw_ostream &O);
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &O) LLVM_OVERRIDE;
 
   void EmitJumpTable(const MachineInstr *MI);
   void EmitJump2Table(const MachineInstr *MI);
-  virtual void EmitInstruction(const MachineInstr *MI);
-  bool runOnMachineFunction(MachineFunction &F);
+  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
+  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
 
-  virtual void EmitConstantPool() {} // we emit constant pools customly!
-  virtual void EmitFunctionBodyEnd();
-  virtual void EmitFunctionEntryLabel();
-  void EmitStartOfAsmFile(Module &M);
-  void EmitEndOfAsmFile(Module &M);
-  void EmitXXStructor(const Constant *CV);
+  virtual void EmitConstantPool() LLVM_OVERRIDE {
+    // we emit constant pools customly!
+  }
+  virtual void EmitFunctionBodyEnd() LLVM_OVERRIDE;
+  virtual void EmitFunctionEntryLabel() LLVM_OVERRIDE;
+  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
+  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
+  virtual void EmitXXStructor(const Constant *CV) LLVM_OVERRIDE;
 
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
@@ -101,12 +103,13 @@ private:
 public:
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  virtual MachineLocation
+    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
 
   /// EmitDwarfRegOp - Emit dwarf register operation.
-  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const;
+  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const LLVM_OVERRIDE;
 
-  virtual unsigned getISAEncoding() {
+  virtual unsigned getISAEncoding() LLVM_OVERRIDE {
     // ARM/Darwin adds ISA to the DWARF info for each function.
     if (!Subtarget->isTargetDarwin())
       return 0;
@@ -114,18 +117,19 @@ public:
       ARM::DW_ISA_ARM_thumb : ARM::DW_ISA_ARM_arm;
   }
 
+private:
   MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
-  MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
-                                        const MachineBasicBlock *MBB) const;
   MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
 
   MCSymbol *GetARMSJLJEHLabel(void) const;
 
   MCSymbol *GetARMGVSymbol(const GlobalValue *GV);
 
+public:
   /// EmitMachineConstantPoolValue - Print a machine constantpool value to
   /// the .s file.
-  virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV);
+  virtual void
+    EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) LLVM_OVERRIDE;
 };
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 1cc5a17cb029..3c7bb24f42f8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -49,6 +49,11 @@ static cl::opt<bool>
 WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
            cl::desc("Widen ARM vmovs to vmovd when possible"));
 
+static cl::opt<unsigned>
+SwiftPartialUpdateClearance("swift-partial-update-clearance",
+     cl::Hidden, cl::init(12),
+     cl::desc("Clearance before partial register updates"));
+
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
 struct ARM_MLxEntry {
   uint16_t MLxOpc;     // MLA / MLS opcode
@@ -683,7 +688,7 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   // Handle register classes that require multiple instructions.
   unsigned BeginIdx = 0;
   unsigned SubRegs = 0;
-  unsigned Spacing = 1;
+  int Spacing = 1;
 
   // Use VORRq when possible.
   if (ARM::QQPRRegClass.contains(DestReg, SrcReg))
@@ -697,6 +702,8 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 3;
   else if (ARM::DQuadRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4;
+  else if (ARM::GPRPairRegClass.contains(DestReg, SrcReg))
+    Opc = ARM::MOVr, BeginIdx = ARM::gsub_0, SubRegs = 2;
 
   else if (ARM::DPairSpcRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 2, Spacing = 2;
@@ -705,27 +712,38 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   else if (ARM::DQuadSpcRegClass.contains(DestReg, SrcReg))
     Opc = ARM::VMOVD, BeginIdx = ARM::dsub_0, SubRegs = 4, Spacing = 2;
 
-  if (Opc) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    MachineInstrBuilder Mov;
-    for (unsigned i = 0; i != SubRegs; ++i) {
-      unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
-      unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
-      assert(Dst && Src && "Bad sub-register");
-      Mov = AddDefaultPred(BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-                             .addReg(Src));
-      // VORR takes two source operands.
-      if (Opc == ARM::VORRq)
-        Mov.addReg(Src);
-    }
-    // Add implicit super-register defs and kills to the last instruction.
-    Mov->addRegisterDefined(DestReg, TRI);
-    if (KillSrc)
-      Mov->addRegisterKilled(SrcReg, TRI);
-    return;
-  }
+  assert(Opc && "Impossible reg-to-reg copy");
+
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineInstrBuilder Mov;
 
-  llvm_unreachable("Impossible reg-to-reg copy");
+  // Copy register tuples backward when the first Dest reg overlaps with SrcReg.
+  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
+    BeginIdx = BeginIdx + ((SubRegs-1)*Spacing);
+    Spacing = -Spacing;
+  }
+#ifndef NDEBUG
+  SmallSet<unsigned, 4> DstRegs;
+#endif
+  for (unsigned i = 0; i != SubRegs; ++i) {
+    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i*Spacing);
+    unsigned Src = TRI->getSubReg(SrcReg,  BeginIdx + i*Spacing);
+    assert(Dst && Src && "Bad sub-register");
+#ifndef NDEBUG
+    assert(!DstRegs.count(Src) && "destructive vector copy");
+    DstRegs.insert(Dst);
+#endif
+    Mov = BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
+      .addReg(Src);
+    // VORR takes two source operands.
+    if (Opc == ARM::VORRq)
+      Mov.addReg(Src);
+    Mov = AddDefaultPred(Mov);
+  }
+  // Add implicit super-register defs and kills to the last instruction.
+  Mov->addRegisterDefined(DestReg, TRI);
+  if (KillSrc)
+    Mov->addRegisterKilled(SrcReg, TRI);
 }
 
 static const
@@ -775,6 +793,13 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
         AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
                    .addReg(SrcReg, getKillRegState(isKill))
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+        MachineInstrBuilder MIB =
+          AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STMIA))
+                       .addFrameIndex(FI))
+                       .addMemOperand(MMO);
+          MIB = AddDReg(MIB, SrcReg, ARM::gsub_0, getKillRegState(isKill), TRI);
+                AddDReg(MIB, SrcReg, ARM::gsub_1, 0, TRI);
       } else
         llvm_unreachable("Unknown reg class!");
       break;
@@ -922,6 +947,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned Align = MFI.getObjectAlignment(FI);
   MachineMemOperand *MMO =
@@ -947,6 +973,15 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     if (ARM::DPRRegClass.hasSubClassEq(RC)) {
       AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    } else if (ARM::GPRPairRegClass.hasSubClassEq(RC)) {
+      unsigned LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA : ARM::LDMIA;
+      MachineInstrBuilder MIB =
+        AddDefaultPred(BuildMI(MBB, I, DL, get(LdmOpc))
+                    .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+      MIB = AddDReg(MIB, DestReg, ARM::gsub_0, RegState::DefineNoRead, TRI);
+      MIB = AddDReg(MIB, DestReg, ARM::gsub_1, RegState::DefineNoRead, TRI);
+      if (TargetRegisterInfo::isPhysicalRegister(DestReg))
+        MIB.addReg(DestReg, RegState::ImplicitDefine);
     } else
       llvm_unreachable("Unknown reg class!");
     break;
@@ -1378,7 +1413,6 @@ bool ARMBaseInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
   case ARM::VLDRD:
   case ARM::VLDRS:
   case ARM::t2LDRi8:
-  case ARM::t2LDRDi8:
   case ARM::t2LDRSHi8:
   case ARM::t2LDRi12:
   case ARM::t2LDRSHi12:
@@ -1517,6 +1551,14 @@ isProfitableToIfCvt(MachineBasicBlock &TMBB,
   return (TCycles + FCycles + TExtra + FExtra) <= UnpredCost;
 }
 
+bool
+ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                            MachineBasicBlock &FMBB) const {
+  // Reduce false anti-dependencies to let Swift's out-of-order execution
+  // engine do its thing.
+  return Subtarget.isSwift();
+}
+
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
@@ -1569,71 +1611,41 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 }
 
 /// Identify instructions that can be folded into a MOVCC instruction, and
-/// return the corresponding opcode for the predicated pseudo-instruction.
-static unsigned canFoldIntoMOVCC(unsigned Reg, MachineInstr *&MI,
-                                 const MachineRegisterInfo &MRI) {
+/// return the defining instruction.
+static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
+                                      const MachineRegisterInfo &MRI,
+                                      const TargetInstrInfo *TII) {
   if (!TargetRegisterInfo::isVirtualRegister(Reg))
     return 0;
   if (!MRI.hasOneNonDBGUse(Reg))
     return 0;
-  MI = MRI.getVRegDef(Reg);
+  MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
     return 0;
+  // MI is folded into the MOVCC by predicating it.
+  if (!MI->isPredicable())
+    return 0;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading CPSR.
   for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
+    // Reject frame index operands, PEI can't handle the predicated pseudos.
+    if (MO.isFI() || MO.isCPI() || MO.isJTI())
+      return 0;
     if (!MO.isReg())
       continue;
+    // MI can't have any tied operands, that would conflict with predication.
+    if (MO.isTied())
+      return 0;
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
       return 0;
     if (MO.isDef() && !MO.isDead())
       return 0;
   }
-  switch (MI->getOpcode()) {
-  default: return 0;
-  case ARM::ANDri:   return ARM::ANDCCri;
-  case ARM::ANDrr:   return ARM::ANDCCrr;
-  case ARM::ANDrsi:  return ARM::ANDCCrsi;
-  case ARM::ANDrsr:  return ARM::ANDCCrsr;
-  case ARM::t2ANDri: return ARM::t2ANDCCri;
-  case ARM::t2ANDrr: return ARM::t2ANDCCrr;
-  case ARM::t2ANDrs: return ARM::t2ANDCCrs;
-  case ARM::EORri:   return ARM::EORCCri;
-  case ARM::EORrr:   return ARM::EORCCrr;
-  case ARM::EORrsi:  return ARM::EORCCrsi;
-  case ARM::EORrsr:  return ARM::EORCCrsr;
-  case ARM::t2EORri: return ARM::t2EORCCri;
-  case ARM::t2EORrr: return ARM::t2EORCCrr;
-  case ARM::t2EORrs: return ARM::t2EORCCrs;
-  case ARM::ORRri:   return ARM::ORRCCri;
-  case ARM::ORRrr:   return ARM::ORRCCrr;
-  case ARM::ORRrsi:  return ARM::ORRCCrsi;
-  case ARM::ORRrsr:  return ARM::ORRCCrsr;
-  case ARM::t2ORRri: return ARM::t2ORRCCri;
-  case ARM::t2ORRrr: return ARM::t2ORRCCrr;
-  case ARM::t2ORRrs: return ARM::t2ORRCCrs;
-
-  // ARM ADD/SUB
-  case ARM::ADDri:   return ARM::ADDCCri;
-  case ARM::ADDrr:   return ARM::ADDCCrr;
-  case ARM::ADDrsi:  return ARM::ADDCCrsi;
-  case ARM::ADDrsr:  return ARM::ADDCCrsr;
-  case ARM::SUBri:   return ARM::SUBCCri;
-  case ARM::SUBrr:   return ARM::SUBCCrr;
-  case ARM::SUBrsi:  return ARM::SUBCCrsi;
-  case ARM::SUBrsr:  return ARM::SUBCCrsr;
-
-  // Thumb2 ADD/SUB
-  case ARM::t2ADDri:   return ARM::t2ADDCCri;
-  case ARM::t2ADDri12: return ARM::t2ADDCCri12;
-  case ARM::t2ADDrr:   return ARM::t2ADDCCrr;
-  case ARM::t2ADDrs:   return ARM::t2ADDCCrs;
-  case ARM::t2SUBri:   return ARM::t2SUBCCri;
-  case ARM::t2SUBri12: return ARM::t2SUBCCri12;
-  case ARM::t2SUBrr:   return ARM::t2SUBCCrr;
-  case ARM::t2SUBrs:   return ARM::t2SUBCCrs;
-  }
+  bool DontMoveAcrossStores = true;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores))
+    return 0;
+  return MI;
 }
 
 bool ARMBaseInstrInfo::analyzeSelect(const MachineInstr *MI,
@@ -1662,19 +1674,18 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   assert((MI->getOpcode() == ARM::MOVCCr || MI->getOpcode() == ARM::t2MOVCCr) &&
          "Unknown select instruction");
   const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
-  MachineInstr *DefMI = 0;
-  unsigned Opc = canFoldIntoMOVCC(MI->getOperand(2).getReg(), DefMI, MRI);
-  bool Invert = !Opc;
-  if (!Opc)
-    Opc = canFoldIntoMOVCC(MI->getOperand(1).getReg(), DefMI, MRI);
-  if (!Opc)
+  MachineInstr *DefMI = canFoldIntoMOVCC(MI->getOperand(2).getReg(), MRI, this);
+  bool Invert = !DefMI;
+  if (!DefMI)
+    DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
+  if (!DefMI)
     return 0;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
   MachineInstrBuilder NewMI = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                                      get(Opc), MI->getOperand(0).getReg())
-    .addOperand(MI->getOperand(Invert ? 2 : 1));
+                                      DefMI->getDesc(),
+                                      MI->getOperand(0).getReg());
 
   // Copy all the DefMI operands, excluding its (null) predicate.
   const MCInstrDesc &DefDesc = DefMI->getDesc();
@@ -1693,6 +1704,15 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (NewMI->hasOptionalDef())
     AddDefaultCC(NewMI);
 
+  // The output register value when the predicate is false is an implicit
+  // register operand tied to the first def.
+  // The tie makes the register allocator ensure the FalseReg is allocated the
+  // same register as operand 0.
+  MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
+  FalseReg.setImplicit();
+  NewMI->addOperand(FalseReg);
+  NewMI->tieOperands(0, NewMI->getNumOperands() - 1);
+
   // The caller will erase MI, but not DefMI.
   DefMI->eraseFromParent();
   return NewMI;
@@ -2039,13 +2059,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
 
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
-    if (!isSuitableForMask(MI, SrcReg, CmpMask, false)) {
+    if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
       MI = 0;
       for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(SrcReg),
            UE = MRI->use_end(); UI != UE; ++UI) {
         if (UI->getParent() != CmpInstr->getParent()) continue;
         MachineInstr *PotentialAND = &*UI;
-        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true))
+        if (!isSuitableForMask(PotentialAND, SrcReg, CmpMask, true) ||
+            isPredicated(PotentialAND))
           continue;
         MI = PotentialAND;
         break;
@@ -2111,6 +2132,10 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // The single candidate is called MI.
   if (!MI) MI = Sub;
 
+  // We can't use a predicated instruction - it doesn't always write the flags.
+  if (isPredicated(MI))
+    return false;
+
   switch (MI->getOpcode()) {
   default: break;
   case ARM::RSBrr:
@@ -2217,6 +2242,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // Toggle the optional operand to CPSR.
     MI->getOperand(5).setReg(ARM::CPSR);
     MI->getOperand(5).setIsDef(true);
+    assert(!isPredicated(MI) && "Can't use flags from predicated instruction");
     CmpInstr->eraseFromParent();
 
     // Modify the condition code of operands in OperandsToUpdate.
@@ -2347,6 +2373,260 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   return true;
 }
 
+static unsigned getNumMicroOpsSwiftLdSt(const InstrItineraryData *ItinData,
+                                        const MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: {
+    const MCInstrDesc &Desc = MI->getDesc();
+    int UOps = ItinData->getNumMicroOps(Desc.getSchedClass());
+    assert(UOps >= 0 && "bad # UOps");
+    return UOps;
+  }
+
+  case ARM::LDRrs:
+  case ARM::LDRBrs:
+  case ARM::STRrs:
+  case ARM::STRBrs: {
+    unsigned ShOpVal = MI->getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRH:
+  case ARM::STRH: {
+    if (!MI->getOperand(2).getReg())
+      return 1;
+
+    unsigned ShOpVal = MI->getOperand(3).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 1;
+    return 2;
+  }
+
+  case ARM::LDRSB:
+  case ARM::LDRSH:
+    return (ARM_AM::getAM3Op(MI->getOperand(3).getImm()) == ARM_AM::sub) ? 3:2;
+
+  case ARM::LDRSB_POST:
+  case ARM::LDRSH_POST: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    return (Rt == Rm) ? 4 : 3;
+  }
+
+  case ARM::LDR_PRE_REG:
+  case ARM::LDRB_PRE_REG: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rt == Rm)
+      return 3;
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::STR_PRE_REG:
+  case ARM::STRB_PRE_REG: {
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 2;
+    return 3;
+  }
+
+  case ARM::LDRH_PRE:
+  case ARM::STRH_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (!Rm)
+      return 2;
+    if (Rt == Rm)
+      return 3;
+    return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub)
+      ? 3 : 2;
+  }
+
+  case ARM::LDR_POST_REG:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRH_POST: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    return (Rt == Rm) ? 3 : 2;
+  }
+
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDR_POST_IMM:
+  case ARM::LDRB_POST_IMM:
+  case ARM::STRB_POST_IMM:
+  case ARM::STRB_POST_REG:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRH_POST:
+  case ARM::STR_POST_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STR_PRE_IMM:
+    return 2;
+
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSH_PRE: {
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm == 0)
+      return 3;
+    unsigned Rt = MI->getOperand(0).getReg();
+    if (Rt == Rm)
+      return 4;
+    unsigned ShOpVal = MI->getOperand(4).getImm();
+    bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+    unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+    if (!isSub &&
+        (ShImm == 0 ||
+         ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+          ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+      return 3;
+    return 4;
+  }
+
+  case ARM::LDRD: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(2).getReg();
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::STRD: {
+    unsigned Rm = MI->getOperand(3).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(4).getImm()) == ARM_AM::sub) ?4:3;
+    return 2;
+  }
+
+  case ARM::LDRD_POST:
+  case ARM::t2LDRD_POST:
+    return 3;
+
+  case ARM::STRD_POST:
+  case ARM::t2STRD_POST:
+    return 4;
+
+  case ARM::LDRD_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(3).getReg();
+    unsigned Rm = MI->getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::t2LDRD_PRE: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(3).getReg();
+    return (Rt == Rn) ? 4 : 3;
+  }
+
+  case ARM::STRD_PRE: {
+    unsigned Rm = MI->getOperand(4).getReg();
+    if (Rm)
+      return (ARM_AM::getAM3Op(MI->getOperand(5).getImm()) == ARM_AM::sub) ?5:4;
+    return 3;
+  }
+
+  case ARM::t2STRD_PRE:
+    return 3;
+
+  case ARM::t2LDR_POST:
+  case ARM::t2LDRB_POST:
+  case ARM::t2LDRB_PRE:
+  case ARM::t2LDRSBi12:
+  case ARM::t2LDRSBi8:
+  case ARM::t2LDRSBpci:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRH_POST:
+  case ARM::t2LDRH_PRE:
+  case ARM::t2LDRSBT:
+  case ARM::t2LDRSB_POST:
+  case ARM::t2LDRSB_PRE:
+  case ARM::t2LDRSH_POST:
+  case ARM::t2LDRSH_PRE:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSHpci:
+  case ARM::t2LDRSHs:
+    return 2;
+
+  case ARM::t2LDRDi8: {
+    unsigned Rt = MI->getOperand(0).getReg();
+    unsigned Rn = MI->getOperand(2).getReg();
+    return (Rt == Rn) ? 3 : 2;
+  }
+
+  case ARM::t2STRB_POST:
+  case ARM::t2STRB_PRE:
+  case ARM::t2STRBs:
+  case ARM::t2STRDi8:
+  case ARM::t2STRH_POST:
+  case ARM::t2STRH_PRE:
+  case ARM::t2STRHs:
+  case ARM::t2STR_POST:
+  case ARM::t2STR_PRE:
+  case ARM::t2STRs:
+    return 2;
+  }
+}
+
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is a the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+         E = MI->memoperands_end(); I != E; ++I) {
+    Size += (*I)->getSize();
+  }
+  return Size / 4;
+}
+
 unsigned
 ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
                                  const MachineInstr *MI) const {
@@ -2356,8 +2636,12 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   const MCInstrDesc &Desc = MI->getDesc();
   unsigned Class = Desc.getSchedClass();
   int ItinUOps = ItinData->getNumMicroOps(Class);
-  if (ItinUOps >= 0)
+  if (ItinUOps >= 0) {
+    if (Subtarget.isSwift() && (Desc.mayLoad() || Desc.mayStore()))
+      return getNumMicroOpsSwiftLdSt(ItinData, MI);
+
     return ItinUOps;
+  }
 
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
@@ -2426,7 +2710,43 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
   case ARM::t2STMIA_UPD:
   case ARM::t2STMDB_UPD: {
     unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
-    if (Subtarget.isCortexA8()) {
+    if (Subtarget.isSwift()) {
+      // rdar://8402126
+      int UOps = 1 + NumRegs;  // One for address computation, one for each ld / st.
+      switch (Opc) {
+      default: break;
+      case ARM::VLDMDIA_UPD:
+      case ARM::VLDMDDB_UPD:
+      case ARM::VLDMSIA_UPD:
+      case ARM::VLDMSDB_UPD:
+      case ARM::VSTMDIA_UPD:
+      case ARM::VSTMDDB_UPD:
+      case ARM::VSTMSIA_UPD:
+      case ARM::VSTMSDB_UPD:
+      case ARM::LDMIA_UPD:
+      case ARM::LDMDA_UPD:
+      case ARM::LDMDB_UPD:
+      case ARM::LDMIB_UPD:
+      case ARM::STMIA_UPD:
+      case ARM::STMDA_UPD:
+      case ARM::STMDB_UPD:
+      case ARM::STMIB_UPD:
+      case ARM::tLDMIA_UPD:
+      case ARM::tSTMIA_UPD:
+      case ARM::t2LDMIA_UPD:
+      case ARM::t2LDMDB_UPD:
+      case ARM::t2STMIA_UPD:
+      case ARM::t2STMDB_UPD:
+        ++UOps; // One for base register writeback.
+        break;
+      case ARM::LDMIA_RET:
+      case ARM::tPOP_RET:
+      case ARM::t2LDMIA_RET:
+        UOps += 2; // One for base reg wb, one for write to pc.
+        break;
+      }
+      return UOps;
+    } else if (Subtarget.isCortexA8()) {
       if (NumRegs < 4)
         return 2;
       // 4 registers would be issued: 2, 2.
@@ -2435,7 +2755,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
       if (NumRegs % 2)
         ++A8UOps;
       return A8UOps;
-    } else if (Subtarget.isCortexA9()) {
+    } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
       int A9UOps = (NumRegs / 2);
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2468,7 +2788,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
     DefCycle = RegNo / 2 + 1;
     if (RegNo % 2)
       ++DefCycle;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
     DefCycle = RegNo;
     bool isSLoad = false;
 
@@ -2512,7 +2832,7 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
       DefCycle = 1;
     // Result latency is issue cycle + 2: E2.
     DefCycle += 2;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
     DefCycle = (RegNo / 2);
     // If there are odd number of registers or if it's not 64-bit aligned,
     // then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2543,7 +2863,7 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
     UseCycle = RegNo / 2 + 1;
     if (RegNo % 2)
       ++UseCycle;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
     UseCycle = RegNo;
     bool isSStore = false;
 
@@ -2584,7 +2904,7 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
       UseCycle = 2;
     // Read in E3.
     UseCycle += 2;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9() || Subtarget.isSwift()) {
     UseCycle = (RegNo / 2);
     // If there are odd number of registers or if it's not 64-bit aligned,
     // then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2769,7 +3089,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
                             const MachineInstr *DefMI,
                             const MCInstrDesc *DefMCID, unsigned DefAlign) {
   int Adjust = 0;
-  if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) {
+  if (Subtarget.isCortexA8() || Subtarget.isLikeA9()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID->getOpcode()) {
@@ -2794,9 +3114,40 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
       break;
     }
     }
+  } else if (Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID->getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal = DefMI->getOperand(3).getImm();
+      bool isSub = ARM_AM::getAM2Op(ShOpVal) == ARM_AM::sub;
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (!isSub &&
+          (ShImm == 0 ||
+           ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+            ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl)))
+        Adjust -= 2;
+      else if (!isSub &&
+               ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Adjust;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl only.
+      unsigned ShAmt = DefMI->getOperand(3).getImm();
+      if (ShAmt == 0 || ShAmt == 1 || ShAmt == 2 || ShAmt == 3)
+        Adjust -= 2;
+      break;
+    }
+    }
   }
 
-  if (DefAlign < 8 && Subtarget.isCortexA9()) {
+  if (DefAlign < 8 && Subtarget.isLikeA9()) {
     switch (DefMCID->getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -2954,7 +3305,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   if (Reg == ARM::CPSR) {
     if (DefMI->getOpcode() == ARM::FMSTAT) {
       // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
-      return Subtarget.isCortexA9() ? 1 : 20;
+      return Subtarget.isLikeA9() ? 1 : 20;
     }
 
     // CPSR set and branch can be paired in the same cycle.
@@ -2970,7 +3321,8 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     // instructions).
     if (Latency > 0 && Subtarget.isThumb2()) {
       const MachineFunction *MF = DefMI->getParent()->getParent();
-      if (MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+      if (MF->getFunction()->getFnAttributes().
+            hasAttribute(Attributes::OptimizeForSize))
         --Latency;
     }
     return Latency;
@@ -3020,7 +3372,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
 
   if (!UseNode->isMachineOpcode()) {
     int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
-    if (Subtarget.isCortexA9())
+    if (Subtarget.isLikeA9() || Subtarget.isSwift())
       return Latency <= 2 ? 1 : Latency - 1;
     else
       return Latency <= 3 ? 1 : Latency - 2;
@@ -3037,7 +3389,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                   UseMCID, UseIdx, UseAlign);
 
   if (Latency > 1 &&
-      (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+      (Subtarget.isCortexA8() || Subtarget.isLikeA9())) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID.getOpcode()) {
@@ -3064,9 +3416,36 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
       break;
     }
     }
+  } else if (DefIdx == 0 && Latency > 2 && Subtarget.isSwift()) {
+    // FIXME: Properly handle all of the latency adjustments for address
+    // writeback.
+    switch (DefMCID.getOpcode()) {
+    default: break;
+    case ARM::LDRrs:
+    case ARM::LDRBrs: {
+      unsigned ShOpVal =
+        cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
+      unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
+      if (ShImm == 0 ||
+          ((ShImm == 1 || ShImm == 2 || ShImm == 3) &&
+           ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
+        Latency -= 2;
+      else if (ShImm == 1 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsr)
+        --Latency;
+      break;
+    }
+    case ARM::t2LDRs:
+    case ARM::t2LDRBs:
+    case ARM::t2LDRHs:
+    case ARM::t2LDRSHs: {
+      // Thumb2 mode: lsl 0-3 only.
+      Latency -= 2;
+      break;
+    }
+    }
   }
 
-  if (DefAlign < 8 && Subtarget.isCortexA9())
+  if (DefAlign < 8 && Subtarget.isLikeA9())
     switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -3190,18 +3569,6 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
-unsigned
-ARMBaseInstrInfo::getOutputLatency(const InstrItineraryData *ItinData,
-                                   const MachineInstr *DefMI, unsigned DefIdx,
-                                   const MachineInstr *DepMI) const {
-  unsigned Reg = DefMI->getOperand(DefIdx).getReg();
-  if (DepMI->readsRegister(Reg, &getRegisterInfo()) || !isPredicated(DepMI))
-    return 1;
-
-  // If the second MI is predicated, then there is an implicit use dependency.
-  return getInstrLatency(ItinData, DefMI);
-}
-
 unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
                                            const MachineInstr *MI,
                                            unsigned *PredCost) const {
@@ -3359,11 +3726,12 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
     return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
 
-  // Cortex-A9 is particularly picky about mixing the two and wants these
+  // A9-like cores are particularly picky about mixing the two and want these
   // converted.
-  if (Subtarget.isCortexA9() && !isPredicated(MI) &&
+  if (Subtarget.isLikeA9() && !isPredicated(MI) &&
       (MI->getOpcode() == ARM::VMOVRS ||
-       MI->getOpcode() == ARM::VMOVSR))
+       MI->getOpcode() == ARM::VMOVSR ||
+       MI->getOpcode() == ARM::VMOVS))
     return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
 
   // No other instructions can be swizzled, so just determine their domain.
@@ -3383,13 +3751,70 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   return std::make_pair(ExeGeneric, 0);
 }
 
+static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
+                                            unsigned SReg, unsigned &Lane) {
+  unsigned DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_0, &ARM::DPRRegClass);
+  Lane = 0;
+
+  if (DReg != ARM::NoRegister)
+   return DReg;
+
+  Lane = 1;
+  DReg = TRI->getMatchingSuperReg(SReg, ARM::ssub_1, &ARM::DPRRegClass);
+
+  assert(DReg && "S-register with no D super-register?");
+  return DReg;
+}
+
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane,
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+///
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+                                       MachineInstr *MI,
+                                       unsigned DReg, unsigned Lane,
+                                       unsigned &ImplicitSReg) {
+  // If the DPR is defined or used already, the other SPR lane will be chained
+  // correctly, so there is nothing to be done.
+  if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+    ImplicitSReg = 0;
+    return true;
+  }
+
+  // Otherwise we need to go searching to see if the SPR is set explicitly.
+  ImplicitSReg = TRI->getSubReg(DReg,
+                                (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+  MachineBasicBlock::LivenessQueryResult LQR =
+    MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+  if (LQR == MachineBasicBlock::LQR_Live)
+    return true;
+  else if (LQR == MachineBasicBlock::LQR_Unknown)
+    return false;
+
+  // If the register is known not to be live, there is no need to add an
+  // implicit-use.
+  ImplicitSReg = 0;
+  return true;
+}
+
 void
 ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   unsigned DstReg, SrcReg, DReg;
   unsigned Lane;
   MachineInstrBuilder MIB(MI);
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  bool isKill;
   switch (MI->getOpcode()) {
     default:
       llvm_unreachable("cannot handle opcode!");
@@ -3400,82 +3825,294 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
 
       // Zap the predicate operands.
       assert(!isPredicated(MI) && "Cannot predicate a VORRd");
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
 
-      // Change to a VORRd which requires two identical use operands.
-      MI->setDesc(get(ARM::VORRd));
+      // Source instruction is %DDst = VMOVD %DSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
 
-      // Add the extra source operand and new predicates.
-      // This will go before any implicit ops.
-      AddDefaultPred(MachineInstrBuilder(MI).addOperand(MI->getOperand(1)));
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      // Change to a %DDst = VORRd %DSrc, %DSrc, 14, %noreg (; implicits)
+      MI->setDesc(get(ARM::VORRd));
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(SrcReg)
+                        .addReg(SrcReg));
       break;
     case ARM::VMOVRS:
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VGETLN");
 
+      // Source instruction is %RDst = VMOVRS %SSrc, 14, %noreg (; implicits)
       DstReg = MI->getOperand(0).getReg();
       SrcReg = MI->getOperand(1).getReg();
 
-      DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_0, &ARM::DPRRegClass);
-      Lane = 0;
-      if (DReg == ARM::NoRegister) {
-        DReg = TRI->getMatchingSuperReg(SrcReg, ARM::ssub_1, &ARM::DPRRegClass);
-        Lane = 1;
-        assert(DReg && "S-register with no D super-register?");
-      }
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
 
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
-      MI->RemoveOperand(1);
+      DReg = getCorrespondingDRegAndLane(TRI, SrcReg, Lane);
 
+      // Convert to %RDst = VGETLNi32 %DSrc, Lane, 14, %noreg (; imps)
+      // Note that DSrc has been widened and the other lane may be undef, which
+      // contaminates the entire register.
       MI->setDesc(get(ARM::VGETLNi32));
-      MIB.addReg(DReg);
-      MIB.addImm(Lane);
+      AddDefaultPred(MIB.addReg(DstReg, RegState::Define)
+                        .addReg(DReg, RegState::Undef)
+                        .addImm(Lane));
 
-      MIB->getOperand(1).setIsUndef();
+      // The old source should be an implicit use, otherwise we might think it
+      // was dead before here.
       MIB.addReg(SrcReg, RegState::Implicit);
-
-      AddDefaultPred(MIB);
       break;
-    case ARM::VMOVSR:
+    case ARM::VMOVSR: {
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
 
+      // Source instruction is %SDst = VMOVSR %RSrc, 14, %noreg (; implicits)
       DstReg = MI->getOperand(0).getReg();
       SrcReg = MI->getOperand(1).getReg();
-      DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_0, &ARM::DPRRegClass);
-      Lane = 0;
-      if (DReg == ARM::NoRegister) {
-        DReg = TRI->getMatchingSuperReg(DstReg, ARM::ssub_1, &ARM::DPRRegClass);
-        Lane = 1;
-        assert(DReg && "S-register with no D super-register?");
-      }
-      isKill = MI->getOperand(0).isKill();
 
-      MI->RemoveOperand(3);
-      MI->RemoveOperand(2);
-      MI->RemoveOperand(1);
-      MI->RemoveOperand(0);
+      DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
+
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+        break;
+
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
 
+      // Convert to %DDst = VSETLNi32 %DDst, %RSrc, Lane, 14, %noreg (; imps)
+      // Again DDst may be undefined at the beginning of this instruction.
       MI->setDesc(get(ARM::VSETLNi32));
-      MIB.addReg(DReg);
-      MIB.addReg(DReg);
-      MIB.addReg(SrcReg);
-      MIB.addImm(Lane);
+      MIB.addReg(DReg, RegState::Define)
+         .addReg(DReg, getUndefRegState(!MI->readsRegister(DReg, TRI)))
+         .addReg(SrcReg)
+         .addImm(Lane);
+      AddDefaultPred(MIB);
+
+      // The narrower destination must be marked as set to keep previous chains
+      // in place.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
+      break;
+    }
+    case ARM::VMOVS: {
+      if (Domain != ExeNEON)
+        break;
+
+      // Source instruction is %SDst = VMOVS %SSrc, 14, %noreg (; implicits)
+      DstReg = MI->getOperand(0).getReg();
+      SrcReg = MI->getOperand(1).getReg();
+
+      unsigned DstLane = 0, SrcLane = 0, DDst, DSrc;
+      DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
+      DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
 
-      MIB->getOperand(1).setIsUndef();
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+        break;
 
-      if (isKill)
-        MIB->addRegisterKilled(DstReg, TRI, true);
-      MIB->addRegisterDefined(DstReg, TRI);
+      for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
+        MI->RemoveOperand(i-1);
+
+      if (DSrc == DDst) {
+        // Destination can be:
+        //     %DDst = VDUPLN32d %DDst, Lane, 14, %noreg (; implicits)
+        MI->setDesc(get(ARM::VDUPLN32d));
+        MIB.addReg(DDst, RegState::Define)
+           .addReg(DDst, getUndefRegState(!MI->readsRegister(DDst, TRI)))
+           .addImm(SrcLane);
+        AddDefaultPred(MIB);
+
+        // Neither the source or the destination are naturally represented any
+        // more, so add them in manually.
+        MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
+        MIB.addReg(SrcReg, RegState::Implicit);
+        if (ImplicitSReg != 0)
+          MIB.addReg(ImplicitSReg, RegState::Implicit);
+        break;
+      }
 
+      // In general there's no single instruction that can perform an S <-> S
+      // move in NEON space, but a pair of VEXT instructions *can* do the
+      // job. It turns out that the VEXTs needed will only use DSrc once, with
+      // the position based purely on the combination of lane-0 and lane-1
+      // involved. For example
+      //     vmov s0, s2 -> vext.32 d0, d0, d1, #1  vext.32 d0, d0, d0, #1
+      //     vmov s1, s3 -> vext.32 d0, d1, d0, #1  vext.32 d0, d0, d0, #1
+      //     vmov s0, s3 -> vext.32 d0, d0, d0, #1  vext.32 d0, d1, d0, #1
+      //     vmov s1, s2 -> vext.32 d0, d0, d0, #1  vext.32 d0, d0, d1, #1
+      //
+      // Pattern of the MachineInstrs is:
+      //     %DDst = VEXTd32 %DSrc1, %DSrc2, Lane, 14, %noreg (;implicits)
+      MachineInstrBuilder NewMIB;
+      NewMIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                       get(ARM::VEXTd32), DDst);
+
+      // On the first instruction, both DSrc and DDst may be <undef> if present.
+      // Specifically when the original instruction didn't have them as an
+      // <imp-use>.
+      unsigned CurReg = SrcLane == 1 && DstLane == 1 ? DSrc : DDst;
+      bool CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = !MI->readsRegister(CurReg, TRI);
+      NewMIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      NewMIB.addImm(1);
+      AddDefaultPred(NewMIB);
+
+      if (SrcLane == DstLane)
+        NewMIB.addReg(SrcReg, RegState::Implicit);
+
+      MI->setDesc(get(ARM::VEXTd32));
+      MIB.addReg(DDst, RegState::Define);
+
+      // On the second instruction, DDst has definitely been defined above, so
+      // it is not <undef>. DSrc, if present, can be <undef> as above.
+      CurReg = SrcLane == 1 && DstLane == 0 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      CurReg = SrcLane == 0 && DstLane == 1 ? DSrc : DDst;
+      CurUndef = CurReg == DSrc && !MI->readsRegister(CurReg, TRI);
+      MIB.addReg(CurReg, getUndefRegState(CurUndef));
+
+      MIB.addImm(1);
       AddDefaultPred(MIB);
+
+      if (SrcLane != DstLane)
+        MIB.addReg(SrcReg, RegState::Implicit);
+
+      // As before, the original destination is no longer represented, add it
+      // implicitly.
+      MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
       break;
+    }
+  }
+
+}
+
+//===----------------------------------------------------------------------===//
+// Partial register updates
+//===----------------------------------------------------------------------===//
+//
+// Swift renames NEON registers with 64-bit granularity.  That means any
+// instruction writing an S-reg implicitly reads the containing D-reg.  The
+// problem is mostly avoided by translating f32 operations to v2f32 operations
+// on D-registers, but f32 loads are still a problem.
+//
+// These instructions can load an f32 into a NEON register:
+//
+// VLDRS - Only writes S, partial D update.
+// VLD1LNd32 - Writes all D-regs, explicit partial D update, 2 uops.
+// VLD1DUPd32 - Writes all D-regs, no partial reg update, 2 uops.
+//
+// FCONSTD can be used as a dependency-breaking instruction.
+
+
+unsigned ARMBaseInstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI,
+                             unsigned OpNum,
+                             const TargetRegisterInfo *TRI) const {
+  // Only Swift has partial register update problems.
+  if (!SwiftPartialUpdateClearance || !Subtarget.isSwift())
+    return 0;
+
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  if (MO.readsReg())
+    return 0;
+  unsigned Reg = MO.getReg();
+  int UseOp = -1;
+
+  switch(MI->getOpcode()) {
+    // Normal instructions writing only an S-register.
+  case ARM::VLDRS:
+  case ARM::FCONSTS:
+  case ARM::VMOVSR:
+    // rdar://problem/8791586
+  case ARM::VMOVv8i8:
+  case ARM::VMOVv4i16:
+  case ARM::VMOVv2i32:
+  case ARM::VMOVv2f32:
+  case ARM::VMOVv1i64:
+    UseOp = MI->findRegisterUseOperandIdx(Reg, false, TRI);
+    break;
+
+    // Explicitly reads the dependency.
+  case ARM::VLD1LNd32:
+    UseOp = 1;
+    break;
+  default:
+    return 0;
+  }
+
+  // If this instruction actually reads a value from Reg, there is no unwanted
+  // dependency.
+  if (UseOp != -1 && MI->getOperand(UseOp).readsReg())
+    return 0;
+
+  // We must be able to clobber the whole D-reg.
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // Virtual register must be a foo:ssub_0<def,undef> operand.
+    if (!MO.getSubReg() || MI->readsVirtualRegister(Reg))
+      return 0;
+  } else if (ARM::SPRRegClass.contains(Reg)) {
+    // Physical register: MI must define the full D-reg.
+    unsigned DReg = TRI->getMatchingSuperReg(Reg, ARM::ssub_0,
+                                             &ARM::DPRRegClass);
+    if (!DReg || !MI->definesRegister(DReg, TRI))
+      return 0;
   }
 
+  // MI has an unwanted D-register dependency.
+  // Avoid defs in the previous N instructrions.
+  return SwiftPartialUpdateClearance;
+}
+
+// Break a partial register dependency after getPartialRegUpdateClearance
+// returned non-zero.
+void ARMBaseInstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI,
+                          unsigned OpNum,
+                          const TargetRegisterInfo *TRI) const {
+  assert(MI && OpNum < MI->getDesc().getNumDefs() && "OpNum is not a def");
+  assert(TRI && "Need TRI instance");
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned Reg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "Can't break virtual register dependencies.");
+  unsigned DReg = Reg;
+
+  // If MI defines an S-reg, find the corresponding D super-register.
+  if (ARM::SPRRegClass.contains(Reg)) {
+    DReg = ARM::D0 + (Reg - ARM::S0) / 2;
+    assert(TRI->isSuperRegister(Reg, DReg) && "Register enums broken");
+  }
+
+  assert(ARM::DPRRegClass.contains(DReg) && "Can only break D-reg deps");
+  assert(MI->definesRegister(DReg, TRI) && "MI doesn't clobber full D-reg");
+
+  // FIXME: In some cases, VLDRS can be changed to a VLD1DUPd32 which defines
+  // the full D-register by loading the same value to both lanes.  The
+  // instruction is micro-coded with 2 uops, so don't do this until we can
+  // properly schedule micro-coded instuctions.  The dispatcher stalls cause
+  // too big regressions.
+
+  // Insert the dependency-breaking FCONSTD before MI.
+  // 96 is the encoding of 0.5, but the actual value doesn't matter here.
+  AddDefaultPred(BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                         get(ARM::FCONSTD), DReg).addImm(96));
+  MI->addRegisterKilled(DReg, TRI, true);
 }
 
 bool ARMBaseInstrInfo::hasNOP() const {
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 92e5ee8dcbd3..6f38e35124eb 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -182,10 +182,13 @@ public:
   virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
                                          unsigned NumCycles,
                                          const BranchProbability
-                                           &Probability) const {
+                                         &Probability) const {
     return NumCycles == 1;
   }
 
+  virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const;
+
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
@@ -226,15 +229,18 @@ public:
                         SDNode *DefNode, unsigned DefIdx,
                         SDNode *UseNode, unsigned UseIdx) const;
 
-  virtual unsigned getOutputLatency(const InstrItineraryData *ItinData,
-                                    const MachineInstr *DefMI, unsigned DefIdx,
-                                    const MachineInstr *DepMI) const;
-
   /// VFP/NEON execution domains.
   std::pair<uint16_t, uint16_t>
   getExecutionDomain(const MachineInstr *MI) const;
   void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
 
+  unsigned getPartialRegUpdateClearance(const MachineInstr*, unsigned,
+                                        const TargetRegisterInfo*) const;
+  void breakPartialRegDependency(MachineBasicBlock::iterator, unsigned,
+                                 const TargetRegisterInfo *TRI) const;
+  /// Get the number of addresses by LDM or VLDM or zero for unknown.
+  unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+
 private:
   unsigned getInstBundleLength(const MachineInstr *MI) const;
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 9deb96ea9e01..e5b300fc7792 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -84,6 +84,11 @@ ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
     ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
+const uint32_t*
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -106,148 +111,12 @@ getReservedRegs(const MachineFunction &MF) const {
     for (unsigned i = 0; i != 16; ++i)
       Reserved.set(ARM::D16 + i);
   }
-  return Reserved;
-}
-
-bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
-                                        unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (Reg) {
-  default: break;
-  case ARM::SP:
-  case ARM::PC:
-    return true;
-  case ARM::R6:
-    if (hasBasePointer(MF))
-      return true;
-    break;
-  case ARM::R7:
-  case ARM::R11:
-    if (FramePtr == Reg && TFI->hasFP(MF))
-      return true;
-    break;
-  case ARM::R9:
-    return STI.isR9Reserved();
-  }
-
-  return false;
-}
+  const TargetRegisterClass *RC  = &ARM::GPRPairRegClass;
+  for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I)
+    for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI)
+      if (Reserved.test(*SI)) Reserved.set(*I);
 
-bool
-ARMBaseRegisterInfo::canCombineSubRegIndices(const TargetRegisterClass *RC,
-                                          SmallVectorImpl<unsigned> &SubIndices,
-                                          unsigned &NewSubIdx) const {
-
-  unsigned Size = RC->getSize() * 8;
-  if (Size < 6)
-    return 0;
-
-  NewSubIdx = 0;  // Whole register.
-  unsigned NumRegs = SubIndices.size();
-  if (NumRegs == 8) {
-    // 8 D registers -> 1 QQQQ register.
-    return (Size == 512 &&
-            SubIndices[0] == ARM::dsub_0 &&
-            SubIndices[1] == ARM::dsub_1 &&
-            SubIndices[2] == ARM::dsub_2 &&
-            SubIndices[3] == ARM::dsub_3 &&
-            SubIndices[4] == ARM::dsub_4 &&
-            SubIndices[5] == ARM::dsub_5 &&
-            SubIndices[6] == ARM::dsub_6 &&
-            SubIndices[7] == ARM::dsub_7);
-  } else if (NumRegs == 4) {
-    if (SubIndices[0] == ARM::qsub_0) {
-      // 4 Q registers -> 1 QQQQ register.
-      return (Size == 512 &&
-              SubIndices[1] == ARM::qsub_1 &&
-              SubIndices[2] == ARM::qsub_2 &&
-              SubIndices[3] == ARM::qsub_3);
-    } else if (SubIndices[0] == ARM::dsub_0) {
-      // 4 D registers -> 1 QQ register.
-      if (Size >= 256 &&
-          SubIndices[1] == ARM::dsub_1 &&
-          SubIndices[2] == ARM::dsub_2 &&
-          SubIndices[3] == ARM::dsub_3) {
-        if (Size == 512)
-          NewSubIdx = ARM::qqsub_0;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::dsub_4) {
-      // 4 D registers -> 1 QQ register (2nd).
-      if (Size == 512 &&
-          SubIndices[1] == ARM::dsub_5 &&
-          SubIndices[2] == ARM::dsub_6 &&
-          SubIndices[3] == ARM::dsub_7) {
-        NewSubIdx = ARM::qqsub_1;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::ssub_0) {
-      // 4 S registers -> 1 Q register.
-      if (Size >= 128 &&
-          SubIndices[1] == ARM::ssub_1 &&
-          SubIndices[2] == ARM::ssub_2 &&
-          SubIndices[3] == ARM::ssub_3) {
-        if (Size >= 256)
-          NewSubIdx = ARM::qsub_0;
-        return true;
-      }
-    }
-  } else if (NumRegs == 2) {
-    if (SubIndices[0] == ARM::qsub_0) {
-      // 2 Q registers -> 1 QQ register.
-      if (Size >= 256 && SubIndices[1] == ARM::qsub_1) {
-        if (Size == 512)
-          NewSubIdx = ARM::qqsub_0;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::qsub_2) {
-      // 2 Q registers -> 1 QQ register (2nd).
-      if (Size == 512 && SubIndices[1] == ARM::qsub_3) {
-        NewSubIdx = ARM::qqsub_1;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::dsub_0) {
-      // 2 D registers -> 1 Q register.
-      if (Size >= 128 && SubIndices[1] == ARM::dsub_1) {
-        if (Size >= 256)
-          NewSubIdx = ARM::qsub_0;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::dsub_2) {
-      // 2 D registers -> 1 Q register (2nd).
-      if (Size >= 256 && SubIndices[1] == ARM::dsub_3) {
-        NewSubIdx = ARM::qsub_1;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::dsub_4) {
-      // 2 D registers -> 1 Q register (3rd).
-      if (Size == 512 && SubIndices[1] == ARM::dsub_5) {
-        NewSubIdx = ARM::qsub_2;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::dsub_6) {
-      // 2 D registers -> 1 Q register (3rd).
-      if (Size == 512 && SubIndices[1] == ARM::dsub_7) {
-        NewSubIdx = ARM::qsub_3;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::ssub_0) {
-      // 2 S registers -> 1 D register.
-      if (SubIndices[1] == ARM::ssub_1) {
-        if (Size >= 128)
-          NewSubIdx = ARM::dsub_0;
-        return true;
-      }
-    } else if (SubIndices[0] == ARM::ssub_2) {
-      // 2 S registers -> 1 D register (2nd).
-      if (Size >= 128 && SubIndices[1] == ARM::ssub_3) {
-        NewSubIdx = ARM::dsub_1;
-        return true;
-      }
-    }
-  }
-  return false;
+  return Reserved;
 }
 
 const TargetRegisterClass*
@@ -263,6 +132,7 @@ ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)
     case ARM::QPRRegClassID:
     case ARM::QQPRRegClassID:
     case ARM::QQQQPRRegClassID:
+    case ARM::GPRPairRegClassID:
       return Super;
     }
     Super = *I++;
@@ -476,7 +346,7 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
 bool
 ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
   // CortexA9 has a Write-after-write hazard for NEON registers.
-  if (!STI.isCortexA9())
+  if (!STI.isLikeA9())
     return false;
 
   switch (RC->getID()) {
@@ -561,8 +431,9 @@ needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
   unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
-  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
-                               F->hasFnAttr(Attribute::StackAlignment));
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getFnAttributes().hasAttribute(Attributes::StackAlignment));
 
   return requiresRealignment && canRealignStack(MF);
 }
@@ -595,6 +466,7 @@ unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
 
 unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
                                               const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   switch (Reg) {
   default: break;
   // Return 0 if either register of the pair is a special register.
@@ -603,10 +475,10 @@ unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
   case ARM::R3: return ARM::R2;
   case ARM::R5: return ARM::R4;
   case ARM::R7:
-    return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
+    return (MRI.isReserved(ARM::R7) || MRI.isReserved(ARM::R6))
       ? 0 : ARM::R6;
-  case ARM::R9: return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
-  case ARM::R11: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
+  case ARM::R9: return MRI.isReserved(ARM::R9)  ? 0 :ARM::R8;
+  case ARM::R11: return MRI.isReserved(ARM::R11) ? 0 : ARM::R10;
 
   case ARM::S1: return ARM::S0;
   case ARM::S3: return ARM::S2;
@@ -648,6 +520,7 @@ unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
 
 unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
                                              const MachineFunction &MF) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   switch (Reg) {
   default: break;
   // Return 0 if either register of the pair is a special register.
@@ -656,10 +529,10 @@ unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
   case ARM::R2: return ARM::R3;
   case ARM::R4: return ARM::R5;
   case ARM::R6:
-    return (isReservedReg(MF, ARM::R7) || isReservedReg(MF, ARM::R6))
+    return (MRI.isReserved(ARM::R7) || MRI.isReserved(ARM::R6))
       ? 0 : ARM::R7;
-  case ARM::R8: return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
-  case ARM::R10: return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
+  case ARM::R8: return MRI.isReserved(ARM::R9)  ? 0 :ARM::R9;
+  case ARM::R10: return MRI.isReserved(ARM::R11) ? 0 : ARM::R11;
 
   case ARM::S0: return ARM::S1;
   case ARM::S2: return ARM::S3;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index da29f7e711d6..e2bdd046db57 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -96,19 +96,10 @@ public:
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint32_t *getNoPreservedMask() const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
-  /// canCombineSubRegIndices - Given a register class and a list of
-  /// subregister indices, return true if it's possible to combine the
-  /// subregister indices into one that corresponds to a larger
-  /// subregister. Return the new subregister index by reference. Note the
-  /// new index may be zero if the given subregisters can be combined to
-  /// form the whole register.
-  virtual bool canCombineSubRegIndices(const TargetRegisterClass *RC,
-                                       SmallVectorImpl<unsigned> &SubIndices,
-                                       unsigned &NewSubIdx) const;
-
   const TargetRegisterClass*
   getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
   const TargetRegisterClass*
@@ -170,8 +161,6 @@ public:
                                  unsigned MIFlags = MachineInstr::NoFlags)const;
 
   /// Code Generation virtual methods...
-  virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
   virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
 
   virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index bda1517685b1..b378b9662682 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -190,6 +190,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
 
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
 def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
                                      (sequence "D%u", 15, 8))>;
 
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index e81b4cc282c9..6adbf4f27e6e 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -47,7 +47,7 @@ namespace {
   class ARMCodeEmitter : public MachineFunctionPass {
     ARMJITInfo                *JTI;
     const ARMBaseInstrInfo    *II;
-    const TargetData          *TD;
+    const DataLayout          *TD;
     const ARMSubtarget        *Subtarget;
     TargetMachine             &TM;
     JITCodeEmitter            &MCE;
@@ -67,7 +67,7 @@ namespace {
     ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
       : MachineFunctionPass(ID), JTI(0),
         II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
-        TD(tm.getTargetData()), TM(tm),
+        TD(tm.getDataLayout()), TM(tm),
         MCE(mce), MCPEs(0), MJTEs(0),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
 
@@ -376,7 +376,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
          "JIT relocation model must be set to static or default!");
   JTI = ((ARMBaseTargetMachine &)MF.getTarget()).getJITInfo();
   II = (const ARMBaseInstrInfo *)MF.getTarget().getInstrInfo();
-  TD = MF.getTarget().getTargetData();
+  TD = MF.getTarget().getDataLayout();
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   MCPEs = &MF.getConstantPool()->getConstants();
   MJTEs = 0;
@@ -389,7 +389,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
 
   do {
     DEBUG(errs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+          << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index a9539850247f..a57368fdb5d8 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -528,7 +528,7 @@ ARMConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   // identity mapping of CPI's to CPE's.
   const std::vector<MachineConstantPoolEntry> &CPs = MCP->getConstants();
 
-  const TargetData &TD = *MF->getTarget().getTargetData();
+  const DataLayout &TD = *MF->getTarget().getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
@@ -1388,10 +1388,9 @@ bool ARMConstantIslands::handleConstantPoolUser(unsigned CPUserIndex) {
     // If the original WaterList entry was "new water" on this iteration,
     // propagate that to the new island.  This is just keeping NewWaterList
     // updated to match the WaterList, which will be updated below.
-    if (NewWaterList.count(WaterBB)) {
-      NewWaterList.erase(WaterBB);
+    if (NewWaterList.erase(WaterBB))
       NewWaterList.insert(NewIsland);
-    }
+
     // The new CPE goes before the following block (NewMBB).
     NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
 
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index 6b98d446b003..ae531c4ea888 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -102,8 +102,6 @@ public:
   virtual void print(raw_ostream &O) const;
   void print(raw_ostream *O) const { if (O) print(*O); }
   void dump() const;
-
-  static bool classof(const ARMConstantPoolValue *) { return true; }
 };
 
 inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
@@ -158,7 +156,6 @@ public:
   static bool classof(const ARMConstantPoolValue *APV) {
     return APV->isGlobalValue() || APV->isBlockAddress() || APV->isLSDA();
   }
-  static bool classof(const ARMConstantPoolConstant *) { return true; }
 };
 
 /// ARMConstantPoolSymbol - ARM-specific constantpool values for external
@@ -192,7 +189,6 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isExtSymbol();
   }
-  static bool classof(const ARMConstantPoolSymbol *) { return true; }
 };
 
 /// ARMConstantPoolMBB - ARM-specific constantpool value of a machine basic
@@ -225,7 +221,6 @@ public:
   static bool classof(const ARMConstantPoolValue *ACPV) {
     return ACPV->isMachineBasicBlock();
   }
-  static bool classof(const ARMConstantPoolMBB *) { return true; }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/ARM/ARMELFWriterInfo.cpp b/lib/Target/ARM/ARMELFWriterInfo.cpp
deleted file mode 100644
index f671317d0948..000000000000
--- a/lib/Target/ARM/ARMELFWriterInfo.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-//===-- ARMELFWriterInfo.cpp - ELF Writer Info for the ARM backend --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the ARM backend.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARMELFWriterInfo.h"
-#include "ARMRelocations.h"
-#include "llvm/Function.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/ELF.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-//  Implementation of the ARMELFWriterInfo class
-//===----------------------------------------------------------------------===//
-
-ARMELFWriterInfo::ARMELFWriterInfo(TargetMachine &TM)
-  : TargetELFWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64,
-                        TM.getTargetData()->isLittleEndian()) {
-}
-
-ARMELFWriterInfo::~ARMELFWriterInfo() {}
-
-unsigned ARMELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
-  switch (MachineRelTy) {
-  case ARM::reloc_arm_absolute:
-  case ARM::reloc_arm_relative:
-  case ARM::reloc_arm_cp_entry:
-  case ARM::reloc_arm_vfp_cp_entry:
-  case ARM::reloc_arm_machine_cp_entry:
-  case ARM::reloc_arm_jt_base:
-  case ARM::reloc_arm_pic_jt:
-    llvm_unreachable("unsupported ARM relocation type");
-
-  case ARM::reloc_arm_branch: return ELF::R_ARM_CALL;
-  case ARM::reloc_arm_movt:   return ELF::R_ARM_MOVT_ABS;
-  case ARM::reloc_arm_movw:   return ELF::R_ARM_MOVW_ABS_NC;
-  default:
-    llvm_unreachable("unknown ARM relocation type");
-  }
-}
-
-long int ARMELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
-                                                    long int Modifier) const {
-  llvm_unreachable("ARMELFWriterInfo::getDefaultAddendForRelTy() not "
-                   "implemented");
-}
-
-unsigned ARMELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
-  llvm_unreachable("ARMELFWriterInfo::getRelocationTySize() not implemented");
-}
-
-bool ARMELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
-  llvm_unreachable("ARMELFWriterInfo::isPCRelativeRel() not implemented");
-}
-
-unsigned ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
-  llvm_unreachable("ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not "
-                   "implemented");
-}
-
-long int ARMELFWriterInfo::computeRelocation(unsigned SymOffset,
-                                             unsigned RelOffset,
-                                             unsigned RelTy) const {
-  llvm_unreachable("ARMELFWriterInfo::getAbsoluteLabelMachineRelTy() not "
-                   "implemented");
-}
diff --git a/lib/Target/ARM/ARMELFWriterInfo.h b/lib/Target/ARM/ARMELFWriterInfo.h
deleted file mode 100644
index 6a84f8ac4235..000000000000
--- a/lib/Target/ARM/ARMELFWriterInfo.h
+++ /dev/null
@@ -1,59 +0,0 @@
-//===-- ARMELFWriterInfo.h - ELF Writer Info for ARM ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the ARM backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM_ELF_WRITER_INFO_H
-#define ARM_ELF_WRITER_INFO_H
-
-#include "llvm/Target/TargetELFWriterInfo.h"
-
-namespace llvm {
-  class TargetMachine;
-
-  class ARMELFWriterInfo : public TargetELFWriterInfo {
-  public:
-    ARMELFWriterInfo(TargetMachine &TM);
-    virtual ~ARMELFWriterInfo();
-
-    /// getRelocationType - Returns the target specific ELF Relocation type.
-    /// 'MachineRelTy' contains the object code independent relocation type
-    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
-
-    /// hasRelocationAddend - True if the target uses an addend in the
-    /// ELF relocation entry.
-    virtual bool hasRelocationAddend() const { return false; }
-
-    /// getDefaultAddendForRelTy - Gets the default addend value for a
-    /// relocation entry based on the target ELF relocation type.
-    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
-                                              long int Modifier = 0) const;
-
-    /// getRelTySize - Returns the size of relocatable field in bits
-    virtual unsigned getRelocationTySize(unsigned RelTy) const;
-
-    /// isPCRelativeRel - True if the relocation type is pc relative
-    virtual bool isPCRelativeRel(unsigned RelTy) const;
-
-    /// getJumpTableRelocationTy - Returns the machine relocation type used
-    /// to reference a jumptable.
-    virtual unsigned getAbsoluteLabelMachineRelTy() const;
-
-    /// computeRelocation - Some relocatable fields could be relocated
-    /// directly, avoiding the relocation symbol emission, compute the
-    /// final relocation value for this symbol.
-    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
-                                       unsigned RelTy) const;
-  };
-
-} // end llvm namespace
-
-#endif // ARM_ELF_WRITER_INFO_H
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 15bb32eb149f..8c45e0b98d8e 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -103,9 +103,9 @@ namespace {
     bool IsLoad;
     bool isUpdating;
     bool hasWritebackOperand;
-    NEONRegSpacing RegSpacing;
-    unsigned char NumRegs; // D registers loaded or stored
-    unsigned char RegElts; // elements per D register; used for lane ops
+    uint8_t RegSpacing; // One of type NEONRegSpacing
+    uint8_t NumRegs; // D registers loaded or stored
+    uint8_t RegElts; // elements per D register; used for lane ops
     // FIXME: Temporary flag to denote whether the real instruction takes
     // a single register (like the encoding) or all of the registers in
     // the list (like the asm syntax and the isel DAG). When all definitions
@@ -377,7 +377,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
 
   const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
   assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
-  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
   unsigned NumRegs = TableEntry->NumRegs;
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -442,7 +442,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
 
   const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
   assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
-  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
   unsigned NumRegs = TableEntry->NumRegs;
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -493,7 +493,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
 
   const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
   assert(TableEntry && "NEONLdStTable lookup failed");
-  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
   unsigned NumRegs = TableEntry->NumRegs;
   unsigned RegElts = TableEntry->RegElts;
 
@@ -777,9 +777,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
-    case ARM::Int_eh_sjlj_dispatchsetup:
-    case ARM::Int_eh_sjlj_dispatchsetup_nofp:
-    case ARM::tInt_eh_sjlj_dispatchsetup: {
+    case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
         static_cast<const ARMBaseInstrInfo*>(TII);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index bf9d16eea181..6611862ca071 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -40,7 +40,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -100,51 +100,53 @@ class ARMFastISel : public FastISel {
     }
 
     // Code from FastISel.cpp.
-    virtual unsigned FastEmitInst_(unsigned MachineInstOpcode,
-                                   const TargetRegisterClass *RC);
-    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    unsigned Op0, bool Op0IsKill);
-    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     unsigned Op1, bool Op1IsKill);
-    virtual unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
-                                      const TargetRegisterClass *RC,
-                                      unsigned Op0, bool Op0IsKill,
-                                      unsigned Op1, bool Op1IsKill,
-                                      unsigned Op2, bool Op2IsKill);
-    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     uint64_t Imm);
-    virtual unsigned FastEmitInst_rf(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     const ConstantFP *FPImm);
-    virtual unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
-                                      const TargetRegisterClass *RC,
-                                      unsigned Op0, bool Op0IsKill,
-                                      unsigned Op1, bool Op1IsKill,
-                                      uint64_t Imm);
-    virtual unsigned FastEmitInst_i(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    uint64_t Imm);
-    virtual unsigned FastEmitInst_ii(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     uint64_t Imm1, uint64_t Imm2);
-
-    virtual unsigned FastEmitInst_extractsubreg(MVT RetVT,
-                                                unsigned Op0, bool Op0IsKill,
-                                                uint32_t Idx);
+  private:
+    unsigned FastEmitInst_(unsigned MachineInstOpcode,
+                           const TargetRegisterClass *RC);
+    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
+    unsigned FastEmitInst_rrr(unsigned MachineInstOpcode,
+                              const TargetRegisterClass *RC,
+                              unsigned Op0, bool Op0IsKill,
+                              unsigned Op1, bool Op1IsKill,
+                              unsigned Op2, bool Op2IsKill);
+    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned FastEmitInst_rf(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             const ConstantFP *FPImm);
+    unsigned FastEmitInst_rri(unsigned MachineInstOpcode,
+                              const TargetRegisterClass *RC,
+                              unsigned Op0, bool Op0IsKill,
+                              unsigned Op1, bool Op1IsKill,
+                              uint64_t Imm);
+    unsigned FastEmitInst_i(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            uint64_t Imm);
+    unsigned FastEmitInst_ii(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             uint64_t Imm1, uint64_t Imm2);
+
+    unsigned FastEmitInst_extractsubreg(MVT RetVT,
+                                        unsigned Op0, bool Op0IsKill,
+                                        uint32_t Idx);
 
     // Backend specific FastISel code.
+  private:
     virtual bool TargetSelectInstruction(const Instruction *I);
     virtual unsigned TargetMaterializeConstant(const Constant *C);
     virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
     virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
                                const LoadInst *LI);
-
+  private:
   #include "ARMGenFastISel.inc"
 
     // Instruction selection routines.
@@ -192,6 +194,7 @@ class ARMFastISel : public FastISel {
     unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
     unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
     unsigned ARMSelectCallOp(bool UseReg);
+    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, EVT VT);
 
     // Call handling routines.
   private:
@@ -615,11 +618,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
   if (VT != MVT::i32) return 0;
 
   Reloc::Model RelocM = TM.getRelocationModel();
-
-  // TODO: Need more magic for ARM PIC.
-  if (!isThumb2 && (RelocM == Reloc::PIC_)) return 0;
-
-  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
 
   // Use movw+movt when possible, it avoids constant pool entries.
   // Darwin targets don't support movt with Reloc::Static, see
@@ -649,6 +652,9 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
       Align = TD.getTypeAllocSize(GV->getType());
     }
 
+    if (Subtarget->isTargetELF() && RelocM == Reloc::PIC_)
+      return ARMLowerPICELF(GV, Align, VT);
+
     // Grab index.
     unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 :
       (Subtarget->isThumb() ? 4 : 8);
@@ -666,17 +672,30 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
         .addConstantPoolIndex(Idx);
       if (RelocM == Reloc::PIC_)
         MIB.addImm(Id);
+      AddOptionalDefs(MIB);
     } else {
       // The extra immediate is for addrmode2.
       MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(ARM::LDRcp),
                     DestReg)
         .addConstantPoolIndex(Idx)
         .addImm(0);
+      AddOptionalDefs(MIB);
+
+      if (RelocM == Reloc::PIC_) {
+        unsigned Opc = IsIndirect ? ARM::PICLDR : ARM::PICADD;
+        unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
+
+        MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                          DL, TII.get(Opc), NewDestReg)
+                                  .addReg(DestReg)
+                                  .addImm(Id);
+        AddOptionalDefs(MIB);
+        return NewDestReg;
+      }
     }
-    AddOptionalDefs(MIB);
   }
 
-  if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
+  if (IsIndirect) {
     MachineInstrBuilder MIB;
     unsigned NewDestReg = createResultReg(TLI.getRegClassFor(VT));
     if (isThumb2)
@@ -1009,6 +1028,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       RC = &ARM::GPRRegClass;
       break;
     case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
@@ -1021,6 +1043,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       RC = &ARM::GPRRegClass;
       break;
     case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           Opc = ARM::t2LDRi8;
@@ -1127,6 +1152,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
       }
       break;
     case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           StrOpc = ARM::t2STRHi8;
@@ -1138,6 +1166,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
       }
       break;
     case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           StrOpc = ARM::t2STRi8;
@@ -1360,6 +1391,11 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
   unsigned Opc = isThumb2 ? ARM::tBRIND : ARM::BX;
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opc))
                   .addReg(AddrReg));
+
+  const IndirectBrInst *IB = cast<IndirectBrInst>(I);
+  for (unsigned i = 0, e = IB->getNumSuccessors(); i != e; ++i)
+    FuncInfo.MBB->addSuccessor(FuncInfo.MBBMap[IB->getSuccessor(i)]);
+
   return true;
 }
 
@@ -2210,25 +2246,17 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
   unsigned CallOpc = ARMSelectCallOp(EnableARMLongCalls);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
-  if (isThumb2) {
-    // Explicitly adding the predicate here.
+  // BL / BLX don't take a predicate, but tBL / tBLX do.
+  if (isThumb2)
     AddDefaultPred(MIB);
-    if (EnableARMLongCalls)
-      MIB.addReg(CalleeReg);
-    else
-      MIB.addExternalSymbol(TLI.getLibcallName(Call));
-  } else {
-    if (EnableARMLongCalls)
-      MIB.addReg(CalleeReg);
-    else
-      MIB.addExternalSymbol(TLI.getLibcallName(Call));
+  if (EnableARMLongCalls)
+    MIB.addReg(CalleeReg);
+  else
+    MIB.addExternalSymbol(TLI.getLibcallName(Call));
 
-    // Explicitly adding the predicate here.
-    AddDefaultPred(MIB);
-  }
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2300,16 +2328,16 @@ bool ARMFastISel::SelectCall(const Instruction *I,
 
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+    if (CS.paramHasAttr(AttrInd, Attributes::SExt))
       Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+    if (CS.paramHasAttr(AttrInd, Attributes::ZExt))
       Flags.setZExt();
 
     // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
-        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
-        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
-        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+    if (CS.paramHasAttr(AttrInd, Attributes::InReg) ||
+        CS.paramHasAttr(AttrInd, Attributes::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attributes::Nest) ||
+        CS.paramHasAttr(AttrInd, Attributes::ByVal))
       return false;
 
     Type *ArgTy = (*i)->getType();
@@ -2356,30 +2384,20 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   unsigned CallOpc = ARMSelectCallOp(UseReg);
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
                                     DL, TII.get(CallOpc));
-  if(isThumb2) {
-    // Explicitly adding the predicate here.
-    AddDefaultPred(MIB);
-    if (UseReg)
-      MIB.addReg(CalleeReg);
-    else if (!IntrMemName)
-      MIB.addGlobalAddress(GV, 0, 0);
-    else
-      MIB.addExternalSymbol(IntrMemName, 0);
-  } else {
-    if (UseReg)
-      MIB.addReg(CalleeReg);
-    else if (!IntrMemName)
-      MIB.addGlobalAddress(GV, 0, 0);
-    else
-      MIB.addExternalSymbol(IntrMemName, 0);
 
-    // Explicitly adding the predicate here.
+  // ARM calls don't take a predicate, but tBL / tBLX do.
+  if(isThumb2)
     AddDefaultPred(MIB);
-  }
+  if (UseReg)
+    MIB.addReg(CalleeReg);
+  else if (!IntrMemName)
+    MIB.addGlobalAddress(GV, 0, 0);
+  else
+    MIB.addExternalSymbol(IntrMemName, 0);
 
   // Add implicit physical register uses to the call.
   for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
-    MIB.addReg(RegArgs[i]);
+    MIB.addReg(RegArgs[i], RegState::Implicit);
 
   // Add a register mask with the call-preserved registers.
   // Proper defs for return values will be added by setPhysRegsDeadExcept().
@@ -2648,7 +2666,7 @@ bool ARMFastISel::SelectShift(const Instruction *I,
   unsigned Reg1 = getRegForValue(Src1Value);
   if (Reg1 == 0) return false;
 
-  unsigned Reg2;
+  unsigned Reg2 = 0;
   if (Opc == ARM::MOVsr) {
     Reg2 = getRegForValue(Src2Value);
     if (Reg2 == 0) return false;
@@ -2790,6 +2808,47 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
   return true;
 }
 
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
+                                     unsigned Align, EVT VT) {
+  bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+  ARMConstantPoolConstant *CPV =
+    ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
+  unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+
+  unsigned Opc;
+  unsigned DestReg1 = createResultReg(TLI.getRegClassFor(VT));
+  // Load value.
+  if (isThumb2) {
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
+                            TII.get(ARM::t2LDRpci), DestReg1)
+                    .addConstantPoolIndex(Idx));
+    Opc = UseGOTOFF ? ARM::t2ADDrr : ARM::t2LDRs;
+  } else {
+    // The extra immediate is for addrmode2.
+    AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                            DL, TII.get(ARM::LDRcp), DestReg1)
+                    .addConstantPoolIndex(Idx).addImm(0));
+    Opc = UseGOTOFF ? ARM::ADDrr : ARM::LDRrs;
+  }
+
+  unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+  if (GlobalBaseReg == 0) {
+    GlobalBaseReg = MRI.createVirtualRegister(TLI.getRegClassFor(VT));
+    AFI->setGlobalBaseReg(GlobalBaseReg);
+  }
+
+  unsigned DestReg2 = createResultReg(TLI.getRegClassFor(VT));
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt,
+                                    DL, TII.get(Opc), DestReg2)
+                            .addReg(DestReg1)
+                            .addReg(GlobalBaseReg);
+  if (!UseGOTOFF)
+    MIB.addImm(0);
+  AddOptionalDefs(MIB);
+
+  return DestReg2;
+}
+
 namespace llvm {
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index aee72d21e2cc..9392497fd07d 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -153,7 +153,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   int FramePtrSpillFI = 0;
   int D8SpillFI = 0;
 
-  // All calls are tail calls in GHC calling conv, and functions have no prologue/epilogue.
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
   if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
     return;
 
@@ -360,7 +361,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = (int)MFI->getStackSize();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
-  // All calls are tail calls in GHC calling conv, and functions have no prologue/epilogue.
+  // All calls are tail calls in GHC calling conv, and functions have no
+  // prologue/epilogue.
   if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
     return;
 
@@ -1151,7 +1153,7 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
     return;
 
   // Naked functions don't spill callee-saved registers.
-  if (MF.getFunction()->hasFnAttr(Attribute::Naked))
+  if (MF.getFunction()->getFnAttributes().hasAttribute(Attributes::Naked))
     return;
 
   // We are planning to use NEON instructions vst1 / vld1.
@@ -1176,7 +1178,7 @@ static void checkNumAlignedDPRCS2Regs(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned NumSpills = 0;
   for (; NumSpills < 8; ++NumSpills)
-    if (!MRI.isPhysRegOrOverlapUsed(ARM::D8 + NumSpills))
+    if (!MRI.isPhysRegUsed(ARM::D8 + NumSpills))
       break;
 
   // Don't do this for just one d-register. It's not worth it.
@@ -1209,6 +1211,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
   // Spill R4 if Thumb2 function requires stack realignment - it will be used as
@@ -1218,12 +1221,12 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // FIXME: It will be better just to find spare register here.
   if (AFI->isThumb2Function() &&
       (MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(MF)))
-    MF.getRegInfo().setPhysRegUsed(ARM::R4);
+    MRI.setPhysRegUsed(ARM::R4);
 
   if (AFI->isThumb1OnlyFunction()) {
     // Spill LR if Thumb1 function uses variable length argument lists.
     if (AFI->getVarArgsRegSaveSize() > 0)
-      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+      MRI.setPhysRegUsed(ARM::LR);
 
     // Spill R4 if Thumb1 epilogue has to restore SP from FP. We don't know
     // for sure what the stack size will be, but for this, an estimate is good
@@ -1233,7 +1236,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // FIXME: It will be better just to find spare register here.
     unsigned StackSize = estimateStackSize(MF);
     if (MFI->hasVarSizedObjects() || StackSize > 508)
-      MF.getRegInfo().setPhysRegUsed(ARM::R4);
+      MRI.setPhysRegUsed(ARM::R4);
   }
 
   // See if we can spill vector registers to aligned stack.
@@ -1241,7 +1244,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Spill the BasePtr if it's used.
   if (RegInfo->hasBasePointer(MF))
-    MF.getRegInfo().setPhysRegUsed(RegInfo->getBaseRegister());
+    MRI.setPhysRegUsed(RegInfo->getBaseRegister());
 
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
@@ -1249,7 +1252,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
-    if (MF.getRegInfo().isPhysRegOrOverlapUsed(Reg)) {
+    if (MRI.isPhysRegUsed(Reg)) {
       Spilled = true;
       CanEliminateFrame = false;
     }
@@ -1338,7 +1341,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
     // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
     if (!LRSpilled && CS1Spilled) {
-      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+      MRI.setPhysRegUsed(ARM::LR);
       NumGPRSpills++;
       UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
                                     UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
@@ -1347,7 +1350,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     }
 
     if (hasFP(MF)) {
-      MF.getRegInfo().setPhysRegUsed(FramePtr);
+      MRI.setPhysRegUsed(FramePtr);
       NumGPRSpills++;
     }
 
@@ -1362,16 +1365,16 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
           // Don't spill high register if the function is thumb1
           if (!AFI->isThumb1OnlyFunction() ||
               isARMLowRegister(Reg) || Reg == ARM::LR) {
-            MF.getRegInfo().setPhysRegUsed(Reg);
-            if (!RegInfo->isReservedReg(MF, Reg))
+            MRI.setPhysRegUsed(Reg);
+            if (!MRI.isReserved(Reg))
               ExtraCSSpill = true;
             break;
           }
         }
       } else if (!UnspilledCS2GPRs.empty() && !AFI->isThumb1OnlyFunction()) {
         unsigned Reg = UnspilledCS2GPRs.front();
-        MF.getRegInfo().setPhysRegUsed(Reg);
-        if (!RegInfo->isReservedReg(MF, Reg))
+        MRI.setPhysRegUsed(Reg);
+        if (!MRI.isReserved(Reg))
           ExtraCSSpill = true;
       }
     }
@@ -1389,7 +1392,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       while (NumExtras && !UnspilledCS1GPRs.empty()) {
         unsigned Reg = UnspilledCS1GPRs.back();
         UnspilledCS1GPRs.pop_back();
-        if (!RegInfo->isReservedReg(MF, Reg) &&
+        if (!MRI.isReserved(Reg) &&
             (!AFI->isThumb1OnlyFunction() || isARMLowRegister(Reg) ||
              Reg == ARM::LR)) {
           Extras.push_back(Reg);
@@ -1401,7 +1404,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
         while (NumExtras && !UnspilledCS2GPRs.empty()) {
           unsigned Reg = UnspilledCS2GPRs.back();
           UnspilledCS2GPRs.pop_back();
-          if (!RegInfo->isReservedReg(MF, Reg)) {
+          if (!MRI.isReserved(Reg)) {
             Extras.push_back(Reg);
             NumExtras--;
           }
@@ -1409,7 +1412,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
       }
       if (Extras.size() && NumExtras == 0) {
         for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
-          MF.getRegInfo().setPhysRegUsed(Extras[i]);
+          MRI.setPhysRegUsed(Extras[i]);
         }
       } else if (!AFI->isThumb1OnlyFunction()) {
         // note: Thumb1 functions spill to R12, not the stack.  Reserve a slot
@@ -1423,7 +1426,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   }
 
   if (ForceLRSpill) {
-    MF.getRegInfo().setPhysRegUsed(ARM::LR);
+    MRI.setPhysRegUsed(ARM::LR);
     AFI->setLRIsSpilledForFarJump(true);
   }
 }
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index a5fd15b6bb97..1240169e84ed 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -47,7 +47,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
           // On A9, AGU and NEON/FPU are muxed.
-          !(STI.isCortexA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
+          !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index a3a6c3176bea..efd6d2b8399e 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -239,7 +239,6 @@ private:
 
   /// SelectCMOVOp - Select CMOV instructions for ARM.
   SDNode *SelectCMOVOp(SDNode *N);
-  SDNode *SelectConditionalOp(SDNode *N);
   SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
                               ARMCC::CondCodes CCVal, SDValue CCR,
                               SDValue InFlag);
@@ -306,7 +305,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 }
 
 /// \brief Check whether a particular node is a constant value representable as
-/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax).
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
 ///
 /// \param ScaledConstant [out] - On success, the pre-scaled constant value.
 static bool isScaledConstantInRange(SDValue Node, int Scale,
@@ -337,7 +336,8 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (!CheckVMLxHazard)
     return true;
 
-  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
+  if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() &&
+      !Subtarget->isSwift())
     return true;
 
   if (!N->hasOneUse())
@@ -375,12 +375,13 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
 bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
                                             ARM_AM::ShiftOpc ShOpcVal,
                                             unsigned ShAmt) {
-  if (!Subtarget->isCortexA9())
+  if (!Subtarget->isLikeA9() && !Subtarget->isSwift())
     return true;
   if (Shift.hasOneUse())
     return true;
   // R << 2 is free.
-  return ShOpcVal == ARM_AM::lsl && ShAmt == 2;
+  return ShOpcVal == ARM_AM::lsl &&
+         (ShAmt == 2 || (Subtarget->isSwift() && ShAmt == 1));
 }
 
 bool ARMDAGToDAGISel::SelectImmShifterOperand(SDValue N,
@@ -487,7 +488,7 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
 bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
                                       SDValue &Opc) {
   if (N.getOpcode() == ISD::MUL &&
-      (!Subtarget->isCortexA9() || N.hasOneUse())) {
+      ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       // X * [3,5,9] -> X + X * [2,4,8] etc.
       int RHSC = (int)RHS->getZExtValue();
@@ -551,7 +552,8 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
 
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
-      !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
     ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
@@ -585,7 +587,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
                                                      SDValue &Offset,
                                                      SDValue &Opc) {
   if (N.getOpcode() == ISD::MUL &&
-      (!Subtarget->isCortexA9() || N.hasOneUse())) {
+      (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       // X * [3,5,9] -> X + X * [2,4,8] etc.
       int RHSC = (int)RHS->getZExtValue();
@@ -651,7 +653,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
     }
   }
 
-  if (Subtarget->isCortexA9() && !N.hasOneUse()) {
+  if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
     // Compute R +/- (R << N) and reuse it.
     Base = N;
     Offset = CurDAG->getRegister(0, MVT::i32);
@@ -689,7 +691,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
 
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
-      !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+      !(Subtarget->isLikeA9() || Subtarget->isSwift() ||
+        N.getOperand(0).hasOneUse())) {
     ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
@@ -2363,121 +2366,6 @@ SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
   return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
 }
 
-SDNode *ARMDAGToDAGISel::SelectConditionalOp(SDNode *N) {
-  SDValue FalseVal = N->getOperand(0);
-  SDValue TrueVal  = N->getOperand(1);
-  ARMCC::CondCodes CCVal =
-    (ARMCC::CondCodes)cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
-  SDValue CCR = N->getOperand(3);
-  assert(CCR.getOpcode() == ISD::Register);
-  SDValue InFlag = N->getOperand(4);
-  SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
-  SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
-
-  if (Subtarget->isThumb()) {
-    SDValue CPTmp0;
-    SDValue CPTmp1;
-    if (SelectT2ShifterOperandReg(TrueVal, CPTmp0, CPTmp1)) {
-      unsigned Opc;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("Unexpected node");
-      case ARMISD::CAND: Opc = ARM::t2ANDCCrs; break;
-      case ARMISD::COR:  Opc = ARM::t2ORRCCrs; break;
-      case ARMISD::CXOR: Opc = ARM::t2EORCCrs; break;
-      }
-      SDValue Ops[] = {
-        FalseVal, FalseVal, CPTmp0, CPTmp1, CC, CCR, Reg0, InFlag
-      };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 8);
-    }
-
-    ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-    if (T) {
-      unsigned TrueImm = T->getZExtValue();
-      if (is_t2_so_imm(TrueImm)) {
-        unsigned Opc;
-        switch (N->getOpcode()) {
-        default: llvm_unreachable("Unexpected node");
-        case ARMISD::CAND: Opc = ARM::t2ANDCCri; break;
-        case ARMISD::COR:  Opc = ARM::t2ORRCCri; break;
-        case ARMISD::CXOR: Opc = ARM::t2EORCCri; break;
-        }
-        SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-        SDValue Ops[] = { FalseVal, FalseVal, True, CC, CCR, Reg0, InFlag };
-        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 7);
-      }
-    }
-
-    unsigned Opc;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected node");
-    case ARMISD::CAND: Opc = ARM::t2ANDCCrr; break;
-    case ARMISD::COR:  Opc = ARM::t2ORRCCrr; break;
-    case ARMISD::CXOR: Opc = ARM::t2EORCCrr; break;
-    }
-    SDValue Ops[] = { FalseVal, FalseVal, TrueVal, CC, CCR, Reg0, InFlag };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 7);
-  }
-
-  SDValue CPTmp0;
-  SDValue CPTmp1;
-  SDValue CPTmp2;
-  if (SelectImmShifterOperand(TrueVal, CPTmp0, CPTmp2)) {
-    unsigned Opc;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected node");
-    case ARMISD::CAND: Opc = ARM::ANDCCrsi; break;
-    case ARMISD::COR:  Opc = ARM::ORRCCrsi; break;
-    case ARMISD::CXOR: Opc = ARM::EORCCrsi; break;
-    }
-    SDValue Ops[] = {
-      FalseVal, FalseVal, CPTmp0, CPTmp2, CC, CCR, Reg0, InFlag
-    };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 8);
-  }
-
-  if (SelectRegShifterOperand(TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
-    unsigned Opc;
-    switch (N->getOpcode()) {
-    default: llvm_unreachable("Unexpected node");
-    case ARMISD::CAND: Opc = ARM::ANDCCrsr; break;
-    case ARMISD::COR:  Opc = ARM::ORRCCrsr; break;
-    case ARMISD::CXOR: Opc = ARM::EORCCrsr; break;
-    }
-    SDValue Ops[] = {
-      FalseVal, FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, Reg0, InFlag
-    };
-    return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 9);
-  }
-
-  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
-  if (T) {
-    unsigned TrueImm = T->getZExtValue();
-    if (is_so_imm(TrueImm)) {
-      unsigned Opc;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("Unexpected node");
-      case ARMISD::CAND: Opc = ARM::ANDCCri; break;
-      case ARMISD::COR:  Opc = ARM::ORRCCri; break;
-      case ARMISD::CXOR: Opc = ARM::EORCCri; break;
-      }
-      SDValue True = CurDAG->getTargetConstant(TrueImm, MVT::i32);
-      SDValue Ops[] = { FalseVal, FalseVal, True, CC, CCR, Reg0, InFlag };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 7);
-    }
-  }
-
-  unsigned Opc;
-  switch (N->getOpcode()) {
-  default: llvm_unreachable("Unexpected node");
-  case ARMISD::CAND: Opc = ARM::ANDCCrr; break;
-  case ARMISD::COR:  Opc = ARM::ORRCCrr; break;
-  case ARMISD::CXOR: Opc = ARM::EORCCrr; break;
-  }
-  SDValue Ops[] = { FalseVal, FalseVal, TrueVal, CC, CCR, Reg0, InFlag };
-  return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 7);
-}
-
 /// Target-specific DAG combining for ISD::XOR.
 /// Target-independent combining lowers SELECT_CC nodes of the form
 /// select_cc setg[ge] X,  0,  X, -X
@@ -2753,6 +2641,38 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                     dl, MVT::i32, MVT::i32, Ops, 5);
     }
   }
+  case ARMISD::UMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2UMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::UMLAL : ARM::UMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
+  case ARMISD::SMLAL:{
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32)};
+      return CurDAG->getMachineNode(ARM::t2SMLAL, dl, MVT::i32, MVT::i32, Ops, 6);
+    }else{
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
+                        N->getOperand(3), getAL(CurDAG),
+                        CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(Subtarget->hasV6Ops() ?
+                                      ARM::SMLAL : ARM::SMLALv5,
+                                      dl, MVT::i32, MVT::i32, Ops, 7);
+    }
+  }
   case ISD::LOAD: {
     SDNode *ResNode = 0;
     if (Subtarget->isThumb() && Subtarget->hasThumb2())
@@ -2805,10 +2725,6 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   }
   case ARMISD::CMOV:
     return SelectCMOVOp(N);
-  case ARMISD::CAND:
-  case ARMISD::COR:
-  case ARMISD::CXOR:
-    return SelectConditionalOp(N);
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 190ca076dae5..ff99b04078e8 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -122,6 +122,7 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
   setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
   setOperationAction(ISD::SELECT,            VT, Expand);
   setOperationAction(ISD::SELECT_CC,         VT, Expand);
+  setOperationAction(ISD::VSELECT,           VT, Expand);
   setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
   if (VT.isInteger()) {
     setOperationAction(ISD::SHL, VT, Custom);
@@ -514,6 +515,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::FLOG10, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
     setOperationAction(ISD::FEXP2, MVT::v4f32, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v4f32, Expand);
 
     // Neon does not support some operations on v1i64 and v2i64 types.
     setOperationAction(ISD::MUL, MVT::v1i64, Expand);
@@ -566,6 +568,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     }
   }
 
+  // ARM and Thumb2 support UMLAL/SMLAL.
+  if (!Subtarget->isThumb1Only())
+    setTargetDAGCombine(ISD::ADDC);
+
+
   computeRegisterProperties();
 
   // ARM does not have f32 extending load.
@@ -629,9 +636,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (!Subtarget->hasV6Ops())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
 
-  // These are expanded into libcalls.
-  if (!Subtarget->hasDivide() || !Subtarget->isThumb2()) {
-    // v7M has a hardware divider
+  if (!(Subtarget->hasDivide() && Subtarget->isThumb2()) &&
+      !(Subtarget->hasDivideInARMMode() && !Subtarget->isThumb())) {
+    // These are expanded into libcalls if the cpu doesn't have HW divider.
     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
     setOperationAction(ISD::UDIV,  MVT::i32, Expand);
   }
@@ -791,12 +798,9 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::SUB);
   setTargetDAGCombine(ISD::MUL);
-
-  if (Subtarget->hasV6T2Ops() || Subtarget->hasNEON()) {
-    setTargetDAGCombine(ISD::AND);
-    setTargetDAGCombine(ISD::OR);
-    setTargetDAGCombine(ISD::XOR);
-  }
+  setTargetDAGCombine(ISD::AND);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::XOR);
 
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
@@ -821,7 +825,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   benefitFromCodePlacementOpt = true;
 
   // Prefer likely predicted branches to selects on out-of-order cores.
-  predictableSelectIsExpensive = Subtarget->isCortexA9();
+  predictableSelectIsExpensive = Subtarget->isLikeA9();
 
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
@@ -898,9 +902,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
 
   case ARMISD::CMOV:          return "ARMISD::CMOV";
-  case ARMISD::CAND:          return "ARMISD::CAND";
-  case ARMISD::COR:           return "ARMISD::COR";
-  case ARMISD::CXOR:          return "ARMISD::CXOR";
 
   case ARMISD::RBIT:          return "ARMISD::RBIT";
 
@@ -984,6 +985,8 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::UMLAL:         return "ARMISD::UMLAL";
+  case ARMISD::SMLAL:         return "ARMISD::SMLAL";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::FMAX:          return "ARMISD::FMAX";
   case ARMISD::FMIN:          return "ARMISD::FMIN";
@@ -1591,19 +1594,19 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
+  bool HasMinSizeAttr = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::MinSize);
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && !isARMFunc &&
-             Subtarget->hasRAS() && !Subtarget->isThumb1Only())
-      // "mov lr, pc; b _foo" to avoid confusing the RSP
-      CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
   } else {
-    if (!isDirect && !Subtarget->hasV5TOps()) {
+    if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    } else if (doesNotRet && isDirect && Subtarget->hasRAS())
+    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+               // Emit regular call when code size is the priority
+               !HasMinSizeAttr)
       // "mov lr, pc; b _foo" to avoid confusing the RSP
       CallOpc = ARMISD::CALL_NOLINK;
     else
@@ -1653,22 +1656,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// and then confiscate the rest of the parameter registers to insure
 /// this.
 void
-ARMTargetLowering::HandleByVal(CCState *State, unsigned &size) const {
+ARMTargetLowering::HandleByVal(
+    CCState *State, unsigned &size, unsigned Align) const {
   unsigned reg = State->AllocateReg(GPRArgRegs, 4);
   assert((State->getCallOrPrologue() == Prologue ||
           State->getCallOrPrologue() == Call) &&
          "unhandled ParmContext");
   if ((!State->isFirstByValRegValid()) &&
       (ARM::R0 <= reg) && (reg <= ARM::R3)) {
-    State->setFirstByValReg(reg);
-    // At a call site, a byval parameter that is split between
-    // registers and memory needs its size truncated here.  In a
-    // function prologue, such byval parameters are reassembled in
-    // memory, and are not truncated.
-    if (State->getCallOrPrologue() == Call) {
-      unsigned excess = 4 * (ARM::R4 - reg);
-      assert(size >= excess && "expected larger existing stack allocation");
-      size -= excess;
+    if (Subtarget->isAAPCS_ABI() && Align > 4) {
+      unsigned AlignInRegs = Align / 4;
+      unsigned Waste = (ARM::R4 - reg) % AlignInRegs;
+      for (unsigned i = 0; i < Waste; ++i)
+        reg = State->AllocateReg(GPRArgRegs, 4);
+    }
+    if (reg != 0) {
+      State->setFirstByValReg(reg);
+      // At a call site, a byval parameter that is split between
+      // registers and memory needs its size truncated here.  In a
+      // function prologue, such byval parameters are reassembled in
+      // memory, and are not truncated.
+      if (State->getCallOrPrologue() == Call) {
+        unsigned excess = 4 * (ARM::R4 - reg);
+        assert(size >= excess && "expected larger existing stack allocation");
+        size -= excess;
+      }
     }
   }
   // Confiscate any remaining parameter registers to preclude their
@@ -1801,6 +1813,14 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     }
   }
 
+  // If Caller's vararg or byval argument has been split between registers and
+  // stack, do not perform tail call, since part of the argument is in caller's
+  // local frame.
+  const ARMFunctionInfo *AFI_Caller = DAG.getMachineFunction().
+                                      getInfo<ARMFunctionInfo>();
+  if (AFI_Caller->getVarArgsRegSaveSize())
+    return false;
+
   // If the callee takes no arguments then go on to check the results of the
   // call.
   if (!Outs.empty()) {
@@ -2532,7 +2552,10 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
 void
 ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                         DebugLoc dl, SDValue &Chain,
-                                        unsigned ArgOffset) const {
+                                        const Value *OrigArg,
+                                        unsigned OffsetFromOrigArg,
+                                        unsigned ArgOffset,
+                                        bool ForceMutable) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -2559,7 +2582,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                                     getPointerTy());
 
     SmallVector<SDValue, 4> MemOps;
-    for (; firstRegToSaveIndex < 4; ++firstRegToSaveIndex) {
+    for (unsigned i = 0; firstRegToSaveIndex < 4; ++firstRegToSaveIndex, ++i) {
       const TargetRegisterClass *RC;
       if (AFI->isThumb1OnlyFunction())
         RC = &ARM::tGPRRegClass;
@@ -2570,7 +2593,7 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
       SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
       SDValue Store =
         DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                 MachinePointerInfo::getFixedStack(AFI->getVarArgsFrameIndex()),
+                     MachinePointerInfo(OrigArg, OffsetFromOrigArg + 4*i),
                      false, false, 0);
       MemOps.push_back(Store);
       FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
@@ -2581,7 +2604,8 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
                           &MemOps[0], MemOps.size());
   } else
     // This will point to the next argument passed via stack.
-    AFI->setVarArgsFrameIndex(MFI->CreateFixedObject(4, ArgOffset, true));
+    AFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, ArgOffset, !ForceMutable));
 }
 
 SDValue
@@ -2604,14 +2628,16 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
   CCInfo.AnalyzeFormalArguments(Ins,
                                 CCAssignFnForNode(CallConv, /* Return*/ false,
                                                   isVarArg));
-
+  
   SmallVector<SDValue, 16> ArgValues;
   int lastInsIndex = -1;
-
   SDValue ArgValue;
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-
+    std::advance(CurOrigArg, Ins[VA.getValNo()].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[VA.getValNo()].OrigArgIndex;
     // Arguments stored in registers.
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
@@ -2705,14 +2731,20 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
           // Since they could be overwritten by lowering of arguments in case of
           // a tail call.
           if (Flags.isByVal()) {
-            unsigned VARegSize, VARegSaveSize;
-            computeRegArea(CCInfo, MF, VARegSize, VARegSaveSize);
-            VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0);
-            unsigned Bytes = Flags.getByValSize() - VARegSize;
-            if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
-            int FI = MFI->CreateFixedObject(Bytes,
-                                            VA.getLocMemOffset(), false);
-            InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));
+            ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+            if (!AFI->getVarArgsFrameIndex()) {
+              VarArgStyleRegisters(CCInfo, DAG,
+                                   dl, Chain, CurOrigArg,
+                                   Ins[VA.getValNo()].PartOffset,
+                                   VA.getLocMemOffset(),
+                                   true /*force mutable frames*/);
+              int VAFrameIndex = AFI->getVarArgsFrameIndex();
+              InVals.push_back(DAG.getFrameIndex(VAFrameIndex, getPointerTy()));
+            } else {
+              int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                              VA.getLocMemOffset(), false);
+              InVals.push_back(DAG.getFrameIndex(FI, getPointerTy()));              
+            }
           } else {
             int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
                                             VA.getLocMemOffset(), true);
@@ -2730,7 +2762,8 @@ ARMTargetLowering::LowerFormalArguments(SDValue Chain,
 
   // varargs
   if (isVarArg)
-    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getNextStackOffset());
+    VarArgStyleRegisters(CCInfo, DAG, dl, Chain, 0, 0,
+                         CCInfo.getNextStackOffset());
 
   return Chain;
 }
@@ -3890,6 +3923,36 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
   return SDValue();
 }
 
+// check if an VEXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0) continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
 
 static bool isVEXTMask(ArrayRef<int> M, EVT VT,
                        bool &ReverseVEXT, unsigned &Imm) {
@@ -4157,10 +4220,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   }
 
   // Scan through the operands to see if only one value is used.
+  //
+  // As an optimisation, even if more than one value is used it may be more
+  // profitable to splat with one value then change some lanes.
+  //
+  // Heuristically we decide to do this if the vector has a "dominant" value,
+  // defined as splatted to more than half of the lanes.
   unsigned NumElts = VT.getVectorNumElements();
   bool isOnlyLowElement = true;
   bool usesOnlyOneValue = true;
+  bool hasDominantValue = false;
   bool isConstant = true;
+
+  // Map of the number of times a particular SDValue appears in the
+  // element list.
+  DenseMap<SDValue, unsigned> ValueCounts;
   SDValue Value;
   for (unsigned i = 0; i < NumElts; ++i) {
     SDValue V = Op.getOperand(i);
@@ -4171,13 +4245,21 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
       isConstant = false;
 
-    if (!Value.getNode())
+    ValueCounts.insert(std::make_pair(V, 0));
+    unsigned &Count = ValueCounts[V];
+    
+    // Is this value dominant? (takes up more than half of the lanes)
+    if (++Count > (NumElts / 2)) {
+      hasDominantValue = true;
       Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
+    }
   }
+  if (ValueCounts.size() != 1)
+    usesOnlyOneValue = false;
+  if (!Value.getNode() && ValueCounts.size() > 0)
+    Value = ValueCounts.begin()->first;
 
-  if (!Value.getNode())
+  if (ValueCounts.size() == 0)
     return DAG.getUNDEF(VT);
 
   if (isOnlyLowElement)
@@ -4187,9 +4269,51 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
   // i32 and try again.
-  if (usesOnlyOneValue && EltSize <= 32) {
-    if (!isConstant)
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+  if (hasDominantValue && EltSize <= 32) {
+    if (!isConstant) {
+      SDValue N;
+
+      // If we are VDUPing a value that comes directly from a vector, that will
+      // cause an unnecessary move to and from a GPR, where instead we could
+      // just use VDUPLANE.
+      if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+        // We need to create a new undef vector to use for the VDUPLANE if the
+        // size of the vector from which we get the value is different than the
+        // size of the vector that we need to create. We will insert the element
+        // such that the register coalescer will remove unnecessary copies.
+        if (VT != Value->getOperand(0).getValueType()) {
+          ConstantSDNode *constIndex;
+          constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1));
+          assert(constIndex && "The index is not a constant!");
+          unsigned index = constIndex->getAPIntValue().getLimitedValue() %
+                             VT.getVectorNumElements();
+          N =  DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
+                        Value, DAG.getConstant(index, MVT::i32)),
+                           DAG.getConstant(index, MVT::i32));
+        } else {
+          N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                        Value->getOperand(0), Value->getOperand(1));
+        }
+      }
+      else
+        N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
+
+      if (!usesOnlyOneValue) {
+        // The dominant value was splatted as 'N', but we now have to insert
+        // all differing elements.
+        for (unsigned I = 0; I < NumElts; ++I) {
+          if (Op.getOperand(I) == Value)
+            continue;
+          SmallVector<SDValue, 3> Ops;
+          Ops.push_back(N);
+          Ops.push_back(Op.getOperand(I));
+          Ops.push_back(DAG.getConstant(I, MVT::i32));
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+        }
+      }
+      return N;
+    }
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumElts; ++i)
@@ -4201,9 +4325,11 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
     }
-    SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
-    if (Val.getNode())
-      return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
+    if (usesOnlyOneValue) {
+      SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
+      if (isConstant && Val.getNode())
+        return DAG.getNode(ARMISD::VDUP, dl, VT, Val); 
+    }
   }
 
   // If all elements are constants and the case above didn't get hit, fall back
@@ -4586,6 +4712,12 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
     if (isVREVMask(ShuffleMask, VT, 16))
       return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
 
+    if (V2->getOpcode() == ISD::UNDEF &&
+        isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
+      return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
+                         DAG.getConstant(Imm, MVT::i32));
+    }
+
     // Check for Neon shuffles that modify both input vectors in place.
     // If both results are used, i.e., if there are two shuffles with the same
     // source operands and with masks corresponding to both results of one of
@@ -5421,7 +5553,7 @@ ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
@@ -5532,7 +5664,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   exitMBB->transferSuccessorsAndUpdatePHIs(BB);
 
   const TargetRegisterClass *TRC = isThumb2 ?
-    (const TargetRegisterClass*)&ARM::tGPRRegClass :
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned scratch = MRI.createVirtualRegister(TRC);
   unsigned scratch2 = MRI.createVirtualRegister(TRC);
@@ -5546,7 +5678,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   //   ldrex dest, ptr
   //   (sign extend dest, if required)
   //   cmp dest, incr
-  //   cmov.cond scratch2, dest, incr
+  //   cmov.cond scratch2, incr, dest
   //   strex scratch, scratch2, ptr
   //   cmp scratch, #0
   //   bne- loopMBB
@@ -5569,7 +5701,7 @@ ARMTargetLowering::EmitAtomicBinaryMinMax(MachineInstr *MI,
   AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
                  .addReg(oldval).addReg(incr));
   BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2MOVCCr : ARM::MOVCCr), scratch2)
-         .addReg(oldval).addReg(incr).addImm(Cond).addReg(ARM::CPSR);
+         .addReg(incr).addReg(oldval).addImm(Cond).addReg(ARM::CPSR);
 
   MIB = BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
   if (strOpc == ARM::t2STREX)
@@ -5939,12 +6071,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                              MachineMemOperand::MOLoad |
                              MachineMemOperand::MOVolatile, 4, 4);
 
-  if (AFI->isThumb1OnlyFunction())
-    BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup));
-  else if (!Subtarget->hasVFP2())
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp));
-  else
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  MIB.addRegMask(RI.getNoPreservedMask());
 
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
@@ -6016,9 +6151,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
-        Align = getTargetData()->getTypeAllocSize(C->getType());
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -6105,9 +6240,9 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+      unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
       if (Align == 0)
-        Align = getTargetData()->getTypeAllocSize(C->getType());
+        Align = getDataLayout()->getTypeAllocSize(C->getType());
       unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
       unsigned VReg1 = MRI->createVirtualRegister(TRC);
@@ -6154,18 +6289,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   }
 
   // Add the jump table entries as successors to the MBB.
-  MachineBasicBlock *PrevMBB = 0;
+  SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
   for (std::vector<MachineBasicBlock*>::iterator
          I = LPadList.begin(), E = LPadList.end(); I != E; ++I) {
     MachineBasicBlock *CurMBB = *I;
-    if (PrevMBB != CurMBB)
+    if (SeenMBBs.insert(CurMBB))
       DispContBB->addSuccessor(CurMBB);
-    PrevMBB = CurMBB;
   }
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
-  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
-  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
   const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
   SmallVector<MachineBasicBlock*, 64> MBBLPads;
   for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
@@ -6279,7 +6411,8 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
-    if (!MF->getFunction()->hasFnAttr(Attribute::NoImplicitFloat) &&
+    if (!MF->getFunction()->getFnAttributes().
+          hasAttribute(Attributes::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
       if ((Align % 16 == 0) && SizeVal >= 16) {
         ldrOpc = ARM::VLD1q32wb_fixed;
@@ -6364,7 +6497,8 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
       } else {
         AddDefaultPred(BuildMI(*BB, MI, dl,
           TII->get(ldrOpc),scratch)
-          .addReg(srcOut, RegState::Define).addReg(srcIn).addImm(1));
+          .addReg(srcOut, RegState::Define).addReg(srcIn)
+          .addReg(0).addImm(1));
 
         AddDefaultPred(BuildMI(*BB, MI, dl, TII->get(strOpc), destOut)
           .addReg(scratch).addReg(destIn)
@@ -6427,9 +6561,9 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
 
     // MachineConstantPool wants an explicit alignment.
-    unsigned Align = getTargetData()->getPrefTypeAlignment(Int32Ty);
+    unsigned Align = getDataLayout()->getPrefTypeAlignment(Int32Ty);
     if (Align == 0)
-      Align = getTargetData()->getTypeAllocSize(C->getType());
+      Align = getDataLayout()->getTypeAllocSize(C->getType());
     unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
 
     AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::LDRcp))
@@ -6981,73 +7115,131 @@ static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
   return AllOnes ? C->isAllOnesValue() : C->isNullValue();
 }
 
+// Return true if N is conditionally 0 or all ones.
+// Detects these expressions where cc is an i1 value:
+//
+//   (select cc 0, y)   [AllOnes=0]
+//   (select cc y, 0)   [AllOnes=0]
+//   (zext cc)          [AllOnes=0]
+//   (sext cc)          [AllOnes=0/1]
+//   (select cc -1, y)  [AllOnes=1]
+//   (select cc y, -1)  [AllOnes=1]
+//
+// Invert is set when N is the null/all ones constant when CC is false.
+// OtherOp is set to the alternative value of N.
+static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
+                                       SDValue &CC, bool &Invert,
+                                       SDValue &OtherOp,
+                                       SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default: return false;
+  case ISD::SELECT: {
+    CC = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    if (isZeroOrAllOnes(N1, AllOnes)) {
+      Invert = false;
+      OtherOp = N2;
+      return true;
+    }
+    if (isZeroOrAllOnes(N2, AllOnes)) {
+      Invert = true;
+      OtherOp = N1;
+      return true;
+    }
+    return false;
+  }
+  case ISD::ZERO_EXTEND:
+    // (zext cc) can never be the all ones value.
+    if (AllOnes)
+      return false;
+    // Fall through.
+  case ISD::SIGN_EXTEND: {
+    EVT VT = N->getValueType(0);
+    CC = N->getOperand(0);
+    if (CC.getValueType() != MVT::i1)
+      return false;
+    Invert = !AllOnes;
+    if (AllOnes)
+      // When looking for an AllOnes constant, N is an sext, and the 'other'
+      // value is 0.
+      OtherOp = DAG.getConstant(0, VT);
+    else if (N->getOpcode() == ISD::ZERO_EXTEND)
+      // When looking for a 0 constant, N can be zext or sext.
+      OtherOp = DAG.getConstant(1, VT);
+    else
+      OtherOp = DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), VT);
+    return true;
+  }
+  }
+}
+
 // Combine a constant select operand into its use:
 //
-//   (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
-//   (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+//   (add (select cc, 0, c), x)  -> (select cc, x, (add, x, c))
+//   (sub x, (select cc, 0, c))  -> (select cc, x, (sub, x, c))
+//   (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))  [AllOnes=1]
+//   (or  (select cc, 0, c), x)  -> (select cc, x, (or, x, c))
+//   (xor (select cc, 0, c), x)  -> (select cc, x, (xor, x, c))
 //
 // The transform is rejected if the select doesn't have a constant operand that
-// is null.
+// is null, or all ones when AllOnes is set.
+//
+// Also recognize sext/zext from i1:
+//
+//   (add (zext cc), x) -> (select cc (add x, 1), x)
+//   (add (sext cc), x) -> (select cc (add x, -1), x)
+//
+// These transformations eventually create predicated instructions.
 //
 // @param N       The node to transform.
 // @param Slct    The N operand that is a select.
 // @param OtherOp The other N operand (x above).
 // @param DCI     Context.
+// @param AllOnes Require the select constant to be all ones instead of null.
 // @returns The new node, or SDValue() on failure.
 static
 SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
-                            TargetLowering::DAGCombinerInfo &DCI) {
+                            TargetLowering::DAGCombinerInfo &DCI,
+                            bool AllOnes = false) {
   SelectionDAG &DAG = DCI.DAG;
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = N->getValueType(0);
-  unsigned Opc = N->getOpcode();
-  bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC;
-  SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1);
-  SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2);
-  ISD::CondCode CC = ISD::SETCC_INVALID;
-
-  if (isSlctCC) {
-    CC = cast<CondCodeSDNode>(Slct.getOperand(4))->get();
-  } else {
-    SDValue CCOp = Slct.getOperand(0);
-    if (CCOp.getOpcode() == ISD::SETCC)
-      CC = cast<CondCodeSDNode>(CCOp.getOperand(2))->get();
-  }
-
-  bool DoXform = false;
-  bool InvCC = false;
-  assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) &&
-          "Bad input!");
+  SDValue NonConstantVal;
+  SDValue CCOp;
+  bool SwapSelectOps;
+  if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
+                                  NonConstantVal, DAG))
+    return SDValue();
 
-  if (isZeroOrAllOnes(LHS, false)) {
-    DoXform = true;
-  } else if (CC != ISD::SETCC_INVALID && isZeroOrAllOnes(RHS, false)) {
-    std::swap(LHS, RHS);
-    SDValue Op0 = Slct.getOperand(0);
-    EVT OpVT = isSlctCC ? Op0.getValueType() : Op0.getOperand(0).getValueType();
-    bool isInt = OpVT.isInteger();
-    CC = ISD::getSetCCInverse(CC, isInt);
+  // Slct is now know to be the desired identity constant when CC is true.
+  SDValue TrueVal = OtherOp;
+  SDValue FalseVal = DAG.getNode(N->getOpcode(), N->getDebugLoc(), VT,
+                                 OtherOp, NonConstantVal);
+  // Unless SwapSelectOps says CC should be false.
+  if (SwapSelectOps)
+    std::swap(TrueVal, FalseVal);
 
-    if (!TLI.isCondCodeLegal(CC, OpVT))
-      return SDValue();         // Inverse operator isn't legal.
+  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+                     CCOp, TrueVal, FalseVal);
+}
 
-    DoXform = true;
-    InvCC = true;
+// Attempt combineSelectAndUse on each operand of a commutative operator N.
+static
+SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes,
+                                       TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes);
+    if (Result.getNode())
+      return Result;
   }
-
-  if (!DoXform)
-    return SDValue();
-
-  SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS);
-  if (isSlctCC)
-    return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result,
-                           Slct.getOperand(0), Slct.getOperand(1), CC);
-  SDValue CCOp = Slct.getOperand(0);
-  if (InvCC)
-    CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(),
-                        CCOp.getOperand(0), CCOp.getOperand(1), CC);
-  return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
-                     CCOp, OtherOp, Result);
+  if (N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes);
+    if (Result.getNode())
+      return Result;
+  }
+  return SDValue();
 }
 
 // AddCombineToVPADDL- For pair-wise add on neon, use the vpaddl instruction
@@ -7139,6 +7331,154 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
   return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), VT, tmp);
 }
 
+static SDValue findMUL_LOHI(SDValue V) {
+  if (V->getOpcode() == ISD::UMUL_LOHI ||
+      V->getOpcode() == ISD::SMUL_LOHI)
+    return V;
+  return SDValue();
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb1Only()) return SDValue();
+
+  // Only perform the checks after legalize when the pattern is available.
+  if (DCI.isBeforeLegalize()) return SDValue();
+
+  // Look for multiply add opportunities.
+  // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
+  // each add nodes consumes a value from ISD::UMUL_LOHI and there is
+  // a glue link from the first add to the second add.
+  // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
+  // a S/UMLAL instruction.
+  //          loAdd   UMUL_LOHI
+  //            \    / :lo    \ :hi
+  //             \  /          \          [no multiline comment]
+  //              ADDC         |  hiAdd
+  //                 \ :glue  /  /
+  //                  \      /  /
+  //                    ADDE
+  //
+  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  SDValue AddcOp0 = AddcNode->getOperand(0);
+  SDValue AddcOp1 = AddcNode->getOperand(1);
+
+  // Check if the two operands are from the same mul_lohi node.
+  if (AddcOp0.getNode() == AddcOp1.getNode())
+    return SDValue();
+
+  assert(AddcNode->getNumValues() == 2 &&
+         AddcNode->getValueType(0) == MVT::i32 &&
+         AddcNode->getValueType(1) == MVT::Glue &&
+         "Expect ADDC with two result values: i32, glue");
+
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
+      AddcOp1->getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  // Look for the glued ADDE.
+  SDNode* AddeNode = AddcNode->getGluedUser();
+  if (AddeNode == NULL)
+    return SDValue();
+
+  // Make sure it is really an ADDE.
+  if (AddeNode->getOpcode() != ISD::ADDE)
+    return SDValue();
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
+         "ADDE node has the wrong inputs");
+
+  // Check for the triangle shape.
+  SDValue AddeOp0 = AddeNode->getOperand(0);
+  SDValue AddeOp1 = AddeNode->getOperand(1);
+
+  // Make sure that the ADDE operands are not coming from the same node.
+  if (AddeOp0.getNode() == AddeOp1.getNode())
+    return SDValue();
+
+  // Find the MUL_LOHI node walking up ADDE's operands.
+  bool IsLeftOperandMUL = false;
+  SDValue MULOp = findMUL_LOHI(AddeOp0);
+  if (MULOp == SDValue())
+   MULOp = findMUL_LOHI(AddeOp1);
+  else
+    IsLeftOperandMUL = true;
+  if (MULOp == SDValue())
+     return SDValue();
+
+  // Figure out the right opcode.
+  unsigned Opc = MULOp->getOpcode();
+  unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
+
+  // Figure out the high and low input values to the MLAL node.
+  SDValue* HiMul = &MULOp;
+  SDValue* HiAdd = NULL;
+  SDValue* LoMul = NULL;
+  SDValue* LowAdd = NULL;
+
+  if (IsLeftOperandMUL)
+    HiAdd = &AddeOp1;
+  else
+    HiAdd = &AddeOp0;
+
+
+  if (AddcOp0->getOpcode() == Opc) {
+    LoMul = &AddcOp0;
+    LowAdd = &AddcOp1;
+  }
+  if (AddcOp1->getOpcode() == Opc) {
+    LoMul = &AddcOp1;
+    LowAdd = &AddcOp0;
+  }
+
+  if (LoMul == NULL)
+    return SDValue();
+
+  if (LoMul->getNode() != HiMul->getNode())
+    return SDValue();
+
+  // Create the merged node.
+  SelectionDAG &DAG = DCI.DAG;
+
+  // Build operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(LoMul->getOperand(0));
+  Ops.push_back(LoMul->getOperand(1));
+  Ops.push_back(*LowAdd);
+  Ops.push_back(*HiAdd);
+
+  SDValue MLALNode =  DAG.getNode(FinalOpc, AddcNode->getDebugLoc(),
+                                 DAG.getVTList(MVT::i32, MVT::i32),
+                                 &Ops[0], Ops.size());
+
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(MLALNode.getNode(), 1);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  SDValue LoMLALResult(MLALNode.getNode(), 0);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+/// PerformADDCCombine - Target-specific dag combine transform from
+/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL.
+static SDValue PerformADDCCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
+
+  return AddCombineTo64bitMLAL(N, DCI, Subtarget);
+
+}
+
 /// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
 /// operands N0 and N1.  This is a helper for PerformADDCombine that is
 /// called with the default operands, and if that fails, with commuted
@@ -7153,7 +7493,7 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
     return Result;
 
   // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
-  if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
+  if (N0.getNode()->hasOneUse()) {
     SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
     if (Result.getNode()) return Result;
   }
@@ -7185,7 +7525,7 @@ static SDValue PerformSUBCombine(SDNode *N,
   SDValue N1 = N->getOperand(1);
 
   // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
-  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
+  if (N1.getNode()->hasOneUse()) {
     SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
     if (Result.getNode()) return Result;
   }
@@ -7313,41 +7653,6 @@ static SDValue PerformMULCombine(SDNode *N,
   return SDValue();
 }
 
-static bool isCMOVWithZeroOrAllOnesLHS(SDValue N, bool AllOnes) {
-  return N.getOpcode() == ARMISD::CMOV && N.getNode()->hasOneUse() &&
-    isZeroOrAllOnes(N.getOperand(0), AllOnes);
-}
-
-/// formConditionalOp - Combine an operation with a conditional move operand
-/// to form a conditional op. e.g. (or x, (cmov 0, y, cond)) => (or.cond x, y)
-/// (and x, (cmov -1, y, cond)) => (and.cond, x, y)
-static SDValue formConditionalOp(SDNode *N, SelectionDAG &DAG,
-                                 bool Commutable) {
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
-  bool isAND = N->getOpcode() == ISD::AND;
-  bool isCand = isCMOVWithZeroOrAllOnesLHS(N1, isAND);
-  if (!isCand && Commutable) {
-    isCand = isCMOVWithZeroOrAllOnesLHS(N0, isAND);
-    if (isCand)
-      std::swap(N0, N1);
-  }
-  if (!isCand)
-    return SDValue();
-
-  unsigned Opc = 0;
-  switch (N->getOpcode()) {
-  default: llvm_unreachable("Unexpected node");
-  case ISD::AND: Opc = ARMISD::CAND; break;
-  case ISD::OR:  Opc = ARMISD::COR; break;
-  case ISD::XOR: Opc = ARMISD::CXOR; break;
-  }
-  return DAG.getNode(Opc, N->getDebugLoc(), N->getValueType(0), N0,
-                     N1.getOperand(1), N1.getOperand(2), N1.getOperand(3),
-                     N1.getOperand(4));
-}
-
 static SDValue PerformANDCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
@@ -7382,10 +7687,10 @@ static SDValue PerformANDCombine(SDNode *N,
   }
 
   if (!Subtarget->isThumb1Only()) {
-    // (and x, (cmov -1, y, cond)) => (and.cond x, y)
-    SDValue CAND = formConditionalOp(N, DAG, true);
-    if (CAND.getNode())
-      return CAND;
+    // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, true, DCI);
+    if (Result.getNode())
+      return Result;
   }
 
   return SDValue();
@@ -7425,13 +7730,12 @@ static SDValue PerformORCombine(SDNode *N,
   }
 
   if (!Subtarget->isThumb1Only()) {
-    // (or x, (cmov 0, y, cond)) => (or.cond x, y)
-    SDValue COR = formConditionalOp(N, DAG, true);
-    if (COR.getNode())
-      return COR;
+    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
+    if (Result.getNode())
+      return Result;
   }
 
-
   // The code below optimizes (or (and X, Y), Z).
   // The AND operand needs to have a single user to make these optimizations
   // profitable.
@@ -7593,10 +7897,10 @@ static SDValue PerformXORCombine(SDNode *N,
     return SDValue();
 
   if (!Subtarget->isThumb1Only()) {
-    // (xor x, (cmov 0, y, cond)) => (xor.cond x, y)
-    SDValue CXOR = formConditionalOp(N, DAG, true);
-    if (CXOR.getNode())
-      return CXOR;
+    // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
+    SDValue Result = combineSelectAndUseCommutative(N, false, DCI);
+    if (Result.getNode())
+      return Result;
   }
 
   return SDValue();
@@ -8746,6 +9050,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
+  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
@@ -8807,8 +9112,8 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
 }
 
 bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
-  if (!Subtarget->allowsUnalignedMem())
-    return false;
+  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:
@@ -8816,10 +9121,14 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    return true;
+    // Unaligned access can use (for example) LRDB, LRDH, LDR
+    return AllowsUnaligned;
   case MVT::f64:
-    return Subtarget->hasNEON();
-  // FIXME: VLD1 etc with standard alignment is legal.
+  case MVT::v2f64:
+    // For any little-endian targets with neon, we can support unaligned ld/st
+    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+    // A big-endian target may also explictly support unaligned accesses
+    return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
   }
 }
 
@@ -8838,7 +9147,7 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
 
   // See if we can use NEON instructions for this...
   if (IsZeroVal &&
-      !F->hasFnAttr(Attribute::NoImplicitFloat) &&
+      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat) &&
       Subtarget->hasNEON()) {
     if (memOpAlign(SrcAlign, DstAlign, 16) && Size >= 16) {
       return MVT::v4i32;
@@ -9632,7 +9941,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::arm_neon_vld4lane: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getTargetData()->getTypeAllocSize(I.getType()) / 8;
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
@@ -9657,7 +9966,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       Type *ArgTy = I.getArgOperand(ArgI)->getType();
       if (!ArgTy->isVectorTy())
         break;
-      NumElts += getTargetData()->getTypeAllocSize(ArgTy) / 8;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
     }
     Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
     Info.ptrVal = I.getArgOperand(0);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 51d120507482..4eb3b2cb5150 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -63,9 +63,6 @@ namespace llvm {
       FMSTAT,       // ARM fmstat instruction.
 
       CMOV,         // ARM conditional move instructions.
-      CAND,         // ARM conditional and instructions.
-      COR,          // ARM conditional or instructions.
-      CXOR,         // ARM conditional xor instructions.
 
       BCC_i64,
 
@@ -176,6 +173,9 @@ namespace llvm {
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      UMLAL,        // 64bit Unsigned Accumulate Multiply
+      SMLAL,        // 64bit Signed Accumulate Multiply
+
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
       // operations, but for ARM some BUILD_VECTORs are legal as-is and their
@@ -260,6 +260,11 @@ namespace llvm {
 
     virtual const char *getTargetNodeName(unsigned Opcode) const;
 
+    virtual bool isSelectSupported(SelectSupportKind Kind) const {
+      // ARM does not support scalar condition selects on vectors.
+      return (Kind != ScalarCondVectorVal);
+    }
+
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
     virtual EVT getSetCCResultType(EVT VT) const;
 
@@ -461,7 +466,11 @@ namespace llvm {
                            SmallVectorImpl<SDValue> &InVals) const;
 
     void VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                              DebugLoc dl, SDValue &Chain, unsigned ArgOffset)
+                              DebugLoc dl, SDValue &Chain,
+                              const Value *OrigArg,
+                              unsigned OffsetFromOrigArg,
+                              unsigned ArgOffset,
+                              bool ForceMutable = false)
       const;
 
     void computeRegArea(CCState &CCInfo, MachineFunction &MF,
@@ -472,7 +481,7 @@ namespace llvm {
                 SmallVectorImpl<SDValue> &InVals) const;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
-    virtual void HandleByVal(CCState *, unsigned &) const;
+    virtual void HandleByVal(CCState *, unsigned &, unsigned) const;
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index c8966fb97a4c..67a6820932fc 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -846,6 +846,23 @@ class AMiscA1I<bits<8> opcod, bits<4> opc7_4, dag oops, dag iops,
   let Inst{3-0}   = Rm;
 }
 
+// Division instructions.
+class ADivA1I<bits<3> opcod, dag oops, dag iops,
+              InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, 4, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+  let Inst{27-23} = 0b01110;
+  let Inst{22-20} = opcod;
+  let Inst{19-16} = Rd;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = Rm;
+  let Inst{7-4}   = 0b0001;
+  let Inst{3-0}   = Rn;
+}
+
 // PKH instructions
 def PKHLSLAsmOperand : ImmAsmOperand {
   let Name = "PKHLSLImm";
@@ -893,6 +910,10 @@ class ARMV5TPat<dag pattern, dag result> : Pat<pattern, result> {
 class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsARM, HasV5TE];
 }
+// ARMV5MOPat - Same as ARMV5TEPat with UseMulOps.
+class ARMV5MOPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE, UseMulOps];
+}
 class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsARM, HasV6];
 }
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 31b0c41f08f5..a0b6f249a286 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -13,13 +13,17 @@
 
 #include "ARMInstrInfo.h"
 #include "ARM.h"
+#include "ARMConstantPoolValue.h"
 #include "ARMMachineFunctionInfo.h"
+#include "ARMTargetMachine.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 using namespace llvm;
@@ -84,3 +88,61 @@ unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
 
   return 0;
 }
+
+namespace {
+  /// ARMCGBR - Create Global Base Reg pass. This initializes the PIC
+  /// global base register for ARM ELF.
+  struct ARMCGBR : public MachineFunctionPass {
+    static char ID;
+    ARMCGBR() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      if (AFI->getGlobalBaseReg() == 0)
+        return false;
+
+      const ARMTargetMachine *TM =
+        static_cast<const ARMTargetMachine *>(&MF.getTarget());
+      if (TM->getRelocationModel() != Reloc::PIC_)
+        return false;
+
+      LLVMContext* Context = &MF.getFunction()->getContext();
+      GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
+                                           GlobalValue::ExternalLinkage, 0,
+                                           "_GLOBAL_OFFSET_TABLE_");
+      unsigned Id = AFI->createPICLabelUId();
+      ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id);
+      unsigned Align = TM->getDataLayout()->getPrefTypeAlignment(GV->getType());
+      unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
+
+      MachineBasicBlock &FirstMBB = MF.front();
+      MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+      DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+      unsigned GlobalBaseReg = AFI->getGlobalBaseReg();
+      unsigned Opc = TM->getSubtarget<ARMSubtarget>().isThumb2() ?
+                     ARM::t2LDRpci : ARM::LDRcp;
+      const TargetInstrInfo &TII = *TM->getInstrInfo();
+      MachineInstrBuilder MIB = BuildMI(FirstMBB, MBBI, DL,
+                                        TII.get(Opc), GlobalBaseReg)
+                                .addConstantPoolIndex(Idx);
+      if (Opc == ARM::LDRcp)
+        MIB.addImm(0);
+      AddDefaultPred(MIB);
+
+      return true;
+    }
+
+    virtual const char *getPassName() const {
+      return "ARM PIC Global Base Reg Initialization";
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char ARMCGBR::ID = 0;
+FunctionPass*
+llvm::createARMGlobalBaseRegPass() { return new ARMCGBR(); }
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 992aba5803f6..df2e55ed5c0e 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -83,6 +83,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisInt<0>,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
+
+def SDT_ARM64bitmlal : SDTypeProfile<2,4, [ SDTCisVT<0, i32>, SDTCisVT<1, i32>,
+                                        SDTCisVT<2, i32>, SDTCisVT<3, i32>,
+                                        SDTCisVT<4, i32>, SDTCisVT<5, i32> ] >;
+def ARMUmlal         : SDNode<"ARMISD::UMLAL", SDT_ARM64bitmlal>;
+def ARMSmlal         : SDNode<"ARMISD::SMLAL", SDT_ARM64bitmlal>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
@@ -90,9 +97,10 @@ def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
-                              [SDNPHasChain, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
-                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+                              [SDNPHasChain, SDNPSideEffect,
+                               SDNPOptInGlue, SDNPOutGlue]>;
 def ARMcopystructbyval : SDNode<"ARMISD::COPY_STRUCT_BYVAL" ,
                                 SDT_ARMStructByVal,
                                 [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
@@ -148,14 +156,16 @@ def ARMsube          : SDNode<"ARMISD::SUBE",  SDTBinaryArithWithFlagsInOut>;
 
 def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
 def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
-                               SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Setjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
-                               SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
+                               SDT_ARMEH_SJLJ_Longjmp,
+                               [SDNPHasChain, SDNPSideEffect]>;
 
 def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMMemBarrierMCR  : SDNode<"ARMISD::MEMBARRIER_MCR", SDT_ARMMEMBARRIER,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPSideEffect]>;
 def ARMPreload        : SDNode<"ARMISD::PRELOAD", SDT_ARMPREFETCH,
                                [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
 
@@ -197,6 +207,8 @@ def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
                                  AssemblerPredicate<"FeatureHWDiv", "divide">;
+def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
+                                 AssemblerPredicate<"FeatureHWDivARM">;
 def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
                                  AssemblerPredicate<"FeatureT2XtPk",
                                                      "pack/extract">;
@@ -232,6 +244,7 @@ def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
 def UseMovt          : Predicate<"Subtarget->useMovt()">;
 def DontUseMovt      : Predicate<"!Subtarget->useMovt()">;
 def UseFPVMLx        : Predicate<"Subtarget->useFPVMLx()">;
+def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 
 // Prefer fused MAC for fp mul + add over fp VMLA / VMLS if they are available.
 // But only select them if more precision in FP computation is allowed.
@@ -242,6 +255,20 @@ def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
 def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
                                  "Subtarget->isTargetDarwin()">;
 
+// VGETLNi32 is microcoded on Swift - prefer VMOV.
+def HasFastVGETLNi32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVGETLNi32 : Predicate<"Subtarget->isSwift()">;
+
+// VDUP.32 is microcoded on Swift - prefer VMOV.
+def HasFastVDUP32 : Predicate<"!Subtarget->isSwift()">;
+def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
+
+// Cortex-A9 prefers VMOVSR to VMOVDRR even when using NEON for scalar FP, as
+// this allows more effective execution domain optimization. See
+// setExecutionDomain().
+def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
+
 def IsLE             : Predicate<"TLI.isLittleEndian()">;
 def IsBE             : Predicate<"TLI.isBigEndian()">;
 
@@ -256,15 +283,13 @@ class RegConstraint<string C> {
 //  ARM specific transformation functions and pattern fragments.
 //
 
-// imm_neg_XFORM - Return a imm value packed into the format described for
-// imm_neg defs below.
+// imm_neg_XFORM - Return the negation of an i32 immediate value.
 def imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
 
-// so_imm_not_XFORM - Return a so_imm value packed into the format described for
-// so_imm_not def below.
-def so_imm_not_XFORM : SDNodeXForm<imm, [{
+// imm_not_XFORM - Return the complement of a i32 immediate value.
+def imm_not_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
 }]>;
 
@@ -275,7 +300,7 @@ def imm16_31 : ImmLeaf<i32, [{
 
 def so_imm_neg_asmoperand : AsmOperandClass { let Name = "ARMSOImmNeg"; }
 def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-    int64_t Value = -(int)N->getZExtValue();
+    unsigned Value = -(unsigned)N->getZExtValue();
     return Value && ARM_AM::getSOImmVal(Value) != -1;
   }], imm_neg_XFORM> {
   let ParserMatchClass = so_imm_neg_asmoperand;
@@ -287,7 +312,7 @@ def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
 def so_imm_not_asmoperand : AsmOperandClass { let Name = "ARMSOImmNot"; }
 def so_imm_not : Operand<i32>, PatLeaf<(imm), [{
     return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
-  }], so_imm_not_XFORM> {
+  }], imm_not_XFORM> {
   let ParserMatchClass = so_imm_not_asmoperand;
 }
 
@@ -1791,12 +1816,15 @@ def ADR : AI1<{0,?,?,0}, (outs GPR:$Rd), (ins adrlabel:$label),
   let Inst{15-12} = Rd;
   let Inst{11-0} = label{11-0};
 }
+
+let hasSideEffects = 1 in {
 def LEApcrel : ARMPseudoInst<(outs GPR:$Rd), (ins i32imm:$label, pred:$p),
                     4, IIC_iALUi, []>;
 
 def LEApcrelJT : ARMPseudoInst<(outs GPR:$Rd),
                       (ins i32imm:$label, nohash_imm:$id, pred:$p),
                       4, IIC_iALUi, []>;
+}
 
 //===----------------------------------------------------------------------===//
 //  Control Flow Instructions.
@@ -3079,15 +3107,19 @@ def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
              (SUBSri  GPR:$src, so_imm_neg:$imm)>;
 
 def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
-             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
 def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
-             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
 
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
 // for part of the negation.
 def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
              (SBCri   GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
+             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
@@ -3399,6 +3431,18 @@ class AsMul1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
   let Inst{11-8}  = Rm;
   let Inst{3-0}   = Rn;
 }
+class AsMla1I64<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : AsMul1I<opcod, oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rm;
+  bits<4> Rn;
+  let Inst{19-16} = RdHi;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = Rm;
+  let Inst{3-0}   = Rn;
+}
 
 // FIXME: The v5 pseudos are only necessary for the additional Constraint
 //        property. Remove them when it's possible to add those properties
@@ -3419,13 +3463,13 @@ def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
                            4, IIC_iMUL32,
                [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
                (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
-               Requires<[IsARM, NoV6]>;
+               Requires<[IsARM, NoV6, UseMulOps]>;
 }
 
 def MLA  : AsMul1I32<0b0000001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                      IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (add (mul GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
-                   Requires<[IsARM, HasV6]> {
+                   Requires<[IsARM, HasV6, UseMulOps]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
 }
@@ -3441,7 +3485,7 @@ def MLAv5: ARMPseudoExpand<(outs GPR:$Rd),
 def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                    IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
-                   Requires<[IsARM, HasV6T2]> {
+                   Requires<[IsARM, HasV6T2, UseMulOps]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<4> Rn;
@@ -3481,14 +3525,14 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 }
 
 // Multiply + accumulate
-def SMLAL : AsMul1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
-def UMLAL : AsMul1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
-                               (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
+                        (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
                                (ins GPR:$Rn, GPR:$Rm), IIC_iMAC64,
@@ -3504,17 +3548,22 @@ def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
   let Inst{3-0}   = Rn;
 }
 
-let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
+let Constraints = "$RLo = $RdLo,$RHi = $RdHi" in {
 def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
-          (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+             (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
                            Requires<[IsARM, NoV6]>;
 def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
-                              (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
+                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
-          (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
+             (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
+                           pred:$p, cc_out:$s)>,
                            Requires<[IsARM, NoV6]>;
+}
+
+let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def UMAALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                               (ins GPR:$Rn, GPR:$Rm, pred:$p),
                               4, IIC_iMAC64, [],
@@ -3542,7 +3591,7 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
                [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6, UseMulOps]>;
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
@@ -3552,7 +3601,7 @@ def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6, UseMulOps]>;
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
@@ -3606,7 +3655,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               [(set GPRnopc:$Rd, (add GPR:$Ra,
                                (opnode (sext_inreg GPRnopc:$Rn, i16),
                                        (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
 
   def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3614,7 +3663,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (opnode (sext_inreg GPRnopc:$Rn, i16),
                                           (sra GPRnopc:$Rm, (i32 16)))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
 
   def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3622,7 +3671,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
                                           (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
 
   def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3630,7 +3679,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
              [(set GPRnopc:$Rd,
                    (add GPR:$Ra, (opnode (sra GPRnopc:$Rn, (i32 16)),
                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
-            Requires<[IsARM, HasV5TE]>;
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
 
   def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3638,7 +3687,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
                                   (sext_inreg GPRnopc:$Rm, i16)), (i32 16))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>;
 
   def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -3646,7 +3695,7 @@ multiclass AI_smla<string opc, PatFrag opnode> {
               [(set GPRnopc:$Rd,
                  (add GPR:$Ra, (sra (opnode GPRnopc:$Rn,
                                     (sra GPRnopc:$Rm, (i32 16))), (i32 16))))]>,
-            Requires<[IsARM, HasV5TE]>;
+            Requires<[IsARM, HasV5TE, UseMulOps]>;
   }
 }
 
@@ -3749,6 +3798,19 @@ defm SMUA : AI_sdml<0, "smua">;
 defm SMUS : AI_sdml<1, "smus">;
 
 //===----------------------------------------------------------------------===//
+//  Division Instructions (ARMv7-A with virtualization extension)
+//
+def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "sdiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
+                   "udiv", "\t$Rd, $Rn, $Rm",
+                   [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasDivideInARM]>;
+
+//===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
 //
 
@@ -3986,48 +4048,6 @@ def MVNCCi : ARMPseudoInst<(outs GPR:$Rd),
  [/*(set GPR:$Rd, (ARMcmov GPR:$false, so_imm_not:$imm, imm:$cc, CCR:$ccr))*/]>,
                 RegConstraint<"$false = $Rd">;
 
-// Conditional instructions
-multiclass AsI1_bincc_irs<Instruction iri, Instruction irr, Instruction irsi,
-                          Instruction irsr,
-                          InstrItinClass iii, InstrItinClass iir,
-                          InstrItinClass iis> {
-  def ri  : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, so_imm:$imm,
-                                 pred:$p, cc_out:$s),
-                            4, iii, [],
-                       (iri GPR:$Rd, GPR:$Rn, so_imm:$imm, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rr  : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, GPR:$Rm,
-                                 pred:$p, cc_out:$s),
-                            4, iir, [],
-                           (irr GPR:$Rd, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rsi : ARMPseudoExpand<(outs GPR:$Rd),
-                            (ins GPR:$Rfalse, GPR:$Rn, so_reg_imm:$shift,
-                                 pred:$p, cc_out:$s),
-                            4, iis, [],
-                (irsi GPR:$Rd, GPR:$Rn, so_reg_imm:$shift, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-  def rsr : ARMPseudoExpand<(outs GPRnopc:$Rd),
-                           (ins GPRnopc:$Rfalse, GPRnopc:$Rn, so_reg_reg:$shift,
-                                pred:$p, cc_out:$s),
-                            4, iis, [],
-                (irsr GPR:$Rd, GPR:$Rn, so_reg_reg:$shift, pred:$p, cc_out:$s)>,
-                            RegConstraint<"$Rfalse = $Rd">;
-}
-
-defm ANDCC : AsI1_bincc_irs<ANDri, ANDrr, ANDrsi, ANDrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ORRCC : AsI1_bincc_irs<ORRri, ORRrr, ORRrsi, ORRrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm EORCC : AsI1_bincc_irs<EORri, EORrr, EORrsi, EORrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm ADDCC : AsI1_bincc_irs<ADDri, ADDrr, ADDrsi, ADDrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-defm SUBCC : AsI1_bincc_irs<SUBri, SUBrr, SUBrsi, SUBrsr,
-                            IIC_iBITi, IIC_iBITr, IIC_iBITsr>;
-
 } // neverHasSideEffects
 
 
@@ -4723,21 +4743,13 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
                                 Requires<[IsARM, IsIOS]>;
 }
 
-// eh.sjlj.dispatchsetup pseudo-instructions.
-// These pseudos are used for both ARM and Thumb2. Any differences are
-// handled when the pseudo is expanded (which happens before any passes
-// that need the instruction size).
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
-    Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
-  isBarrier = 1 in
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for both ARM and Thumb. Any differences are handled when
+// the pseudo is expanded (which happens before any passes that need the
+// instruction size).
+let isBarrier = 1 in
 def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
 
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
-  isBarrier = 1 in
-def Int_eh_sjlj_dispatchsetup_nofp : PseudoInst<(outs), (ins), NoItinerary, []>;
-
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -4841,32 +4853,32 @@ def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
 def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
                  (SMULWB GPR:$a, GPR:$b)>;
 
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
                            (sra (shl GPR:$b, (i32 16)), (i32 16)))),
                  (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, sext_16_node:$b)),
                  (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
                            (sra GPR:$b, (i32 16)))),
                  (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
                  (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra GPR:$a, (i32 16)),
                            (sra (shl GPR:$b, (i32 16)), (i32 16)))),
                  (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
                  (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
                            (i32 16))),
                  (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
-def : ARMV5TEPat<(add GPR:$acc,
+def : ARMV5MOPat<(add GPR:$acc,
                       (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
                  (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 048d340df006..3cf213cbffee 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -398,6 +398,20 @@ def VecListFourQWordIndexed : Operand<i32> {
   let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
 }
 
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
 def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() == 2;
 }]>;
@@ -1980,7 +1994,7 @@ def VST1LNd8  : VST1LN<0b0000, {?,?,?,0}, "8", v8i8, truncstorei8,
 def VST1LNd16 : VST1LN<0b0100, {?,?,0,?}, "16", v4i16, truncstorei16,
                        NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
 }
 
 def VST1LNd32 : VST1LN<0b1000, {?,0,?,?}, "32", v2i32, store, extractelt,
@@ -2023,7 +2037,7 @@ def VST1LNd8_UPD  : VST1LNWB<0b0000, {?,?,?,0}, "8", v8i8, post_truncsti8,
 def VST1LNd16_UPD : VST1LNWB<0b0100, {?,?,0,?}, "16", v4i16, post_truncsti16,
                              NEONvgetlaneu, addrmode6> {
   let Inst{7-6} = lane{1-0};
-  let Inst{4}   = Rn{5};
+  let Inst{4}   = Rn{4};
 }
 def VST1LNd32_UPD : VST1LNWB<0b1000, {?,0,?,?}, "32", v2i32, post_store,
                              extractelt, addrmode6oneL32> {
@@ -2273,6 +2287,25 @@ def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
 def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
           (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
 
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+          (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+          (VLD1q32 addrmode6:$addr)>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q32 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
 //===----------------------------------------------------------------------===//
 // NEON pattern fragments
 //===----------------------------------------------------------------------===//
@@ -4455,10 +4488,36 @@ def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                      "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      [(set DPR:$Vd,
                            (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 DPR:$src1),
+                                    (v2f32 DPR:$Vn), (v2f32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 DPR:$src1),
+                                    (v1i64 DPR:$Vn), (v1i64 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
 
 def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
                      (and DPR:$Vm, (vnotd DPR:$Vd)))),
-          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+
+def : Pat<(v1i64 (or (and DPR:$Vn, DPR:$Vd),
+                     (and DPR:$Vm, (vnotd DPR:$Vd)))),
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
 
 def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
@@ -4467,9 +4526,35 @@ def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      [(set QPR:$Vd,
                            (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
 
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 QPR:$src1),
+                                    (v4f32 QPR:$Vn), (v4f32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 QPR:$src1),
+                                    (v2i64 QPR:$Vn), (v2i64 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
 def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
                      (and QPR:$Vm, (vnotq QPR:$Vd)))),
-          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i64 (or (and QPR:$Vn, QPR:$Vd),
+                     (and QPR:$Vm, (vnotq QPR:$Vd)))),
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
@@ -4983,7 +5068,8 @@ def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
                           (outs GPR:$R), (ins DPR:$V, VectorIndex32:$lane),
                           IIC_VMOVSI, "vmov", "32", "$R, $V$lane",
                           [(set GPR:$R, (extractelt (v2i32 DPR:$V),
-                                           imm:$lane))]> {
+                                           imm:$lane))]>,
+                Requires<[HasNEON, HasFastVGETLNi32]> {
   let Inst{21} = lane{0};
 }
 // def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
@@ -5006,7 +5092,16 @@ def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
 def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
           (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
                              (DSubReg_i32_reg imm:$lane))),
-                     (SubReg_i32_lane imm:$lane))>;
+                     (SubReg_i32_lane imm:$lane))>,
+      Requires<[HasNEON, HasFastVGETLNi32]>;
+def : Pat<(extractelt (v2i32 DPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG DPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (COPY_TO_REGCLASS
+            (i32 (EXTRACT_SUBREG QPR:$src, (SSubReg_f32_reg imm:$lane))), GPR)>,
+      Requires<[HasNEON, HasSlowVGETLNi32]>;
 def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
           (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1),DPR_VFP2)),
                           (SSubReg_f32_reg imm:$src2))>;
@@ -5117,14 +5212,23 @@ class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
 
 def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
 def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
-def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>,
+                Requires<[HasNEON, HasFastVDUP32]>;
 def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
 def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
 def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
 
-def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>;
+// NEONvdup patterns for uarchs with fast VDUP.32.
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32d GPR:$R)>,
+      Requires<[HasNEON,HasFastVDUP32]>;
 def : Pat<(v4f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VDUP32q GPR:$R)>;
 
+// NEONvdup patterns for uarchs with slow VDUP.32 - use VMOVDRR instead.
+def : Pat<(v2i32 (NEONvdup (i32 GPR:$R))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+def : Pat<(v2f32 (NEONvdup (f32 (bitconvert GPR:$R)))), (VMOVDRR GPR:$R, GPR:$R)>,
+      Requires<[HasNEON,HasSlowVDUP32]>;
+
 //   VDUP     : Vector Duplicate Lane (from scalar to all elements)
 
 class VDUPLND<bits<4> op19_16, string OpcodeStr, string Dt,
@@ -5561,6 +5665,11 @@ def : N2VSPat<arm_ftoui, VCVTf2ud>;
 def : N2VSPat<arm_sitof, VCVTs2fd>;
 def : N2VSPat<arm_uitof, VCVTu2fd>;
 
+// Prefer VMOVDRR for i32 -> f32 bitcasts, it can write all DPR registers.
+def : Pat<(f32 (bitconvert GPR:$a)),
+          (EXTRACT_SUBREG (VMOVDRR GPR:$a, GPR:$a), ssub_0)>,
+        Requires<[HasNEON, DontUseVMOVSR]>;
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 554f6d9f94e1..ae7a5c00bd74 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -223,6 +223,7 @@ def t_addrmode_sp : Operand<i32>,
 def t_addrmode_pc : Operand<i32> {
   let EncoderMethod = "getAddrModePCOpValue";
   let DecoderMethod = "DecodeThumbAddrModePC";
+  let PrintMethod = "printThumbLdrLabelOperand";
 }
 
 //===----------------------------------------------------------------------===//
@@ -1200,6 +1201,7 @@ let neverHasSideEffects = 1, isReMaterializable = 1 in
 def tLEApcrel   : tPseudoInst<(outs tGPR:$Rd), (ins i32imm:$label, pred:$p),
                               2, IIC_iALUi, []>;
 
+let hasSideEffects = 1 in
 def tLEApcrelJT : tPseudoInst<(outs tGPR:$Rd),
                               (ins i32imm:$label, nohash_imm:$id, pred:$p),
                               2, IIC_iALUi, []>;
@@ -1245,10 +1247,6 @@ def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
                               [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
                              Requires<[IsThumb, IsIOS]>;
 
-let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
-    isBarrier = 1 in
-def tInt_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
-
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 8ecf0091d8b6..002d64a2d039 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -159,7 +159,7 @@ def t2addrmode_imm12 : Operand<i32>,
 // t2ldrlabel  := imm12
 def t2ldrlabel : Operand<i32> {
   let EncoderMethod = "getAddrModeImm12OpValue";
-  let PrintMethod = "printT2LdrLabelOperand";
+  let PrintMethod = "printThumbLdrLabelOperand";
 }
 
 def t2ldr_pcrel_imm12_asmoperand : AsmOperandClass {let Name = "MemPCRelImm12";}
@@ -523,6 +523,23 @@ class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
   let Inst{7-4}   = opc7_4;
   let Inst{3-0}   = Rm;
 }
+class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  bits<4> RdLo;
+  bits<4> RdHi;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{31-23} = 0b111110111;
+  let Inst{22-20} = opc22_20;
+  let Inst{19-16} = Rn;
+  let Inst{15-12} = RdLo;
+  let Inst{11-8}  = RdHi;
+  let Inst{7-4}   = opc7_4;
+  let Inst{3-0}   = Rm;
+}
 
 
 /// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
@@ -757,33 +774,6 @@ multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
      let Inst{24} = 1;
      let Inst{23-21} = op23_21;
    }
-
-   // Predicated versions.
-   def CCri : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_imm:$imm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUi, [],
-                             (!cast<Instruction>(NAME#ri) GPRnopc:$Rd,
-                              GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
-   def CCri12 : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPR:$Rn, imm0_4095:$imm,
-                                  pred:$p),
-                             4, IIC_iALUi, [],
-                             (!cast<Instruction>(NAME#ri12) GPRnopc:$Rd,
-                              GPR:$Rn, imm0_4095:$imm, pred:$p)>,
-                RegConstraint<"$Rfalse = $Rd">;
-   def CCrr : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, rGPR:$Rm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUr, [],
-                             (!cast<Instruction>(NAME#rr) GPRnopc:$Rd,
-                              GPRnopc:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
-   def CCrs : t2PseudoExpand<(outs GPRnopc:$Rd),
-                             (ins GPRnopc:$Rfalse, GPRnopc:$Rn, t2_so_reg:$Rm,
-                                  pred:$p, cc_out:$s), 4, IIC_iALUsi, [],
-                             (!cast<Instruction>(NAME#rs) GPRnopc:$Rd,
-                              GPRnopc:$Rn, t2_so_reg:$Rm, pred:$p, cc_out:$s)>,
-              RegConstraint<"$Rfalse = $Rd">;
 }
 
 /// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
@@ -1200,6 +1190,7 @@ def t2ADR : T2PCOneRegImm<(outs rGPR:$Rd),
 let neverHasSideEffects = 1, isReMaterializable = 1 in
 def t2LEApcrel   : t2PseudoInst<(outs rGPR:$Rd), (ins i32imm:$label, pred:$p),
                                 4, IIC_iALUi, []>;
+let hasSideEffects = 1 in
 def t2LEApcrelJT : t2PseudoInst<(outs rGPR:$Rd),
                                 (ins i32imm:$label, nohash_imm:$id, pred:$p),
                                 4, IIC_iALUi,
@@ -1962,7 +1953,7 @@ def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
 def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
 def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
-            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
 
 // Select Bytes -- for disassembly only
 
@@ -2405,7 +2396,8 @@ def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
 def t2MLA: T2FourReg<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "mla", "\t$Rd, $Rn, $Rm, $Ra",
-                [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]> {
+                [(set rGPR:$Rd, (add (mul rGPR:$Rn, rGPR:$Rm), rGPR:$Ra))]>,
+           Requires<[IsThumb2, UseMulOps]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2415,7 +2407,8 @@ def t2MLA: T2FourReg<
 def t2MLS: T2FourReg<
                 (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "mls", "\t$Rd, $Rn, $Rm, $Ra",
-                [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]> {
+                [(set rGPR:$Rd, (sub rGPR:$Ra, (mul rGPR:$Rn, rGPR:$Rm)))]>,
+           Requires<[IsThumb2, UseMulOps]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2437,15 +2430,17 @@ def t2UMULL : T2MulLong<0b010, 0b0000,
 } // isCommutable
 
 // Multiply + accumulate
-def t2SMLAL : T2MulLong<0b100, 0b0000,
+def t2SMLAL : T2MlaLong<0b100, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
-                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
 
-def t2UMLAL : T2MulLong<0b110, 0b0000,
+def t2UMLAL : T2MlaLong<0b110, 0b0000,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
-                  (ins rGPR:$Rn, rGPR:$Rm), IIC_iMAC64,
-                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>;
+                  (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
+                  "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+                  RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">;
 
 def t2UMAAL : T2MulLong<0b110, 0b0110,
                   (outs rGPR:$RdLo, rGPR:$RdHi),
@@ -2482,7 +2477,7 @@ def t2SMMLA : T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "smmla", "\t$Rd, $Rn, $Rm, $Ra",
                 [(set rGPR:$Rd, (add (mulhs rGPR:$Rm, rGPR:$Rn), rGPR:$Ra))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+              Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2503,7 +2498,7 @@ def t2SMMLS: T2FourReg<
         (outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                 "smmls", "\t$Rd, $Rn, $Rm, $Ra",
                 [(set rGPR:$Rd, (sub rGPR:$Ra, (mulhs rGPR:$Rn, rGPR:$Rm)))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+             Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b110;
@@ -2608,7 +2603,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               [(set rGPR:$Rd, (add rGPR:$Ra,
                                (opnode (sext_inreg rGPR:$Rn, i16),
                                        (sext_inreg rGPR:$Rm, i16))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2621,7 +2616,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
              !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm, $Ra",
              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sext_inreg rGPR:$Rn, i16),
                                                  (sra rGPR:$Rm, (i32 16)))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2634,7 +2629,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
                                                (sext_inreg rGPR:$Rm, i16))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2647,7 +2642,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm, $Ra",
              [(set rGPR:$Rd, (add rGPR:$Ra, (opnode (sra rGPR:$Rn, (i32 16)),
                                                  (sra rGPR:$Rm, (i32 16)))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b001;
@@ -2660,7 +2655,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
                                     (sext_inreg rGPR:$Rm, i16)), (i32 16))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -2673,7 +2668,7 @@ multiclass T2I_smla<string opc, PatFrag opnode> {
               !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
               [(set rGPR:$Rd, (add rGPR:$Ra, (sra (opnode rGPR:$Rn,
                                       (sra rGPR:$Rm, (i32 16))), (i32 16))))]>,
-          Requires<[IsThumb2, HasThumb2DSP]> {
+           Requires<[IsThumb2, HasThumb2DSP, UseMulOps]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = 0b011;
@@ -2767,7 +2762,7 @@ def t2SMLSLDX : T2FourReg_mac<1, 0b101, 0b1101, (outs rGPR:$Ra,rGPR:$Rd),
 //  Division Instructions.
 //  Signed and unsigned division on v7-M
 //
-def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
+def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "sdiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
                  Requires<[HasDivide, IsThumb2]> {
@@ -2778,7 +2773,7 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
   let Inst{7-4} = 0b1111;
 }
 
-def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iALUi,
+def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "udiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
                  Requires<[HasDivide, IsThumb2]> {
@@ -3049,37 +3044,6 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
                  RegConstraint<"$false = $Rd">;
 } // isCodeGenOnly = 1
 
-multiclass T2I_bincc_irs<Instruction iri, Instruction irr, Instruction irs,
-                   InstrItinClass iii, InstrItinClass iir, InstrItinClass iis> {
-   // shifted imm
-   def ri : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_imm:$imm,
-                                pred:$p, cc_out:$s),
-                           4, iii, [],
-                  (iri rGPR:$Rd, rGPR:$Rn, t2_so_imm:$imm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-   // register
-   def rr : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, rGPR:$Rm,
-                                pred:$p, cc_out:$s),
-                           4, iir, [],
-                        (irr rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-   // shifted register
-   def rs : t2PseudoExpand<(outs rGPR:$Rd),
-                           (ins rGPR:$Rfalse, rGPR:$Rn, t2_so_reg:$ShiftedRm,
-                                pred:$p, cc_out:$s),
-                           4, iis, [],
-            (irs rGPR:$Rd, rGPR:$Rn, t2_so_reg:$ShiftedRm, pred:$p, cc_out:$s)>,
-                           RegConstraint<"$Rfalse = $Rd">;
-} // T2I_bincc_irs
-
-defm t2ANDCC : T2I_bincc_irs<t2ANDri, t2ANDrr, t2ANDrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2ORRCC : T2I_bincc_irs<t2ORRri, t2ORRrr, t2ORRrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
-defm t2EORCC : T2I_bincc_irs<t2EORri, t2EORrr, t2EORrs,
-                             IIC_iBITi, IIC_iBITr, IIC_iBITsi>;
 } // neverHasSideEffects
 
 //===----------------------------------------------------------------------===//
@@ -3281,11 +3245,11 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let Inst{15-14} = 0b10;
   let Inst{12} = 1;
 
-  bits<20> target;
+  bits<24> target;
   let Inst{26} = target{19};
   let Inst{11} = target{18};
   let Inst{13} = target{17};
-  let Inst{21-16} = target{16-11};
+  let Inst{25-16} = target{20-11};
   let Inst{10-0} = target{10-0};
   let DecoderMethod = "DecodeT2BInstruction";
 }
@@ -3367,20 +3331,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                  Requires<[IsThumb2, IsIOS]>;
 }
 
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
-  // return stack predictor.
-  def t2BMOVPCB_CALL : tPseudoInst<(outs),
-                                   (ins t_bltarget:$func),
-                               6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                        Requires<[IsThumb]>;
-}
-
-// Direct calls
-def : T2Pat<(ARMcall_nolink texternalsym:$func),
-            (t2BMOVPCB_CALL texternalsym:$func)>,
-      Requires<[IsThumb]>;
-
 // IT block
 let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index eb7eaa6c9708..b5a896c69985 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -450,11 +450,11 @@ def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
-def : ARMPat<(f32_to_f16 SPR:$a),
-             (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+def : Pat<(f32_to_f16 SPR:$a),
+          (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
 
-def : ARMPat<(f16_to_f32 GPR:$a),
-             (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
+def : Pat<(f16_to_f32 GPR:$a),
+          (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
@@ -523,10 +523,12 @@ def VMOVRS : AVConv2I<0b11100001, 0b1010,
   let D = VFPNeonDomain;
 }
 
+// Bitcast i32 -> f32.  NEON prefers to use VMOVDRR.
 def VMOVSR : AVConv4I<0b11100000, 0b1010,
                       (outs SPR:$Sn), (ins GPR:$Rt),
                       IIC_fpMOVIS, "vmov", "\t$Sn, $Rt",
-                      [(set SPR:$Sn, (bitconvert GPR:$Rt))]> {
+                      [(set SPR:$Sn, (bitconvert GPR:$Rt))]>,
+             Requires<[HasVFP2, UseVMOVSR]> {
   // Instruction operands.
   bits<5> Sn;
   bits<4> Rt;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 3f99cce14669..254d8f6b7c7a 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -168,7 +168,7 @@ void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
       intptr_t LazyPtr = getIndirectSymAddr(Fn);
       if (!LazyPtr) {
         // In PIC mode, the function stub is loading a lazy-ptr.
-        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((const GlobalValue*)F, Fn, JCE);
         DEBUG(if (F)
                 errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
                        << "] for GV '" << F->getName() << "'\n";
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 897ceb624bf5..0185289f3bd8 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -27,7 +27,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -1448,7 +1448,7 @@ namespace {
     static char ID;
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
-    const TargetData *TD;
+    const DataLayout *TD;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
@@ -1478,7 +1478,7 @@ namespace {
 }
 
 bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  TD  = Fn.getTarget().getTargetData();
+  TD  = Fn.getTarget().getDataLayout();
   TII = Fn.getTarget().getInstrInfo();
   TRI = Fn.getTarget().getRegisterInfo();
   STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
index f1c8fc84816e..c0ac04b6003c 100644
--- a/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -108,6 +108,11 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// pass.
   DenseMap<unsigned, unsigned> CPEClones;
 
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
 public:
   ARMFunctionInfo() :
     isThumb(false),
@@ -119,7 +124,7 @@ public:
     GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
     NumAlignedDPRCS2Regs(0),
     JumpTableUId(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false) {}
+    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
 
   explicit ARMFunctionInfo(MachineFunction &MF) :
     isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
@@ -130,7 +135,7 @@ public:
     GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
     GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
     JumpTableUId(0), PICLabelUId(0),
-    VarArgsFrameIndex(0), HasITBlocks(false) {}
+    VarArgsFrameIndex(0), HasITBlocks(false), GlobalBaseReg(0) {}
 
   bool isThumbFunction() const { return isThumb; }
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
@@ -249,6 +254,9 @@ public:
   bool hasITBlocks() const { return HasITBlocks; }
   void setHasITBlocks(bool h) { HasITBlocks = h; }
 
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
   void recordCPEClone(unsigned CPIdx, unsigned CPCloneIdx) {
     if (!CPEClones.insert(std::make_pair(CPCloneIdx, CPIdx)).second)
       assert(0 && "Duplicate entries!");
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 6f974fd17d8c..b0f576bc2b6f 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -49,6 +49,9 @@ def ssub_0  : SubRegIndex;
 def ssub_1  : SubRegIndex;
 def ssub_2  : SubRegIndex<[dsub_1, ssub_0]>;
 def ssub_3  : SubRegIndex<[dsub_1, ssub_1]>;
+
+def gsub_0  : SubRegIndex;
+def gsub_1  : SubRegIndex;
 // Let TableGen synthesize the remaining 12 ssub_* indices.
 // We don't need to name them.
 }
@@ -247,11 +250,16 @@ def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> {
 }
 
 // Scalar single precision floating point register class..
-def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)>;
+// FIXME: Allocation order changed to s0, s2, s4, ... as a quick hack to
+// avoid partial-write dependencies on D registers (S registers are
+// renamed as portions of D registers).
+def SPR : RegisterClass<"ARM", [f32], 32, (add (decimate
+                                                (sequence "S%u", 0, 31), 2),
+                                               (sequence "S%u", 0, 31))>;
 
 // Subset of SPR which can be used as a source of NEON scalars for 16-bit
 // operations
-def SPR_8 : RegisterClass<"ARM", [f32], 32, (trunc SPR, 16)>;
+def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)>;
 
 // Scalar double precision floating point / generic 64-bit vector register
 // class.
@@ -308,6 +316,17 @@ def DPair : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
   let AltOrderSelect = [{ return 1; }];
 }
 
+// Pseudo-registers representing even-odd pairs of GPRs from R1 to R13/SP.
+// These are needed by instructions (e.g. ldrexd/strexd) requiring even-odd GPRs.
+def Tuples2R : RegisterTuples<[gsub_0, gsub_1],
+                              [(add R0, R2, R4, R6, R8, R10, R12),
+                               (add R1, R3, R5, R7, R9, R11, SP)]>;
+
+// Register class representing a pair of even-odd GPRs.
+def GPRPair : RegisterClass<"ARM", [untyped], 64, (add Tuples2R)> {
+  let Size = 64; // 2 x 32 bits, we have no predefined type of that size.
+}
+
 // Pseudo-registers representing 3 consecutive D registers.
 def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
                               [(shl DPR, 0),
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 81d2fa37c2d1..02196d06bfd3 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -55,6 +55,7 @@ def IIC_iMUL32     : InstrItinClass;
 def IIC_iMAC32     : InstrItinClass;
 def IIC_iMUL64     : InstrItinClass;
 def IIC_iMAC64     : InstrItinClass;
+def IIC_iDIV     : InstrItinClass;
 def IIC_iLoad_i    : InstrItinClass;
 def IIC_iLoad_r    : InstrItinClass;
 def IIC_iLoad_si   : InstrItinClass;
@@ -261,3 +262,4 @@ def IIC_VTBX4      : InstrItinClass;
 include "ARMScheduleV6.td"
 include "ARMScheduleA8.td"
 include "ARMScheduleA9.td"
+include "ARMScheduleSwift.td"
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 7bc590f94756..404634fee989 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1876,8 +1876,9 @@ def CortexA9Itineraries : ProcessorItineraries<
 ]>;
 
 // ===---------------------------------------------------------------------===//
-// This following definitions describe the simple machine model which
-// will replace itineraries.
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler and will eventually replace itineraries.
+
 
 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
 def CortexA9Model : SchedMachineModel {
@@ -1891,5 +1892,595 @@ def CortexA9Model : SchedMachineModel {
   let Itineraries = CortexA9Itineraries;
 }
 
-// TODO: Add Cortex-A9 processor and scheduler resources.
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+def A9UnitALU : ProcResource<2>;
+def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
+def A9UnitAGU : ProcResource<1>;
+def A9UnitLS  : ProcResource<1>;
+def A9UnitFP  : ProcResource<1> { let Buffered = 0; }
+def A9UnitB   : ProcResource<1>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write types with their resources and latency on A9.
+
+// Consume an issue slot, but no processor resources. This is useful when all
+// other writes associated with the operand have NumMicroOps = 0.
+def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
+
+// Write an integer register.
+def A9WriteI : SchedWriteRes<[A9UnitALU]>;
+// Write an integer shifted-by register
+def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+
+// Basic ALU.
+def A9WriteA : SchedWriteRes<[A9UnitALU]>;
+// ALU with operand shifted by immediate.
+def A9WriteAsi : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+// ALU with operand shifted by register.
+def A9WriteAsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
+
+// Multiplication
+def A9WriteM   : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
+def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
+                                              let NumMicroOps = 0; }
+def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
+def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
+                                                let NumMicroOps = 0; }
+
+// Floating-point
+// Only one FP or AGU instruction may issue per cycle. We model this
+// by having FP instructions consume the AGU resource.
+def A9WriteF      : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
+def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
+def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
+def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
+
+// NEON has an odd mix of latencies. Simply name the write types by latency.
+def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
+def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
+def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
+def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+
+// Reserve A9UnitFP for 2 consecutive cycles.
+def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+// Branches don't have a def operand but still consume resources.
+def A9WriteB : SchedWriteRes<[A9UnitB]>;
+
+// Address generation.
+def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
+
+// Load Integer.
+def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+// Load the upper 32-bits using the same micro-op.
+def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
+                                     let NumMicroOps = 0; }
+// Offset shifted by register.
+def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+// Load (and zero extend) a byte.
+def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
+
+// Load or Store Float, aligned.
+def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
+
+// Store Integer.
+def A9WriteS : SchedWriteRes<[A9UnitLS]>;
+
+//===----------------------------------------------------------------------===//
+// Define resources dynamically for load multiple variants.
+
+// Define helpers for extra latency without consuming resources.
+def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
+foreach NumCycles = 2-8 in {
+def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
+} // foreach NumCycles
+
+// Define TII for use in SchedVariant Predicates.
+def : PredicateProlog<[{
+  const ARMBaseInstrInfo *TII =
+    static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// Define address generation sequences and predicates for 8 flavors of LDMs.
+foreach NumAddr = 1-8 in {
+
+// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
+// latency for instructions that generate multiple loads or stores.
+def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
+
+// Define a predicate to select the LDM based on number of memory addresses.
+def A9LMAdr#NumAddr#Pred :
+  SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+
+} // foreach NumAddr
+
+// Fall-back for unknown LDMs.
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">;
+
+// LDM/VLDM/VLDn address generation latency & resources.
+// Dynamically select the A9WriteAdrN sequence using a predicate.
+def A9WriteLMAdr : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
+  // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
+  SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
+
+// Define LDM Resources.
+// These take no issue resource, so they can be combined with other
+// writes like WriteB.
+// A9WriteLMLo takes a single LS resource and 2 cycles.
+def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
+                                              let NumMicroOps = 0; }
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
+                                      let NumMicroOps = 0; }
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+foreach NumAddr = 1-8 in {
+def A9WriteL#NumAddr : WriteSequence<
+  [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+def A9WriteL#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// LDM: Load multiple into 32-bit integer registers.
+
+// A9WriteLM variants expand into a pair of writes for each 64-bit
+// value loaded. When the number of registers is odd, the last
+// A9WriteLnHi is naturally ignored because the instruction has no
+// following def operands.  These variants take no issue resource, so
+// they may need to be part of a WriteSequence that includes A9WriteIssue.
+def A9WriteLM : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteL1, A9WriteL1Hi]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi,
+                          A9WriteL7, A9WriteL7Hi]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi,
+                          A9WriteL7, A9WriteL7Hi,
+                          A9WriteL8, A9WriteL8Hi]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
+                             A9WriteL2, A9WriteL2Hi,
+                             A9WriteL3Hi, A9WriteL3Hi,
+                             A9WriteL4Hi, A9WriteL4Hi,
+                             A9WriteL5Hi, A9WriteL5Hi,
+                             A9WriteL6Hi, A9WriteL6Hi,
+                             A9WriteL7Hi, A9WriteL7Hi,
+                             A9WriteL8Hi, A9WriteL8Hi]>]> {
+  let Variadic = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
+
+// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
+// so can be used in WriteSequences for in single-issue instructions that
+// encapsulate multiple loads.
+def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+
+foreach NumAddr = 1-8 in {
+
+// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
+def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
+
+// A9WriteLfp1-8 definitions are statically expanded into a sequence of
+// A9WriteLfpOps with additive latency that takes a single issue slot.
+// Used directly to describe NEON VLDn.
+def A9WriteLfp#NumAddr : WriteSequence<
+  [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
+// permuting loaded values.
+def A9WriteLfp#NumAddr#Mov : WriteSequence<
+  [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+} // foreach NumAddr
+
+// Define VLDM/VSTM PreRA resources.
+// A9WriteLMfpPreRA are dynamically expanded into the correct
+// A9WriteLfp1-8 sequence based on a predicate. This supports the
+// preRA VLDM variants in which all 64-bit loads are written to the
+// same tuple of either single or double precision registers.
+def A9WriteLMfpPreRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
+  // For unknown VLDM/VSTM PreRA, assume 2xS registers.
+  SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
+
+// Define VLDM/VSTM PostRA Resources.
+// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
+def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
+
+foreach NumAddr = 1-8 in {
+
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+def A9WriteLMfp#NumAddr : WriteSequence<
+  [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMfp#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+} // foreach NumAddr
+
+// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
+// pair of writes for each 64-bit data loaded. When the number of
+// registers is odd, the last WriteLMfpnHi is naturally ignored because
+// the instruction has no following def operands.
+def A9WriteLMfpPostRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLMfp1, A9WriteLMfp1Hi]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi,
+                          A9WriteLMfp7, A9WriteLMfp7Hi]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi,
+                          A9WriteLMfp7, A9WriteLMfp7Hi,
+                          A9WriteLMfp8, A9WriteLMfp8Hi]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                             A9WriteLMfp2, A9WriteLMfp2Hi,
+                             A9WriteLMfp3Hi, A9WriteLMfp3Hi,
+                             A9WriteLMfp4Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp5Hi,
+                             A9WriteLMfp6Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp7Hi,
+                             A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
+  let Variadic = 1;
+}
+
+// Distinguish between our multiple MI-level forms of the same
+// VLDM/VSTM instructions.
+def A9PreRA : SchedPredicate<
+  "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+def A9PostRA : SchedPredicate<
+  "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+
+// VLDM represents all destination registers as a single register
+// tuple, unlike LDM. So the number of write operands is not variadic.
+def A9WriteLMfp : SchedWriteVariant<[
+  SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
+  SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
+
+//===----------------------------------------------------------------------===//
+// Resources for other (non LDM/VLDM) Variants.
+
+// These mov immediate writers are unconditionally expanded with
+// additive latency.
+def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
+def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, A9WriteA]>;
+def A9WriteI2ld  : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
+
+// Some ALU operations can read loaded integer values one cycle early.
+def A9ReadA : SchedReadAdvance<1,
+  [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
+   A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
+   A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
+   A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
+   A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
+
+// Read types for operands that are unconditionally read in cycle N
+// after the instruction issues, decreases producer latency by N-1.
+def A9Read2 : SchedReadAdvance<1>;
+def A9Read3 : SchedReadAdvance<2>;
+def A9Read4 : SchedReadAdvance<3>;
+
+//===----------------------------------------------------------------------===//
+// Map itinerary classes to scheduler read/write resources per operand.
+//
+// For ARM, we piggyback scheduler resources on the Itinerary classes
+// to avoid perturbing the existing instruction definitions.
+
+// This table follows the ARM Cortex-A9 Technical Reference Manuals,
+// mostly in order.
+let SchedModel = CortexA9Model in {
+
+def :ItinRW<[A9WriteI], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+                         IIC_iMVNi,IIC_iMVNsi,
+                         IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
+def :ItinRW<[A9WriteI,A9ReadA],[IIC_iMVNr]>;
+def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
+
+def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
+def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
+def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
+
+def :ItinRW<[A9WriteA], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
+def :ItinRW<[A9WriteA, A9ReadA], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
+def :ItinRW<[A9WriteA, A9ReadA, A9ReadA],[IIC_iALUr,IIC_iCMPr]>;
+def :ItinRW<[A9WriteAsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
+def :ItinRW<[A9WriteAsi, A9ReadA], [IIC_iALUsi]>;
+def :ItinRW<[A9WriteAsi, ReadDefault, A9ReadA], [IIC_iALUsir]>; // RSB
+def :ItinRW<[A9WriteAsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
+def :ItinRW<[A9WriteAsr, A9ReadA], [IIC_iALUsr,IIC_iCMPsr]>;
+
+// A9WriteHi ignored for MUL32.
+def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
+                                     IIC_iMUL64,IIC_iMAC64]>;
+// FIXME: SMLALxx needs itin classes
+def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
+
+// TODO: For floating-point ops, we model the pipeline forwarding
+// latencies here. WAW latencies are sometimes longer.
+
+def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
+                            IIC_fpUNA32, IIC_fpUNA64,
+                            IIC_fpCMP32, IIC_fpCMP64]>;
+def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
+def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
+                         IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
+                         IIC_fpALU32, IIC_fpALU64]>;
+def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
+def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
+def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
+def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
+def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
+def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
+def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
+def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
+
+def :ItinRW<[A9WriteB], [IIC_Br]>;
+
+// A9 PLD is processed in a dedicated unit.
+def :ItinRW<[], [IIC_Preload]>;
+
+// Note: We must assume that loads are aligned, since the machine
+// model cannot know this statically and A9 ignores alignment hints.
+
+// A9WriteAdr consumes AGU regardless address writeback. But it's
+// latency is only relevant for users of an updated address.
+def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
+                                     IIC_iLoad_iu,IIC_iLoad_ru]>;
+def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
+def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
+                                       IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
+def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
+def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
+                                            IIC_iLoad_d_ru]>;
+// Store either has no def operands, or the one def for address writeback.
+def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
+                                     IIC_iStore_iu, IIC_iStore_ru,
+                                     IIC_iStore_d_i, IIC_iStore_d_r,
+                                     IIC_iStore_d_ru]>;
+def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
+                                      IIC_iStore_bh_i, IIC_iStore_bh_r,
+                                      IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
+def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
+
+// A9WriteML will be expanded into a separate write for each def
+// operand. Address generation consumes resources, but A9WriteLMAdr
+// is listed after all def operands, so has no effective latency.
+//
+// Note: A9WriteLM expands into an even number of def operands. The
+// actual number of def operands may be less by one.
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
+
+// Load multiple with address writeback has an extra def operand in
+// front of the loaded registers.
+//
+// Reuse the load-multiple variants for store-multiple because the
+// resources are identical, For stores only the address writeback
+// has a def operand so the WriteL latencies are unused.
+def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
+                                                      IIC_iStore_m,
+                                                      IIC_iStore_mu]>;
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
+def :ItinRW<[A9WriteL, A9WriteAdr, A9WriteA], [IIC_iLoadiALU]>;
+
+def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
+
+def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
+def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
+def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
+                                        IIC_fpStore_m, IIC_fpStore_mu]>;
+
+// Note: Unlike VLDM, VLD1 expects the writeback operand after the
+// normal writes.
+def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
+                                         IIC_VLD1x2, IIC_VLD1x2u]>;
+def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
+                                         IIC_VLD1x4, IIC_VLD1x4u,
+                                         IIC_VLD4dup, IIC_VLD4dupu]>;
+def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
+                                            IIC_VLD2, IIC_VLD2u,
+                                            IIC_VLD2dup, IIC_VLD2dupu]>;
+def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
+                                            IIC_VLD2x2, IIC_VLD2x2u,
+                                            IIC_VLD2ln, IIC_VLD2lnu]>;
+def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
+                                            IIC_VLD3dup, IIC_VLD3dupu]>;
+def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
+                                            IIC_VLD4ln, IIC_VLD4lnu]>;
+def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
+
+// Vector stores use similar resources to vector loads, so use the
+// same write types. The address write must be first for stores with
+// address writeback.
+def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
+                                         IIC_VST1x2, IIC_VST1x2u,
+                                         IIC_VST1ln, IIC_VST1lnu,
+                                         IIC_VST2, IIC_VST2u,
+                                         IIC_VST2x2, IIC_VST2x2u,
+                                         IIC_VST2ln, IIC_VST2lnu]>;
+def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
+                                         IIC_VST1x4, IIC_VST1x4u,
+                                         IIC_VST3, IIC_VST3u,
+                                         IIC_VST3ln, IIC_VST3lnu,
+                                         IIC_VST4, IIC_VST4u,
+                                         IIC_VST4ln, IIC_VST4lnu]>;
+
+// NEON moves.
+def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
+def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
+def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
+
+// NEON integer arithmetic
+//
+// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
+// VSUB/VMVN/VCLSD/VCLZD/VCNTD
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
+// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
+// ...
+// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
+// VQNEG/VQABS
+def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
+// VABS
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
+// VPADD/VPADDL are mapped later under IIC_SHLi.
+// ...
+// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
+def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
+// VMOVimm/VMVNimm/VORRimm/VBICimm
+def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
+def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
+
+// NEON integer multiply
+//
+// Note: these don't quite match the timing docs, but they do match
+// the original A9 itinerary.
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
+def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
+def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
+def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
+def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
+
+// NEON integer shift
+// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
+def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
+def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
+
+// NEON permute
+def :ItinRW<[A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
+            [IIC_VPERMQ3, IIC_VEXTQ]>;
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
+            [IIC_VTBX4]>;
 
+// NEON floating-point
+def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
+def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
+def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
+def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
+def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
+def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
+def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
+} // SchedModel = CortexA9Model
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
new file mode 100644
index 000000000000..e9bc3e0f3955
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -0,0 +1,1085 @@
+//=- ARMScheduleSwift.td - Swift Scheduling Definitions -*- tablegen -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Swift processor..
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// This section contains legacy support for itineraries. This is
+// required until SD and PostRA schedulers are replaced by MachineScheduler.
+
+def SW_DIS0 : FuncUnit;
+def SW_DIS1 : FuncUnit;
+def SW_DIS2 : FuncUnit;
+
+def SW_ALU0 : FuncUnit;
+def SW_ALU1 : FuncUnit;
+def SW_LS   : FuncUnit;
+def SW_IDIV : FuncUnit;
+def SW_FDIV : FuncUnit;
+
+// FIXME: Need bypasses.
+// FIXME: Model the multiple stages of IIC_iMOVix2, IIC_iMOVix2addpc, and
+//        IIC_iMOVix2ld better.
+// FIXME: Model the special immediate shifts that are not microcoded.
+// FIXME: Do we need to model the fact that uses of r15 in a micro-op force it
+//        to issue on pipe 1?
+// FIXME: Model the pipelined behavior of CMP / TST instructions.
+// FIXME: Better model the microcode stages of multiply instructions, especially
+//        conditional variants.
+// FIXME: Add preload instruction when it is documented.
+// FIXME: Model non-pipelined nature of FP div / sqrt unit.
+
+def SwiftItineraries : ProcessorItineraries<
+  [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMOVix2 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2]>,
+  InstrItinData<IIC_iMOVix2addpc,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                 [3]>,
+  InstrItinData<IIC_iMOVix2ld,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_LS]>],
+                              [5]>,
+  //
+  // MVN instructions
+  InstrItinData<IIC_iMVNi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iMVNsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iALUr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iALUsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iALUsir,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iALUsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1, 1]>,
+  //
+  // Bitwise Instructions that produce a result
+  InstrItinData<IIC_iBITi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iBITr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iBITsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1]>,
+  InstrItinData<IIC_iBITsr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+
+  // CLZ, RBIT, etc.
+  InstrItinData<IIC_iUNAr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+
+  // BFC, BFI, UBFX, SBFX
+  InstrItinData<IIC_iUNAsi, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [2, 1]>,
+
+  //
+  // Zero and sign extension instructions
+  InstrItinData<IIC_iEXTr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1]>,
+  InstrItinData<IIC_iEXTAr, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1]>,
+  InstrItinData<IIC_iEXTAsr,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                            [1, 1, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iCMPr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMPsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMPsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1, 1]>,
+  //
+  // Test instructions
+  InstrItinData<IIC_iTSTi   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iTSTr   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iTSTsi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iTSTsr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<2, [SW_ALU0, SW_ALU1]>],
+                              [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  // FIXME: Correctly model the extra input dep on the destination.
+  InstrItinData<IIC_iCMOVi  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1]>,
+  InstrItinData<IIC_iCMOVr  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMOVsi , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [1, 1]>,
+  InstrItinData<IIC_iCMOVsr , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_iCMOVix2, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [3, 1, 1]>,
+  InstrItinData<IIC_iMAC16  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [3, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  InstrItinData<IIC_iMAC32  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  InstrItinData<IIC_iMUL64  , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0], 3>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [5, 5, 1, 1]>,
+  InstrItinData<IIC_iMAC64  , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0], 1>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [5, 6, 1, 1]>,
+  //
+  // Integer divide
+  InstrItinData<IIC_iDIV  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                             InstrStage<1, [SW_ALU0], 0>,
+                             InstrStage<14, [SW_IDIV]>],
+                            [14, 1, 1]>,
+
+  // Integer load pipeline
+  // FIXME: The timings are some rough approximations
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoad_i   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1]>,
+  InstrItinData<IIC_iLoad_bh_i, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1]>,
+  InstrItinData<IIC_iLoad_d_i , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_LS], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoad_r   , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_r, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_r , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 1>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [3, 4, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iLoad_si  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS]>],
+                                [5, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS]>],
+                                [5, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoad_iu  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoad_ru  , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_ALU0], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [3, 1, 1, 1]>,
+  InstrItinData<IIC_iLoad_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [3, 4, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iLoad_siu , [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                 InstrStage<1, [SW_LS], 3>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [5, 3, 1, 1]>,
+  InstrItinData<IIC_iLoad_bh_siu,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_DIS2], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                                [5, 3, 1, 1]>,
+  //
+  // Load multiple, def is the 5th operand.
+  // FIXME: This assumes 3 to 4 registers.
+  InstrItinData<IIC_iLoad_m  , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+
+  //
+  // Load multiple + update, defs are the 1st and 5th operands.
+  InstrItinData<IIC_iLoad_mu , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
+                                InstrStage<1, [SW_LS], 3>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [2, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Load multiple plus branch
+  InstrItinData<IIC_iLoad_mBr, [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop, def is the 3rd operand.
+  InstrItinData<IIC_iPop  ,    [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+  //
+  // Pop + branch, def is the 3rd operand.
+  InstrItinData<IIC_iPop_Br,   [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [1, 1, 3], [], -1>, // dynamic uops
+
+  //
+  // iLoadi + iALUr for t2LDRpci_pic.
+  InstrItinData<IIC_iLoadiALU, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                InstrStage<1, [SW_LS], 3>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [4, 1]>,
+
+  // Integer store pipeline
+  ///
+  // Immediate offset
+  InstrItinData<IIC_iStore_i  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  InstrItinData<IIC_iStore_bh_i,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  InstrItinData<IIC_iStore_d_i, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStore_r  , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_r,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_r, [InstrStage<1, [SW_DIS0], 0>,
+                                 InstrStage<1, [SW_DIS1], 0>,
+                                 InstrStage<1, [SW_DIS2], 0>,
+                                 InstrStage<1, [SW_LS], 0>,
+                                 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                 InstrStage<1, [SW_LS]>],
+                                [1, 1, 1]>,
+  //
+  // Scaled register offset
+  InstrItinData<IIC_iStore_si ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_si,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStore_iu ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_iu,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStore_ru ,  [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_ru,[InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_d_ru, [InstrStage<1, [SW_DIS0], 0>,
+                                  InstrStage<1, [SW_DIS1], 0>,
+                                  InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                  InstrStage<1, [SW_LS]>],
+                                 [1, 1, 1, 1]>,
+  //
+  // Scaled register offset with update
+  InstrItinData<IIC_iStore_siu,    [InstrStage<1, [SW_DIS0], 0>,
+                                    InstrStage<1, [SW_DIS1], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                    InstrStage<1, [SW_LS], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+                                   [3, 1, 1, 1]>,
+  InstrItinData<IIC_iStore_bh_siu, [InstrStage<1, [SW_DIS0], 0>,
+                                    InstrStage<1, [SW_DIS1], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
+                                    InstrStage<1, [SW_LS], 0>,
+                                    InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
+                                   [3, 1, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStore_m , [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                                [], [], -1>, // dynamic uops
+  //
+  // Store multiple + update
+  InstrItinData<IIC_iStore_mu, [InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_DIS2], 0>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS], 1>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
+                                InstrStage<1, [SW_LS]>],
+                               [2], [], -1>, // dynamic uops
+
+  //
+  // Preload
+  InstrItinData<IIC_Preload,   [InstrStage<1, [SW_DIS0], 0>], [1, 1]>,
+
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br       , [InstrStage<1, [SW_DIS0], 0>]>,
+
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                              InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                             [1]>,
+  //
+  // Single-precision FP Unary
+  //
+  // Most floating-point moves get issued on ALU0.
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [1, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+
+  //
+  // Single to Half FP Convert
+  InstrItinData<IIC_fpCVTSH , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 4>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1]>,
+  //
+  // Half to Single FP Convert
+  InstrItinData<IIC_fpCVTHS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [12, 1, 1]>,
+  //
+  // Single-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-precision Fused FP MAC
+  InstrItinData<IIC_fpFMAC64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [12, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<15, [SW_FDIV]>],
+                              [17, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<30, [SW_FDIV]>],
+                              [32, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<15, [SW_FDIV]>],
+                              [17, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 0>,
+                               InstrStage<30, [SW_FDIV]>],
+                              [32, 1, 1]>,
+
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_fpMOVIS,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_fpMOVID,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_fpMOVSI,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_fpMOVDI,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 3>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 4, 1]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1]>,
+  //
+  // FP Load Multiple
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpLoad_m, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1, 1, 4], [], -1>, // dynamic uops
+  //
+  // FP Load Multiple + update
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpLoad_mu,[InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                              [2, 1, 1, 1, 4], [], -1>, // dynamic uops
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1]>,
+  //
+  // Double-precision FP Store
+  InstrItinData<IIC_fpStore64,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1]>,
+  //
+  // FP Store Multiple
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpStore_m,[InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [1, 1, 1], [], -1>, // dynamic uops
+  //
+  // FP Store Multiple + update
+  // FIXME: Assumes a single Q register.
+  InstrItinData<IIC_fpStore_mu,[InstrStage<1, [SW_DIS0], 0>,
+                                InstrStage<1, [SW_DIS1], 0>,
+                                InstrStage<1, [SW_LS], 4>,
+                                InstrStage<1, [SW_ALU0, SW_ALU1]>],
+                               [2, 1, 1, 1], [], -1>, // dynamic uops
+  // NEON
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4D,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Subtract (4 cycle)
+  InstrItinData<IIC_VSUBi4Q,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register Integer Count
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1, 1]>,
+  //
+  // Double-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Absolute Difference and Accumulate
+  InstrItinData<IIC_VABAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1, 1]>,
+
+  //
+  // Move
+  InstrItinData<IIC_VMOV,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Quad-register Permute Move
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [4, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 3>,
+                               InstrStage<1, [SW_LS]>],
+                              [3, 4, 1]>,
+  //
+  // Integer to Lane Move
+  // FIXME: I think this is correct, but it is not clear from the tuning guide.
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_LS], 4>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [6, 1]>,
+
+  //
+  // Vector narrow move
+  InstrItinData<IIC_VMOVN,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1]>,
+  //
+  // Double-register FP Unary
+  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+  //        and they issue on a different pipeline.
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Quad-register FP Unary
+  // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
+  //        and they issue on a different pipeline.
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [2, 1]>,
+  //
+  // Double-register FP Binary
+  // FIXME: We're using this itin for many instructions.
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+
+  //
+  // VPADD, etc.
+  InstrItinData<IIC_VPBIND,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register FP VMUL
+  InstrItinData<IIC_VFMULD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register FP Binary
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU0]>],
+                              [4, 1, 1]>,
+  //
+  // Quad-register FP VMUL
+  InstrItinData<IIC_VFMULQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Fused FP Multiple-Accumulate
+  InstrItinData<IIC_VFMACD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register FusedF P Multiple-Accumulate
+  InstrItinData<IIC_VFMACQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 1]>,
+  //
+  // Double-register Permute
+  // FIXME: The latencies are unclear from the documentation.
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+  //
+  // Quad-register Permute
+  // FIXME: The latencies are unclear from the documentation.
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+  //
+  // Quad-register Permute (3 cycle issue on A9)
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [3, 4, 3, 4]>,
+
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 3, 3]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 3, 5, 5]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 3, 5, 7, 7]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [2, 1, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [4, 1, 3, 3]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [6, 1, 3, 5, 5]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [SW_DIS0], 0>,
+                               InstrStage<1, [SW_DIS1], 0>,
+                               InstrStage<1, [SW_DIS2], 0>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1], 2>,
+                               InstrStage<1, [SW_ALU1]>],
+                              [8, 1, 3, 5, 7, 7]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// This following definitions describe the simple machine model which
+// will replace itineraries.
+
+// Swift machine model for scheduling and other instruction cost heuristics.
+def SwiftModel : SchedMachineModel {
+  let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
+  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let LoadLatency = 3;
+
+  let Itineraries = SwiftItineraries;
+}
+
+// TODO: Add Swift processor and scheduler resources.
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 31d5d38d84f3..b33b3c915a6e 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -155,7 +155,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   TargetLowering::ArgListEntry Entry;
 
   // First argument: data pointer
-  Type *IntPtrTy = TLI.getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
   Entry.Node = Dst;
   Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 4762854c12dd..bcc9db4ae3e3 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -13,8 +13,9 @@
 
 #include "ARMSubtarget.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
 #include "llvm/GlobalValue.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
@@ -31,6 +32,10 @@ static cl::opt<bool>
 DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
+UseFusedMulOps("arm-use-mulops",
+               cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
 StrictAlign("arm-strict-align", cl::Hidden,
             cl::desc("Disallow all unaligned memory accesses"));
 
@@ -49,6 +54,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasVFPv4(false)
   , HasNEON(false)
   , UseNEONForSinglePrecisionFP(false)
+  , UseMulOps(UseFusedMulOps)
   , SlowFPVMLx(false)
   , HasVMLxForwarding(false)
   , SlowFPBrcc(false)
@@ -63,6 +69,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , HasFP16(false)
   , HasD16(false)
   , HasHardwareDivide(false)
+  , HasHardwareDivideInARM(false)
   , HasT2ExtractPack(false)
   , HasDataBarrier(false)
   , Pref32BitThumb(false)
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index b3940613000c..8e6b6506022d 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -30,7 +30,7 @@ class StringRef;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA8, CortexA9
+    Others, CortexA8, CortexA9, CortexA15, Swift
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
@@ -57,6 +57,10 @@ protected:
   /// determine if NEON should actually be used.
   bool UseNEONForSinglePrecisionFP;
 
+  /// UseMulOps - True if non-microcoded fused integer multiply-add and
+  /// multiply-subtract instructions should be used.
+  bool UseMulOps;
+
   /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
   /// whether the FP VML[AS] instructions are slow (if so, don't use them).
   bool SlowFPVMLx;
@@ -107,6 +111,9 @@ protected:
   /// HasHardwareDivide - True if subtarget supports [su]div
   bool HasHardwareDivide;
 
+  /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
+  bool HasHardwareDivideInARM;
+
   /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
   /// instructions.
   bool HasT2ExtractPack;
@@ -199,7 +206,10 @@ protected:
 
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
   bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+  bool isCortexA15() const { return ARMProcFamily == CortexA15; }
+  bool isSwift()    const { return ARMProcFamily == Swift; }
   bool isCortexM3() const { return CPUString == "cortex-m3"; }
+  bool isLikeA9() const { return isCortexA9() || isCortexA15(); }
 
   bool hasARMOps() const { return !NoARM; }
 
@@ -211,8 +221,10 @@ protected:
     return hasNEON() && UseNEONForSinglePrecisionFP; }
 
   bool hasDivide() const { return HasHardwareDivide; }
+  bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
   bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
+  bool useMulOps() const { return UseMulOps; }
   bool useFPVMLx() const { return !SlowFPVMLx; }
   bool hasVMLxForwarding() const { return HasVMLxForwarding; }
   bool isFPBrccSlow() const { return SlowFPBrcc; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 171c9adfa40f..b486d4fe2ef9 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -60,7 +60,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                                    CodeGenOpt::Level OL)
   : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     InstrInfo(Subtarget),
-    DataLayout(Subtarget.isAPCS_ABI() ?
+    DL(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "v128:32:128-v64:32:64-n32-S32") :
                Subtarget.isAAPCS_ABI() ?
@@ -68,10 +68,10 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                            "v128:64:128-v64:64:64-n32-S64") :
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "v128:64:128-v64:64:64-n32-S32")),
-    ELFWriterInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
-    FrameLowering(Subtarget) {
+    FrameLowering(Subtarget),
+    STTI(&TLInfo), VTTI(&TLInfo) {
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
                        "support ARM mode execution!");
@@ -88,7 +88,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
-    DataLayout(Subtarget.isAPCS_ABI() ?
+    DL(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
                            "v128:32:128-v64:32:64-a:0:32-n32-S32") :
@@ -99,12 +99,12 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
                std::string("e-p:32:32-f64:64:64-i64:64:64-"
                            "i16:16:32-i8:8:32-i1:8:32-"
                            "v128:64:128-v64:64:64-a:0:32-n32-S32")),
-    ELFWriterInfo(*this),
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget.hasThumb2()
               ? new ARMFrameLowering(Subtarget)
-              : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
+              : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
@@ -143,6 +143,11 @@ bool ARMPassConfig::addPreISel() {
 
 bool ARMPassConfig::addInstSelector() {
   addPass(createARMISelDag(getARMTargetMachine(), getOptLevel()));
+
+  const ARMSubtarget *Subtarget = &getARMSubtarget();
+  if (Subtarget->isTargetELF() && !Subtarget->isThumb1Only() &&
+      TM->Options.EnableFastISel)
+    addPass(createARMGlobalBaseRegPass());
   return false;
 }
 
@@ -150,7 +155,7 @@ bool ARMPassConfig::addPreRegAlloc() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
     addPass(createARMLoadStoreOptimizationPass(true));
-  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9())
     addPass(createMLxExpansionPass());
   return true;
 }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index abcdb24c0c69..ebdd5b4d64c9 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -15,7 +15,6 @@
 #define ARMTARGETMACHINE_H
 
 #include "ARMInstrInfo.h"
-#include "ARMELFWriterInfo.h"
 #include "ARMFrameLowering.h"
 #include "ARMJITInfo.h"
 #include "ARMSubtarget.h"
@@ -25,7 +24,8 @@
 #include "Thumb1FrameLowering.h"
 #include "Thumb2InstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetTransformImpl.h"
+#include "llvm/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/OwningPtr.h"
 
@@ -62,11 +62,12 @@ public:
 class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
   ARMInstrInfo        InstrInfo;
-  const TargetData    DataLayout;       // Calculates type size & alignment
-  ARMELFWriterInfo    ELFWriterInfo;
+  const DataLayout    DL;       // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
   ARMFrameLowering    FrameLowering;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
  public:
   ARMTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS,
@@ -88,12 +89,14 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
   virtual const ARMFrameLowering *getFrameLowering() const {
     return &FrameLowering;
   }
-
-  virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
-  virtual const TargetData       *getTargetData() const { return &DataLayout; }
-  virtual const ARMELFWriterInfo *getELFWriterInfo() const {
-    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
   }
+  virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
 };
 
 /// ThumbTargetMachine - Thumb target machine.
@@ -104,12 +107,13 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
   OwningPtr<ARMBaseInstrInfo> InstrInfo;
-  const TargetData    DataLayout;   // Calculates type size & alignment
-  ARMELFWriterInfo    ELFWriterInfo;
+  const DataLayout    DL;   // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
   // Either Thumb1FrameLowering or ARMFrameLowering.
   OwningPtr<ARMFrameLowering> FrameLowering;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 public:
   ThumbTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
@@ -138,10 +142,13 @@ public:
   virtual const ARMFrameLowering *getFrameLowering() const {
     return FrameLowering.get();
   }
-  virtual const TargetData       *getTargetData() const { return &DataLayout; }
-  virtual const ARMELFWriterInfo *getELFWriterInfo() const {
-    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
   }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 3a5957b24107..c61e3bd99d77 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -181,49 +181,44 @@ class ARMAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseVectorLane(VectorLaneTy &LaneKind, unsigned &Index);
 
   // Asm Match Converter Methods
-  bool cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+  void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  void cvtLdWriteBackRegAddrMode2(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+  void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrMode2(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  void cvtStWriteBackRegAddrMode3(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+  void cvtLdExtTWriteBackImm(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+  void cvtLdExtTWriteBackReg(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+  void cvtStExtTWriteBackImm(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+  void cvtStExtTWriteBackReg(MCInst &Inst,
                              const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdrdPre(MCInst &Inst, unsigned Opcode,
-                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtStrdPre(MCInst &Inst, unsigned Opcode,
-                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+  void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
+  void cvtLdWriteBackRegAddrMode3(MCInst &Inst,
                                   const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+  void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
+  void cvtVLDwbFixed(MCInst &Inst,
                      const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
+  void cvtVLDwbRegister(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
+  void cvtVSTwbFixed(MCInst &Inst,
                      const SmallVectorImpl<MCParsedAsmOperand*> &);
-  bool cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
+  void cvtVSTwbRegister(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
@@ -258,15 +253,17 @@ public:
 
   // Implementation of the MCTargetAsmParser interface:
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-  bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands);
   bool ParseDirective(AsmToken DirectiveID);
 
   unsigned checkTargetMatchPredicate(MCInst &Inst);
 
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
 };
 } // end anonymous namespace
 
@@ -486,7 +483,8 @@ public:
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
   ARMCC::CondCodes getCondCode() const {
@@ -862,7 +860,7 @@ public:
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
   bool isToken() const { return Kind == k_Token; }
   bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
-  bool isMemory() const { return Kind == k_Memory; }
+  bool isMem() const { return Kind == k_Memory; }
   bool isShifterImm() const { return Kind == k_ShifterImmediate; }
   bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
   bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
@@ -873,14 +871,14 @@ public:
     return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
   }
   bool isMemNoOffset(bool alignOK = false) const {
-    if (!isMemory())
+    if (!isMem())
       return false;
     // No offset of any kind.
     return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
      (alignOK || Memory.Alignment == 0);
   }
   bool isMemPCRelImm12() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Base register must be PC.
     if (Memory.BaseRegNum != ARM::PC)
@@ -894,7 +892,7 @@ public:
     return isMemNoOffset(true);
   }
   bool isAddrMode2() const {
-    if (!isMemory() || Memory.Alignment != 0) return false;
+    if (!isMem() || Memory.Alignment != 0) return false;
     // Check for register offset.
     if (Memory.OffsetRegNum) return true;
     // Immediate offset in range [-4095, 4095].
@@ -916,7 +914,7 @@ public:
     // and we reject it.
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
-    if (!isMemory() || Memory.Alignment != 0) return false;
+    if (!isMem() || Memory.Alignment != 0) return false;
     // No shifts are legal for AM3.
     if (Memory.ShiftType != ARM_AM::no_shift) return false;
     // Check for register offset.
@@ -946,7 +944,7 @@ public:
     // and we reject it.
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
-    if (!isMemory() || Memory.Alignment != 0) return false;
+    if (!isMem() || Memory.Alignment != 0) return false;
     // Check for register offset.
     if (Memory.OffsetRegNum) return false;
     // Immediate offset in range [-1020, 1020] and a multiple of 4.
@@ -956,25 +954,25 @@ public:
       Val == INT32_MIN;
   }
   bool isMemTBB() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
       return false;
     return true;
   }
   bool isMemTBH() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
         Memory.Alignment != 0 )
       return false;
     return true;
   }
   bool isMemRegOffset() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+    if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
       return false;
     return true;
   }
   bool isT2MemRegOffset() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.Alignment != 0)
       return false;
     // Only lsl #{0, 1, 2, 3} allowed.
@@ -987,14 +985,14 @@ public:
   bool isMemThumbRR() const {
     // Thumb reg+reg addressing is simple. Just two registers, a base and
     // an offset. No shifts, negations or any other complicating factors.
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
       return false;
     return isARMLowRegister(Memory.BaseRegNum) &&
       (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
   }
   bool isMemThumbRIs4() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
       return false;
     // Immediate offset, multiple of 4 in range [0, 124].
@@ -1003,7 +1001,7 @@ public:
     return Val >= 0 && Val <= 124 && (Val % 4) == 0;
   }
   bool isMemThumbRIs2() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
       return false;
     // Immediate offset, multiple of 4 in range [0, 62].
@@ -1012,7 +1010,7 @@ public:
     return Val >= 0 && Val <= 62 && (Val % 2) == 0;
   }
   bool isMemThumbRIs1() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [0, 31].
@@ -1021,7 +1019,7 @@ public:
     return Val >= 0 && Val <= 31;
   }
   bool isMemThumbSPI() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
       return false;
     // Immediate offset, multiple of 4 in range [0, 1020].
@@ -1035,7 +1033,7 @@ public:
     // and we reject it.
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset a multiple of 4 in range [-1020, 1020].
     if (!Memory.OffsetImm) return true;
@@ -1044,7 +1042,7 @@ public:
     return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
   }
   bool isMemImm0_1020s4Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset a multiple of 4 in range [0, 1020].
     if (!Memory.OffsetImm) return true;
@@ -1052,7 +1050,7 @@ public:
     return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
   }
   bool isMemImm8Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Base reg of PC isn't allowed for these encodings.
     if (Memory.BaseRegNum == ARM::PC) return false;
@@ -1062,7 +1060,7 @@ public:
     return (Val == INT32_MIN) || (Val > -256 && Val < 256);
   }
   bool isMemPosImm8Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [0, 255].
     if (!Memory.OffsetImm) return true;
@@ -1070,7 +1068,7 @@ public:
     return Val >= 0 && Val < 256;
   }
   bool isMemNegImm8Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Base reg of PC isn't allowed for these encodings.
     if (Memory.BaseRegNum == ARM::PC) return false;
@@ -1080,7 +1078,7 @@ public:
     return (Val == INT32_MIN) || (Val > -256 && Val < 0);
   }
   bool isMemUImm12Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [0, 4095].
     if (!Memory.OffsetImm) return true;
@@ -1094,7 +1092,7 @@ public:
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
 
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [-4095, 4095].
     if (!Memory.OffsetImm) return true;
@@ -3376,7 +3374,8 @@ ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseMSRMaskOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   SMLoc S = Parser.getTok().getLoc();
   const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
   StringRef Mask = Tok.getString();
 
   if (isMClass()) {
@@ -3880,8 +3879,8 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 /// cvtT2LdrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtT2LdrdPre(MCInst &Inst,
              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -3892,14 +3891,13 @@ cvtT2LdrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtT2StrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtT2StrdPre(MCInst &Inst,
              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateReg(0));
@@ -3910,14 +3908,13 @@ cvtT2StrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3926,28 +3923,26 @@ cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
 
   ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode2(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3956,14 +3951,13 @@ cvtLdWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
 
   ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
 
@@ -3972,57 +3966,53 @@ cvtLdWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
 
   ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 
 /// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrModeImm12(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrMode2(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode2(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStWriteBackRegAddrMode3(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdExtTWriteBackImm(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4034,14 +4024,13 @@ cvtLdExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdExtTWriteBackReg(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4053,14 +4042,13 @@ cvtLdExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStExtTWriteBackImm(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4072,14 +4060,13 @@ cvtStExtTWriteBackImm(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStExtTWriteBackReg(MCInst &Inst,
                       const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4091,14 +4078,13 @@ cvtStExtTWriteBackReg(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdrdPre(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Rt, Rt2
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
@@ -4109,14 +4095,13 @@ cvtLdrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtStrdPre - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtStrdPre(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtStrdPre(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4127,40 +4112,27 @@ cvtStrdPre(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtLdWriteBackRegAddrMode3(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtLdWriteBackRegAddrMode3(MCInst &Inst,
                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
   ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-/// cvtThumbMultiple- Convert parsed operands to MCInst.
+/// cvtThumbMultiply - Convert parsed operands to MCInst.
 /// Needed here because the Asm Gen Matcher can't handle properly tied operands
 /// when they refer multiple MIOperands inside a single one.
-bool ARMAsmParser::
-cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtThumbMultiply(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // The second source operand must be the same register as the destination
-  // operand.
-  if (Operands.size() == 6 &&
-      (((ARMOperand*)Operands[3])->getReg() !=
-       ((ARMOperand*)Operands[5])->getReg()) &&
-      (((ARMOperand*)Operands[3])->getReg() !=
-       ((ARMOperand*)Operands[4])->getReg())) {
-    Error(Operands[3]->getStartLoc(),
-          "destination register must match source register");
-    return false;
-  }
   ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
   ((ARMOperand*)Operands[1])->addCCOutOperands(Inst, 1);
   // If we have a three-operand form, make sure to set Rn to be the operand
@@ -4173,12 +4145,10 @@ cvtThumbMultiply(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[RegOp])->addRegOperands(Inst, 1);
   Inst.addOperand(Inst.getOperand(0));
   ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
-
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVLDwbFixed(MCInst &Inst,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
@@ -4188,11 +4158,10 @@ cvtVLDwbFixed(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVLDwbRegister(MCInst &Inst,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Vd
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
@@ -4204,11 +4173,10 @@ cvtVLDwbRegister(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVSTwbFixed(MCInst &Inst,
               const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4218,11 +4186,10 @@ cvtVSTwbFixed(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
-bool ARMAsmParser::
-cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
+void ARMAsmParser::
+cvtVSTwbRegister(MCInst &Inst,
                  const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Create a writeback register dummy placeholder.
   Inst.addOperand(MCOperand::CreateImm(0));
@@ -4234,7 +4201,6 @@ cvtVSTwbRegister(MCInst &Inst, unsigned Opcode,
   ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
   // pred
   ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-  return true;
 }
 
 /// Parse an ARM memory expression, return false if successful else return true
@@ -4471,6 +4437,12 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
         ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
         ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
       return Error(Loc, "immediate shift value out of range");
+    // If <ShiftTy> #0, turn it into a no_shift.
+    if (Imm == 0)
+      St = ARM_AM::lsl;
+    // For consistency, treat lsr #32 and asr #32 as having immediate value 0.
+    if (Imm == 32)
+      Imm = 0;
     Amount = Imm;
   }
 
@@ -4648,7 +4620,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       return true;
 
     const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
-                                                   getContext());
+                                              getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
     return false;
@@ -4983,7 +4955,8 @@ static bool doesIgnoreDataTypeSuffix(StringRef Mnemonic, StringRef DT) {
 
 static void applyMnemonicAliases(StringRef &Mnemonic, unsigned Features);
 /// Parse an arm instruction mnemonic followed by its operands.
-bool ARMAsmParser::ParseInstruction(StringRef Name, SMLoc NameLoc,
+bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                    SMLoc NameLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // Apply mnemonic aliases before doing anything else, as the destination
   // mnemnonic may include suffices and we want to handle them normally.
@@ -5377,6 +5350,25 @@ validateInstruction(MCInst &Inst,
                    "in register list");
     break;
   }
+  case ARM::tMUL: {
+    // The second source operand must be the same register as the destination
+    // operand.
+    //
+    // In this case, we must directly check the parsed operands because the
+    // cvtThumbMultiply() function is written in such a way that it guarantees
+    // this first statement is always true for the new Inst.  Essentially, the
+    // destination is unconditionally copied into the second source operand
+    // without checking to see if it matches what we actually parsed.
+    if (Operands.size() == 6 &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[5])->getReg()) &&
+        (((ARMOperand*)Operands[3])->getReg() !=
+         ((ARMOperand*)Operands[4])->getReg())) {
+      return Error(Operands[3]->getStartLoc(),
+                   "destination register must match source register");
+    }
+    break;
+  }
   // Like for ldm/stm, push and pop have hi-reg handling version in Thumb2,
   // so only issue a diagnostic for thumb1. The instructions will be
   // switched to the t2 encodings in processInstruction() if necessary.
@@ -5678,6 +5670,20 @@ bool ARMAsmParser::
 processInstruction(MCInst &Inst,
                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   switch (Inst.getOpcode()) {
+  // Alias for alternate form of 'ADR Rd, #imm' instruction.
+  case ARM::ADDri: {
+    if (Inst.getOperand(1).getReg() != ARM::PC ||
+        Inst.getOperand(5).getReg() != 0)
+      return false;
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::ADR);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(2));
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4));
+    Inst = TmpInst;
+    return true;
+  }
   // Aliases for alternate PC+imm syntax of LDR instructions.
   case ARM::t2LDRpcrel:
     Inst.setOpcode(ARM::t2LDRpci);
@@ -7471,13 +7477,14 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 
 static const char *getSubtargetFeatureName(unsigned Val);
 bool ARMAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc,
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out) {
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
   MCInst Inst;
-  unsigned ErrorInfo;
   unsigned MatchResult;
-  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo);
+  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                     MatchingInlineAsm);
   switch (MatchResult) {
   default: break;
   case Match_Success:
@@ -7540,9 +7547,6 @@ MatchAndEmitInstruction(SMLoc IDLoc,
   case Match_MnemonicFail:
     return Error(IDLoc, "invalid instruction",
                  ((ARMOperand*)Operands[0])->getLocRange());
-  case Match_ConversionFail:
-    // The converter function will have already emitted a diagnostic.
-    return true;
   case Match_RequiresNotITBlock:
     return Error(IDLoc, "flag setting instruction only valid outside IT block");
   case Match_RequiresITBlock:
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index ac916ccaed63..377bd9243c2e 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_target(ARMCodeGen
   ARMCodeEmitter.cpp
   ARMConstantIslandPass.cpp
   ARMConstantPoolValue.cpp
-  ARMELFWriterInfo.cpp
   ARMExpandPseudoInsts.cpp
   ARMFastISel.cpp
   ARMFrameLowering.cpp
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index c90751d0b962..f00142de50dc 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -525,8 +525,9 @@ static bool tryAddingSymbolicOperand(uint64_t Address, int32_t Value,
     else
        ReferenceType = LLVMDisassembler_ReferenceType_InOut_None;
     const char *ReferenceName;
-    const char *Name = SymbolLookUp(DisInfo, Value, &ReferenceType, Address,
-                                    &ReferenceName);
+    uint64_t SymbolValue = 0x00000000ffffffffULL & Value;
+    const char *Name = SymbolLookUp(DisInfo, SymbolValue, &ReferenceType,
+                                    Address, &ReferenceName);
     if (Name) {
       SymbolicOp.AddSymbol.Name = Name;
       SymbolicOp.AddSymbol.Present = true;
@@ -1523,6 +1524,8 @@ DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
         return MCDisassembler::Fail;
     }
     unsigned amt = fieldFromInstruction(Insn, 7, 5);
+    if (Opc == ARM_AM::ror && amt == 0)
+      Opc = ARM_AM::rrx;
     unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
 
     Inst.addOperand(MCOperand::CreateImm(imm));
@@ -1564,6 +1567,9 @@ static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
       break;
   }
 
+  if (ShOp == ARM_AM::ror && imm == 0)
+    ShOp = ARM_AM::rrx;
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
@@ -2089,16 +2095,28 @@ static DecodeStatus DecodeAddrMode7Operand(MCInst &Inst, unsigned Val,
 static DecodeStatus
 DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
                      uint64_t Address, const void *Decoder) {
-  DecodeStatus S = MCDisassembler::Success;
-  unsigned imm = (fieldFromInstruction(Insn, 0, 11) << 0) |
-                 (fieldFromInstruction(Insn, 11, 1) << 18) |
-                 (fieldFromInstruction(Insn, 13, 1) << 17) |
-                 (fieldFromInstruction(Insn, 16, 6) << 11) |
-                 (fieldFromInstruction(Insn, 26, 1) << 19);
-  if (!tryAddingSymbolicOperand(Address, Address + SignExtend32<20>(imm<<1) + 4,
+  DecodeStatus Status = MCDisassembler::Success;
+
+  // Note the J1 and J2 values are from the encoded instruction.  So here
+  // change them to I1 and I2 values via as documented:
+  // I1 = NOT(J1 EOR S);
+  // I2 = NOT(J2 EOR S);
+  // and build the imm32 with one trailing zero as documented:
+  // imm32 = SignExtend(S:I1:I2:imm10:imm11:'0', 32);
+  unsigned S = fieldFromInstruction(Insn, 26, 1);
+  unsigned J1 = fieldFromInstruction(Insn, 13, 1);
+  unsigned J2 = fieldFromInstruction(Insn, 11, 1);
+  unsigned I1 = !(J1 ^ S);
+  unsigned I2 = !(J2 ^ S);
+  unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
+  unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
+  unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
+  int imm32 = SignExtend32<24>(tmp << 1);
+  if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
                                 true, 4, Inst, Decoder))
-    Inst.addOperand(MCOperand::CreateImm(SignExtend32<20>(imm << 1)));
-  return S;
+    Inst.addOperand(MCOperand::CreateImm(imm32));
+
+  return Status;
 }
 
 static DecodeStatus
@@ -2701,6 +2719,8 @@ static DecodeStatus DecodeVLD1DupInstruction(MCInst &Inst, unsigned Insn,
   unsigned align = fieldFromInstruction(Insn, 4, 1);
   unsigned size = fieldFromInstruction(Insn, 6, 2);
 
+  if (size == 0 && align == 1)
+    return MCDisassembler::Fail;
   align *= (1 << size);
 
   switch (Inst.getOpcode()) {
@@ -2831,6 +2851,8 @@ static DecodeStatus DecodeVLD4DupInstruction(MCInst &Inst, unsigned Insn,
   unsigned align = fieldFromInstruction(Insn, 4, 1);
 
   if (size == 0x3) {
+    if (align == 0)
+      return MCDisassembler::Fail;
     size = 4;
     align = 16;
   } else {
@@ -3170,7 +3192,7 @@ static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
     int imm = Val & 0xFF;
 
     if (!(Val & 0x100)) imm *= -1;
-    Inst.addOperand(MCOperand::CreateImm(imm << 2));
+    Inst.addOperand(MCOperand::CreateImm(imm * 4));
   }
 
   return MCDisassembler::Success;
@@ -3710,8 +3732,16 @@ static DecodeStatus DecodeVLD1LN(MCInst &Inst, unsigned Insn,
       if (fieldFromInstruction(Insn, 6, 1))
         return MCDisassembler::Fail; // UNDEFINED
       index = fieldFromInstruction(Insn, 7, 1);
-      if (fieldFromInstruction(Insn, 4, 2) != 0)
-        align = 4;
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0 :
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
   }
 
   if (!Check(S, DecodeDPRRegisterClass(Inst, Rd, Address, Decoder)))
@@ -3769,8 +3799,16 @@ static DecodeStatus DecodeVST1LN(MCInst &Inst, unsigned Insn,
       if (fieldFromInstruction(Insn, 6, 1))
         return MCDisassembler::Fail; // UNDEFINED
       index = fieldFromInstruction(Insn, 7, 1);
-      if (fieldFromInstruction(Insn, 4, 2) != 0)
-        align = 4;
+
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0: 
+          align = 0; break;
+        case 3:
+          align = 4; break;
+        default:
+          return MCDisassembler::Fail;
+      }
+      break;
   }
 
   if (Rm != 0xF) { // Writeback
@@ -4090,8 +4128,15 @@ static DecodeStatus DecodeVLD4LN(MCInst &Inst, unsigned Insn,
         inc = 2;
       break;
     case 2:
-      if (fieldFromInstruction(Insn, 4, 2))
-        align = 4 << fieldFromInstruction(Insn, 4, 2);
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
       index = fieldFromInstruction(Insn, 7, 1);
       if (fieldFromInstruction(Insn, 6, 1))
         inc = 2;
@@ -4164,8 +4209,15 @@ static DecodeStatus DecodeVST4LN(MCInst &Inst, unsigned Insn,
         inc = 2;
       break;
     case 2:
-      if (fieldFromInstruction(Insn, 4, 2))
-        align = 4 << fieldFromInstruction(Insn, 4, 2);
+      switch (fieldFromInstruction(Insn, 4, 2)) {
+        case 0:
+          align = 0; break;
+        case 3:
+          return MCDisassembler::Fail;
+        default:
+          align = 4 << fieldFromInstruction(Insn, 4, 2); break;
+      }
+
       index = fieldFromInstruction(Insn, 7, 1);
       if (fieldFromInstruction(Insn, 6, 1))
         inc = 2;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 8b9109ec9868..dcc41d93f5ce 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -29,11 +29,33 @@ using namespace llvm;
 ///
 /// getSORegOffset returns an integer from 0-31, representing '32' as 0.
 static unsigned translateShiftImm(unsigned imm) {
+  // lsr #32 and asr #32 exist, but should be encoded as a 0.
+  assert((imm & ~0x1f) == 0 && "Invalid shift encoding");
+
   if (imm == 0)
     return 32;
   return imm;
 }
 
+/// Prints the shift value with an immediate value.
+static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
+                          unsigned ShImm, bool UseMarkup) {
+  if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm))
+    return;
+  O << ", ";
+
+  assert (!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
+  O << getShiftOpcStr(ShOpc);
+
+  if (ShOpc != ARM_AM::rrx) {
+    O << " ";
+    if (UseMarkup)
+      O << "<imm:";
+    O << "#" << translateShiftImm(ShImm);
+    if (UseMarkup)
+      O << ">";
+  }
+}
 
 ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
                                const MCInstrInfo &MII,
@@ -45,7 +67,9 @@ ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
 }
 
 void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  OS << getRegisterName(RegNo);
+  OS << markup("<reg:")
+     << getRegisterName(RegNo)
+     << markup(">");
 }
 
 void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
@@ -85,10 +109,13 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     printSBitModifierOperand(MI, 6, O);
     printPredicateOperand(MI, 4, O);
 
-    O << '\t' << getRegisterName(Dst.getReg())
-      << ", " << getRegisterName(MO1.getReg());
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
 
-    O << ", " << getRegisterName(MO2.getReg());
+    O << ", ";
+    printRegName(O, MO2.getReg());
     assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
     printAnnotation(O, Annot);
     return;
@@ -104,15 +131,20 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     printSBitModifierOperand(MI, 5, O);
     printPredicateOperand(MI, 3, O);
 
-    O << '\t' << getRegisterName(Dst.getReg())
-      << ", " << getRegisterName(MO1.getReg());
+    O << '\t';
+    printRegName(O, Dst.getReg());
+    O << ", ";
+    printRegName(O, MO1.getReg());
 
     if (ARM_AM::getSORegShOp(MO2.getImm()) == ARM_AM::rrx) {
       printAnnotation(O, Annot);
       return;
     }
 
-    O << ", #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+    O << ", "
+      << markup("<imm:")
+      << "#" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()))
+      << markup(">");
     printAnnotation(O, Annot);
     return;
   }
@@ -136,7 +168,9 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       MI->getOperand(3).getImm() == -4) {
     O << '\t' << "push";
     printPredicateOperand(MI, 4, O);
-    O << "\t{" << getRegisterName(MI->getOperand(1).getReg()) << "}";
+    O << "\t{";
+    printRegName(O, MI->getOperand(1).getReg());
+    O << "}";
     printAnnotation(O, Annot);
     return;
   }
@@ -159,7 +193,9 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       MI->getOperand(4).getImm() == 4) {
     O << '\t' << "pop";
     printPredicateOperand(MI, 5, O);
-    O << "\t{" << getRegisterName(MI->getOperand(0).getReg()) << "}";
+    O << "\t{";
+    printRegName(O, MI->getOperand(0).getReg());
+    O << "}";
     printAnnotation(O, Annot);
     return;
   }
@@ -198,7 +234,8 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     O << "\tldm";
 
     printPredicateOperand(MI, 1, O);
-    O << '\t' << getRegisterName(BaseReg);
+    O << '\t';
+    printRegName(O, BaseReg);
     if (Writeback) O << "!";
     O << ", ";
     printRegisterList(MI, 3, O);
@@ -224,9 +261,11 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
+    printRegName(O, Reg);
   } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
+    O << markup("<imm:")
+      << '#' << Op.getImm()
+      << markup(">");
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     // If a symbolic branch target was added as a constant expression then print
@@ -244,13 +283,16 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
-void ARMInstPrinter::printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
+void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   if (MO1.isExpr())
     O << *MO1.getExpr();
-  else if (MO1.isImm())
-    O << "[pc, #" << MO1.getImm() << "]";
+  else if (MO1.isImm()) {
+    O << markup("<mem:") << "[pc, "
+      << markup("<imm:") << "#" << MO1.getImm()
+      << markup(">]>", "]");
+  }
   else
     llvm_unreachable("Unknown LDR label operand?");
 }
@@ -266,7 +308,7 @@ void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
   const MCOperand &MO3 = MI->getOperand(OpNum+2);
 
-  O << getRegisterName(MO1.getReg());
+  printRegName(O, MO1.getReg());
 
   // Print the shift opc.
   ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO3.getImm());
@@ -274,7 +316,8 @@ void ARMInstPrinter::printSORegRegOperand(const MCInst *MI, unsigned OpNum,
   if (ShOpc == ARM_AM::rrx)
     return;
 
-  O << ' ' << getRegisterName(MO2.getReg());
+  O << ' ';
+  printRegName(O, MO2.getReg());
   assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
 }
 
@@ -283,14 +326,11 @@ void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << getRegisterName(MO1.getReg());
+  printRegName(O, MO1.getReg());
 
   // Print the shift opc.
-  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
-  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
-  if (ShOpc == ARM_AM::rrx)
-    return;
-  O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
 }
 
 
@@ -304,67 +344,51 @@ void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
 
   if (!MO2.getReg()) {
-    if (ARM_AM::getAM2Offset(MO3.getImm())) // Don't print +0.
-      O << ", #"
+    if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0.
+      O << ", "
+        << markup("<imm:")
+        << "#"
         << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-        << ARM_AM::getAM2Offset(MO3.getImm());
-    O << "]";
-    return;
-  }
-
-  O << ", "
-    << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-    << getRegisterName(MO2.getReg());
-
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
-    O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
-    << " #" << ShImm;
-  O << "]";
-}
-
-void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op,
-                                         raw_ostream &O) {
-  const MCOperand &MO1 = MI->getOperand(Op);
-  const MCOperand &MO2 = MI->getOperand(Op+1);
-  const MCOperand &MO3 = MI->getOperand(Op+2);
-
-  O << "[" << getRegisterName(MO1.getReg()) << "], ";
-
-  if (!MO2.getReg()) {
-    unsigned ImmOffs = ARM_AM::getAM2Offset(MO3.getImm());
-    O << '#'
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-      << ImmOffs;
+        << ARM_AM::getAM2Offset(MO3.getImm())
+        << markup(">");
+    }
+    O << "]" << markup(">");
     return;
   }
 
-  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-    << getRegisterName(MO2.getReg());
+  O << ", ";
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()));
+  printRegName(O, MO2.getReg());
 
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
-    O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
-    << " #" << ShImm;
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
+                   ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup);
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
   const MCOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << getRegisterName(MO1.getReg()) << ", "
-    << getRegisterName(MO2.getReg()) << "]";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
   const MCOperand &MO2 = MI->getOperand(Op+1);
-  O << "[" << getRegisterName(MO1.getReg()) << ", "
-    << getRegisterName(MO2.getReg()) << ", lsl #1]";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+  O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
@@ -376,13 +400,13 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
     return;
   }
 
+#ifndef NDEBUG
   const MCOperand &MO3 = MI->getOperand(Op+2);
   unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+  assert(IdxMode != ARMII::IndexModePost &&
+         "Should be pre or offset index op");
+#endif
 
-  if (IdxMode == ARMII::IndexModePost) {
-    printAM2PostIndexOp(MI, Op, O);
-    return;
-  }
   printAM2PreOrOffsetIndexOp(MI, Op, O);
 }
 
@@ -394,19 +418,18 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
 
   if (!MO1.getReg()) {
     unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
-    O << '#'
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
-      << ImmOffs;
+    O << markup("<imm:")
+      << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
+      << ImmOffs
+      << markup(">");
     return;
   }
 
-  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
-    << getRegisterName(MO1.getReg());
+  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()));
+  printRegName(O, MO1.getReg());
 
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
-    O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
-    << " #" << ShImm;
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()),
+                   ARM_AM::getAM2Offset(MO2.getImm()), UseMarkup);
 }
 
 //===--------------------------------------------------------------------===//
@@ -419,18 +442,22 @@ void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  O << "[" << getRegisterName(MO1.getReg()) << "], ";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "], " << markup(">");
 
   if (MO2.getReg()) {
-    O << (char)ARM_AM::getAM3Op(MO3.getImm())
-    << getRegisterName(MO2.getReg());
+    O << (char)ARM_AM::getAM3Op(MO3.getImm());
+    printRegName(O, MO2.getReg());
     return;
   }
 
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
-  O << '#'
+  O << markup("<imm:")
+    << '#'
     << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
-    << ImmOffs;
+    << ImmOffs
+    << markup(">");
 }
 
 void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
@@ -439,23 +466,29 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  O << '[' << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << '[';
+  printRegName(O, MO1.getReg());
 
   if (MO2.getReg()) {
-    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
-      << getRegisterName(MO2.getReg()) << ']';
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
+    printRegName(O, MO2.getReg());
+    O << ']' << markup(">");
     return;
   }
 
-  //If the op is sub we have to print the immediate even if it is 0 
+  //If the op is sub we have to print the immediate even if it is 0
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
   ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
- 
-  if (ImmOffs || (op == ARM_AM::sub))
-    O << ", #"
+
+  if (ImmOffs || (op == ARM_AM::sub)) {
+    O << ", "
+      << markup("<imm:")
+      << "#"
       << ARM_AM::getAddrOpcStr(op)
-      << ImmOffs;
-  O << ']';
+      << ImmOffs
+      << markup(">");
+  }
+  O << ']' << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
@@ -483,15 +516,15 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
   if (MO1.getReg()) {
-    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
-      << getRegisterName(MO1.getReg());
+    O << getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()));
+    printRegName(O, MO1.getReg());
     return;
   }
 
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
-  O << '#'
-    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
-    << ImmOffs;
+  O << markup("<imm:")
+    << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs
+    << markup(">");
 }
 
 void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
@@ -499,7 +532,9 @@ void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
                                              raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   unsigned Imm = MO.getImm();
-  O << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff);
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff)
+    << markup(">");
 }
 
 void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
@@ -507,7 +542,8 @@ void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << (MO2.getImm() ? "" : "-") << getRegisterName(MO1.getReg());
+  O << (MO2.getImm() ? "" : "-");
+  printRegName(O, MO1.getReg());
 }
 
 void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
@@ -515,7 +551,9 @@ void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
                                              raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   unsigned Imm = MO.getImm();
-  O << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2);
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2)
+    << markup(">");
 }
 
 
@@ -536,16 +574,20 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
     return;
   }
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
 
   unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
   unsigned Op = ARM_AM::getAM5Op(MO2.getImm());
   if (ImmOffs || Op == ARM_AM::sub) {
-    O << ", #"
+    O << ", "
+      << markup("<imm:")
+      << "#"
       << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
-      << ImmOffs * 4;
+      << ImmOffs * 4
+      << markup(">");
   }
-  O << "]";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
@@ -553,18 +595,21 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
   if (MO2.getImm()) {
     // FIXME: Both darwin as and GNU as violate ARM docs here.
     O << ", :" << (MO2.getImm() << 3);
   }
-  O << "]";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
-  O << "[" << getRegisterName(MO1.getReg()) << "]";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
@@ -573,8 +618,10 @@ void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
   const MCOperand &MO = MI->getOperand(OpNum);
   if (MO.getReg() == 0)
     O << "!";
-  else
-    O << ", " << getRegisterName(MO.getReg());
+  else {
+    O << ", ";
+    printRegName(O, MO.getReg());
+  }
 }
 
 void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
@@ -585,7 +632,9 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
   int32_t lsb = CountTrailingZeros_32(v);
   int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
   assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
-  O << '#' << lsb << ", #" << width;
+  O << markup("<imm:") << '#' << lsb << markup(">")
+    << ", "
+    << markup("<imm:") << '#' << width << markup(">");
 }
 
 void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
@@ -599,10 +648,18 @@ void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
   unsigned ShiftOp = MI->getOperand(OpNum).getImm();
   bool isASR = (ShiftOp & (1 << 5)) != 0;
   unsigned Amt = ShiftOp & 0x1f;
-  if (isASR)
-    O << ", asr #" << (Amt == 0 ? 32 : Amt);
-  else if (Amt)
-    O << ", lsl #" << Amt;
+  if (isASR) {
+    O << ", asr "
+      << markup("<imm:")
+      << "#" << (Amt == 0 ? 32 : Amt)
+      << markup(">");
+  }
+  else if (Amt) {
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << Amt
+      << markup(">");
+  }
 }
 
 void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
@@ -611,7 +668,7 @@ void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
   if (Imm == 0)
     return;
   assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
-  O << ", lsl #" << Imm;
+  O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">");
 }
 
 void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
@@ -621,7 +678,7 @@ void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
   if (Imm == 0)
     Imm = 32;
   assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
-  O << ", asr #" << Imm;
+  O << ", asr " << markup("<imm:") << "#" << Imm << markup(">");
 }
 
 void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
@@ -629,7 +686,7 @@ void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
   O << "{";
   for (unsigned i = OpNum, e = MI->getNumOperands(); i != e; ++i) {
     if (i != OpNum) O << ", ";
-    O << getRegisterName(MI->getOperand(i).getReg());
+    printRegName(O, MI->getOperand(i).getReg());
   }
   O << "}";
 }
@@ -803,23 +860,29 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
 
   int32_t OffImm = (int32_t)MO.getImm();
 
+  O << markup("<imm:");
   if (OffImm == INT32_MIN)
     O << "#-0";
   else if (OffImm < 0)
     O << "#-" << -OffImm;
   else
     O << "#" << OffImm;
+  O << markup(">");
 }
 
 void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
-  O << "#" << MI->getOperand(OpNum).getImm() * 4;
+  O << markup("<imm:")
+    << "#" << MI->getOperand(OpNum).getImm() * 4
+    << markup(">");
 }
 
 void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
                                      raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
-  O << "#" << (Imm == 0 ? 32 : Imm);
+  O << markup("<imm:")
+    << "#" << (Imm == 0 ? 32 : Imm)
+    << markup(">");
 }
 
 void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
@@ -849,10 +912,13 @@ void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
     return;
   }
 
-  O << "[" << getRegisterName(MO1.getReg());
-  if (unsigned RegNum = MO2.getReg())
-    O << ", " << getRegisterName(RegNum);
-  O << "]";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned RegNum = MO2.getReg()) {
+    O << ", ";
+    printRegName(O, RegNum);
+  }
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
@@ -867,10 +933,15 @@ void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
     return;
   }
 
-  O << "[" << getRegisterName(MO1.getReg());
-  if (unsigned ImmOffs = MO2.getImm())
-    O << ", #" << ImmOffs * Scale;
-  O << "]";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (unsigned ImmOffs = MO2.getImm()) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << ImmOffs * Scale
+      << markup(">");
+  }
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
@@ -906,14 +977,12 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
   unsigned Reg = MO1.getReg();
-  O << getRegisterName(Reg);
+  printRegName(O, Reg);
 
   // Print the shift opc.
   assert(MO2.isImm() && "Not a valid t2_so_reg value!");
-  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
-  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
-  if (ShOpc != ARM_AM::rrx)
-    O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()), UseMarkup);
 }
 
 void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
@@ -926,18 +995,27 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
     return;
   }
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
   bool isSub = OffImm < 0;
   // Special value for #-0. All others are normal.
   if (OffImm == INT32_MIN)
     OffImm = 0;
-  if (isSub)
-    O << ", #-" << -OffImm;
-  else if (OffImm > 0)
-    O << ", #" << OffImm;
-  O << "]";
+  if (isSub) {
+    O << ", "
+      << markup("<imm:") 
+      << "#-" << -OffImm
+      << markup(">");
+  }
+  else if (OffImm > 0) {
+    O << ", "
+      << markup("<imm:") 
+      << "#" << OffImm
+      << markup(">");
+  }
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
@@ -946,17 +1024,24 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
   // Don't print +0.
+  if (OffImm != 0)
+    O << ", ";
+  if (OffImm != 0 && UseMarkup)
+    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << ", #-0";
+    O << "#-0";
   else if (OffImm < 0)
-    O << ", #-" << -OffImm;
+    O << "#-" << -OffImm;
   else if (OffImm > 0)
-    O << ", #" << OffImm;
-  O << "]";
+    O << "#" << OffImm;
+  if (OffImm != 0 && UseMarkup)
+    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
@@ -970,20 +1055,27 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
     return;
   }
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
 
   assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
 
   // Don't print +0.
+  if (OffImm != 0)
+    O << ", ";
+  if (OffImm != 0 && UseMarkup)
+    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << ", #-0";
+    O << "#-0";
   else if (OffImm < 0)
-    O << ", #-" << -OffImm;
+    O << "#-" << -OffImm;
   else if (OffImm > 0)
-    O << ", #" << OffImm;
-  O << "]";
+    O << "#" << OffImm;
+  if (OffImm != 0 && UseMarkup)
+    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
@@ -992,10 +1084,15 @@ void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  O << "[" << getRegisterName(MO1.getReg());
-  if (MO2.getImm())
-    O << ", #" << MO2.getImm() * 4;
-  O << "]";
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  if (MO2.getImm()) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << MO2.getImm() * 4
+      << markup(">");
+  }
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
@@ -1003,11 +1100,12 @@ void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
                                                       raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   int32_t OffImm = (int32_t)MO1.getImm();
-  // Don't print +0.
+  O << ", " << markup("<imm:");
   if (OffImm < 0)
-    O << ", #-" << -OffImm;
+    O << "#-" << -OffImm;
   else
-    O << ", #" << OffImm;
+    O << "#" << OffImm;
+  O << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
@@ -1019,12 +1117,18 @@ void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
   assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
 
   // Don't print +0.
+  if (OffImm != 0)
+    O << ", ";
+  if (OffImm != 0 && UseMarkup)
+    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << ", #-0";
+    O << "#-0";
   else if (OffImm < 0)
-    O << ", #-" << -OffImm;
+    O << "#-" << -OffImm;
   else if (OffImm > 0)
-    O << ", #" << OffImm;
+    O << "#" << OffImm;
+  if (OffImm != 0 && UseMarkup)
+    O << ">";
 }
 
 void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
@@ -1034,23 +1138,30 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
   const MCOperand &MO3 = MI->getOperand(OpNum+2);
 
-  O << "[" << getRegisterName(MO1.getReg());
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
 
   assert(MO2.getReg() && "Invalid so_reg load / store address!");
-  O << ", " << getRegisterName(MO2.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
 
   unsigned ShAmt = MO3.getImm();
   if (ShAmt) {
     assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
-    O << ", lsl #" << ShAmt;
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << ShAmt
+      << markup(">");
   }
-  O << "]";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                        raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#' << ARM_AM::getFPImmFloat(MO.getImm());
+  O << markup("<imm:")
+    << '#' << ARM_AM::getFPImmFloat(MO.getImm())
+    << markup(">");
 }
 
 void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1058,14 +1169,18 @@ void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
   unsigned EncodedImm = MI->getOperand(OpNum).getImm();
   unsigned EltBits;
   uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
-  O << "#0x";
+  O << markup("<imm:")
+    << "#0x";
   O.write_hex(Val);
+  O << markup(">");
 }
 
 void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
-  O << "#" << Imm + 1;
+  O << markup("<imm:")
+    << "#" << Imm + 1
+    << markup(">");
 }
 
 void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1073,23 +1188,30 @@ void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
   unsigned Imm = MI->getOperand(OpNum).getImm();
   if (Imm == 0)
     return;
-  O << ", ror #";
+  O << ", ror "
+    << markup("<imm:")
+    << "#";
   switch (Imm) {
   default: assert (0 && "illegal ror immediate!");
   case 1: O << "8"; break;
   case 2: O << "16"; break;
   case 3: O << "24"; break;
   }
+  O << markup(">");
 }
 
 void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
-  O << "#" << 16 - MI->getOperand(OpNum).getImm();
+  O << markup("<imm:")
+    << "#" << 16 - MI->getOperand(OpNum).getImm()
+    << markup(">");
 }
 
 void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
-  O << "#" << 32 - MI->getOperand(OpNum).getImm();
+  O << markup("<imm:")
+    << "#" << 32 - MI->getOperand(OpNum).getImm()
+    << markup(">");
 }
 
 void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
@@ -1099,7 +1221,9 @@ void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
 
 void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O) {
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "}";
 }
 
 void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
@@ -1107,7 +1231,11 @@ void ARMInstPrinter::printVectorListTwo(const MCInst *MI, unsigned OpNum,
   unsigned Reg = MI->getOperand(OpNum).getReg();
   unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
   unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
-  O << "{" << getRegisterName(Reg0) << ", " << getRegisterName(Reg1) << "}";
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
 }
 
 void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI,
@@ -1116,7 +1244,11 @@ void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI,
   unsigned Reg = MI->getOperand(OpNum).getReg();
   unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
   unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
-  O << "{" << getRegisterName(Reg0) << ", " << getRegisterName(Reg1) << "}";
+  O << "{";
+  printRegName(O, Reg0);
+  O << ", ";
+  printRegName(O, Reg1);
+  O << "}";
 }
 
 void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum,
@@ -1124,9 +1256,13 @@ void ARMInstPrinter::printVectorListThree(const MCInst *MI, unsigned OpNum,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "}";
 }
 
 void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
@@ -1134,16 +1270,23 @@ void ARMInstPrinter::printVectorListFour(const MCInst *MI, unsigned OpNum,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 3) << "}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "}";
 }
 
 void ARMInstPrinter::printVectorListOneAllLanes(const MCInst *MI,
                                                 unsigned OpNum,
                                                 raw_ostream &O) {
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[]}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
@@ -1152,7 +1295,11 @@ void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
   unsigned Reg = MI->getOperand(OpNum).getReg();
   unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
   unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_1);
-  O << "{" << getRegisterName(Reg0) << "[], " << getRegisterName(Reg1) << "[]}";
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
@@ -1161,9 +1308,13 @@ void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[]}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
@@ -1172,10 +1323,15 @@ void ARMInstPrinter::printVectorListFourAllLanes(const MCInst *MI,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 3) << "[]}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 1);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 3);
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListTwoSpacedAllLanes(const MCInst *MI,
@@ -1184,7 +1340,11 @@ void ARMInstPrinter::printVectorListTwoSpacedAllLanes(const MCInst *MI,
   unsigned Reg = MI->getOperand(OpNum).getReg();
   unsigned Reg0 = MRI.getSubReg(Reg, ARM::dsub_0);
   unsigned Reg1 = MRI.getSubReg(Reg, ARM::dsub_2);
-  O << "{" << getRegisterName(Reg0) << "[], " << getRegisterName(Reg1) << "[]}";
+  O << "{";
+  printRegName(O, Reg0);
+  O << "[], ";
+  printRegName(O, Reg1);
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListThreeSpacedAllLanes(const MCInst *MI,
@@ -1193,9 +1353,13 @@ void ARMInstPrinter::printVectorListThreeSpacedAllLanes(const MCInst *MI,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 4) << "[]}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O  << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListFourSpacedAllLanes(const MCInst *MI,
@@ -1204,10 +1368,15 @@ void ARMInstPrinter::printVectorListFourSpacedAllLanes(const MCInst *MI,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 4) << "[], "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 6) << "[]}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "[], ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "[]}";
 }
 
 void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
@@ -1216,9 +1385,13 @@ void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 4) << "}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << "}";
 }
 
 void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI,
@@ -1227,8 +1400,13 @@ void ARMInstPrinter::printVectorListFourSpaced(const MCInst *MI,
   // Normally, it's not safe to use register enum values directly with
   // addition to get the next register, but for VFP registers, the
   // sort order is guaranteed because they're all of the form D<n>.
-  O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 4) << ", "
-    << getRegisterName(MI->getOperand(OpNum).getReg() + 6) << "}";
+  O << "{";
+  printRegName(O, MI->getOperand(OpNum).getReg());
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 2);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 4);
+  O << ", ";
+  printRegName(O, MI->getOperand(OpNum).getReg() + 6);
+  O << "}";
 }
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 73d7bfd28502..b7bab5fdcd8e 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -126,7 +126,8 @@ public:
   void printRotImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
   void printPCLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
+                                 raw_ostream &O);
   void printFBits16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printFBits32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index ac6ce642dfa9..1ba6ab039f20 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -593,7 +593,9 @@ public:
   const object::mach::CPUSubtypeARM Subtype;
   DarwinARMAsmBackend(const Target &T, const StringRef TT,
                       object::mach::CPUSubtypeARM st)
-    : ARMAsmBackend(T, TT), Subtype(st) { }
+    : ARMAsmBackend(T, TT), Subtype(st) {
+      HasDataInCodeSupport = true;
+    }
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createARMMachObjectWriter(OS, /*Is64Bit=*/false,
@@ -674,7 +676,7 @@ void DarwinARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
@@ -687,6 +689,15 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT) {
     else if (TheTriple.getArchName() == "armv6" ||
         TheTriple.getArchName() == "thumbv6")
       return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V6);
+    else if (TheTriple.getArchName() == "armv7f" ||
+        TheTriple.getArchName() == "thumbv7f")
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7F);
+    else if (TheTriple.getArchName() == "armv7k" ||
+        TheTriple.getArchName() == "thumbv7k")
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7K);
+    else if (TheTriple.getArchName() == "armv7s" ||
+        TheTriple.getArchName() == "thumbv7s")
+      return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7S);
     return new DarwinARMAsmBackend(T, TT, object::mach::CSARM_V7);
   }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 7d6acbc5cfda..99e4f713f690 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -194,6 +194,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case ARM::fixup_arm_uncondbranch:
       Type = ELF::R_ARM_JUMP24;
       break;
+    case ARM::fixup_t2_condbranch:
+    case ARM::fixup_t2_uncondbranch:
+      Type = ELF::R_ARM_THM_JUMP24;
+      break;
     case ARM::fixup_arm_movt_hi16:
     case ARM::fixup_arm_movt_hi16_pcrel:
       Type = ELF::R_ARM_MOVT_PREL;
@@ -242,6 +246,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_TARGET1:
         Type = ELF::R_ARM_TARGET1;
         break;
+      case MCSymbolRefExpr::VK_ARM_TARGET2:
+        Type = ELF::R_ARM_TARGET2;
+        break;
       } 
       break;
     case ARM::fixup_arm_ldst_pcrel_12:
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index d32805e522a3..c1aab9c72cca 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -50,7 +50,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
   Code32Directive = ".code\t32";
 
   WeakRefDirective = "\t.weak\t";
-  LCOMMDirectiveType = LCOMM::NoAlignment;
 
   HasLEB128 = true;
   SupportsDebugInformation = true;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 94f1082b5f6d..d0e127a8f335 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -35,8 +35,8 @@ STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
 namespace {
 class ARMMCCodeEmitter : public MCCodeEmitter {
-  ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCSubtargetInfo &STI;
   const MCContext &CTX;
@@ -783,7 +783,7 @@ getT2Imm8s4OpValue(const MCInst &MI, unsigned OpIdx,
 
   // Immediate is always encoded as positive. The 'U' bit controls add vs sub.
   if (Imm8 < 0)
-    Imm8 = -Imm8;
+    Imm8 = -(uint32_t)Imm8;
 
   // Scaled by 4.
   Imm8 /= 4;
@@ -934,6 +934,10 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
   unsigned SBits = getShiftOp(ShOp);
 
+  // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift
+  // amount. However, it would be an easy mistake to make so check here.
+  assert((ShImm & ~0x1f) == 0 && "Out of range shift amount");
+
   // {16-13} = Rn
   // {12}    = isAdd
   // {11-0}  = shifter
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index a727e087d291..b404e6c6e014 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -28,7 +28,7 @@ private:
 
   explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
     : Kind(_Kind), Expr(_Expr) {}
-  
+
 public:
   /// @name Construction
   /// @{
@@ -67,9 +67,6 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }
-  
-  static bool classof(const ARMMCExpr *) { return true; }
-
 };
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 5df84c8b103a..00ffc94ac7d1 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -71,6 +71,14 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
         else
           // Use CPU to figure out the exact features.
           ARMArchFeature = "+v7";
+      } else if (Len >= Idx+2 && TT[Idx+1] == 's') {
+        if (NoCPU)
+          // v7s: FeatureNEON, FeatureDB, FeatureDSPThumb2, FeatureT2XtPk
+          //      Swift
+          ARMArchFeature = "+v7,+swift,+neon,+db,+t2dsp,+t2xtpk";
+        else
+          // Use CPU to figure out the exact features.
+          ARMArchFeature = "+v7";
       } else {
         // v7 CPUs have lots of different feature sets. If no CPU is specified,
         // then assume v7a (e.g. cortex-a8) feature set. Otherwise, return
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 510302d4b3d0..a89981e4f060 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -46,7 +46,7 @@ MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index a51e0fa3fbc5..2154c931769a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -41,6 +41,12 @@ class ARMMachObjectWriter : public MCMachObjectTargetWriter {
                                         const MCFixup &Fixup, MCValue Target,
                                         uint64_t &FixedValue);
 
+  bool requiresExternRelocation(MachObjectWriter *Writer,
+                                const MCAssembler &Asm,
+                                const MCFragment &Fragment,
+                                unsigned RelocType, const MCSymbolData *SD,
+                                uint64_t FixedValue);
+
 public:
   ARMMachObjectWriter(bool Is64Bit, uint32_t CPUType,
                       uint32_t CPUSubtype)
@@ -305,6 +311,46 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
+bool ARMMachObjectWriter::requiresExternRelocation(MachObjectWriter *Writer,
+                                                   const MCAssembler &Asm,
+                                                   const MCFragment &Fragment,
+                                                   unsigned RelocType,
+                                                   const MCSymbolData *SD,
+                                                   uint64_t FixedValue) {
+  // Most cases can be identified purely from the symbol.
+  if (Writer->doesSymbolRequireExternRelocation(SD))
+    return true;
+  int64_t Value = (int64_t)FixedValue;  // The displacement is signed.
+  int64_t Range;
+  switch (RelocType) {
+  default:
+    return false;
+  case macho::RIT_ARM_Branch24Bit:
+    // PC pre-adjustment of 8 for these instructions.
+    Value -= 8;
+    // ARM BL/BLX has a 25-bit offset.
+    Range = 0x1ffffff;
+    break;
+  case macho::RIT_ARM_ThumbBranch22Bit:
+    // PC pre-adjustment of 4 for these instructions.
+    Value -= 4;
+    // Thumb BL/BLX has a 24-bit offset.
+    Range = 0xffffff;
+  }
+  // BL/BLX also use external relocations when an internal relocation
+  // would result in the target being out of range. This gives the linker
+  // enough information to generate a branch island.
+  const MCSectionData &SymSD = Asm.getSectionData(
+    SD->getSymbol().getSection());
+  Value += Writer->getSectionAddress(&SymSD);
+  Value -= Writer->getSectionAddress(Fragment.getParent());
+  // If the resultant value would be out of range for an internal relocation,
+  // use an external instead.
+  if (Value > Range || Value < -(Range + 1))
+    return true;
+  return false;
+}
+
 void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
                                            const MCAssembler &Asm,
                                            const MCAsmLayout &Layout,
@@ -373,7 +419,8 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
     }
 
     // Check whether we need an external or internal relocation.
-    if (Writer->doesSymbolRequireExternRelocation(SD)) {
+    if (requiresExternRelocation(Writer, Asm, *Fragment, RelocType, SD,
+                                 FixedValue)) {
       IsExtern = 1;
       Index = SD->getIndex();
 
@@ -410,7 +457,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   if (Type == macho::RIT_ARM_Half) {
     // The other-half value only gets populated for the movt and movw
     // relocation entries.
-    uint32_t Value = 0;;
+    uint32_t Value = 0;
     switch ((unsigned)Fixup.getKind()) {
     default: break;
     case ARM::fixup_arm_movw_lo16:
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index ad60e3282e68..70643bcda3ac 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -51,7 +51,8 @@ namespace {
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo *MRI;
 
-    bool isA9;
+    bool isLikeA9;
+    bool isSwift;
     unsigned MIIdx;
     MachineInstr* LastMIs[4];
     SmallPtrSet<MachineInstr*, 4> IgnoreStall;
@@ -60,6 +61,7 @@ namespace {
     void pushStack(MachineInstr *MI);
     MachineInstr *getAccDefMI(MachineInstr *MI) const;
     unsigned getDefReg(MachineInstr *MI) const;
+    bool hasLoopHazard(MachineInstr *MI) const;
     bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
     bool FindMLxHazard(MachineInstr *MI);
     void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
@@ -135,6 +137,50 @@ unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
   return Reg;
 }
 
+/// hasLoopHazard - Check whether an MLx instruction is chained to itself across
+/// a single-MBB loop.
+bool MLxExpansion::hasLoopHazard(MachineInstr *MI) const {
+  unsigned Reg = MI->getOperand(1).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return false;
+
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *DefMI = MRI->getVRegDef(Reg);
+  while (true) {
+outer_continue:
+    if (DefMI->getParent() != MBB)
+      break;
+
+    if (DefMI->isPHI()) {
+      for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) {
+        if (DefMI->getOperand(i + 1).getMBB() == MBB) {
+          unsigned SrcReg = DefMI->getOperand(i).getReg();
+          if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+            DefMI = MRI->getVRegDef(SrcReg);
+            goto outer_continue;
+          }
+        }
+      }
+    } else if (DefMI->isCopyLike()) {
+      Reg = DefMI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    } else if (DefMI->isInsertSubreg()) {
+      Reg = DefMI->getOperand(2).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        DefMI = MRI->getVRegDef(Reg);
+        continue;
+      }
+    }
+
+    break;
+  }
+
+  return DefMI == MI;
+}
+
 bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
   // FIXME: Detect integer instructions properly.
   const MCInstrDesc &MCID = MI->getDesc();
@@ -149,6 +195,19 @@ bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
   return false;
 }
 
+static bool isFpMulInstruction(unsigned Opcode) {
+  switch (Opcode) {
+  case ARM::VMULS:
+  case ARM::VMULfd:
+  case ARM::VMULfq:
+  case ARM::VMULD:
+  case ARM::VMULslfd:
+  case ARM::VMULslfq:
+    return true;
+  default:
+    return false;
+  }
+}
 
 bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
   if (NumExpand >= ExpandLimit)
@@ -171,6 +230,12 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
     return true;
   }
 
+  // On Swift, we mostly care about hazards from multiplication instructions
+  // writing the accumulator and the pipelining of loop iterations by out-of-
+  // order execution. 
+  if (isSwift)
+    return isFpMulInstruction(DefMI->getOpcode()) || hasLoopHazard(MI);
+
   if (IgnoreStall.count(MI))
     return false;
 
@@ -179,8 +244,8 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
   // preserves the in-order retirement of the instructions.
   // Look at the next few instructions, if *most* of them can cause hazards,
   // then the scheduler can't *fix* this, we'd better break up the VMLA.
-  unsigned Limit1 = isA9 ? 1 : 4;
-  unsigned Limit2 = isA9 ? 1 : 4;
+  unsigned Limit1 = isLikeA9 ? 1 : 4;
+  unsigned Limit2 = isLikeA9 ? 1 : 4;
   for (unsigned i = 1; i <= 4; ++i) {
     int Idx = ((int)MIIdx - i + 4) % 4;
     MachineInstr *NextMI = LastMIs[Idx];
@@ -316,7 +381,8 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
   TRI = Fn.getTarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
   const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
-  isA9 = STI->isCortexA9();
+  isLikeA9 = STI->isLikeA9() || STI->isSwift();
+  isSwift = STI->isSwift();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 5913a9c4ccdd..a85acaaa1494 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -1,8 +1,6 @@
 add_llvm_library(LLVMTarget
   Mangler.cpp
   Target.cpp
-  TargetData.cpp
-  TargetELFWriterInfo.cpp
   TargetInstrInfo.cpp
   TargetIntrinsicInfo.cpp
   TargetJITInfo.cpp
@@ -12,6 +10,7 @@ add_llvm_library(LLVMTarget
   TargetMachineC.cpp
   TargetRegisterInfo.cpp
   TargetSubtargetInfo.cpp
+  TargetTransformImpl.cpp
   )
 
 foreach(t ${LLVM_TARGETS_TO_BUILD})
diff --git a/lib/Target/CellSPU/SPUAsmPrinter.cpp b/lib/Target/CellSPU/SPUAsmPrinter.cpp
index 03d5a9ae0c4c..3396e8b1ef39 100644
--- a/lib/Target/CellSPU/SPUAsmPrinter.cpp
+++ b/lib/Target/CellSPU/SPUAsmPrinter.cpp
@@ -130,8 +130,7 @@ namespace {
     void
     printS10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
     {
-      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
-                             >> 16);
+      short value = MI->getOperand(OpNo).getImm();
       assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
              && "Invalid s10 argument");
       O << value;
@@ -140,8 +139,7 @@ namespace {
     void
     printU10ImmOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O)
     {
-      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
-                             >> 16);
+      short value = MI->getOperand(OpNo).getImm();
       assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
       O << value;
     }
diff --git a/lib/Target/CellSPU/SPUFrameLowering.cpp b/lib/Target/CellSPU/SPUFrameLowering.cpp
index fac806e1b0ea..f01199515a11 100644
--- a/lib/Target/CellSPU/SPUFrameLowering.cpp
+++ b/lib/Target/CellSPU/SPUFrameLowering.cpp
@@ -22,7 +22,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index c27caeae7d45..5d5061054b08 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -67,8 +67,8 @@ namespace {
 
   //! ConstantSDNode predicate for signed 16-bit values
   /*!
-    \arg CN The constant SelectionDAG node holding the value
-    \arg Imm The returned 16-bit value, if returning true
+    \param CN The constant SelectionDAG node holding the value
+    \param Imm The returned 16-bit value, if returning true
 
     This predicate tests the value in \a CN to see whether it can be
     represented as a 16-bit, sign-extended quantity. Returns true if
@@ -83,12 +83,10 @@ namespace {
       return true;
     } else if (vt == MVT::i32) {
       int32_t i_val = (int32_t) CN->getZExtValue();
-      short s_val = (short) i_val;
-      return i_val == s_val;
+      return i_val == SignExtend32<16>(i_val);
     } else {
       int64_t i_val = (int64_t) CN->getZExtValue();
-      short s_val = (short) i_val;
-      return i_val == s_val;
+      return i_val == SignExtend64<16>(i_val);
     }
   }
 
@@ -99,9 +97,10 @@ namespace {
     EVT vt = FPN->getValueType(0);
     if (vt == MVT::f32) {
       int val = FloatToBits(FPN->getValueAPF().convertToFloat());
-      int sval = (int) ((val << 16) >> 16);
-      Imm = (short) val;
-      return val == sval;
+      if (val == SignExtend32<16>(val)) {
+        Imm = (short) val;
+        return true;
+      }
     }
 
     return false;
@@ -306,10 +305,10 @@ namespace {
 }
 
 /*!
- \arg Op The ISD instruction operand
- \arg N The address to be tested
- \arg Base The base address
- \arg Index The base address index
+ \param Op The ISD instruction operand
+ \param N The address to be tested
+ \param Base The base address
+ \param Index The base address index
  */
 bool
 SPUDAGToDAGISel::SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
@@ -376,10 +375,10 @@ SPUDAGToDAGISel::SelectDForm2Addr(SDNode *Op, SDValue N, SDValue &Disp,
 }
 
 /*!
-  \arg Op The ISD instruction (ignored)
-  \arg N The address to be tested
-  \arg Base Base address register/pointer
-  \arg Index Base address index
+  \param Op The ISD instruction (ignored)
+  \param N The address to be tested
+  \param Base Base address register/pointer
+  \param Index Base address index
 
   Examine the input address by a base register plus a signed 10-bit
   displacement, [r+I10] (D-form address).
@@ -542,10 +541,10 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
 }
 
 /*!
-  \arg Op The ISD instruction operand
-  \arg N The address operand
-  \arg Base The base pointer operand
-  \arg Index The offset/index operand
+  \param Op The ISD instruction operand
+  \param N The address operand
+  \param Base The base pointer operand
+  \param Index The offset/index operand
 
   If the address \a N can be expressed as an A-form or D-form address, returns
   false.  Otherwise, creates two operands, Base and Index that will become the
@@ -570,7 +569,7 @@ SPUDAGToDAGISel::SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base,
  Utility function to use with COPY_TO_REGCLASS instructions. Returns a SDValue
  to be used as the last parameter of a
 CurDAG->getMachineNode(COPY_TO_REGCLASS,..., ) function call
- \arg VT the value type for which we want a register class
+ \param VT the value type for which we want a register class
 */
 SDValue SPUDAGToDAGISel::getRC( MVT VT ) {
   switch( VT.SimpleTy ) {
diff --git a/lib/Target/CellSPU/SPUSubtarget.h b/lib/Target/CellSPU/SPUSubtarget.h
index 7c4aa1430217..27d28b22dd04 100644
--- a/lib/Target/CellSPU/SPUSubtarget.h
+++ b/lib/Target/CellSPU/SPUSubtarget.h
@@ -80,9 +80,9 @@ namespace llvm {
       return UseLargeMem;
     }
 
-    /// getTargetDataString - Return the pointer size and type alignment
+    /// getDataLayoutString - Return the pointer size and type alignment
     /// properties of this subtarget.
-    const char *getTargetDataString() const {
+    const char *getDataLayoutString() const {
       return "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128"
              "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:64:128-v128:128:128"
              "-s:128:128-n32:64";
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 54764f133cc1..918316572a2e 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -38,12 +38,13 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
                                    CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
-    DataLayout(Subtarget.getTargetDataString()),
+    DL(Subtarget.getDataLayoutString()),
     InstrInfo(*this),
     FrameLowering(Subtarget),
     TLInfo(*this),
     TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+    InstrItins(Subtarget.getInstrItineraryData()),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index 3e5d38c919c1..7f53ea6fbeb2 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -20,7 +20,8 @@
 #include "SPUSelectionDAGInfo.h"
 #include "SPUFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetTransformImpl.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -28,12 +29,14 @@ namespace llvm {
 ///
 class SPUTargetMachine : public LLVMTargetMachine {
   SPUSubtarget        Subtarget;
-  const TargetData    DataLayout;
+  const DataLayout    DL;
   SPUInstrInfo        InstrInfo;
   SPUFrameLowering    FrameLowering;
   SPUTargetLowering   TLInfo;
   SPUSelectionDAGInfo TSInfo;
   InstrItineraryData  InstrItins;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 public:
   SPUTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -70,13 +73,19 @@ public:
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const TargetData *getTargetData() const {
-    return &DataLayout;
+  virtual const DataLayout *getDataLayout() const {
+    return &DL;
   }
 
   virtual const InstrItineraryData *getInstrItineraryData() const {
     return &InstrItins;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index c8e757becc72..5c909903f94b 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -285,14 +285,14 @@ void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
     Out << "GlobalValue::LinkerPrivateLinkage"; break;
   case GlobalValue::LinkerPrivateWeakLinkage:
     Out << "GlobalValue::LinkerPrivateWeakLinkage"; break;
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
-    Out << "GlobalValue::LinkerPrivateWeakDefAutoLinkage"; break;
   case GlobalValue::AvailableExternallyLinkage:
     Out << "GlobalValue::AvailableExternallyLinkage "; break;
   case GlobalValue::LinkOnceAnyLinkage:
     Out << "GlobalValue::LinkOnceAnyLinkage "; break;
   case GlobalValue::LinkOnceODRLinkage:
     Out << "GlobalValue::LinkOnceODRLinkage "; break;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+    Out << "GlobalValue::LinkOnceODRAutoHideLinkage"; break;
   case GlobalValue::WeakAnyLinkage:
     Out << "GlobalValue::WeakAnyLinkage"; break;
   case GlobalValue::WeakODRLinkage:
@@ -474,13 +474,15 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
     Out << "AttributeWithIndex PAWI;"; nl(Out);
     for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
       unsigned index = PAL.getSlot(i).Index;
-      Attributes attrs = PAL.getSlot(i).Attrs;
-      Out << "PAWI.Index = " << index << "U; PAWI.Attrs = Attribute::None ";
-#define HANDLE_ATTR(X)                 \
-      if (attrs & Attribute::X)      \
-        Out << " | Attribute::" #X;  \
-      attrs &= ~Attribute::X;
-      
+      AttrBuilder attrs(PAL.getSlot(i).Attrs);
+      Out << "PAWI.Index = " << index << "U;\n";
+      Out << " {\n    AttrBuilder B;\n";
+
+#define HANDLE_ATTR(X)                                     \
+      if (attrs.hasAttribute(Attributes::X))               \
+        Out << "    B.addAttribute(Attributes::" #X ");\n"; \
+      attrs.removeAttribute(Attributes::X);
+
       HANDLE_ATTR(SExt);
       HANDLE_ATTR(ZExt);
       HANDLE_ATTR(NoReturn);
@@ -505,19 +507,18 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
       HANDLE_ATTR(ReturnsTwice);
       HANDLE_ATTR(UWTable);
       HANDLE_ATTR(NonLazyBind);
+      HANDLE_ATTR(MinSize);
 #undef HANDLE_ATTR
-      if (attrs & Attribute::StackAlignment)
-        Out << " | Attribute::constructStackAlignmentFromInt("
-            << Attribute::getStackAlignmentFromAttrs(attrs)
-            << ")"; 
-      attrs &= ~Attribute::StackAlignment;
-      assert(attrs == 0 && "Unhandled attribute!");
-      Out << ";";
+      if (attrs.hasAttribute(Attributes::StackAlignment))
+        Out << "    B.addStackAlignmentAttr(" << attrs.getStackAlignment() << ")\n";
+      attrs.removeAttribute(Attributes::StackAlignment);
+      assert(!attrs.hasAttributes() && "Unhandled attribute!");
+      Out << "    PAWI.Attrs = Attributes::get(mod->getContext(), B);\n }";
       nl(Out);
       Out << "Attrs.push_back(PAWI);";
       nl(Out);
     }
-    Out << name << "_PAL = AttrListPtr::get(Attrs);";
+    Out << name << "_PAL = AttrListPtr::get(mod->getContext(), Attrs);";
     nl(Out);
     out(); nl(Out);
     Out << '}'; nl(Out);
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 9cbe7981a905..30d765d6c9ce 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -15,7 +15,7 @@
 #define CPPTARGETMACHINE_H
 
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -35,7 +35,7 @@ struct CPPTargetMachine : public TargetMachine {
                                    AnalysisID StartAfter,
                                    AnalysisID StopAfter);
 
-  virtual const TargetData *getTargetData() const { return 0; }
+  virtual const DataLayout *getDataLayout() const { return 0; }
 };
 
 extern Target TheCppBackendTarget;
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 1f2d8accbb09..306084bb8c52 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -16,6 +16,7 @@ add_llvm_target(HexagonCodeGen
   HexagonExpandPredSpillCode.cpp
   HexagonFrameLowering.cpp
   HexagonHardwareLoops.cpp
+  HexagonMachineScheduler.cpp
   HexagonMCInstLower.cpp
   HexagonInstrInfo.cpp
   HexagonISelDAGToDAG.cpp
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 5fa4740f2af3..c15bce608f5e 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -46,7 +46,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
diff --git a/lib/Target/Hexagon/HexagonCallingConvLower.cpp b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
index ba8e679ebdae..73f9d9acab26 100644
--- a/lib/Target/Hexagon/HexagonCallingConvLower.cpp
+++ b/lib/Target/Hexagon/HexagonCallingConvLower.cpp
@@ -16,7 +16,7 @@
 #include "HexagonCallingConvLower.h"
 #include "Hexagon.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 703a128ee024..1c891f14d8fe 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -1350,6 +1350,8 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
     } else {
       setOperationAction(ISD::BR_JT, MVT::Other, Expand);
     }
+    // Increase jump tables cutover to 5, was 4.
+    setMinimumJumpTableEntries(5);
 
     setOperationAction(ISD::BR_CC, MVT::i32, Expand);
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index e472d490e0a0..a64c7a18164f 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -56,6 +56,16 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   bits<1> isPredicated = 0;
   let TSFlags{6} = isPredicated;
 
+  // Dot new value store instructions.
+  bits<1> isNVStore = 0;
+  let TSFlags{8} = isNVStore;
+
+  // Fields used for relation models.
+  string BaseOpcode = "";
+  string CextOpcode = "";
+  string PredSense = "";
+  string PNewValue = "";
+  string InputType = "";    // Input is "imm" or "reg" type.
   // *** The code above must match HexagonBaseInfo.h ***
 }
 
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index c8f933dcf4bd..84354403084d 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/MathExtras.h"
 #define GET_INSTRINFO_CTOR
+#define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
 
@@ -1915,6 +1916,15 @@ unsigned HexagonInstrInfo::getInvertedPredicatedOpcode(const int Opc) const {
 
 int HexagonInstrInfo::
 getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
+  enum Hexagon::PredSense inPredSense;
+  inPredSense = invertPredicate ? Hexagon::PredSense_false :
+                                  Hexagon::PredSense_true;
+  int CondOpcode = Hexagon::getPredOpcode(Opc, inPredSense);
+  if (CondOpcode >= 0) // Valid Conditional opcode/instruction
+    return CondOpcode;
+
+  // This switch case will be removed once all the instructions have been
+  // modified to use relation maps.
   switch(Opc) {
   case Hexagon::TFR:
     return !invertPredicate ? Hexagon::TFR_cPt :
@@ -1934,24 +1944,6 @@ getMatchingCondBranchOpcode(int Opc, bool invertPredicate) const {
   case Hexagon::JMP_EQriPt_nv_V4:
     return !invertPredicate ? Hexagon::JMP_EQriPt_nv_V4 :
                               Hexagon::JMP_EQriNotPt_nv_V4;
-  case Hexagon::ADD_ri:
-    return !invertPredicate ? Hexagon::ADD_ri_cPt :
-                              Hexagon::ADD_ri_cNotPt;
-  case Hexagon::ADD_rr:
-    return !invertPredicate ? Hexagon::ADD_rr_cPt :
-                              Hexagon::ADD_rr_cNotPt;
-  case Hexagon::XOR_rr:
-    return !invertPredicate ? Hexagon::XOR_rr_cPt :
-                              Hexagon::XOR_rr_cNotPt;
-  case Hexagon::AND_rr:
-    return !invertPredicate ? Hexagon::AND_rr_cPt :
-                              Hexagon::AND_rr_cNotPt;
-  case Hexagon::OR_rr:
-    return !invertPredicate ? Hexagon::OR_rr_cPt :
-                              Hexagon::OR_rr_cNotPt;
-  case Hexagon::SUB_rr:
-    return !invertPredicate ? Hexagon::SUB_rr_cPt :
-                              Hexagon::SUB_rr_cNotPt;
   case Hexagon::COMBINE_rr:
     return !invertPredicate ? Hexagon::COMBINE_rr_cPt :
                               Hexagon::COMBINE_rr_cNotPt;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index c0c0df6004cd..1d4a7060adf0 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -15,6 +15,18 @@ include "HexagonInstrFormats.td"
 include "HexagonImmediates.td"
 
 //===----------------------------------------------------------------------===//
+// Classes used for relation maps.
+//===----------------------------------------------------------------------===//
+// PredRel - Filter class used to relate non-predicated instructions with their
+// predicated forms.
+class PredRel;
+// PredNewRel - Filter class used to relate predicated instructions with their
+// predicate-new forms.
+class PredNewRel: PredRel;
+// ImmRegRel - Filter class used to relate instructions having reg-reg form
+// with their reg-imm counterparts.
+class ImmRegRel;
+//===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
 def HasV2T                      : Predicate<"Subtarget.hasV2TOps()">;
@@ -148,37 +160,91 @@ multiclass CMP32_ri_s8<string OpcStr, PatFrag OpNode> {
 }
 
 //===----------------------------------------------------------------------===//
-// ALU32/ALU +
+// ALU32/ALU (Instructions with register-register form)
 //===----------------------------------------------------------------------===//
-// Add.
-let isCommutable = 1, isPredicable = 1 in
-def ADD_rr : ALU32_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = add($src1, $src2)",
-            [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
-                                           (i32 IntRegs:$src2)))]>;
+multiclass ALU32_Pbase<string mnemonic, bit isNot,
+                       bit isPredNew> {
 
-let isPredicable = 1 in
-def ADD_ri : ALU32_ri<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, s16Imm:$src2),
-            "$dst = add($src1, #$src2)",
-            [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$src1),
-                                           s16ImmPred:$src2))]>;
+  let PNewValue = #!if(isPredNew, "new", "") in
+  def #NAME# : ALU32_rr<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, IntRegs: $src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2, $src3)",
+            []>;
+}
 
-// Logical operations.
-let isPredicable = 1 in
-def XOR_rr : ALU32_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = xor($src1, $src2)",
-            [(set (i32 IntRegs:$dst), (xor (i32 IntRegs:$src1),
-                                           (i32 IntRegs:$src2)))]>;
+multiclass ALU32_Pred<string mnemonic, bit PredNot> {
+  let PredSense = #!if(PredNot, "false", "true") in {
+    defm _c#NAME# : ALU32_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME# : ALU32_Pbase<mnemonic, PredNot, 1>;
+  }
+}
 
-let isCommutable = 1, isPredicable = 1 in
-def AND_rr : ALU32_rr<(outs IntRegs:$dst),
+let InputType = "reg" in
+multiclass ALU32_base<string mnemonic, string CextOp, SDNode OpNode> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_rr in {
+    let isPredicable = 1 in
+    def #NAME# : ALU32_rr<(outs IntRegs:$dst),
             (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = and($src1, $src2)",
-            [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
-                                           (i32 IntRegs:$src2)))]>;
+            "$dst = "#mnemonic#"($src1, $src2)",
+            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                              (i32 IntRegs:$src2)))]>;
+
+    let neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32_Pred<mnemonic, 0>;
+      defm NotPt : ALU32_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+let isCommutable = 1 in {
+  defm ADD_rr : ALU32_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
+  defm AND_rr : ALU32_base<"and", "AND", and>, ImmRegRel, PredNewRel;
+  defm XOR_rr : ALU32_base<"xor", "XOR", xor>, ImmRegRel, PredNewRel;
+  defm OR_rr  : ALU32_base<"or", "OR", or>, ImmRegRel, PredNewRel;
+}
+
+defm SUB_rr : ALU32_base<"sub", "SUB", sub>, ImmRegRel, PredNewRel;
+
+//===----------------------------------------------------------------------===//
+// ALU32/ALU (ADD with register-immediate form)
+//===----------------------------------------------------------------------===//
+multiclass ALU32ri_Pbase<string mnemonic, bit isNot, bit isPredNew> {
+  let PNewValue = #!if(isPredNew, "new", "") in
+  def #NAME# : ALU32_ri<(outs IntRegs:$dst),
+            (ins PredRegs:$src1, IntRegs:$src2, s8Imm: $src3),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew,".new) $dst = ",
+            ") $dst = ")#mnemonic#"($src2, #$src3)",
+            []>;
+}
+
+multiclass ALU32ri_Pred<string mnemonic, bit PredNot> {
+  let PredSense = #!if(PredNot, "false", "true") in {
+    defm _c#NAME# : ALU32ri_Pbase<mnemonic, PredNot, 0>;
+    // Predicate new
+    defm _cdn#NAME# : ALU32ri_Pbase<mnemonic, PredNot, 1>;
+  }
+}
+
+let InputType = "imm" in
+multiclass ALU32ri_base<string mnemonic, string CextOp, SDNode OpNode> {
+  let CextOpcode = CextOp, BaseOpcode = CextOp#_ri in {
+    let isPredicable = 1 in
+    def #NAME# : ALU32_ri<(outs IntRegs:$dst),
+            (ins IntRegs:$src1, s16Imm:$src2),
+            "$dst = "#mnemonic#"($src1, #$src2)",
+            [(set (i32 IntRegs:$dst), (OpNode (i32 IntRegs:$src1),
+                                              (s16ImmPred:$src2)))]>;
+
+    let neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ALU32ri_Pred<mnemonic, 0>;
+      defm NotPt : ALU32ri_Pred<mnemonic, 1>;
+    }
+  }
+}
+
+defm ADD_ri : ALU32ri_base<"add", "ADD", add>, ImmRegRel, PredNewRel;
 
 def OR_ri : ALU32_ri<(outs IntRegs:$dst),
             (ins IntRegs:$src1, s10Imm:$src2),
@@ -197,13 +263,6 @@ def AND_ri : ALU32_ri<(outs IntRegs:$dst),
             [(set (i32 IntRegs:$dst), (and (i32 IntRegs:$src1),
                                            s10ImmPred:$src2))]>;
 
-let isCommutable = 1, isPredicable = 1 in
-def OR_rr : ALU32_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = or($src1, $src2)",
-            [(set (i32 IntRegs:$dst), (or (i32 IntRegs:$src1),
-                                          (i32 IntRegs:$src2)))]>;
-
 // Negate.
 def NEG : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
           "$dst = neg($src1)",
@@ -214,14 +273,6 @@ def NOP : ALU32_rr<(outs), (ins),
           "nop",
           []>;
 
-// Subtract.
-let isPredicable = 1 in
-def SUB_rr : ALU32_rr<(outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2),
-            "$dst = sub($src1, $src2)",
-            [(set (i32 IntRegs:$dst), (sub (i32 IntRegs:$src1),
-                                           (i32 IntRegs:$src2)))]>;
-
 // Rd32=sub(#s10,Rs32)
 def SUB_ri : ALU32_ri<(outs IntRegs:$dst),
             (ins s10Imm:$src1, IntRegs:$src2),
@@ -348,56 +399,6 @@ def ZXTH : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$src1),
 // ALU32/PRED +
 //===----------------------------------------------------------------------===//
 
-// Conditional add.
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_ri_cPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
-            "if ($src1) $dst = add($src2, #$src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_ri_cNotPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
-            "if (!$src1) $dst = add($src2, #$src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_ri_cdnPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
-            "if ($src1.new) $dst = add($src2, #$src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_ri_cdnNotPt : ALU32_ri<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, s8Imm:$src3),
-            "if (!$src1.new) $dst = add($src2, #$src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1) $dst = add($src2, $src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1) $dst = add($src2, $src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1.new) $dst = add($src2, $src3)",
-            []>;
-
-let neverHasSideEffects = 1, isPredicated = 1 in
-def ADD_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1.new) $dst = add($src2, $src3)",
-            []>;
-
-
 // Conditional combine.
 
 let neverHasSideEffects = 1, isPredicated = 1 in
@@ -424,108 +425,6 @@ def COMBINE_rr_cdnNotPt : ALU32_rr<(outs DoubleRegs:$dst),
             "if (!$src1.new) $dst = combine($src2, $src3)",
             []>;
 
-// Conditional logical operations.
-
-let isPredicated = 1 in
-def XOR_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1) $dst = xor($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def XOR_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1) $dst = xor($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def XOR_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1.new) $dst = xor($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def XOR_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1.new) $dst = xor($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def AND_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1) $dst = and($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def AND_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1) $dst = and($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def AND_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1.new) $dst = and($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def AND_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1.new) $dst = and($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def OR_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1) $dst = or($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def OR_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1) $dst = or($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def OR_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1.new) $dst = or($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def OR_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1.new) $dst = or($src2, $src3)",
-            []>;
-
-
-// Conditional subtract.
-
-let isPredicated = 1 in
-def SUB_rr_cPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1) $dst = sub($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def SUB_rr_cNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1) $dst = sub($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def SUB_rr_cdnPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if ($src1.new) $dst = sub($src2, $src3)",
-            []>;
-
-let isPredicated = 1 in
-def SUB_rr_cdnNotPt : ALU32_rr<(outs IntRegs:$dst),
-            (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-            "if (!$src1.new) $dst = sub($src2, $src3)",
-            []>;
-
-
 // Conditional transfer.
 let neverHasSideEffects = 1, isPredicated = 1 in
 def TFR_cPt : ALU32_rr<(outs IntRegs:$dst), (ins PredRegs:$src1, IntRegs:$src2),
@@ -3546,4 +3445,31 @@ include "HexagonInstrInfoV5.td"
 // V5 Instructions -
 //===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate non-predicate instructions with their
+// predicated formats - true and false.
+//
+
+def getPredOpcode : InstrMapping {
+  let FilterClass = "PredRel";
+  // Instructions with the same BaseOpcode and isNVStore values form a row.
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue"];
+  // Instructions with the same predicate sense form a column.
+  let ColFields = ["PredSense"];
+  // The key column is the unpredicated instructions.
+  let KeyCol = [""];
+  // Value columns are PredSense=true and PredSense=false
+  let ValueCols = [["true"], ["false"]];
+}
 
+//===----------------------------------------------------------------------===//
+// Generate mapping table to relate predicated instructions with their .new
+// format.
+//
+def getPredNewOpcode : InstrMapping {
+  let FilterClass = "PredNewRel";
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let ColFields = ["PNewValue"];
+  let KeyCol = [""];
+  let ValueCols = [["new"]];
+}
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
new file mode 100644
index 000000000000..0e9ef4838d8a
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -0,0 +1,681 @@
+//===- HexagonMachineScheduler.cpp - MI Scheduler for Hexagon -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "misched"
+
+#include "HexagonMachineScheduler.h"
+
+#include <queue>
+
+using namespace llvm;
+
+/// Platform specific modifications to DAG.
+void VLIWMachineScheduler::postprocessDAG() {
+  SUnit* LastSequentialCall = NULL;
+  // Currently we only catch the situation when compare gets scheduled
+  // before preceding call.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    // Remember the call.
+    if (SUnits[su].getInstr()->isCall())
+      LastSequentialCall = &(SUnits[su]);
+    // Look for a compare that defines a predicate.
+    else if (SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
+  }
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+/// It is _not_ precise (statefull), it is more like
+/// another heuristic. Many corner cases are figured
+/// empirically.
+bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getInstr())
+    return false;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    if (!ResourcesModel->canReserveResources(SU->getInstr()))
+      return false;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+
+  // Now see if there are no other dependencies to instructions already
+  // in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    if (Packet[i]->Succs.size() == 0)
+      continue;
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order dependencies.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// Keep track of available resources.
+bool VLIWResourceModel::reserveResources(SUnit *SU) {
+  bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    return false;
+  }
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU)) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  switch (SU->getInstr()->getOpcode()) {
+  default:
+    ResourcesModel->reserveResources(SU->getInstr());
+    break;
+  case TargetOpcode::EXTRACT_SUBREG:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::PROLOG_LABEL:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::COPY:
+  case TargetOpcode::INLINEASM:
+    break;
+  }
+  Packet.push_back(SU);
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Packet[" << TotalPackets << "]:\n");
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i) {
+    DEBUG(dbgs() << "\t[" << i << "] SU(");
+    DEBUG(dbgs() << Packet[i]->NodeNum << ")\t");
+    DEBUG(Packet[i]->getInstr()->dump());
+  }
+#endif
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= SchedModel->getIssueWidth()) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    startNewCycle = true;
+  }
+
+  return startNewCycle;
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+void VLIWMachineScheduler::schedule() {
+  DEBUG(dbgs()
+        << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
+        << " " << BB->getName()
+        << " in_func " << BB->getParent()->getFunction()->getName()
+        << " at loop depth "  << MLI.getLoopDepth(BB)
+        << " \n");
+
+  buildDAGWithRegPressure();
+
+  // Postprocess the DAG to add platform specific artificial dependencies.
+  postprocessDAG();
+
+  // To view Height/Depth correctly, they should be accessed at least once.
+  DEBUG(unsigned maxH = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getHeight() > maxH)
+            maxH = SUnits[su].getHeight();
+        dbgs() << "Max Height " << maxH << "\n";);
+  DEBUG(unsigned maxD = 0;
+        for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          if (SUnits[su].getDepth() > maxD)
+            maxD = SUnits[su].getDepth();
+        dbgs() << "Max Depth " << maxD << "\n";);
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  initQueues();
+
+  bool IsTopNode = false;
+  while (SUnit *SU = SchedImpl->pickNode(IsTopNode)) {
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+}
+
+void ConvergingVLIWScheduler::initialize(ScheduleDAGMI *dag) {
+  DAG = static_cast<VLIWMachineScheduler*>(dag);
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+  Top.init(DAG, SchedModel);
+  Bot.init(DAG, SchedModel);
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = DAG->getSchedModel()->getInstrItineraries();
+  const TargetMachine &TM = DAG->MF.getTarget();
+  Top.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+  Bot.HazardRec = TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
+
+  Top.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+  Bot.ResourceModel = new VLIWResourceModel(TM, DAG->getSchedModel());
+
+  assert((!llvm::ForceTopDown || !llvm::ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+}
+
+void ConvergingVLIWScheduler::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  for (SUnit::succ_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    unsigned PredReadyCycle = I->getSUnit()->TopReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Top.MaxMinLatency = std::max(MinLatency, Top.MaxMinLatency);
+#endif
+    if (SU->TopReadyCycle < PredReadyCycle + MinLatency)
+      SU->TopReadyCycle = PredReadyCycle + MinLatency;
+  }
+  Top.releaseNode(SU, SU->TopReadyCycle);
+}
+
+void ConvergingVLIWScheduler::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    unsigned SuccReadyCycle = I->getSUnit()->BotReadyCycle;
+    unsigned MinLatency = I->getMinLatency();
+#ifndef NDEBUG
+    Bot.MaxMinLatency = std::max(MinLatency, Bot.MaxMinLatency);
+#endif
+    if (SU->BotReadyCycle < SuccReadyCycle + MinLatency)
+      SU->BotReadyCycle = SuccReadyCycle + MinLatency;
+  }
+  Bot.releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitrary target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool ConvergingVLIWScheduler::SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled())
+    return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
+
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if (IssueCount + uops > SchedModel->getIssueWidth())
+    return true;
+
+  return false;
+}
+
+void ConvergingVLIWScheduler::SchedBoundary::releaseNode(SUnit *SU,
+                                                     unsigned ReadyCycle) {
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  if (ReadyCycle > CurrCycle || checkHazard(SU))
+
+    Pending.push(SU);
+  else
+    Available.push(SU);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void ConvergingVLIWScheduler::SchedBoundary::bumpCycle() {
+  unsigned Width = SchedModel->getIssueWidth();
+  IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
+
+  assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+
+  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+        << CurrCycle << '\n');
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void ConvergingVLIWScheduler::SchedBoundary::bumpNode(SUnit *SU) {
+  bool startNewCycle = false;
+
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+
+  // Update DFA model.
+  startNewCycle = ResourceModel->reserveResources(SU);
+
+  // Check the instruction group dispatch limit.
+  // TODO: Check if this SU must end a dispatch group.
+  IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+  if (startNewCycle) {
+    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    bumpCycle();
+  }
+  else
+    DEBUG(dbgs() << "*** IssueCount " << IssueCount
+          << " at cycle " << CurrCycle << '\n');
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void ConvergingVLIWScheduler::SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void ConvergingVLIWScheduler::SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// advance the cycle until at least one node is ready. If multiple instructions
+/// are ready, return NULL.
+SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  for (unsigned i = 0; Available.empty(); ++i) {
+    assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
+           "permanent hazard"); (void)i;
+    ResourceModel->reserveResources(0);
+    bumpCycle();
+    releasePending();
+  }
+  if (Available.size() == 1)
+    return *Available.begin();
+  return NULL;
+}
+
+#ifndef NDEBUG
+void ConvergingVLIWScheduler::traceCandidate(const char *Label,
+                                             const ReadyQueue &Q,
+                                             SUnit *SU, PressureElement P) {
+  dbgs() << Label << " " << Q.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  SU->dump(DAG);
+}
+#endif
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return 0;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+/// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
+/// of SU, return it, otherwise return null.
+static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
+  SUnit *OnlyAvailableSucc = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    SUnit &Succ = *I->getSUnit();
+    if (!Succ.isScheduled) {
+      // We found an available, but not scheduled, successor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
+        return 0;
+      OnlyAvailableSucc = &Succ;
+    }
+  }
+  return OnlyAvailableSucc;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 100;
+static const unsigned PriorityThree = 50;
+static const unsigned PriorityFour = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned FactorOne = 2;
+
+/// Single point to compute overall scheduling cost.
+/// TODO: More heuristics will be used soon.
+int ConvergingVLIWScheduler::SchedulingCost(ReadyQueue &Q, SUnit *SU,
+                                            SchedCandidate &Candidate,
+                                            RegPressureDelta &Delta,
+                                            bool verbose) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (!SU || SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Critical path first.
+  if (Q.getID() == TopQID) {
+    ResCount += (SU->getHeight() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Top.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  } else {
+    ResCount += (SU->getDepth() * ScaleTwo);
+
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (Bot.ResourceModel->isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+  }
+
+  unsigned NumNodesBlocking = 0;
+  if (Q.getID() == TopQID) {
+    // How many SUs does it block from scheduling?
+    // Look at all of the successors of this node.
+    // Count the number of nodes that
+    // this node is the sole unscheduled node for.
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I)
+      if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  } else {
+    // How many unscheduled predecessors block this node?
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I)
+      if (getSingleUnscheduledSucc(I->getSUnit()) == SU)
+        ++NumNodesBlocking;
+  }
+  ResCount += (NumNodesBlocking * ScaleTwo);
+
+  // Factor in reg pressure as a heuristic.
+  ResCount -= (Delta.Excess.UnitIncrease*PriorityThree);
+  ResCount -= (Delta.CriticalMax.UnitIncrease*PriorityThree);
+
+  DEBUG(if (verbose) dbgs() << " Total(" << ResCount << ")");
+
+  return ResCount;
+}
+
+/// Pick the best candidate from the top queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+ConvergingVLIWScheduler::CandResult ConvergingVLIWScheduler::
+pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
+                  SchedCandidate &Candidate) {
+  DEBUG(Q.dump());
+
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  // BestSU remains NULL if no top candidates beat the best existing candidate.
+  CandResult FoundCandidate = NoCand;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    RegPressureDelta RPDelta;
+    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
+                                    DAG->getRegionCriticalPSets(),
+                                    DAG->getRegPressure().MaxSetPressure);
+
+    int CurrentCost = SchedulingCost(Q, *I, Candidate, RPDelta, false);
+
+    // Initialize the candidate if needed.
+    if (!Candidate.SU) {
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = NodeOrder;
+      continue;
+    }
+
+    // Best cost.
+    if (CurrentCost > Candidate.SCost) {
+      DEBUG(traceCandidate("CCAND", Q, *I));
+      Candidate.SU = *I;
+      Candidate.RPDelta = RPDelta;
+      Candidate.SCost = CurrentCost;
+      FoundCandidate = BestCost;
+      continue;
+    }
+
+    // Fall through to original instruction order.
+    // Only consider node order if Candidate was chosen from this Q.
+    if (FoundCandidate == NoCand)
+      continue;
+  }
+  return FoundCandidate;
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *ConvergingVLIWScheduler::pickNodeBidrectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    return SU;
+  }
+  SchedCandidate BotCand;
+  // Prefer bottom scheduling when heuristics are silent.
+  CandResult BotResult = pickNodeFromQueue(Bot.Available,
+                                           DAG->getBotRPTracker(), BotCand);
+  assert(BotResult != NoCand && "failed to find the first candidate");
+
+  // If either Q has a single candidate that provides the least increase in
+  // Excess pressure, we can immediately schedule from that Q.
+  //
+  // RegionCriticalPSets summarizes the pressure within the scheduled region and
+  // affects picking from either Q. If scheduling in one direction must
+  // increase pressure for one of the excess PSets, then schedule in that
+  // direction first to provide more freedom in the other direction.
+  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  // Check if the top Q has a better candidate.
+  SchedCandidate TopCand;
+  CandResult TopResult = pickNodeFromQueue(Top.Available,
+                                           DAG->getTopRPTracker(), TopCand);
+  assert(TopResult != NoCand && "failed to find the first candidate");
+
+  if (TopResult == SingleExcess || TopResult == SingleCritical) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // If either Q has a single candidate that minimizes pressure above the
+  // original region's pressure pick it.
+  if (BotResult == SingleMax) {
+    IsTopNode = false;
+    return BotCand.SU;
+  }
+  if (TopResult == SingleMax) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  if (TopCand.SCost > BotCand.SCost) {
+    IsTopNode = true;
+    return TopCand.SU;
+  }
+  // Otherwise prefer the bottom candidate in node order.
+  IsTopNode = false;
+  return BotCand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return NULL;
+  }
+  SUnit *SU;
+  if (llvm::ForceTopDown) {
+    SU = Top.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate TopCand;
+      CandResult TopResult =
+        pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
+      assert(TopResult != NoCand && "failed to find the first candidate");
+      (void)TopResult;
+      SU = TopCand.SU;
+    }
+    IsTopNode = true;
+  } else if (llvm::ForceBottomUp) {
+    SU = Bot.pickOnlyChoice();
+    if (!SU) {
+      SchedCandidate BotCand;
+      CandResult BotResult =
+        pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
+      assert(BotResult != NoCand && "failed to find the first candidate");
+      (void)BotResult;
+      SU = BotCand.SU;
+    }
+    IsTopNode = false;
+  } else {
+    SU = pickNodeBidrectional(IsTopNode);
+  }
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "*** " << (IsTopNode ? "Top" : "Bottom")
+        << " Scheduling Instruction in cycle "
+        << (IsTopNode ? Top.CurrCycle : Bot.CurrCycle) << '\n';
+        SU->dump(DAG));
+  return SU;
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, VLIWMachineScheduler needs
+/// to update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+void ConvergingVLIWScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = Top.CurrCycle;
+    Top.bumpNode(SU);
+  } else {
+    SU->BotReadyCycle = Bot.CurrCycle;
+    Bot.bumpNode(SU);
+  }
+}
+
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
new file mode 100644
index 000000000000..fe0242a0f74e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -0,0 +1,244 @@
+//===-- HexagonMachineScheduler.h - Custom Hexagon MI scheduler.      ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Custom Hexagon MI scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef HEXAGONASMPRINTER_H
+#define HEXAGONASMPRINTER_H
+
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/PriorityQueue.h"
+
+using namespace llvm;
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// ConvergingVLIWScheduler - Implementation of the standard
+// MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+class VLIWResourceModel {
+  /// ResourcesModel - Represents VLIW state.
+  /// Not limited to VLIW targets per say, but assumes
+  /// definition of DFA by a target.
+  DFAPacketizer *ResourcesModel;
+
+  const TargetSchedModel *SchedModel;
+
+  /// Local packet/bundle model. Purely
+  /// internal to the MI schedulre at the time.
+  std::vector<SUnit*> Packet;
+
+  /// Total packets created.
+  unsigned TotalPackets;
+
+public:
+VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
+    SchedModel(SM), TotalPackets(0) {
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+
+    // This hard requirement could be relaxed,
+    // but for now do not let it proceed.
+    assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+    Packet.resize(SchedModel->getIssueWidth());
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  ~VLIWResourceModel() {
+    delete ResourcesModel;
+  }
+
+  void resetPacketState() {
+    Packet.clear();
+  }
+
+  void resetDFA() {
+    ResourcesModel->clearResources();
+  }
+
+  void reset() {
+    Packet.clear();
+    ResourcesModel->clearResources();
+  }
+
+  bool isResourceAvailable(SUnit *SU);
+  bool reserveResources(SUnit *SU);
+  unsigned getTotalPackets() const { return TotalPackets; }
+};
+
+/// Extend the standard ScheduleDAGMI to provide more context and override the
+/// top-level schedule() driver.
+class VLIWMachineScheduler : public ScheduleDAGMI {
+public:
+  VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
+    ScheduleDAGMI(C, S) {}
+
+  /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
+  /// time to do some work.
+  virtual void schedule();
+  /// Perform platform specific DAG postprocessing.
+  void postprocessDAG();
+};
+
+/// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
+/// to balance the schedule.
+class ConvergingVLIWScheduler : public MachineSchedStrategy {
+
+  /// Store the state used by ConvergingVLIWScheduler heuristics, required
+  ///  for the lifetime of one invocation of pickNode().
+  struct SchedCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    // Register pressure values for the best candidate.
+    RegPressureDelta RPDelta;
+
+    // Best scheduling cost.
+    int SCost;
+
+    SchedCandidate(): SU(NULL), SCost(0) {}
+  };
+  /// Represent the type of SchedCandidate found within a single queue.
+  enum CandResult {
+    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure,
+    BestCost};
+
+  /// Each Scheduling boundary is associated with ready queues. It tracks the
+  /// current cycle in whichever direction at has moved, and maintains the state
+  /// of "hazards" and other interlocks at the current cycle.
+  struct SchedBoundary {
+    VLIWMachineScheduler *DAG;
+    const TargetSchedModel *SchedModel;
+
+    ReadyQueue Available;
+    ReadyQueue Pending;
+    bool CheckPending;
+
+    ScheduleHazardRecognizer *HazardRec;
+    VLIWResourceModel *ResourceModel;
+
+    unsigned CurrCycle;
+    unsigned IssueCount;
+
+    /// MinReadyCycle - Cycle of the soonest available instruction.
+    unsigned MinReadyCycle;
+
+    // Remember the greatest min operand latency.
+    unsigned MaxMinLatency;
+
+    /// Pending queues extend the ready queues with the same ID and the
+    /// PendingFlag set.
+    SchedBoundary(unsigned ID, const Twine &Name):
+      DAG(0), SchedModel(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
+      CheckPending(false), HazardRec(0), ResourceModel(0),
+      CurrCycle(0), IssueCount(0),
+      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+
+    ~SchedBoundary() {
+      delete ResourceModel;
+      delete HazardRec;
+    }
+
+    void init(VLIWMachineScheduler *dag, const TargetSchedModel *smodel) {
+      DAG = dag;
+      SchedModel = smodel;
+    }
+
+    bool isTop() const {
+      return Available.getID() == ConvergingVLIWScheduler::TopQID;
+    }
+
+    bool checkHazard(SUnit *SU);
+
+    void releaseNode(SUnit *SU, unsigned ReadyCycle);
+
+    void bumpCycle();
+
+    void bumpNode(SUnit *SU);
+
+    void releasePending();
+
+    void removeReady(SUnit *SU);
+
+    SUnit *pickOnlyChoice();
+  };
+
+  VLIWMachineScheduler *DAG;
+  const TargetSchedModel *SchedModel;
+  const TargetRegisterInfo *TRI;
+
+  // State of the top and bottom scheduled instruction boundaries.
+  SchedBoundary Top;
+  SchedBoundary Bot;
+
+public:
+  /// SUnit::NodeQueueId: 0 (none), 1 (top), 2 (bot), 3 (both)
+  enum {
+    TopQID = 1,
+    BotQID = 2,
+    LogMaxQID = 2
+  };
+
+  ConvergingVLIWScheduler():
+    DAG(0), SchedModel(0), TRI(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+
+  virtual void initialize(ScheduleDAGMI *dag);
+
+  virtual SUnit *pickNode(bool &IsTopNode);
+
+  virtual void schedNode(SUnit *SU, bool IsTopNode);
+
+  virtual void releaseTopNode(SUnit *SU);
+
+  virtual void releaseBottomNode(SUnit *SU);
+
+  unsigned ReportPackets() {
+    return Top.ResourceModel->getTotalPackets() +
+           Bot.ResourceModel->getTotalPackets();
+  }
+
+protected:
+  SUnit *pickNodeBidrectional(bool &IsTopNode);
+
+  int SchedulingCost(ReadyQueue &Q,
+                     SUnit *SU, SchedCandidate &Candidate,
+                     RegPressureDelta &Delta, bool verbose);
+
+  CandResult pickNodeFromQueue(ReadyQueue &Q,
+                               const RegPressureTracker &RPTracker,
+                               SchedCandidate &Candidate);
+#ifndef NDEBUG
+  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
+                      PressureElement P = PressureElement());
+#endif
+};
+
+} // namespace
+
+
+#endif
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 7ece4085ecbd..1e91c3948550 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -337,7 +337,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** Hexagon New Value Jump **********\n"
                << "********** Function: "
-               << MF.getFunction()->getName() << "\n");
+               << MF.getName() << "\n");
 
 #if 0
   // for now disable this, if we move NewValueJump before register
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 55cbc094a2ad..a295015de561 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -109,6 +109,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
 
   DenseMap<unsigned, unsigned> PeepholeMap;
+  DenseMap<unsigned, std::pair<unsigned, unsigned> > PeepholeDoubleRegsMap;
 
   if (DisableHexagonPeephole) return false;
 
@@ -117,6 +118,7 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
        MBBb != MBBe; ++MBBb) {
     MachineBasicBlock* MBB = MBBb;
     PeepholeMap.clear();
+    PeepholeDoubleRegsMap.clear();
 
     // Traverse the basic block.
     for (MachineBasicBlock::iterator MII = MBB->begin(); MII != MBB->end();
@@ -140,6 +142,24 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
         }
       }
 
+      // Look for this sequence below
+      // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
+      // %vregIntReg = COPY %vregDoubleReg1:subreg_loreg.
+      // and convert into
+      // %vregIntReg = COPY %vregDoubleReg0:subreg_hireg.
+      if (MI->getOpcode() == Hexagon::LSRd_ri) {
+        assert(MI->getNumOperands() == 3);
+        MachineOperand &Dst = MI->getOperand(0);
+        MachineOperand &Src1 = MI->getOperand(1);
+        MachineOperand &Src2 = MI->getOperand(2);
+        if (Src2.getImm() != 32)
+          continue;
+        unsigned DstReg = Dst.getReg();
+        unsigned SrcReg = Src1.getReg();
+        PeepholeDoubleRegsMap[DstReg] =
+          std::make_pair(*&SrcReg, 1/*Hexagon::subreg_hireg*/);
+      }
+
       // Look for P=NOT(P).
       if (!DisablePNotP &&
           (MI->getOpcode() == Hexagon::NOT_p)) {
@@ -178,6 +198,21 @@ bool HexagonPeephole::runOnMachineFunction(MachineFunction &MF) {
             // Change the 1st operand.
             MI->RemoveOperand(1);
             MI->addOperand(MachineOperand::CreateReg(PeepholeSrc, false));
+          } else  {
+            DenseMap<unsigned, std::pair<unsigned, unsigned> >::iterator DI =
+              PeepholeDoubleRegsMap.find(SrcReg);
+            if (DI != PeepholeDoubleRegsMap.end()) {
+              std::pair<unsigned,unsigned> PeepholeSrc = DI->second;
+              MI->RemoveOperand(1);
+              MI->addOperand(MachineOperand::CreateReg(PeepholeSrc.first,
+                                                       false /*isDef*/,
+                                                       false /*isImp*/,
+                                                       false /*isKill*/,
+                                                       false /*isDead*/,
+                                                       false /*isUndef*/,
+                                                       false /*isEarlyClobber*/,
+                                                       PeepholeSrc.second));
+            }
           }
         }
       }
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2c23674a3319..37424860564f 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -310,6 +310,58 @@ void HexagonRegisterInfo::getInitialFrameState(std::vector<MachineMove>
   Moves.push_back(MachineMove(0, Dst, Src));
 }
 
+// Get the weight in units of pressure for this register class.
+const RegClassWeight &
+HexagonRegisterInfo::getRegClassWeight(const TargetRegisterClass *RC) const {
+  // Each TargetRegisterClass has a per register weight, and weight
+  // limit which must be less than the limits of its pressure sets.
+  static const RegClassWeight RCWeightTable[] = {
+    {1, 32}, // IntRegs
+    {1, 8},  // CRRegs
+    {1, 4},  // PredRegs
+    {2, 16}, // DoubleRegs
+    {0, 0} };
+  return RCWeightTable[RC->getID()];
+}
+
+/// Get the number of dimensions of register pressure.
+unsigned HexagonRegisterInfo::getNumRegPressureSets() const {
+  return 4;
+}
+
+/// Get the name of this register unit pressure set.
+const char *HexagonRegisterInfo::getRegPressureSetName(unsigned Idx) const {
+  static const char *const RegPressureSetName[] = {
+    "IntRegsRegSet",
+    "CRRegsRegSet",
+    "PredRegsRegSet",
+    "DoubleRegsRegSet"
+  };
+  assert((Idx < 4) && "Index out of bounds");
+  return RegPressureSetName[Idx];
+}
+
+/// Get the register unit pressure limit for this dimension.
+/// This limit must be adjusted dynamically for reserved registers.
+unsigned HexagonRegisterInfo::getRegPressureSetLimit(unsigned Idx) const {
+  static const int RegPressureLimit [] = { 16, 4, 2, 8 };
+  assert((Idx < 4) && "Index out of bounds");
+  return RegPressureLimit[Idx];
+}
+
+const int*
+HexagonRegisterInfo::getRegClassPressureSets(const TargetRegisterClass *RC)
+  const {
+  static const int RCSetsTable[] = {
+    0,  -1,  // IntRegs
+    1,  -1,  // CRRegs
+    2,  -1,  // PredRegs
+    0,  -1,  // DoubleRegs
+    -1 };
+  static const unsigned RCSetStartTable[] = { 0, 2, 4, 6, 0 };
+  unsigned SetListStart = RCSetStartTable[RC->getID()];
+  return &RCSetsTable[SetListStart];
+}
 unsigned HexagonRegisterInfo::getEHExceptionRegister() const {
   llvm_unreachable("What is the exception register");
 }
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 85355ae7beb5..8820d13e0122 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -87,6 +87,11 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   // Exception handling queries.
   unsigned getEHExceptionRegister() const;
   unsigned getEHHandlerRegister() const;
+  const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+  unsigned getNumRegPressureSets() const;
+  const char *getRegPressureSetName(unsigned Idx) const;
+  unsigned getRegPressureSetLimit(unsigned Idx) const;
+  const int* getRegClassPressureSets(const TargetRegisterClass *RC) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index 2468f0b86f88..4d93dd18d4e0 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -50,7 +50,7 @@ bool HexagonRemoveExtendArgs::runOnFunction(Function &F) {
   unsigned Idx = 1;
   for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); AI != AE;
        ++AI, ++Idx) {
-    if (F.paramHasAttr(Idx, Attribute::SExt)) {
+    if (F.getParamAttributes(Idx).hasAttribute(Attributes::SExt)) {
       Argument* Arg = AI;
       if (!isa<PointerType>(Arg->getType())) {
         for (Instruction::use_iterator UI = Arg->use_begin();
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index d1076b8e4412..b5ff69a701cd 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -47,6 +47,7 @@ def HexagonModel : SchedMachineModel {
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
   let Itineraries = HexagonItineraries;
+  let LoadLatency = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 9b41126ca6fd..5668ae81e82e 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -58,6 +58,7 @@ def HexagonModelV4 : SchedMachineModel {
   // Max issue per cycle == bundle width.
   let IssueWidth = 4;
   let Itineraries = HexagonItinerariesV4;
+  let LoadLatency = 1;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 5d087db1bdb7..4bacb8fa670d 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -40,28 +40,27 @@ EnableIEEERndNear(
 
 HexagonSubtarget::HexagonSubtarget(StringRef TT, StringRef CPU, StringRef FS):
   HexagonGenSubtargetInfo(TT, CPU, FS),
-  HexagonArchVersion(V2),
   CPUString(CPU.str()) {
-  ParseSubtargetFeatures(CPU, FS);
 
-  switch(HexagonArchVersion) {
-  case HexagonSubtarget::V2:
-    break;
-  case HexagonSubtarget::V3:
-    EnableV3 = true;
-    break;
-  case HexagonSubtarget::V4:
-    break;
-  case HexagonSubtarget::V5:
-    break;
-  default:
-    // If the programmer has not specified a Hexagon version, default
-    // to -mv4.
+  // If the programmer has not specified a Hexagon version, default to -mv4.
+  if (CPUString.empty())
     CPUString = "hexagonv4";
-    HexagonArchVersion = HexagonSubtarget::V4;
-    break;
+
+  if (CPUString == "hexagonv2") {
+    HexagonArchVersion = V2;
+  } else if (CPUString == "hexagonv3") {
+    EnableV3 = true;
+    HexagonArchVersion = V3;
+  } else if (CPUString == "hexagonv4") {
+    HexagonArchVersion = V4;
+  } else if (CPUString == "hexagonv5") {
+    HexagonArchVersion = V5;
+  } else {
+    llvm_unreachable("Unrecognized Hexagon processor version");
   }
 
+  ParseSubtargetFeatures(CPUString, FS);
+
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUString);
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index a7b291ff2a26..30866e9eeba8 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "HexagonTargetMachine.h"
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
+#include "HexagonMachineScheduler.h"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
@@ -29,6 +30,11 @@ opt<bool> DisableHardwareLoops(
                         "disable-hexagon-hwloops", cl::Hidden,
                         cl::desc("Disable Hardware Loops for Hexagon target"));
 
+static cl::
+opt<bool> DisableHexagonMISched("disable-hexagon-misched",
+                                cl::Hidden, cl::ZeroOrMore, cl::init(false),
+                                cl::desc("Disable Hexagon MI Scheduling"));
+
 /// HexagonTargetMachineModule - Note that this is used on hosts that
 /// cannot link in a library unless there are references into the
 /// library.  In particular, it seems that it is not possible to get
@@ -42,6 +48,13 @@ extern "C" void LLVMInitializeHexagonTarget() {
   RegisterTargetMachine<HexagonTargetMachine> X(TheHexagonTarget);
 }
 
+static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
+  return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler());
+}
+
+static MachineSchedRegistry
+SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler",
+                    createVLIWMachineSched);
 
 /// HexagonTargetMachine ctor - Create an ILP32 architecture model.
 ///
@@ -55,13 +68,14 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
                                            CodeModel::Model CM,
                                            CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    DataLayout("e-p:32:32:32-"
+    DL("e-p:32:32:32-"
                 "i64:64:64-i32:32:32-i16:16:16-i1:32:32-"
                 "f64:64:64-f32:32:32-a0:0-n32") ,
     Subtarget(TT, CPU, FS), InstrInfo(Subtarget), TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget),
-    InstrItins(&Subtarget.getInstrItineraryData()) {
+    InstrItins(&Subtarget.getInstrItineraryData()),
+    STTI(&TLInfo), VTTI(&TLInfo) {
   setMCUseCFI(false);
 }
 
@@ -74,7 +88,7 @@ bool HexagonTargetMachine::addPassesForOptimizations(PassManagerBase &PM) {
   PM.add(createDeadCodeEliminationPass());
   PM.add(createConstantPropagationPass());
   PM.add(createLoopUnrollPass());
-  PM.add(createLoopStrengthReducePass(getTargetLowering()));
+  PM.add(createLoopStrengthReducePass());
   return true;
 }
 
@@ -83,7 +97,13 @@ namespace {
 class HexagonPassConfig : public TargetPassConfig {
 public:
   HexagonPassConfig(HexagonTargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+    : TargetPassConfig(TM, PM) {
+    // Enable MI scheduler.
+    if (!DisableHexagonMISched) {
+      enablePass(&MachineSchedulerID);
+      MachineSchedRegistry::setDefault(createVLIWMachineSched);
+    }
+  }
 
   HexagonTargetMachine &getHexagonTargetMachine() const {
     return getTM<HexagonTargetMachine>();
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index 0336965d11f1..7a4215c119a9 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -20,20 +20,23 @@
 #include "HexagonSelectionDAGInfo.h"
 #include "HexagonFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
 class Module;
 
 class HexagonTargetMachine : public LLVMTargetMachine {
-  const TargetData DataLayout;       // Calculates type size & alignment.
+  const DataLayout DL;       // Calculates type size & alignment.
   HexagonSubtarget Subtarget;
   HexagonInstrInfo InstrInfo;
   HexagonTargetLowering TLInfo;
   HexagonSelectionDAGInfo TSInfo;
   HexagonFrameLowering FrameLowering;
   const InstrItineraryData* InstrItins;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 
 public:
   HexagonTargetMachine(const Target &T, StringRef TT,StringRef CPU,
@@ -68,7 +71,15 @@ public:
     return &TSInfo;
   }
 
-  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
+
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
   static unsigned getModuleMatchQuality(const Module &M);
 
   // Pass Pipeline Configuration.
diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 32cc70958638..f4d7761ac358 100644
--- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -16,7 +16,7 @@
 #include "HexagonTargetMachine.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/ELF.h"
@@ -73,7 +73,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
 
   if (Kind.isBSS() || Kind.isDataNoRel() || Kind.isCommon()) {
     Type *Ty = GV->getType()->getElementType();
-    return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+    return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
   }
 
   return false;
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index a03ed03365ba..3d5f685028ea 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -3474,8 +3474,8 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // 1. Two loads unless they are volatile.
       // 2. Two stores in V4 unless they are volatile.
       else if ((DepType == SDep::Order) &&
-               !I->hasVolatileMemoryRef() &&
-               !J->hasVolatileMemoryRef()) {
+               !I->hasOrderedMemoryRef() &&
+               !J->hasOrderedMemoryRef()) {
         if (QRI->Subtarget.hasV4TOps() &&
             // hexagonv4 allows dual store.
             MCIDI.mayStore() && MCIDJ.mayStore()) {
diff --git a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
index 9305c2702fa4..c607b5d35649 100644
--- a/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
+++ b/lib/Target/Hexagon/HexagonVarargsCallingConvention.h
@@ -75,9 +75,9 @@ static bool CC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
 
   const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
   unsigned Alignment =
-    State.getTarget().getTargetData()->getABITypeAlignment(ArgTy);
+    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
   unsigned Size =
-    State.getTarget().getTargetData()->getTypeSizeInBits(ArgTy) / 8;
+    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
 
   // If it's passed by value, then we need the size of the aggregate not of
   // the pointer.
@@ -130,9 +130,9 @@ static bool RetCC_Hexagon32_VarArgs(unsigned ValNo, EVT ValVT,
 
   const Type* ArgTy = LocVT.getTypeForEVT(State.getContext());
   unsigned Alignment =
-    State.getTarget().getTargetData()->getABITypeAlignment(ArgTy);
+    State.getTarget().getDataLayout()->getABITypeAlignment(ArgTy);
   unsigned Size =
-    State.getTarget().getTargetData()->getTypeSizeInBits(ArgTy) / 8;
+    State.getTarget().getDataLayout()->getTypeSizeInBits(ArgTy) / 8;
 
   unsigned Offset3 = State.AllocateStack(Size, Alignment);
   State.addLoc(CCValAssign::getMem(ValNo, ValVT.getSimpleVT(), Offset3,
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index d6e6c36af5de..86f75d1c2d7a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -24,7 +24,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Target &T, StringRef TT) {
   HasLEB128 = true;
 
   PrivateGlobalPrefix = ".L";
-  LCOMMDirectiveType = LCOMM::ByteAlignment;
+  LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
   InlineAsmEnd = "# InlineAsm End";
   ZeroDirective = "\t.space\t";
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index 38fb0e87fdb4..f7809caeb32f 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -44,9 +44,10 @@ class MBlazeAsmParser : public MCTargetAsmParser {
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
 
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
 
   /// @name Auto-generated Match Functions
   /// {
@@ -56,12 +57,12 @@ class MBlazeAsmParser : public MCTargetAsmParser {
 
   /// }
 
-
 public:
   MBlazeAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser)
     : MCTargetAsmParser(), Parser(_Parser) {}
 
-  virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   virtual bool ParseDirective(AsmToken DirectiveID);
@@ -313,14 +314,13 @@ static unsigned MatchRegisterName(StringRef Name);
 /// }
 //
 bool MBlazeAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc,
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out) {
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
   MCInst Inst;
-  SMLoc ErrorLoc;
-  unsigned ErrorInfo;
-
-  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo)) {
+  switch (MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                               MatchingInlineAsm)) {
   default: break;
   case Match_Success:
     Out.EmitInstruction(Inst);
@@ -329,10 +329,8 @@ MatchAndEmitInstruction(SMLoc IDLoc,
     return Error(IDLoc, "instruction use requires an option to be enabled");
   case Match_MnemonicFail:
       return Error(IDLoc, "unrecognized instruction mnemonic");
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
-  case Match_InvalidOperand:
-    ErrorLoc = IDLoc;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0U) {
       if (ErrorInfo >= Operands.size())
         return Error(IDLoc, "too few operands for instruction");
@@ -343,6 +341,7 @@ MatchAndEmitInstruction(SMLoc IDLoc,
 
     return Error(ErrorLoc, "invalid operand for instruction");
   }
+  }
 
   llvm_unreachable("Implement any new match types added!");
 }
@@ -479,7 +478,7 @@ ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
 /// Parse an mblaze instruction mnemonic followed by its operands.
 bool MBlazeAsmParser::
-ParseInstruction(StringRef Name, SMLoc NameLoc,
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   // The first operands is the token for the instruction name
   size_t dotLoc = Name.find('.');
diff --git a/lib/Target/MBlaze/CMakeLists.txt b/lib/Target/MBlaze/CMakeLists.txt
index 6c3e8b644703..0bf93d71dab8 100644
--- a/lib/Target/MBlaze/CMakeLists.txt
+++ b/lib/Target/MBlaze/CMakeLists.txt
@@ -27,7 +27,6 @@ add_llvm_target(MBlazeCodeGen
   MBlazeSelectionDAGInfo.cpp
   MBlazeAsmPrinter.cpp
   MBlazeMCInstLower.cpp
-  MBlazeELFWriterInfo.cpp
   )
 
 add_dependencies(LLVMMBlazeCodeGen intrinsics_gen)
diff --git a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
index e9f340f2f6d2..b679a318c3e0 100644
--- a/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
+++ b/lib/Target/MBlaze/MBlazeAsmPrinter.cpp
@@ -34,7 +34,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
diff --git a/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp b/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp
deleted file mode 100644
index e3c7236d1141..000000000000
--- a/lib/Target/MBlaze/MBlazeELFWriterInfo.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-//===-- MBlazeELFWriterInfo.cpp - ELF Writer Info for the MBlaze backend --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the MBlaze backend.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MBlazeELFWriterInfo.h"
-#include "MBlazeRelocations.h"
-#include "llvm/Function.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-//  Implementation of the MBlazeELFWriterInfo class
-//===----------------------------------------------------------------------===//
-
-MBlazeELFWriterInfo::MBlazeELFWriterInfo(TargetMachine &TM)
-  : TargetELFWriterInfo(TM.getTargetData()->getPointerSizeInBits() == 64,
-                        TM.getTargetData()->isLittleEndian()) {
-}
-
-MBlazeELFWriterInfo::~MBlazeELFWriterInfo() {}
-
-unsigned MBlazeELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
-  switch (MachineRelTy) {
-  case MBlaze::reloc_pcrel_word:
-    return ELF::R_MICROBLAZE_64_PCREL;
-  case MBlaze::reloc_absolute_word:
-    return ELF::R_MICROBLAZE_NONE;
-  default:
-    llvm_unreachable("unknown mblaze machine relocation type");
-  }
-}
-
-long int MBlazeELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
-                                                    long int Modifier) const {
-  switch (RelTy) {
-  case ELF::R_MICROBLAZE_32_PCREL:
-    return Modifier - 4;
-  case ELF::R_MICROBLAZE_32:
-    return Modifier;
-  default:
-    llvm_unreachable("unknown mblaze relocation type");
-  }
-}
-
-unsigned MBlazeELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
-  // FIXME: Most of these sizes are guesses based on the name
-  switch (RelTy) {
-  case ELF::R_MICROBLAZE_32:
-  case ELF::R_MICROBLAZE_32_PCREL:
-  case ELF::R_MICROBLAZE_32_PCREL_LO:
-  case ELF::R_MICROBLAZE_32_LO:
-  case ELF::R_MICROBLAZE_SRO32:
-  case ELF::R_MICROBLAZE_SRW32:
-  case ELF::R_MICROBLAZE_32_SYM_OP_SYM:
-  case ELF::R_MICROBLAZE_GOTOFF_32:
-    return 32;
-
-  case ELF::R_MICROBLAZE_64_PCREL:
-  case ELF::R_MICROBLAZE_64:
-  case ELF::R_MICROBLAZE_GOTPC_64:
-  case ELF::R_MICROBLAZE_GOT_64:
-  case ELF::R_MICROBLAZE_PLT_64:
-  case ELF::R_MICROBLAZE_GOTOFF_64:
-    return 64;
-  }
-
-  return 0;
-}
-
-bool MBlazeELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
-  // FIXME: Most of these are guesses based on the name
-  switch (RelTy) {
-  case ELF::R_MICROBLAZE_32_PCREL:
-  case ELF::R_MICROBLAZE_64_PCREL:
-  case ELF::R_MICROBLAZE_32_PCREL_LO:
-  case ELF::R_MICROBLAZE_GOTPC_64:
-    return true;
-  }
-
-  return false;
-}
-
-unsigned MBlazeELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
-  return MBlaze::reloc_absolute_word;
-}
-
-long int MBlazeELFWriterInfo::computeRelocation(unsigned SymOffset,
-                                                unsigned RelOffset,
-                                                unsigned RelTy) const {
-  assert((RelTy == ELF::R_MICROBLAZE_32_PCREL ||
-          RelTy == ELF::R_MICROBLAZE_64_PCREL) &&
-         "computeRelocation unknown for this relocation type");
-  return SymOffset - (RelOffset + 4);
-}
diff --git a/lib/Target/MBlaze/MBlazeELFWriterInfo.h b/lib/Target/MBlaze/MBlazeELFWriterInfo.h
deleted file mode 100644
index a314eb76ea46..000000000000
--- a/lib/Target/MBlaze/MBlazeELFWriterInfo.h
+++ /dev/null
@@ -1,59 +0,0 @@
-//===-- MBlazeELFWriterInfo.h - ELF Writer Info for MBlaze ------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the MBlaze backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MBLAZE_ELF_WRITER_INFO_H
-#define MBLAZE_ELF_WRITER_INFO_H
-
-#include "llvm/Target/TargetELFWriterInfo.h"
-
-namespace llvm {
-  class TargetMachine;
-
-  class MBlazeELFWriterInfo : public TargetELFWriterInfo {
-  public:
-    MBlazeELFWriterInfo(TargetMachine &TM);
-    virtual ~MBlazeELFWriterInfo();
-
-    /// getRelocationType - Returns the target specific ELF Relocation type.
-    /// 'MachineRelTy' contains the object code independent relocation type
-    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
-
-    /// hasRelocationAddend - True if the target uses an addend in the
-    /// ELF relocation entry.
-    virtual bool hasRelocationAddend() const { return false; }
-
-    /// getDefaultAddendForRelTy - Gets the default addend value for a
-    /// relocation entry based on the target ELF relocation type.
-    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
-                                              long int Modifier = 0) const;
-
-    /// getRelTySize - Returns the size of relocatable field in bits
-    virtual unsigned getRelocationTySize(unsigned RelTy) const;
-
-    /// isPCRelativeRel - True if the relocation type is pc relative
-    virtual bool isPCRelativeRel(unsigned RelTy) const;
-
-    /// getJumpTableRelocationTy - Returns the machine relocation type used
-    /// to reference a jumptable.
-    virtual unsigned getAbsoluteLabelMachineRelTy() const;
-
-    /// computeRelocation - Some relocatable fields could be relocated
-    /// directly, avoiding the relocation symbol emission, compute the
-    /// final relocation value for this symbol.
-    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
-                                       unsigned RelTy) const;
-  };
-
-} // end llvm namespace
-
-#endif // MBLAZE_ELF_WRITER_INFO_H
diff --git a/lib/Target/MBlaze/MBlazeFrameLowering.cpp b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
index d2f14a5c53b7..9e467bf337e0 100644
--- a/lib/Target/MBlaze/MBlazeFrameLowering.cpp
+++ b/lib/Target/MBlaze/MBlazeFrameLowering.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
index 91aaf940e626..1c2e3b26613e 100644
--- a/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeIntrinsicInfo.cpp
@@ -83,7 +83,7 @@ bool MBlazeIntrinsicInfo::isOverloaded(unsigned IntrID) const {
 #undef GET_INTRINSIC_OVERLOAD_TABLE
 }
 
-/// This defines the "getAttributes(ID id)" method.
+/// This defines the "getAttributes(LLVMContext &C, ID id)" method.
 #define GET_INTRINSIC_ATTRIBUTES
 #include "MBlazeGenIntrinsics.inc"
 #undef GET_INTRINSIC_ATTRIBUTES
@@ -104,7 +104,8 @@ Function *MBlazeIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
                                                 Type **Tys,
                                                 unsigned numTy) const {
   assert(!isOverloaded(IntrID) && "MBlaze intrinsics are not overloaded");
-  AttrListPtr AList = getAttributes((mblazeIntrinsic::ID) IntrID);
+  AttrListPtr AList = getAttributes(M->getContext(),
+                                    (mblazeIntrinsic::ID) IntrID);
   return cast<Function>(M->getOrInsertFunction(getName(IntrID),
                                                getType(M->getContext(), IntrID),
                                                AList));
diff --git a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
index 46f5207a90ba..daa76e887fca 100644
--- a/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
+++ b/lib/Target/MBlaze/MBlazeRegisterInfo.cpp
@@ -140,7 +140,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   unsigned oi = i == 2 ? 1 : 2;
 
-  DEBUG(dbgs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DEBUG(dbgs() << "\nFunction : " << MF.getName() << "\n";
         dbgs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 5f82f142032a..f180652f1127 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -38,11 +38,12 @@ MBlazeTargetMachine(const Target &T, StringRef TT,
                     CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
-    DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
+    DL("E-p:32:32:32-i8:8:8-i16:16:16"),
     InstrInfo(*this),
     FrameLowering(Subtarget),
-    TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+    TLInfo(*this), TSInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index 1647a2169210..a8df4e63e3ee 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -20,25 +20,26 @@
 #include "MBlazeSelectionDAGInfo.h"
 #include "MBlazeIntrinsicInfo.h"
 #include "MBlazeFrameLowering.h"
-#include "MBlazeELFWriterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
   class formatted_raw_ostream;
 
   class MBlazeTargetMachine : public LLVMTargetMachine {
     MBlazeSubtarget        Subtarget;
-    const TargetData       DataLayout; // Calculates type size & alignment
+    const DataLayout       DL; // Calculates type size & alignment
     MBlazeInstrInfo        InstrInfo;
     MBlazeFrameLowering    FrameLowering;
     MBlazeTargetLowering   TLInfo;
     MBlazeSelectionDAGInfo TSInfo;
     MBlazeIntrinsicInfo    IntrinsicInfo;
-    MBlazeELFWriterInfo    ELFWriterInfo;
     InstrItineraryData     InstrItins;
+    ScalarTargetTransformImpl STTI;
+    VectorTargetTransformImpl VTTI;
 
   public:
     MBlazeTargetMachine(const Target &T, StringRef TT,
@@ -59,8 +60,8 @@ namespace llvm {
     virtual const MBlazeSubtarget *getSubtargetImpl() const
     { return &Subtarget; }
 
-    virtual const TargetData *getTargetData() const
-    { return &DataLayout;}
+    virtual const DataLayout *getDataLayout() const
+    { return &DL;}
 
     virtual const MBlazeRegisterInfo *getRegisterInfo() const
     { return &InstrInfo.getRegisterInfo(); }
@@ -74,9 +75,10 @@ namespace llvm {
     const TargetIntrinsicInfo *getIntrinsicInfo() const
     { return &IntrinsicInfo; }
 
-    virtual const MBlazeELFWriterInfo *getELFWriterInfo() const {
-      return &ELFWriterInfo;
-    }
+    virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const
+    { return &STTI; }
+    virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const
+    { return &VTTI; }
 
     // Pass Pipeline Configuration
     virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
diff --git a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
index f66ea302d9fe..899c74ee8ed7 100644
--- a/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetObjectFile.cpp
@@ -13,7 +13,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
@@ -70,7 +70,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
     return false;
 
   Type *Ty = GV->getType()->getElementType();
-  return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
 }
 
 const MCSection *MBlazeTargetObjectFile::
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
index f383fecdc25a..44feeb49e7f1 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
@@ -156,7 +156,8 @@ void ELFMBlazeAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 }
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
index bfd11a070ba5..2b71d9d3c844 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -29,8 +29,8 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
 class MBlazeMCCodeEmitter : public MCCodeEmitter {
-  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
+  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
 
 public:
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index 7cc96c62c830..7bc7d8f20724 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -35,7 +35,8 @@ MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT,
+                                     StringRef CPU);
 
 MCObjectWriter *createMBlazeELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index 61d7f2bf4766..2e170f17bf9d 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -221,3 +221,17 @@ MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   return true;
 }
+
+void
+MSP430FrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
+                                                                         const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  // Create a frame entry for the FPW register that must be saved.
+  if (TFI->hasFP(MF)) {
+    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
+    (void)FrameIdx;
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for FPW register must be last in order to be found!");
+  }
+}
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index b636827da7b0..cb02545852b5 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -46,6 +46,7 @@ public:
 
   bool hasFP(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 5430d433b650..5efc6a36b894 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -274,8 +274,8 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   else if (AM.JT != -1)
     Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
   else if (AM.BlockAddr)
-    Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
-                                   true, 0/*AM.SymbolFlags*/);
+    Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, 0,
+                                         0/*AM.SymbolFlags*/);
   else
     Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i16);
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index f8b7e149f0db..fc677aec38ef 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -61,7 +61,7 @@ MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
   TargetLowering(tm, new TargetLoweringObjectFileELF()),
   Subtarget(*tm.getSubtargetImpl()) {
 
-  TD = getTargetData();
+  TD = getDataLayout();
 
   // Set up the register classes.
   addRegisterClass(MVT::i8,  &MSP430::GR8RegClass);
@@ -655,7 +655,7 @@ SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), /*isTarget=*/true);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
 
   return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
 }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index d8ad02fca403..991304c23de3 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -169,7 +169,7 @@ namespace llvm {
                                             SelectionDAG &DAG) const;
 
     const MSP430Subtarget &Subtarget;
-    const TargetData *TD;
+    const DataLayout *TD;
   };
 } // namespace llvm
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index aed46a2ec595..9ae238f66f57 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -220,20 +220,6 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(i+1).ChangeToImmediate(Offset);
 }
 
-void
-MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
-                                                                         const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // Create a frame entry for the FPW register that must be saved.
-  if (TFI->hasFP(MF)) {
-    int FrameIdx = MF.getFrameInfo()->CreateFixedObject(2, -4, true);
-    (void)FrameIdx;
-    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
-           "Slot for FPW register must be last in order to be found!");
-  }
-}
-
 unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 9ee0a03f6310..64a43bcafbb4 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -49,8 +49,6 @@ public:
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
 
-  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
-
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const;
 };
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 817001d6ad7c..13e37b373533 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -33,10 +33,10 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
                                          CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
-    // FIXME: Check TargetData string.
-    DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
+    // FIXME: Check DataLayout string.
+    DL("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) { }
+    FrameLowering(Subtarget), STTI(&TLInfo), VTTI(&TLInfo) { }
 
 namespace {
 /// MSP430 Code Generator Pass Configuration Options.
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index f54146b3e338..186172ede428 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -21,9 +21,10 @@
 #include "MSP430SelectionDAGInfo.h"
 #include "MSP430RegisterInfo.h"
 #include "MSP430Subtarget.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
@@ -31,11 +32,13 @@ namespace llvm {
 ///
 class MSP430TargetMachine : public LLVMTargetMachine {
   MSP430Subtarget        Subtarget;
-  const TargetData       DataLayout;       // Calculates type size & alignment
+  const DataLayout       DL;       // Calculates type size & alignment
   MSP430InstrInfo        InstrInfo;
   MSP430TargetLowering   TLInfo;
   MSP430SelectionDAGInfo TSInfo;
   MSP430FrameLowering    FrameLowering;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 
 public:
   MSP430TargetMachine(const Target &T, StringRef TT,
@@ -47,7 +50,7 @@ public:
     return &FrameLowering;
   }
   virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
-  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const DataLayout *getDataLayout() const     { return &DL;}
   virtual const MSP430Subtarget *getSubtargetImpl() const { return &Subtarget; }
 
   virtual const TargetRegisterInfo *getRegisterInfo() const {
@@ -61,7 +64,12 @@ public:
   virtual const MSP430SelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
-
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
 }; // MSP430TargetMachine.
 
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 786a0c5ed187..539a1f723bdd 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Target/Mangler.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/raw_ostream.h"
@@ -44,7 +44,7 @@ static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
   OutName.push_back('_');
 }
 
-/// NameNeedsEscaping - Return true if the identifier \arg Str needs quotes
+/// NameNeedsEscaping - Return true if the identifier \p Str needs quotes
 /// for this assembler.
 static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo &MAI) {
   assert(!Str.empty() && "Cannot create an empty MCSymbol");
@@ -157,7 +157,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
 /// a suffix on their name indicating the number of words of arguments they
 /// take.
 static void AddFastCallStdCallSuffix(SmallVectorImpl<char> &OutName,
-                                     const Function *F, const TargetData &TD) {
+                                     const Function *F, const DataLayout &TD) {
   // Calculate arguments size total.
   unsigned ArgWords = 0;
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
@@ -183,8 +183,7 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   ManglerPrefixTy PrefixTy = Mangler::Default;
   if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
     PrefixTy = Mangler::Private;
-  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage() ||
-           GV->hasLinkerPrivateWeakDefAutoLinkage())
+  else if (GV->hasLinkerPrivateLinkage() || GV->hasLinkerPrivateWeakLinkage())
     PrefixTy = Mangler::LinkerPrivate;
   
   // If this global has a name, handle it simply.
diff --git a/lib/Target/Mips/AsmParser/CMakeLists.txt b/lib/Target/Mips/AsmParser/CMakeLists.txt
index 6c7343bbe513..28f521910901 100644
--- a/lib/Target/Mips/AsmParser/CMakeLists.txt
+++ b/lib/Target/Mips/AsmParser/CMakeLists.txt
@@ -1,3 +1,4 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
 add_llvm_library(LLVMMipsAsmParser
   MipsAsmParser.cpp
   )
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 58b559025757..67b524883cf8 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -8,53 +8,1316 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsRegisterInfo.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
 
 namespace {
+class MipsAssemblerOptions {
+public:
+  MipsAssemblerOptions():
+    aTReg(1), reorder(true), macro(true) {
+  }
+
+  unsigned getATRegNum() {return aTReg;}
+  bool setATReg(unsigned Reg);
+
+  bool isReorder() {return reorder;}
+  void setReorder() {reorder = true;}
+  void setNoreorder() {reorder = false;}
+
+  bool isMacro() {return macro;}
+  void setMacro() {macro = true;}
+  void setNomacro() {macro = false;}
+
+private:
+  unsigned aTReg;
+  bool reorder;
+  bool macro;
+};
+}
+
+namespace {
 class MipsAsmParser : public MCTargetAsmParser {
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
+
+  enum FpFormatTy {
+    FP_FORMAT_NONE = -1,
+    FP_FORMAT_S,
+    FP_FORMAT_D,
+    FP_FORMAT_L,
+    FP_FORMAT_W
+  } FpFormat;
+
+  MCSubtargetInfo &STI;
+  MCAsmParser &Parser;
+  MipsAssemblerOptions Options;
+
+
+#define GET_ASSEMBLER_HEADER
+#include "MipsGenAsmMatcher.inc"
+
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
 
-  bool ParseInstruction(StringRef Name, SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  bool parseMathOperation(StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   bool ParseDirective(AsmToken DirectiveID);
 
+  MipsAsmParser::OperandMatchResultTy
+  parseMemOperand(SmallVectorImpl<MCParsedAsmOperand*>&);
+
+  bool ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &,
+                    StringRef Mnemonic);
+
+  int tryParseRegister(StringRef Mnemonic);
+
+  bool tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                               StringRef Mnemonic);
+
+  bool needsExpansion(MCInst &Inst);
+
+  void expandInstruction(MCInst &Inst, SMLoc IDLoc,
+                         SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+                     SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+  void expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                            SmallVectorImpl<MCInst> &Instructions);
+  bool reportParseError(StringRef ErrorMsg);
+
+  bool parseMemOffset(const MCExpr *&Res);
+  bool parseRelocOperand(const MCExpr *&Res);
+
+  bool parseDirectiveSet();
+
+  bool parseSetAtDirective();
+  bool parseSetNoAtDirective();
+  bool parseSetMacroDirective();
+  bool parseSetNoMacroDirective();
+  bool parseSetReorderDirective();
+  bool parseSetNoReorderDirective();
+
+  MCSymbolRefExpr::VariantKind getVariantKind(StringRef Symbol);
+
+  bool isMips64() const {
+    return (STI.getFeatureBits() & Mips::FeatureMips64) != 0;
+  }
+
+  bool isFP64() const {
+    return (STI.getFeatureBits() & Mips::FeatureFP64Bit) != 0;
+  }
+
+  int matchRegisterName(StringRef Symbol);
+
+  int matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic);
+
+  void setFpFormat(FpFormatTy Format) {
+    FpFormat = Format;
+  }
+
+  void setDefaultFpFormat();
+
+  void setFpFormat(StringRef Format);
+
+  FpFormatTy getFpFormat() {return FpFormat;}
+
+  bool requestsDoubleOperand(StringRef Mnemonic);
+
+  unsigned getReg(int RC,int RegNo);
+
+  unsigned getATReg();
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser() {
+    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
+  }
+
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+};
+}
+
+namespace {
+
+/// MipsOperand - Instances of this class represent a parsed Mips machine
+/// instruction.
+class MipsOperand : public MCParsedAsmOperand {
+
+  enum KindTy {
+    k_CondCode,
+    k_CoprocNum,
+    k_Immediate,
+    k_Memory,
+    k_PostIndexRegister,
+    k_Register,
+    k_Token
+  } Kind;
+
+  MipsOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNum;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    struct {
+      unsigned Base;
+      const MCExpr *Off;
+    } Mem;
+  };
+
+  SMLoc StartLoc, EndLoc;
+
+public:
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const{
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (Expr == 0)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst,Expr);
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::CreateReg(getMemBase()));
+
+    const MCExpr *Expr = getMemOff();
+    addExpr(Inst,Expr);
+  }
+
+  bool isReg() const { return Kind == k_Register; }
+  bool isImm() const { return Kind == k_Immediate; }
+  bool isToken() const { return Kind == k_Token; }
+  bool isMem() const { return Kind == k_Memory; }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Base;
+  }
+
+  const MCExpr *getMemOff() const {
+    assert((Kind == k_Memory) && "Invalid access!");
+    return Mem.Off;
+  }
+
+  static MipsOperand *CreateToken(StringRef Str, SMLoc S) {
+    MipsOperand *Op = new MipsOperand(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 
+  static MipsOperand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static MipsOperand *CreateMem(unsigned Base, const MCExpr *Off,
+                                 SMLoc S, SMLoc E) {
+    MipsOperand *Op = new MipsOperand(k_Memory);
+    Op->Mem.Base = Base;
+    Op->Mem.Off = Off;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
+  virtual void print(raw_ostream &OS) const {
+    llvm_unreachable("unimplemented!");
+  }
 };
 }
 
+bool MipsAsmParser::needsExpansion(MCInst &Inst) {
+
+  switch(Inst.getOpcode()) {
+    case Mips::LoadImm32Reg:
+    case Mips::LoadAddr32Imm:
+    case Mips::LoadAddr32Reg:
+      return true;
+    default:
+      return false;
+  }
+}
+
+void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
+                        SmallVectorImpl<MCInst> &Instructions){
+  switch(Inst.getOpcode()) {
+    case Mips::LoadImm32Reg:
+      return expandLoadImm(Inst, IDLoc, Instructions);
+    case Mips::LoadAddr32Imm:
+      return expandLoadAddressImm(Inst,IDLoc,Instructions);
+    case Mips::LoadAddr32Reg:
+      return expandLoadAddressReg(Inst,IDLoc,Instructions);
+    }
+}
+
+void MipsAsmParser::expandLoadImm(MCInst &Inst, SMLoc IDLoc,
+                                  SmallVectorImpl<MCInst> &Instructions){
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  assert(RegOp.isReg() && "expected register operand kind");
+
+  int ImmValue = ImmOp.getImm();
+  tmpInst.setLoc(IDLoc);
+  if ( 0 <= ImmValue && ImmValue <= 65535) {
+    // for 0 <= j <= 65535.
+    // li d,j => ori d,$zero,j
+    tmpInst.setOpcode(isMips64() ? Mips::ORi64 : Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+              MCOperand::CreateReg(isMips64() ? Mips::ZERO_64 : Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else if ( ImmValue < 0 && ImmValue >= -32768) {
+    // for -32768 <= j < 0.
+    // li d,j => addiu d,$zero,j
+    tmpInst.setOpcode(Mips::ADDiu); //TODO:no ADDiu64 in td files?
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+              MCOperand::CreateReg(isMips64() ? Mips::ZERO_64 : Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    // for any other value of j that is representable as a 32-bit integer.
+    // li d,j => lui d,hi16(j)
+    //           ori d,d,lo16(j)
+    tmpInst.setOpcode(isMips64() ? Mips::LUi64 : Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(isMips64() ? Mips::ORi64 : Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    tmpInst.setLoc(IDLoc);
+    Instructions.push_back(tmpInst);
+  }
+}
+
+void MipsAsmParser::expandLoadAddressReg(MCInst &Inst, SMLoc IDLoc,
+                                         SmallVectorImpl<MCInst> &Instructions){
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(2);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &SrcRegOp = Inst.getOperand(1);
+  assert(SrcRegOp.isReg() && "expected register operand kind");
+  const MCOperand &DstRegOp = Inst.getOperand(0);
+  assert(DstRegOp.isReg() && "expected register operand kind");
+  int ImmValue = ImmOp.getImm();
+  if ( -32768 <= ImmValue && ImmValue <= 65535) {
+    //for -32768 <= j <= 65535.
+    //la d,j(s) => addiu d,s,j
+    tmpInst.setOpcode(Mips::ADDiu); //TODO:no ADDiu64 in td files?
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    //for any other value of j that is representable as a 32-bit integer.
+    //la d,j(s) => lui d,hi16(j)
+    //             ori d,d,lo16(j)
+    //             addu d,d,s
+    tmpInst.setOpcode(isMips64()?Mips::LUi64:Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(isMips64()?Mips::ORi64:Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(Mips::ADDu);
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(DstRegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(SrcRegOp.getReg()));
+    Instructions.push_back(tmpInst);
+  }
+}
+
+void MipsAsmParser::expandLoadAddressImm(MCInst &Inst, SMLoc IDLoc,
+                                         SmallVectorImpl<MCInst> &Instructions){
+  MCInst tmpInst;
+  const MCOperand &ImmOp = Inst.getOperand(1);
+  assert(ImmOp.isImm() && "expected immediate operand kind");
+  const MCOperand &RegOp = Inst.getOperand(0);
+  assert(RegOp.isReg() && "expected register operand kind");
+  int ImmValue = ImmOp.getImm();
+  if ( -32768 <= ImmValue && ImmValue <= 65535) {
+    //for -32768 <= j <= 65535.
+    //la d,j => addiu d,$zero,j
+    tmpInst.setOpcode(Mips::ADDiu);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(
+              MCOperand::CreateReg(isMips64()?Mips::ZERO_64:Mips::ZERO));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue));
+    Instructions.push_back(tmpInst);
+  } else {
+    //for any other value of j that is representable as a 32-bit integer.
+    //la d,j => lui d,hi16(j)
+    //          ori d,d,lo16(j)
+    tmpInst.setOpcode(isMips64()?Mips::LUi64:Mips::LUi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm((ImmValue & 0xffff0000) >> 16));
+    Instructions.push_back(tmpInst);
+    tmpInst.clear();
+    tmpInst.setOpcode(isMips64()?Mips::ORi64:Mips::ORi);
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateReg(RegOp.getReg()));
+    tmpInst.addOperand(MCOperand::CreateImm(ImmValue & 0xffff));
+    Instructions.push_back(tmpInst);
+  }
+}
+
 bool MipsAsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc,
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out) {
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
+                                              MatchingInlineAsm);
+
+  switch (MatchResult) {
+  default: break;
+  case Match_Success: {
+    if (needsExpansion(Inst)) {
+      SmallVector<MCInst, 4> Instructions;
+      expandInstruction(Inst, IDLoc, Instructions);
+      for(unsigned i =0; i < Instructions.size(); i++){
+        Out.EmitInstruction(Instructions[i]);
+      }
+    } else {
+        Inst.setLoc(IDLoc);
+        Out.EmitInstruction(Inst);
+      }
+    return false;
+  }
+  case Match_MissingFeature:
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    return true;
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((MipsOperand*)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction");
+  }
+  return true;
+}
+
+int MipsAsmParser::matchRegisterName(StringRef Name) {
+
+   int CC;
+   if (!isMips64())
+    CC = StringSwitch<unsigned>(Name)
+      .Case("zero",  Mips::ZERO)
+      .Case("a0",  Mips::A0)
+      .Case("a1",  Mips::A1)
+      .Case("a2",  Mips::A2)
+      .Case("a3",  Mips::A3)
+      .Case("v0",  Mips::V0)
+      .Case("v1",  Mips::V1)
+      .Case("s0",  Mips::S0)
+      .Case("s1",  Mips::S1)
+      .Case("s2",  Mips::S2)
+      .Case("s3",  Mips::S3)
+      .Case("s4",  Mips::S4)
+      .Case("s5",  Mips::S5)
+      .Case("s6",  Mips::S6)
+      .Case("s7",  Mips::S7)
+      .Case("k0",  Mips::K0)
+      .Case("k1",  Mips::K1)
+      .Case("sp",  Mips::SP)
+      .Case("fp",  Mips::FP)
+      .Case("gp",  Mips::GP)
+      .Case("ra",  Mips::RA)
+      .Case("t0",  Mips::T0)
+      .Case("t1",  Mips::T1)
+      .Case("t2",  Mips::T2)
+      .Case("t3",  Mips::T3)
+      .Case("t4",  Mips::T4)
+      .Case("t5",  Mips::T5)
+      .Case("t6",  Mips::T6)
+      .Case("t7",  Mips::T7)
+      .Case("t8",  Mips::T8)
+      .Case("t9",  Mips::T9)
+      .Case("at",  Mips::AT)
+      .Case("fcc0",  Mips::FCC0)
+      .Default(-1);
+   else
+    CC = StringSwitch<unsigned>(Name)
+      .Case("zero", Mips::ZERO_64)
+      .Case("at", Mips::AT_64)
+      .Case("v0", Mips::V0_64)
+      .Case("v1", Mips::V1_64)
+      .Case("a0", Mips::A0_64)
+      .Case("a1", Mips::A1_64)
+      .Case("a2", Mips::A2_64)
+      .Case("a3", Mips::A3_64)
+      .Case("a4", Mips::T0_64)
+      .Case("a5", Mips::T1_64)
+      .Case("a6", Mips::T2_64)
+      .Case("a7", Mips::T3_64)
+      .Case("t4", Mips::T4_64)
+      .Case("t5", Mips::T5_64)
+      .Case("t6", Mips::T6_64)
+      .Case("t7", Mips::T7_64)
+      .Case("s0", Mips::S0_64)
+      .Case("s1", Mips::S1_64)
+      .Case("s2", Mips::S2_64)
+      .Case("s3", Mips::S3_64)
+      .Case("s4", Mips::S4_64)
+      .Case("s5", Mips::S5_64)
+      .Case("s6", Mips::S6_64)
+      .Case("s7", Mips::S7_64)
+      .Case("t8", Mips::T8_64)
+      .Case("t9", Mips::T9_64)
+      .Case("kt0", Mips::K0_64)
+      .Case("kt1", Mips::K1_64)
+      .Case("gp", Mips::GP_64)
+      .Case("sp", Mips::SP_64)
+      .Case("fp", Mips::FP_64)
+      .Case("s8", Mips::FP_64)
+      .Case("ra", Mips::RA_64)
+      .Default(-1);
+
+  if (CC != -1)
+    return CC;
+
+  if (Name[0] == 'f') {
+    StringRef NumString = Name.substr(1);
+    unsigned IntVal;
+    if( NumString.getAsInteger(10, IntVal))
+      return -1; // not integer
+    if (IntVal > 31)
+      return -1;
+
+    FpFormatTy Format = getFpFormat();
+
+    if (Format == FP_FORMAT_S || Format == FP_FORMAT_W)
+      return getReg(Mips::FGR32RegClassID, IntVal);
+    if (Format == FP_FORMAT_D) {
+      if(isFP64()) {
+        return getReg(Mips::FGR64RegClassID, IntVal);
+      }
+      // only even numbers available as register pairs
+      if (( IntVal > 31) || (IntVal%2 !=  0))
+        return -1;
+      return getReg(Mips::AFGR64RegClassID, IntVal/2);
+    }
+  }
+
+  return -1;
+}
+void MipsAsmParser::setDefaultFpFormat() {
+
+  if (isMips64() || isFP64())
+    FpFormat = FP_FORMAT_D;
+  else
+    FpFormat = FP_FORMAT_S;
+}
+
+bool MipsAsmParser::requestsDoubleOperand(StringRef Mnemonic){
+
+  bool IsDouble = StringSwitch<bool>(Mnemonic.lower())
+    .Case("ldxc1", true)
+    .Case("ldc1",  true)
+    .Case("sdxc1", true)
+    .Case("sdc1",  true)
+    .Default(false);
+
+  return IsDouble;
+}
+void MipsAsmParser::setFpFormat(StringRef Format) {
+
+  FpFormat = StringSwitch<FpFormatTy>(Format.lower())
+    .Case(".s",  FP_FORMAT_S)
+    .Case(".d",  FP_FORMAT_D)
+    .Case(".l",  FP_FORMAT_L)
+    .Case(".w",  FP_FORMAT_W)
+    .Default(FP_FORMAT_NONE);
+}
+
+bool MipsAssemblerOptions::setATReg(unsigned Reg) {
+  if (Reg > 31)
+    return false;
+
+  aTReg = Reg;
   return true;
 }
 
+unsigned MipsAsmParser::getATReg() {
+  unsigned Reg = Options.getATRegNum();
+  if (isMips64())
+    return getReg(Mips::CPU64RegsRegClassID,Reg);
+  
+  return getReg(Mips::CPURegsRegClassID,Reg);
+}
+
+unsigned MipsAsmParser::getReg(int RC,int RegNo) {
+  return *(getContext().getRegisterInfo().getRegClass(RC).begin() + RegNo);
+}
+
+int MipsAsmParser::matchRegisterByNumber(unsigned RegNum, StringRef Mnemonic) {
+
+  if (Mnemonic.lower() == "rdhwr") {
+    // at the moment only hwreg29 is supported
+    if (RegNum != 29)
+      return -1;
+    return Mips::HWR29;
+  }
+
+  if (RegNum > 31)
+    return -1;
+
+  // MIPS64 registers are numbered 1 after the 32-bit equivalents
+  return getReg(Mips::CPURegsRegClassID, RegNum) + isMips64();
+}
+
+int MipsAsmParser::tryParseRegister(StringRef Mnemonic) {
+  const AsmToken &Tok = Parser.getTok();
+  int RegNum = -1;
+
+  if (Tok.is(AsmToken::Identifier)) {
+    std::string lowerCase = Tok.getString().lower();
+    RegNum = matchRegisterName(lowerCase);
+  } else if (Tok.is(AsmToken::Integer))
+    RegNum = matchRegisterByNumber(static_cast<unsigned>(Tok.getIntVal()),
+                                   Mnemonic.lower());
+    else
+      return RegNum;  //error
+  // 64 bit div operations require Mips::ZERO instead of MIPS::ZERO_64
+  if (isMips64() && RegNum == Mips::ZERO_64) {
+    if (Mnemonic.find("ddiv") != StringRef::npos)
+      RegNum = Mips::ZERO;
+  }
+  return RegNum;
+}
+
 bool MipsAsmParser::
-ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  tryParseRegisterOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                          StringRef Mnemonic){
+
+  SMLoc S = Parser.getTok().getLoc();
+  int RegNo = -1;
+
+  // FIXME: we should make a more generic method for CCR
+  if ((Mnemonic == "cfc1" || Mnemonic == "ctc1")
+      && Operands.size() == 2 && Parser.getTok().is(AsmToken::Integer)){
+    RegNo = Parser.getTok().getIntVal();  // get the int value
+    // at the moment only fcc0 is supported
+    if (RegNo ==  0)
+      RegNo = Mips::FCC0;
+  } else
+    RegNo = tryParseRegister(Mnemonic);
+  if (RegNo == -1)
+    return true;
+
+  Operands.push_back(MipsOperand::CreateReg(RegNo, S,
+      Parser.getTok().getLoc()));
+  Parser.Lex(); // Eat register token.
+  return false;
+}
+
+bool MipsAsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*>&Operands,
+                                 StringRef Mnemonic) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
+    return false;
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
+
+  switch (getLexer().getKind()) {
+  default:
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return true;
+  case AsmToken::Dollar: {
+    // parse register
+    SMLoc S = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat dollar token.
+    // parse register operand
+    if (!tryParseRegisterOperand(Operands, Mnemonic)) {
+      if (getLexer().is(AsmToken::LParen)) {
+        // check if it is indexed addressing operand
+        Operands.push_back(MipsOperand::CreateToken("(", S));
+        Parser.Lex(); // eat parenthesis
+        if (getLexer().isNot(AsmToken::Dollar))
+          return true;
+
+        Parser.Lex(); // eat dollar
+        if (tryParseRegisterOperand(Operands, Mnemonic))
+          return true;
+
+        if (!getLexer().is(AsmToken::RParen))
+          return true;
+
+        S = Parser.getTok().getLoc();
+        Operands.push_back(MipsOperand::CreateToken(")", S));
+        Parser.Lex();
+      }
+      return false;
+    }
+    // maybe it is a symbol reference
+    StringRef Identifier;
+    if (Parser.ParseIdentifier(Identifier))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    MCSymbol *Sym = getContext().GetOrCreateSymbol("$" + Identifier);
+
+    // Otherwise create a symbol ref.
+    const MCExpr *Res = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None,
+                                                getContext());
+
+    Operands.push_back(MipsOperand::CreateImm(Res, S, E));
+    return false;
+  }
+  case AsmToken::Identifier:
+  case AsmToken::LParen:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+  case AsmToken::Integer:
+  case AsmToken::String: {
+     // quoted label names
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc();
+    if (getParser().ParseExpression(IdVal))
+      return true;
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  }
+  case AsmToken::Percent: {
+    // it is a symbol reference or constant expression
+    const MCExpr *IdVal;
+    SMLoc S = Parser.getTok().getLoc(); // start location of the operand
+    if (parseRelocOperand(IdVal))
+      return true;
+
+    SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+    Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+    return false;
+  } // case AsmToken::Percent
+  } // switch(getLexer().getKind())
+  return true;
+}
+
+bool MipsAsmParser::parseRelocOperand(const MCExpr *&Res) {
+
+  Parser.Lex(); // eat % token
+  const AsmToken &Tok = Parser.getTok(); // get next token, operation
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+
+  std::string Str = Tok.getIdentifier().str();
+
+  Parser.Lex(); // eat identifier
+  // now make expression from the rest of the operand
+  const MCExpr *IdVal;
+  SMLoc EndLoc;
+
+  if (getLexer().getKind() == AsmToken::LParen) {
+    while (1) {
+      Parser.Lex(); // eat '(' token
+      if (getLexer().getKind() == AsmToken::Percent) {
+        Parser.Lex(); // eat % token
+        const AsmToken &nextTok = Parser.getTok();
+        if (nextTok.isNot(AsmToken::Identifier))
+          return true;
+        Str += "(%";
+        Str += nextTok.getIdentifier();
+        Parser.Lex(); // eat identifier
+        if (getLexer().getKind() != AsmToken::LParen)
+          return true;
+      } else
+        break;
+    }
+    if (getParser().ParseParenExpression(IdVal,EndLoc))
+      return true;
+
+    while (getLexer().getKind() == AsmToken::RParen)
+      Parser.Lex(); // eat ')' token
+
+  } else
+    return true; // parenthesis must follow reloc operand
+
+  // Check the type of the expression
+  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(IdVal)) {
+    // it's a constant, evaluate lo or hi value
+    int Val = MCE->getValue();
+    if (Str == "lo") {
+      Val = Val & 0xffff;
+    } else if (Str == "hi") {
+      Val = (Val & 0xffff0000) >> 16;
+    }
+    Res = MCConstantExpr::Create(Val, getContext());
+    return false;
+  }
+
+  if (const MCSymbolRefExpr *MSRE = dyn_cast<MCSymbolRefExpr>(IdVal)) {
+    // it's a symbol, create symbolic expression from symbol
+    StringRef Symbol = MSRE->getSymbol().getName();
+    MCSymbolRefExpr::VariantKind VK = getVariantKind(Str);
+    Res = MCSymbolRefExpr::Create(Symbol,VK,getContext());
+    return false;
+  }
   return true;
 }
 
+bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                  SMLoc &EndLoc) {
+
+  StartLoc = Parser.getTok().getLoc();
+  RegNo = tryParseRegister("");
+  EndLoc = Parser.getTok().getLoc();
+  return (RegNo == (unsigned)-1);
+}
+
+bool MipsAsmParser::parseMemOffset(const MCExpr *&Res) {
+
+  SMLoc S;
+
+  switch(getLexer().getKind()) {
+  default:
+    return true;
+  case AsmToken::Integer:
+  case AsmToken::Minus:
+  case AsmToken::Plus:
+    return (getParser().ParseExpression(Res));
+  case AsmToken::Percent:
+    return parseRelocOperand(Res);
+  case AsmToken::LParen:
+    return false;  // it's probably assuming 0
+  }
+  return true;
+}
+
+MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
+               SmallVectorImpl<MCParsedAsmOperand*>&Operands) {
+
+  const MCExpr *IdVal = 0;
+  SMLoc S;
+  // first operand is the offset
+  S = Parser.getTok().getLoc();
+
+  if (parseMemOffset(IdVal))
+    return MatchOperand_ParseFail;
+
+  const AsmToken &Tok = Parser.getTok(); // get next token
+  if (Tok.isNot(AsmToken::LParen)) {
+    MipsOperand *Mnemonic = static_cast<MipsOperand*>(Operands[0]);
+    if (Mnemonic->getToken() == "la") {
+      SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer()-1);
+      Operands.push_back(MipsOperand::CreateImm(IdVal, S, E));
+      return MatchOperand_Success;
+    }
+    Error(Parser.getTok().getLoc(), "'(' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Parser.Lex(); // Eat '(' token.
+
+  const AsmToken &Tok1 = Parser.getTok(); // get next token
+  if (Tok1.is(AsmToken::Dollar)) {
+    Parser.Lex(); // Eat '$' token.
+    if (tryParseRegisterOperand(Operands,"")) {
+      Error(Parser.getTok().getLoc(), "unexpected token in operand");
+      return MatchOperand_ParseFail;
+    }
+
+  } else {
+    Error(Parser.getTok().getLoc(), "unexpected token in operand");
+    return MatchOperand_ParseFail;
+  }
+
+  const AsmToken &Tok2 = Parser.getTok(); // get next token
+  if (Tok2.isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "')' expected");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  Parser.Lex(); // Eat ')' token.
+
+  if (IdVal == 0)
+    IdVal = MCConstantExpr::Create(0, getContext());
+
+  // now replace register operand with the mem operand
+  MipsOperand* op = static_cast<MipsOperand*>(Operands.back());
+  int RegNo = op->getReg();
+  // remove register from operands
+  Operands.pop_back();
+  // and add memory operand
+  Operands.push_back(MipsOperand::CreateMem(RegNo, IdVal, S, E));
+  delete op;
+  return MatchOperand_Success;
+}
+
+MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
+
+  MCSymbolRefExpr::VariantKind VK
+                   = StringSwitch<MCSymbolRefExpr::VariantKind>(Symbol)
+    .Case("hi",          MCSymbolRefExpr::VK_Mips_ABS_HI)
+    .Case("lo",          MCSymbolRefExpr::VK_Mips_ABS_LO)
+    .Case("gp_rel",      MCSymbolRefExpr::VK_Mips_GPREL)
+    .Case("call16",      MCSymbolRefExpr::VK_Mips_GOT_CALL)
+    .Case("got",         MCSymbolRefExpr::VK_Mips_GOT)
+    .Case("tlsgd",       MCSymbolRefExpr::VK_Mips_TLSGD)
+    .Case("tlsldm",      MCSymbolRefExpr::VK_Mips_TLSLDM)
+    .Case("dtprel_hi",   MCSymbolRefExpr::VK_Mips_DTPREL_HI)
+    .Case("dtprel_lo",   MCSymbolRefExpr::VK_Mips_DTPREL_LO)
+    .Case("gottprel",    MCSymbolRefExpr::VK_Mips_GOTTPREL)
+    .Case("tprel_hi",    MCSymbolRefExpr::VK_Mips_TPREL_HI)
+    .Case("tprel_lo",    MCSymbolRefExpr::VK_Mips_TPREL_LO)
+    .Case("got_disp",    MCSymbolRefExpr::VK_Mips_GOT_DISP)
+    .Case("got_page",    MCSymbolRefExpr::VK_Mips_GOT_PAGE)
+    .Case("got_ofst",    MCSymbolRefExpr::VK_Mips_GOT_OFST)
+    .Case("hi(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_HI)
+    .Case("lo(%neg(%gp_rel",    MCSymbolRefExpr::VK_Mips_GPOFF_LO)
+    .Default(MCSymbolRefExpr::VK_None);
+
+  return VK;
+}
+
+static int ConvertCcString(StringRef CondString) {
+  int CC = StringSwitch<unsigned>(CondString)
+      .Case(".f",    0)
+      .Case(".un",   1)
+      .Case(".eq",   2)
+      .Case(".ueq",  3)
+      .Case(".olt",  4)
+      .Case(".ult",  5)
+      .Case(".ole",  6)
+      .Case(".ule",  7)
+      .Case(".sf",   8)
+      .Case(".ngle", 9)
+      .Case(".seq",  10)
+      .Case(".ngl",  11)
+      .Case(".lt",   12)
+      .Case(".nge",  13)
+      .Case(".le",   14)
+      .Case(".ngt",  15)
+      .Default(-1);
+
+  return CC;
+}
+
 bool MipsAsmParser::
-ParseInstruction(StringRef Name, SMLoc NameLoc,
+parseMathOperation(StringRef Name, SMLoc NameLoc,
+                   SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // split the format
+  size_t Start = Name.find('.'), Next = Name.rfind('.');
+  StringRef Format1 = Name.slice(Start, Next);
+  // and add the first format to the operands
+  Operands.push_back(MipsOperand::CreateToken(Format1, NameLoc));
+  // now for the second format
+  StringRef Format2 = Name.slice(Next, StringRef::npos);
+  Operands.push_back(MipsOperand::CreateToken(Format2, NameLoc));
+
+  // set the format for the first register
+  setFpFormat(Format1);
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    if (getLexer().isNot(AsmToken::Comma)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+
+    }
+    Parser.Lex();  // Eat the comma.
+
+    //set the format for the first register
+    setFpFormat(Format2);
+
+    // Parse and remember the operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.EatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // floating point instructions: should register be treated as double?
+  if (requestsDoubleOperand(Name)) {
+    setFpFormat(FP_FORMAT_D);
+  Operands.push_back(MipsOperand::CreateToken(Name, NameLoc));
+  }
+  else {
+    setDefaultFpFormat();
+    // Create the leading tokens for the mnemonic, split by '.' characters.
+    size_t Start = 0, Next = Name.find('.');
+    StringRef Mnemonic = Name.slice(Start, Next);
+
+    Operands.push_back(MipsOperand::CreateToken(Mnemonic, NameLoc));
+
+    if (Next != StringRef::npos) {
+      // there is a format token in mnemonic
+      // StringRef Rest = Name.slice(Next, StringRef::npos);
+      size_t Dot = Name.find('.', Next+1);
+      StringRef Format = Name.slice(Next, Dot);
+      if (Dot == StringRef::npos) //only one '.' in a string, it's a format
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      else {
+        if (Name.startswith("c.")){
+          // floating point compare, add '.' and immediate represent for cc
+          Operands.push_back(MipsOperand::CreateToken(".", NameLoc));
+          int Cc = ConvertCcString(Format);
+          if (Cc == -1) {
+            return Error(NameLoc, "Invalid conditional code");
+          }
+          SMLoc E = SMLoc::getFromPointer(
+              Parser.getTok().getLoc().getPointer() -1 );
+          Operands.push_back(MipsOperand::CreateImm(
+              MCConstantExpr::Create(Cc, getContext()), NameLoc, E));
+        } else {
+          // trunc, ceil, floor ...
+          return parseMathOperation(Name, NameLoc, Operands);
+        }
+
+        // the rest is a format
+        Format = Name.slice(Dot, StringRef::npos);
+        Operands.push_back(MipsOperand::CreateToken(Format, NameLoc));
+      }
+
+      setFpFormat(Format);
+    }
+  }
+
+  // Read the remaining operands.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (ParseOperand(Operands, Name)) {
+      SMLoc Loc = getLexer().getLoc();
+      Parser.EatToEndOfStatement();
+      return Error(Loc, "unexpected token in argument list");
+    }
+
+    while (getLexer().is(AsmToken::Comma) ) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Operands, Name)) {
+        SMLoc Loc = getLexer().getLoc();
+        Parser.EatToEndOfStatement();
+        return Error(Loc, "unexpected token in argument list");
+      }
+    }
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    Parser.EatToEndOfStatement();
+    return Error(Loc, "unexpected token in argument list");
+  }
+
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::reportParseError(StringRef ErrorMsg) {
+   SMLoc Loc = getLexer().getLoc();
+   Parser.EatToEndOfStatement();
+   return Error(Loc, ErrorMsg);
+}
+
+bool MipsAsmParser::parseSetNoAtDirective() {
+  // line should look like:
+  //  .set noat
+  // set at reg to 0
+  Options.setATReg(0);
+  // eat noat
+  Parser.Lex();
+  // if this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+bool MipsAsmParser::parseSetAtDirective() {
+  // line can be
+  //  .set at - defaults to $1
+  // or .set at=$reg
+  getParser().Lex();
+  if (getLexer().is(AsmToken::EndOfStatement)) {
+    Options.setATReg(1);
+    Parser.Lex(); // Consume the EndOfStatement
+    return false;
+  } else if (getLexer().is(AsmToken::Equal)) {
+    getParser().Lex(); //eat '='
+    if (getLexer().isNot(AsmToken::Dollar)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    Parser.Lex(); // eat '$'
+    if (getLexer().isNot(AsmToken::Integer)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    const AsmToken &Reg = Parser.getTok();
+    if (!Options.setATReg(Reg.getIntVal())) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    getParser().Lex(); //eat reg
+
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token in statement");
+      return false;
+     }
+    Parser.Lex(); // Consume the EndOfStatement
+    return false;
+  } else {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+}
+
+bool MipsAsmParser::parseSetReorderDirective() {
+  Parser.Lex();
+  // if this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setReorder();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoReorderDirective() {
+    Parser.Lex();
+    // if this is not the end of the statement, report error
+    if (getLexer().isNot(AsmToken::EndOfStatement)) {
+      reportParseError("unexpected token in statement");
+      return false;
+    }
+    Options.setNoreorder();
+    Parser.Lex(); // Consume the EndOfStatement
+    return false;
+}
+
+bool MipsAsmParser::parseSetMacroDirective() {
+  Parser.Lex();
+  // if this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token in statement");
+    return false;
+  }
+  Options.setMacro();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+bool MipsAsmParser::parseSetNoMacroDirective() {
+  Parser.Lex();
+  // if this is not the end of the statement, report error
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  if (Options.isReorder()) {
+    reportParseError("`noreorder' must be set before `nomacro'");
+    return false;
+  }
+  Options.setNomacro();
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+bool MipsAsmParser::parseDirectiveSet() {
+
+  // get next token
+  const AsmToken &Tok = Parser.getTok();
+
+  if (Tok.getString() == "noat") {
+    return parseSetNoAtDirective();
+  } else if (Tok.getString() == "at") {
+    return parseSetAtDirective();
+  } else if (Tok.getString() == "reorder") {
+    return parseSetReorderDirective();
+  } else if (Tok.getString() == "noreorder") {
+    return parseSetNoReorderDirective();
+  } else if (Tok.getString() == "macro") {
+    return parseSetMacroDirective();
+  } else if (Tok.getString() == "nomacro") {
+    return parseSetNoMacroDirective();
+  } else if (Tok.getString() == "nomips16") {
+    // ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  } else if (Tok.getString() == "nomicromips") {
+    // ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
   return true;
 }
 
-bool MipsAsmParser::
-ParseDirective(AsmToken DirectiveID) {
+bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
+
+  if (DirectiveID.getString() == ".ent") {
+    // ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".end") {
+    // ignore this directive for now
+    Parser.Lex();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".frame") {
+    // ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".set") {
+    return parseDirectiveSet();
+  }
+
+  if (DirectiveID.getString() == ".fmask") {
+    // ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".mask") {
+    // ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
+  if (DirectiveID.getString() == ".gpword") {
+    // ignore this directive for now
+    Parser.EatToEndOfStatement();
+    return false;
+  }
+
   return true;
 }
 
@@ -64,3 +1327,7 @@ extern "C" void LLVMInitializeMipsAsmParser() {
   RegisterMCAsmParser<MipsAsmParser> A(TheMips64Target);
   RegisterMCAsmParser<MipsAsmParser> B(TheMips64elTarget);
 }
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "MipsGenAsmMatcher.inc"
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index aab8a011d4d0..ef56e752b2e4 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -10,6 +10,8 @@ tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MipsGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM MipsGenEDInfo.inc -gen-enhanced-disassembly-info)
+tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
+tablegen(LLVM MipsGenMCPseudoLowering.inc -gen-pseudo-lowering)
 add_public_tablegen_target(MipsCommonTableGen)
 
 add_llvm_target(MipsCodeGen
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index aa5747209b79..82dbcc5bcf7d 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -108,6 +108,11 @@ static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder);
+
 static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
@@ -138,6 +143,11 @@ static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
                                                 uint64_t Address,
                                                 const void *Decoder);
 
+static DecodeStatus DecodeACRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
@@ -346,6 +356,13 @@ static DecodeStatus DecodeCPURegsRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeDSPRegsRegisterClass(MCInst &Inst,
+                                               unsigned RegNo,
+                                               uint64_t Address,
+                                               const void *Decoder) {
+  return DecodeCPURegsRegisterClass(Inst, RegNo, Address, Decoder);
+}
+
 static DecodeStatus DecodeFGR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
@@ -463,6 +480,18 @@ static DecodeStatus DecodeHWRegs64RegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeACRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder) {
+  if (RegNo >= 4)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::ACRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
diff --git a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
index fa231507a2ef..be5d7e42532a 100644
--- a/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/Mips/MCTargetDesc/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMMipsDesc
   MipsAsmBackend.cpp
+  MipsDirectObjLower.cpp
   MipsMCAsmInfo.cpp
   MipsMCCodeEmitter.cpp
   MipsMCTargetDesc.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 18961fdd785e..9a35bb6bd707 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -92,7 +92,7 @@ public:
       MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
   }
 
-  /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
@@ -217,7 +217,7 @@ public:
   ///
   /// \param Inst - The instruction to relax, which may be the same
   /// as the output.
-  /// \parm Res [output] - On return, the relaxed instruction.
+  /// \param [out] Res On return, the relaxed instruction.
   void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
   }
 
@@ -244,22 +244,26 @@ public:
 } // namespace
 
 // MCAsmBackend
-MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/true);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/true);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
index 234455e0c7f0..233214b461f0 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h
@@ -122,14 +122,16 @@ inline static unsigned getMipsRegisterNumbering(unsigned RegEnum)
 {
   switch (RegEnum) {
   case Mips::ZERO: case Mips::ZERO_64: case Mips::F0: case Mips::D0_64:
-  case Mips::D0:
+  case Mips::D0:   case Mips::FCC0:    case Mips::AC0:
     return 0;
   case Mips::AT: case Mips::AT_64: case Mips::F1: case Mips::D1_64:
+  case Mips::AC1:
     return 1;
   case Mips::V0: case Mips::V0_64: case Mips::F2: case Mips::D2_64:
-  case Mips::D1:
+  case Mips::D1: case Mips::AC2:
     return 2;
   case Mips::V1: case Mips::V1_64: case Mips::F3: case Mips::D3_64:
+  case Mips::AC3:
     return 3;
   case Mips::A0: case Mips::A0_64: case Mips::F4: case Mips::D4_64:
   case Mips::D2:
diff --git a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
new file mode 100644
index 000000000000..15c4282030db
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.cpp
@@ -0,0 +1,81 @@
+//===-- MipsDirectObjLower.cpp - Mips LLVM direct object lowering -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower Mips MCInst records that are normally
+// left to the assembler to lower such as large shifts.
+//
+//===----------------------------------------------------------------------===//
+#include "MipsInstrInfo.h"
+#include "MCTargetDesc/MipsDirectObjLower.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+// If the D<shift> instruction has a shift amount that is greater
+// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
+void Mips::LowerLargeShift(MCInst& Inst) {
+
+  assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
+  assert(Inst.getOperand(2).isImm());
+
+  int64_t Shift = Inst.getOperand(2).getImm();
+  if (Shift <= 31)
+    return; // Do nothing
+  Shift -= 32;
+
+  // saminus32
+  Inst.getOperand(2).setImm(Shift);
+
+  switch (Inst.getOpcode()) {
+  default:
+    // Calling function is not synchronized
+    llvm_unreachable("Unexpected shift instruction");
+  case Mips::DSLL:
+    Inst.setOpcode(Mips::DSLL32);
+    return;
+  case Mips::DSRL:
+    Inst.setOpcode(Mips::DSRL32);
+    return;
+  case Mips::DSRA:
+    Inst.setOpcode(Mips::DSRA32);
+    return;
+  }
+}
+
+// Pick a DEXT or DINS instruction variant based on the pos and size operands
+void Mips::LowerDextDins(MCInst& InstIn) {
+  int Opcode = InstIn.getOpcode();
+
+  if (Opcode == Mips::DEXT)
+    assert(InstIn.getNumOperands() == 4 &&
+           "Invalid no. of machine operands for DEXT!");
+  else // Only DEXT and DINS are possible
+    assert(InstIn.getNumOperands() == 5 &&
+           "Invalid no. of machine operands for DINS!");
+
+  assert(InstIn.getOperand(2).isImm());
+  int64_t pos = InstIn.getOperand(2).getImm();
+  assert(InstIn.getOperand(3).isImm());
+  int64_t size = InstIn.getOperand(3).getImm();
+
+  if (size <= 32) {
+    if (pos < 32)  // DEXT/DINS, do nothing
+      return;
+    // DEXTU/DINSU
+    InstIn.getOperand(2).setImm(pos - 32);
+    InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTU : Mips::DINSU);
+    return;
+  }
+  // DEXTM/DINSM
+  assert(pos < 32 && "DEXT/DINS cannot have both size and pos > 32");
+  InstIn.getOperand(3).setImm(size - 32);
+  InstIn.setOpcode((Opcode == Mips::DEXT) ? Mips::DEXTM : Mips::DINSM);
+  return;
+}
diff --git a/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
new file mode 100644
index 000000000000..8813cc9ac7a4
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsDirectObjLower.h
@@ -0,0 +1,28 @@
+//===-- MipsDirectObjLower.h - Mips LLVM direct object lowering *- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSDIRECTOBJLOWER_H
+#define MIPSDIRECTOBJLOWER_H
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+
+  namespace Mips {
+  /// MipsDirectObjLower - This name space is used to lower MCInstr in cases
+  //                       where the assembler usually finishes the lowering
+  //                       such as large shifts.
+    void LowerLargeShift(MCInst &Inst);
+    void LowerDextDins(MCInst &Inst);
+  }
+}
+
+#endif
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 8e84b3f99f42..5d240fe84703 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -34,7 +34,8 @@ namespace {
 
   class MipsELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI, bool _isN64);
+    MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
+                        bool _isN64, bool IsLittleEndian);
 
     virtual ~MipsELFObjectWriter();
 
@@ -53,9 +54,9 @@ namespace {
 }
 
 MipsELFObjectWriter::MipsELFObjectWriter(bool _is64Bit, uint8_t OSABI,
-                                         bool _isN64)
+                                         bool _isN64, bool IsLittleEndian)
   : MCELFObjectTargetWriter(_is64Bit, OSABI, ELF::EM_MIPS,
-                            /*HasRelocationAddend*/ false,
+                            /*HasRelocationAddend*/ (_isN64) ? true : false,
                             /*IsN64*/ _isN64) {}
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
@@ -274,6 +275,7 @@ MCObjectWriter *llvm::createMipsELFObjectWriter(raw_ostream &OS,
                                                 bool IsLittleEndian,
                                                 bool Is64Bit) {
   MCELFObjectTargetWriter *MOTW = new MipsELFObjectWriter(Is64Bit, OSABI,
-                                                (Is64Bit) ? true : false);
+                                                (Is64Bit) ? true : false,
+                                                IsLittleEndian);
   return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 8dab62d51813..7fbdae02f411 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -13,6 +13,7 @@
 //
 #define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsDirectObjLower.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/ADT/APFloat.h"
@@ -29,17 +30,14 @@ using namespace llvm;
 
 namespace {
 class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
-  const MCSubtargetInfo &STI;
-  MCContext &Ctx;
   bool IsLittleEndian;
 
 public:
-  MipsMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                    MCContext &ctx, bool IsLittle) :
-            MCII(mcii), STI(sti) , Ctx(ctx), IsLittleEndian(IsLittle) {}
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, bool IsLittle) :
+            MCII(mcii), IsLittleEndian(IsLittle) {}
 
   ~MipsMCCodeEmitter() {}
 
@@ -95,7 +93,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
-  return new MipsMCCodeEmitter(MCII, STI, Ctx, false);
+  return new MipsMCCodeEmitter(MCII, false);
 }
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
@@ -103,7 +101,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
-  return new MipsMCCodeEmitter(MCII, STI, Ctx, true);
+  return new MipsMCCodeEmitter(MCII, true);
 }
 
 /// EncodeInstruction - Emit the instruction.
@@ -112,16 +110,35 @@ void MipsMCCodeEmitter::
 EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups) const
 {
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups);
+
+  // Non-pseudo instructions that get changed for direct object
+  // only based on operand values.
+  // If this list of instructions get much longer we will move
+  // the check to a function call. Until then, this is more efficient.
+  MCInst TmpInst = MI;
+  switch (MI.getOpcode()) {
+  // If shift amount is >= 32 it the inst needs to be lowered further
+  case Mips::DSLL:
+  case Mips::DSRL:
+  case Mips::DSRA:
+    Mips::LowerLargeShift(TmpInst);
+    break;
+    // Double extract instruction is chosen by pos and size operands
+  case Mips::DEXT:
+  case Mips::DINS:
+    Mips::LowerDextDins(TmpInst);
+  }
+
+  uint32_t Binary = getBinaryCodeForInstr(TmpInst, Fixups);
 
   // Check for unimplemented opcodes.
-  // Unfortunately in MIPS both NOT and SLL will come in with Binary == 0
+  // Unfortunately in MIPS both NOP and SLL will come in with Binary == 0
   // so we have to special check for them.
-  unsigned Opcode = MI.getOpcode();
+  unsigned Opcode = TmpInst.getOpcode();
   if ((Opcode != Mips::NOP) && (Opcode != Mips::SLL) && !Binary)
     llvm_unreachable("unimplemented opcode in EncodeInstruction()");
 
-  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  const MCInstrDesc &Desc = MCII.get(TmpInst.getOpcode());
   uint64_t TSFlags = Desc.TSFlags;
 
   // Pseudo instructions don't get encoded and shouldn't be here
@@ -129,8 +146,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   if ((TSFlags & MipsII::FormMask) == MipsII::Pseudo)
     llvm_unreachable("Pseudo opcode found in EncodeInstruction()");
 
-  // For now all instructions are 4 bytes
-  int Size = 4; // FIXME: Have Desc.getSize() return the correct value!
+  // Get byte count of instruction
+  unsigned Size = Desc.getSize();
+  if (!Size)
+    llvm_unreachable("Desc.getSize() returns 0");
 
   EmitInstruction(Binary, Size, OS);
 }
@@ -143,7 +162,11 @@ getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isExpr() && "getBranchTargetOpValue expects only expressions");
+
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() &&
+         "getBranchTargetOpValue expects only expressions or immediates");
 
   const MCExpr *Expr = MO.getExpr();
   Fixups.push_back(MCFixup::Create(0, Expr,
@@ -159,7 +182,10 @@ getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups) const {
 
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert(MO.isExpr() && "getJumpTargetOpValue expects only expressions");
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm()) return MO.getImm();
+  assert(MO.isExpr() &&
+         "getJumpTargetOpValue expects only expressions or an immediate");
 
   const MCExpr *Expr = MO.getExpr();
   Fixups.push_back(MCFixup::Create(0, Expr,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index bfcc2a2e4ae0..71954a4bd862 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -42,10 +42,14 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT,
+                                       StringRef CPU);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
index 596f07145a27..bd8c5173454e 100644
--- a/lib/Target/Mips/Makefile
+++ b/lib/Target/Mips/Makefile
@@ -16,7 +16,9 @@ BUILT_SOURCES = MipsGenRegisterInfo.inc MipsGenInstrInfo.inc \
                 MipsGenAsmWriter.inc MipsGenCodeEmitter.inc \
                 MipsGenDAGISel.inc MipsGenCallingConv.inc \
                 MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc \
-                MipsGenEDInfo.inc MipsGenDisassemblerTables.inc
+                MipsGenEDInfo.inc MipsGenDisassemblerTables.inc \
+                MipsGenMCPseudoLowering.inc MipsGenAsmMatcher.inc
+
 DIRS = InstPrinter Disassembler AsmParser TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 90f7942c5b4e..90c01d5de0a9 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -77,6 +77,10 @@ def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
                                       "Mips16 mode">;
 
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
+def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
+                                    "Mips DSP-R2 ASE", [FeatureDSP]>;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
@@ -95,9 +99,20 @@ def MipsAsmWriter : AsmWriter {
   bit isMCAsmWriter = 1;
 }
 
+def MipsAsmParser : AsmParser {
+  let ShouldEmitMatchRegisterName = 0;
+}
+
+def MipsAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "$";
+}
+
 def Mips : Target {
   let InstructionSet = MipsInstrInfo;
-
+  let AssemblyParsers = [MipsAsmParser];
   let AssemblyWriters = [MipsAsmWriter];
+  let AssemblyParserVariants = [MipsAsmParserVariant];
 }
-
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 030042f2e832..4e6b21feb55d 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -41,6 +41,11 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   // Adjust stack.
   if (isInt<16>(-StackSize))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::SaveRaF16)).addImm(StackSize);
+
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
+      .addReg(Mips::SP);
+
 }
 
 void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
@@ -55,6 +60,10 @@ void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
   if (!StackSize)
     return;
 
+  if (hasFP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::Move32R16), Mips::SP)
+      .addReg(Mips::S0);
+
   // Adjust stack.
   if (isInt<16>(StackSize))
     // assumes stacksize multiple of 8
@@ -66,19 +75,58 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI,
                           const std::vector<CalleeSavedInfo> &CSI,
                           const TargetRegisterInfo *TRI) const {
-  // FIXME: implement.
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+
+  //
+  // Registers RA, S0,S1 are the callee saved registers and they
+  // will be saved with the "save" instruction
+  // during emitPrologue
+  //
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
+      && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+  }
+
+  return true;
+}
+
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                       const std::vector<CalleeSavedInfo> &CSI,
+                                       const TargetRegisterInfo *TRI) const {
+  //
+  // Registers RA,S0,S1 are the callee saved registers and they will be restored
+  // with the restore instruction during emitEpilogue.
+  // We need to override this virtual function, otherwise llvm will try and
+  // restore the registers on it's on from the stack.
+  //
+
   return true;
 }
 
 bool
 Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  // FIXME: implement.
-  return true;
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Reserve call frame if the size of the maximum call frame fits into 15-bit
+  // immediate field and there are no variable sized objects on the stack.
+  return isInt<15>(MFI->getMaxCallFrameSize()) && !MFI->hasVarSizedObjects();
 }
 
 void Mips16FrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
+  MF.getRegInfo().setPhysRegUsed(Mips::RA);
+  MF.getRegInfo().setPhysRegUsed(Mips::S0);
+  MF.getRegInfo().setPhysRegUsed(Mips::S1);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 25cc37b81519..01db71e8def5 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -32,6 +32,11 @@ public:
                                  const std::vector<CalleeSavedInfo> &CSI,
                                  const TargetRegisterInfo *TRI) const;
 
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
   bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 2bc286b6bb1f..619646b3178a 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -25,7 +25,7 @@
 using namespace llvm;
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
-  : MipsInstrInfo(tm, /* FIXME: set mips16 unconditional br */ 0),
+  : MipsInstrInfo(tm, Mips::BimmX16),
     RI(*tm.getSubtargetImpl(), *this) {}
 
 const MipsRegisterInfo &Mips16InstrInfo::getRegisterInfo() const {
@@ -58,12 +58,22 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I, DebugLoc DL,
                                   unsigned DestReg, unsigned SrcReg,
                                   bool KillSrc) const {
-  unsigned Opc = 0, ZeroReg = 0;
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.contains(DestReg) &&
+      Mips::CPURegsRegClass.contains(SrcReg))
+    Opc = Mips::MoveR3216;
+  else if (Mips::CPURegsRegClass.contains(DestReg) &&
+           Mips::CPU16RegsRegClass.contains(SrcReg))
+    Opc = Mips::Move32R16;
+  else if ((SrcReg == Mips::HI) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mfhi16, SrcReg = 0;
+
+  else if ((SrcReg == Mips::LO) &&
+           (Mips::CPU16RegsRegClass.contains(DestReg)))
+    Opc = Mips::Mflo16, SrcReg = 0;
 
-  if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
-    if (Mips::CPURegsRegClass.contains(SrcReg))
-      Opc = Mips::Mov32R16;
-  }
 
   assert(Opc && "Cannot copy registers");
 
@@ -72,9 +82,6 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   if (DestReg)
     MIB.addReg(DestReg, RegState::Define);
 
-  if (ZeroReg)
-    MIB.addReg(ZeroReg);
-
   if (SrcReg)
     MIB.addReg(SrcReg, getKillRegState(KillSrc));
 }
@@ -84,7 +91,15 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                     unsigned SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
-  assert(false && "Implement this function.");
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOStore);
+  unsigned Opc = 0;
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::SwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc)).addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FI).addImm(0).addMemOperand(MMO);
 }
 
 void Mips16InstrInfo::
@@ -92,7 +107,16 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                      unsigned DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
-  assert(false && "Implement this function.");
+  DebugLoc DL;
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineMemOperand *MMO = GetMemOperand(MBB, FI, MachineMemOperand::MOLoad);
+  unsigned Opc = 0;
+
+  if (Mips::CPU16RegsRegClass.hasSubClassEq(RC))
+    Opc = Mips::LwRxSpImmX16;
+  assert(Opc && "Register class not handled!");
+  BuildMI(MBB, I, DL, get(Opc), DestReg).addFrameIndex(FI).addImm(0)
+    .addMemOperand(MMO);
 }
 
 bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
@@ -102,7 +126,7 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   default:
     return false;
   case Mips::RetRA16:
-    ExpandRetRA16(MBB, MI, Mips::JrRa16);
+    ExpandRetRA16(MBB, MI, Mips::JrcRa16);
     break;
   }
 
@@ -113,12 +137,55 @@ bool Mips16InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
 /// GetOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
 unsigned Mips16InstrInfo::GetOppositeBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:  llvm_unreachable("Illegal opcode!");
+  case Mips::BeqzRxImmX16: return Mips::BnezRxImmX16;
+  case Mips::BnezRxImmX16: return Mips::BeqzRxImmX16;
+  case Mips::BteqzT8CmpX16: return Mips::BtnezT8CmpX16;
+  case Mips::BteqzT8SltX16: return Mips::BtnezT8SltX16;
+  case Mips::BteqzT8SltiX16: return Mips::BtnezT8SltiX16;
+  case Mips::BtnezX16: return Mips::BteqzX16;
+  case Mips::BtnezT8CmpiX16: return Mips::BteqzT8CmpiX16;
+  case Mips::BtnezT8SltuX16: return Mips::BteqzT8SltuX16;
+  case Mips::BtnezT8SltiuX16: return Mips::BteqzT8SltiuX16;
+  case Mips::BteqzX16: return Mips::BtnezX16;
+  case Mips::BteqzT8CmpiX16: return Mips::BtnezT8CmpiX16;
+  case Mips::BteqzT8SltuX16: return Mips::BtnezT8SltuX16;
+  case Mips::BteqzT8SltiuX16: return Mips::BtnezT8SltiuX16;
+  case Mips::BtnezT8CmpX16: return Mips::BteqzT8CmpX16;
+  case Mips::BtnezT8SltX16: return Mips::BteqzT8SltX16;
+  case Mips::BtnezT8SltiX16: return Mips::BteqzT8SltiX16;
+  }
   assert(false && "Implement this function.");
   return 0;
 }
 
+/// Adjust SP by Amount bytes.
+void Mips16InstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const {
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  if (isInt<16>(Amount)) {
+    if (Amount < 0)
+      BuildMI(MBB, I, DL, get(Mips::SaveDecSpF16)). addImm(-Amount);
+    else if (Amount > 0)
+      BuildMI(MBB, I, DL, get(Mips::RestoreIncSpF16)).addImm(Amount);
+  }
+  else
+    // not implemented for large values yet
+    assert(false && "adjust stack pointer amount exceeded");
+}
+
 unsigned Mips16InstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
-  return 0;
+  return (Opc == Mips::BeqzRxImmX16   || Opc == Mips::BimmX16  ||
+          Opc == Mips::BnezRxImmX16   || Opc == Mips::BteqzX16 ||
+          Opc == Mips::BteqzT8CmpX16  || Opc == Mips::BteqzT8CmpiX16 ||
+          Opc == Mips::BteqzT8SltX16  || Opc == Mips::BteqzT8SltuX16  ||
+          Opc == Mips::BteqzT8SltiX16 || Opc == Mips::BteqzT8SltiuX16 ||
+          Opc == Mips::BtnezX16       || Opc == Mips::BtnezT8CmpX16 ||
+          Opc == Mips::BtnezT8CmpiX16 || Opc == Mips::BtnezT8SltX16 ||
+          Opc == Mips::BtnezT8SltuX16 || Opc == Mips::BtnezT8SltiX16 ||
+          Opc == Mips::BtnezT8SltiuX16 ) ? Opc : 0;
 }
 
 void Mips16InstrInfo::ExpandRetRA16(MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index 260c5b69b25f..e06ccfe61c52 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -64,6 +64,10 @@ public:
 
   virtual unsigned GetOppositeBranchOpc(unsigned Opc) const;
 
+  /// Adjust SP by Amount bytes.
+  void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I) const;
+
 private:
   virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 94cf984769b8..5defc75ea6ef 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -10,21 +10,74 @@
 // This file describes Mips16 instructions.
 //
 //===----------------------------------------------------------------------===//
+//
+//
+// Mips Address
+//
+def addr16 :
+  ComplexPattern<iPTR, 3, "SelectAddr16", [frameindex], [SDNPWantParent]>;
 
 //
-// RRR-type instruction format
+// Address operand
+def mem16 : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops CPU16Regs, simm16, CPU16Regs);
+  let EncoderMethod = "getMemEncoding";
+}
+
+def mem16_ea : Operand<i32> {
+  let PrintMethod = "printMemOperandEA";
+  let MIOperandInfo = (ops CPU16Regs, simm16);
+  let EncoderMethod = "getMemEncoding";
+}
+
+//
+// Compare a register and immediate and place result in CC
+// Implicit use of T8
 //
+// EXT-CCRR Instruction format
+//
+class FEXT_CCRXI16_ins<bits<5> _op, string asmstr,
+                       InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$cc), (ins CPU16Regs:$rx, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm\n\tmove\t$cc, $$t8"), [], itin> {
+  let isCodeGenOnly=1;
+}
 
-class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
-  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
-         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
+//
+// EXT-I instruction format
+//
+class FEXT_I16_ins<bits<5> eop, string asmstr, InstrItinClass itin> :
+  FEXT_I16<eop, (outs), (ins brtarget:$imm16),
+           !strconcat(asmstr, "\t$imm16"),[], itin>;
 
 //
-// I8_MOV32R instruction format (used only by MOV32R instruction)
+// EXT-I8 instruction format
 //
-class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
-  FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz),
-               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+class FEXT_I816_ins_base<bits<3> _func, string asmstr,
+                         string asmstr2, InstrItinClass itin>:
+  FEXT_I816<_func, (outs), (ins uimm16:$imm), !strconcat(asmstr, asmstr2),
+            [], itin>;
+
+class FEXT_I816_ins<bits<3> _func, string asmstr,
+                    InstrItinClass itin>:
+  FEXT_I816_ins_base<_func, asmstr, "\t$imm", itin>;
+
+//
+// Assembler formats in alphabetical order.
+// Natural and pseudos are mixed together.
+//
+// Compare two registers and place result in CC
+// Implicit use of T8
+//
+// CC-RR Instruction format
+//
+class FCCRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$cc), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$cc, $$t8"), [], itin> {
+  let isCodeGenOnly=1;
+}
 
 //
 // EXT-RI instruction format
@@ -42,6 +95,10 @@ class FEXT_RI16_ins<bits<5> _op, string asmstr,
 class FEXT_RI16_PC_ins<bits<5> _op, string asmstr, InstrItinClass itin>:
   FEXT_RI16_ins_base<_op, asmstr, "\t$rx, $$pc, $imm", itin>;
 
+class FEXT_RI16_B_ins<bits<5> _op, string asmstr,
+                      InstrItinClass itin>:
+  FEXT_RI16<_op, (outs), (ins  CPU16Regs:$rx, brtarget:$imm),
+            !strconcat(asmstr, "\t$rx, $imm"), [], itin>;
 
 class FEXT_2RI16_ins<bits<5> _op, string asmstr,
                      InstrItinClass itin>:
@@ -51,6 +108,104 @@ class FEXT_2RI16_ins<bits<5> _op, string asmstr,
 }
 
 
+// this has an explicit sp argument that we ignore to work around a problem
+// in the compiler
+class FEXT_RI16_SP_explicit_ins<bits<5> _op, string asmstr,
+                                InstrItinClass itin>:
+  FEXT_RI16<_op, (outs CPU16Regs:$rx), (ins CPUSPReg:$ry, simm16:$imm),
+            !strconcat(asmstr, "\t$rx, $imm ( $ry ); "), [], itin>;
+
+//
+// EXT-RRI instruction format
+//
+
+class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                         InstrItinClass itin>:
+  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
+                          InstrItinClass itin>:
+  FEXT_RRI16<op, (outs ), (ins  CPU16Regs:$ry, MemOpnd:$addr),
+             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+//
+// EXT-RRI-A instruction format
+//
+
+class FEXT_RRI_A16_mem_ins<bits<1> op, string asmstr, Operand MemOpnd,
+                           InstrItinClass itin>:
+  FEXT_RRI_A16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
+               !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+
+//
+// EXT-SHIFT instruction format
+//
+class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
+  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
+               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+
+//
+// EXT-T8I8
+//
+class FEXT_T8I816_ins<bits<3> _func, string asmstr, string asmstr2,
+                      InstrItinClass itin>:
+  FEXT_I816<_func, (outs),
+            (ins CPU16Regs:$rx, CPU16Regs:$ry, brtarget:$imm),
+            !strconcat(asmstr2, !strconcat("\t$rx, $ry\n\t",
+            !strconcat(asmstr, "\t$imm"))),[], itin> {
+  let isCodeGenOnly=1;
+}
+
+//
+// EXT-T8I8I
+//
+class FEXT_T8I8I16_ins<bits<3> _func, string asmstr, string asmstr2,
+                       InstrItinClass itin>:
+  FEXT_I816<_func, (outs),
+            (ins CPU16Regs:$rx, simm16:$imm, brtarget:$targ),
+            !strconcat(asmstr2, !strconcat("\t$rx, $imm\n\t",
+            !strconcat(asmstr, "\t$targ"))), [], itin> {
+  let isCodeGenOnly=1;
+}
+//
+
+
+//
+// I8_MOVR32 instruction format (used only by the MOVR32 instructio
+//
+class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
+       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins CPURegs:$r32),
+       !strconcat(asmstr,  "\t$rz, $r32"), [], itin>;
+
+//
+// I8_MOV32R instruction format (used only by MOV32R instruction)
+//
+
+class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
+  FI8_MOV32R16<(outs CPURegs:$r32), (ins CPU16Regs:$rz),
+               !strconcat(asmstr,  "\t$r32, $rz"), [], itin>;
+
+//
+// This are pseudo formats for multiply
+// This first one can be changed to non pseudo now.
+//
+// MULT
+//
+class FMULT16_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry"), []>;
+
+//
+// MULT-LO
+//
+class FMULT16_LO_ins<string asmstr, InstrItinClass itin> :
+  MipsPseudo16<(outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+               !strconcat(asmstr, "\t$rx, $ry\n\tmflo\t$rz"), []> {
+  let isCodeGenOnly=1;
+}
+
 //
 // RR-type instruction format
 //
@@ -60,6 +215,27 @@ class FRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
         !strconcat(asmstr, "\t$rx, $ry"), [], itin> {
 }
 
+class FRRTR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry\n\tmove\t$rz, $$t8"), [], itin> ;
+
+//
+// maybe refactor but need a $zero as a dummy first parameter
+//
+class FRR16_div_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs ), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$$zero, $rx, $ry"), [], itin> ;
+
+class FUnaryRR16_ins<bits<5> f, string asmstr, InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry),
+        !strconcat(asmstr, "\t$rx, $ry"), [], itin> ;
+
+
+class FRR16_M_ins<bits<5> f, string asmstr,
+                  InstrItinClass itin> :
+  FRR16<f, (outs CPU16Regs:$rx), (ins),
+        !strconcat(asmstr, "\t$rx"), [], itin>;
+
 class FRxRxRy16_ins<bits<5> f, string asmstr,
                     InstrItinClass itin> :
   FRR16<f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
@@ -74,35 +250,109 @@ class FRR16_JALRC_RA_only_ins<bits<1> nd_, bits<1> l_,
   FRR16_JALRC<nd_, l_, 1, (outs), (ins), !strconcat(asmstr, "\t $$ra"),
               [], itin> ;
 
+
+class FRR16_JALRC_ins<bits<1> nd, bits<1> l, bits<1> ra,
+                      string asmstr, InstrItinClass itin>:
+  FRR16_JALRC<nd, l, ra, (outs), (ins CPU16Regs:$rx), 
+              !strconcat(asmstr, "\t $rx"), [], itin> ;
+
 //
-// EXT-RRI instruction format
+// RRR-type instruction format
 //
 
-class FEXT_RRI16_mem_ins<bits<5> op, string asmstr, Operand MemOpnd,
-                         InstrItinClass itin>:
-  FEXT_RRI16<op, (outs CPU16Regs:$ry), (ins  MemOpnd:$addr),
-             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
+  FRRR16<_f, (outs CPU16Regs:$rz), (ins CPU16Regs:$rx, CPU16Regs:$ry),
+         !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
 
-class FEXT_RRI16_mem2_ins<bits<5> op, string asmstr, Operand MemOpnd,
-                          InstrItinClass itin>:
-  FEXT_RRI16<op, (outs ), (ins  CPU16Regs:$ry, MemOpnd:$addr),
-             !strconcat(asmstr, "\t$ry, $addr"), [], itin>;
+//
+// These Sel patterns support the generation of conditional move
+// pseudo instructions.
+//
+// The nomenclature uses the components making up the pseudo and may
+// be a bit counter intuitive when compared with the end result we seek.
+// For example using a bqez in the example directly below results in the
+// conditional move being done if the tested register is not zero.
+// I considered in easier to check by keeping the pseudo consistent with
+// it's components but it could have been done differently.
+//
+// The simplest case is when can test and operand directly and do the
+// conditional move based on a simple mips16 conditional
+//  branch instruction.
+// for example:
+// if $op == beqz or bnez:
+//
+// $op1 $rt, .+4
+// move $rd, $rs
+//
+// if $op == beqz, then if $rt != 0, then the conditional assignment
+// $rd = $rs is done.
 
+// if $op == bnez, then if $rt == 0, then the conditional assignment
+// $rd = $rs is done.
 //
-// EXT-SHIFT instruction format
+// So this pseudo class only has one operand, i.e. op
 //
-class FEXT_SHIFT16_ins<bits<2> _f, string asmstr, InstrItinClass itin>:
-  FEXT_SHIFT16<_f, (outs CPU16Regs:$rx), (ins CPU16Regs:$ry, shamt:$sa),
-               !strconcat(asmstr, "\t$rx, $ry, $sa"), [], itin>;
+class Sel<bits<5> f1, string op, InstrItinClass itin>:
+  MipsInst16_32<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                CPU16Regs:$rt),
+                !strconcat(op, "\t$rt, .+4\n\t\n\tmove $rd, $rs"), [], itin,
+                Pseudo16> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+}
 
 //
-// Address operand
-def mem16 : Operand<i32> {
-  let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops CPU16Regs, simm16);
-  let EncoderMethod = "getMemEncoding";
+// The next two instruction classes allow for an operand which tests
+// two operands and returns a value in register T8 and
+//then does a conditional branch based on the value of T8
+//
+
+// op2 can be cmpi or slti/sltiu
+// op1 can bteqz or btnez
+// the operands for op2 are a register and a signed constant
+//
+// $op2 $t, $imm  ;test register t and branch conditionally
+// $op1 .+4       ;op1 is a conditional branch
+// move $rd, $rs
+//
+//
+class SeliT<bits<5> f1, string op1, bits<5> f2, string op2,
+                 InstrItinClass itin>:
+  MipsInst16_32<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                                        CPU16Regs:$rl, simm16:$imm),
+                 !strconcat(op2,
+                 !strconcat("\t$rl, $imm\n\t",
+                 !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), [], itin,
+                 Pseudo16> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
+}
+
+//
+// op2 can be cmp or slt/sltu
+// op1 can be bteqz or btnez
+// the operands for op2 are two registers
+// op1 is a conditional branch
+//
+//
+// $op2 $rl, $rr  ;test registers rl,rr
+// $op1 .+4       ;op2 is a conditional branch
+// move $rd, $rs
+//
+//
+class SelT<bits<5> f1, string op1, bits<5> f2, string op2,
+           InstrItinClass itin>:
+  MipsInst16_32<(outs CPU16Regs:$rd_), (ins CPU16Regs:$rd, CPU16Regs:$rs,
+                CPU16Regs:$rl, CPU16Regs:$rr),
+                !strconcat(op2,
+                !strconcat("\t$rl, $rr\n\t",
+                !strconcat(op1, "\t.+4\n\tmove $rd, $rs"))), [], itin,
+                Pseudo16> {
+  let isCodeGenOnly=1;
+  let Constraints = "$rd = $rd_";
 }
 
+
 //
 // Some general instruction class info
 //
@@ -115,6 +365,24 @@ class ArithLogic16Defs<bit isCom=0> {
   bit neverHasSideEffects = 1;
 }
 
+class branch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit isBarrier = 1;
+}
+
+class cbranch16 {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+}
+
+class MayLoad {
+  bit mayLoad = 1;
+}
+
+class MayStore {
+  bit mayStore = 1;
+}
 //
 
 // Format: ADDIU rx, immediate MIPS16e
@@ -126,6 +394,9 @@ def AddiuRxImmX16: FEXT_RI16_ins<0b01001, "addiu", IIAlu>;
 def AddiuRxRxImmX16: FEXT_2RI16_ins<0b01001, "addiu", IIAlu>,
   ArithLogic16Defs<0>;
 
+def AddiuRxRyOffMemX16:
+  FEXT_RRI_A16_mem_ins<0, "addiu", mem16_ea, IIAlu>;
+
 //
 
 // Format: ADDIU rx, pc, immediate MIPS16e
@@ -148,6 +419,87 @@ def AdduRxRyRz16: FRRR16_ins<01, "addu", IIAlu>, ArithLogic16Defs<1>;
 
 def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
 
+
+//
+// Format: BEQZ rx, offset MIPS16e
+// Purpose: Branch on Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BeqzRxImmX16: FEXT_RI16_B_ins<0b00100, "beqz", IIAlu>, cbranch16;
+
+// Format: B offset MIPS16e
+// Purpose: Unconditional Branch
+// To do an unconditional PC-relative branch.
+//
+def BimmX16: FEXT_I16_ins<0b00010, "b", IIAlu>, branch16;
+
+//
+// Format: BNEZ rx, offset MIPS16e
+// Purpose: Branch on Not Equal to Zero (Extended)
+// To test a GPR then do a PC-relative conditional branch.
+//
+def BnezRxImmX16: FEXT_RI16_B_ins<0b00101, "bnez", IIAlu>, cbranch16;
+
+//
+// Format: BTEQZ offset MIPS16e
+// Purpose: Branch on T Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def BteqzX16: FEXT_I816_ins<0b000, "bteqz", IIAlu>, cbranch16;
+
+def BteqzT8CmpX16: FEXT_T8I816_ins<0b000, "bteqz", "cmp", IIAlu>, cbranch16;
+
+def BteqzT8CmpiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "cmpi", IIAlu>,
+  cbranch16;
+
+def BteqzT8SltX16: FEXT_T8I816_ins<0b000, "bteqz", "slt", IIAlu>, cbranch16;
+
+def BteqzT8SltuX16: FEXT_T8I816_ins<0b000, "bteqz", "sltu", IIAlu>, cbranch16;
+
+def BteqzT8SltiX16: FEXT_T8I8I16_ins<0b000, "bteqz", "slti", IIAlu>, cbranch16;
+
+def BteqzT8SltiuX16: FEXT_T8I8I16_ins<0b000, "bteqz", "sltiu", IIAlu>,
+  cbranch16;
+
+//
+// Format: BTNEZ offset MIPS16e
+// Purpose: Branch on T Not Equal to Zero (Extended)
+// To test special register T then do a PC-relative conditional branch.
+//
+def BtnezX16: FEXT_I816_ins<0b001, "btnez", IIAlu> ,cbranch16;
+
+def BtnezT8CmpX16: FEXT_T8I816_ins<0b000, "btnez", "cmp", IIAlu>, cbranch16;
+
+def BtnezT8CmpiX16: FEXT_T8I8I16_ins<0b000, "btnez", "cmpi", IIAlu>, cbranch16;
+
+def BtnezT8SltX16: FEXT_T8I816_ins<0b000, "btnez", "slt", IIAlu>, cbranch16;
+
+def BtnezT8SltuX16: FEXT_T8I816_ins<0b000, "btnez", "sltu", IIAlu>, cbranch16;
+
+def BtnezT8SltiX16: FEXT_T8I8I16_ins<0b000, "btnez", "slti", IIAlu>, cbranch16;
+
+def BtnezT8SltiuX16: FEXT_T8I8I16_ins<0b000, "btnez", "sltiu", IIAlu>,
+  cbranch16;
+
+//
+// Format: DIV rx, ry MIPS16e
+// Purpose: Divide Word
+// To divide 32-bit signed integers.
+//
+def DivRxRy16: FRR16_div_ins<0b11010, "div", IIAlu> {
+  let Defs = [HI, LO];
+}
+
+//
+// Format: DIVU rx, ry MIPS16e
+// Purpose: Divide Unsigned Word
+// To divide 32-bit unsigned integers.
+//
+def DivuRxRy16: FRR16_div_ins<0b11011, "divu", IIAlu> {
+  let Defs = [HI, LO];
+}
+
+
 //
 // Format: JR ra MIPS16e
 // Purpose: Jump Register Through Register ra
@@ -155,35 +507,56 @@ def AndRxRxRy16: FRxRxRy16_ins<0b01100, "and", IIAlu>, ArithLogic16Defs<1>;
 // address register.
 //
 
-def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu>;
+def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let hasDelaySlot = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
+
+def JrcRa16: FRR16_JALRC_RA_only_ins<0, 0, "jrc", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
 
+def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIAlu> {
+  let isBranch = 1;
+  let isIndirectBranch = 1;
+  let isTerminator=1;
+  let isBarrier=1;
+}
 //
 // Format: LB ry, offset(rx) MIPS16e
 // Purpose: Load Byte (Extended)
 // To load a byte from memory as a signed value.
 //
-def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IIAlu>;
+def LbRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lb", mem16, IILoad>, MayLoad;
 
 //
 // Format: LBU ry, offset(rx) MIPS16e
 // Purpose: Load Byte Unsigned (Extended)
 // To load a byte from memory as a unsigned value.
 //
-def LbuRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IIAlu>;
+def LbuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lbu", mem16, IILoad>, MayLoad;
 
 //
 // Format: LH ry, offset(rx) MIPS16e
 // Purpose: Load Halfword signed (Extended)
 // To load a halfword from memory as a signed value.
 //
-def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IIAlu>;
+def LhRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lh", mem16, IILoad>, MayLoad;
 
 //
 // Format: LHU ry, offset(rx) MIPS16e
 // Purpose: Load Halfword unsigned (Extended)
 // To load a halfword from memory as an unsigned value.
 //
-def LhuRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IIAlu>;
+def LhuRxRyOffMemX16:
+  FEXT_RRI16_mem_ins<0b10100, "lhu", mem16, IILoad>, MayLoad;
 
 //
 // Format: LI rx, immediate MIPS16e
@@ -197,28 +570,98 @@ def LiRxImmX16: FEXT_RI16_ins<0b01101, "li", IIAlu>;
 // Purpose: Load Word (Extended)
 // To load a word from memory as a signed value.
 //
-def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IIAlu>;
+def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IILoad>, MayLoad;
+
+// Format: LW rx, offset(sp) MIPS16e
+// Purpose: Load Word (SP-Relative, Extended)
+// To load an SP-relative word from memory as a signed value.
+//
+def LwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b10110, "lw", IILoad>, MayLoad;
 
 //
 // Format: MOVE r32, rz MIPS16e
 // Purpose: Move
 // To move the contents of a GPR to a GPR.
 //
-def Mov32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+
+//
+// Format: MOVE ry, r32 MIPS16e
+//Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
+
+//
+// Format: MFHI rx MIPS16e
+// Purpose: Move From HI Register
+// To copy the special purpose HI register to a GPR.
+//
+def Mfhi16: FRR16_M_ins<0b10000, "mfhi", IIAlu> {
+  let Uses = [HI];
+  let neverHasSideEffects = 1;
+}
+
+//
+// Format: MFLO rx MIPS16e
+// Purpose: Move From LO Register
+// To copy the special purpose LO register to a GPR.
+//
+def Mflo16: FRR16_M_ins<0b10010, "mflo", IIAlu> {
+  let Uses = [LO];
+  let neverHasSideEffects = 1;
+}
+
+//
+// Pseudo Instruction for mult
+//
+def MultRxRy16:  FMULT16_ins<"mult",  IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+def MultuRxRy16: FMULT16_ins<"multu", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+//
+// Format: MULT rx, ry MIPS16e
+// Purpose: Multiply Word
+// To multiply 32-bit signed integers.
+//
+def MultRxRyRz16: FMULT16_LO_ins<"mult", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
+
+//
+// Format: MULTU rx, ry MIPS16e
+// Purpose: Multiply Unsigned Word
+// To multiply 32-bit unsigned integers.
+//
+def MultuRxRyRz16: FMULT16_LO_ins<"multu", IIAlu> {
+  let isCommutable = 1;
+  let neverHasSideEffects = 1;
+  let Defs = [HI, LO];
+}
 
 //
 // Format: NEG rx, ry MIPS16e
 // Purpose: Negate
 // To negate an integer value.
 //
-def NegRxRy16: FRR16_ins<0b11101, "neg", IIAlu>;
+def NegRxRy16: FUnaryRR16_ins<0b11101, "neg", IIAlu>;
 
 //
 // Format: NOT rx, ry MIPS16e
 // Purpose: Not
 // To complement an integer value
 //
-def NotRxRy16: FRR16_ins<0b01111, "not", IIAlu>;
+def NotRxRy16: FUnaryRR16_ins<0b01111, "not", IIAlu>;
 
 //
 // Format: OR rx, ry MIPS16e
@@ -240,10 +683,22 @@ def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
 // for direct object emitter, encoding needs to be adjusted for the
 // frame size
 //
-let ra=1, s=0,s0=0,s1=0 in
+let ra=1, s=0,s0=1,s1=1 in
 def RestoreRaF16:
   FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "restore \t$$ra, $frame_size", [], IILoad >;
+             "restore\t$$ra,  $$s0, $$s1, $frame_size", [], IILoad >, MayLoad {
+  let isCodeGenOnly = 1;
+}
+
+// Use Restore to increment SP since SP is not a Mip 16 register, this
+// is an easy way to do that which does not require a register.
+//
+let ra=0, s=0,s0=0,s1=0 in
+def RestoreIncSpF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "restore\t$frame_size", [], IILoad >, MayLoad {
+  let isCodeGenOnly = 1;
+}
 
 //
 // Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
@@ -252,24 +707,152 @@ def RestoreRaF16:
 // To set up a stack frame on entry to a subroutine,
 // saving return address and static registers, and adjusting stack
 //
-let ra=1, s=1,s0=0,s1=0 in
+let ra=1, s=1,s0=1,s1=1 in
 def SaveRaF16:
   FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "save \t$$ra, $frame_size", [], IILoad >;
+             "save\t$$ra, $$s0, $$s1, $frame_size", [], IIStore >, MayStore {
+  let isCodeGenOnly = 1;
+}
 
 //
+// Use Save to decrement the SP by a constant since SP is not
+// a Mips16 register.
+//
+let ra=0, s=0,s0=0,s1=0 in
+def SaveDecSpF16:
+  FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
+             "save\t$frame_size", [], IIStore >, MayStore {
+  let isCodeGenOnly = 1;
+}
+//
 // Format: SB ry, offset(rx) MIPS16e
 // Purpose: Store Byte (Extended)
 // To store a byte to memory.
 //
-def SbRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIAlu>;
+def SbRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11000, "sb", mem16, IIStore>, MayStore;
 
 //
+// The Sel(T) instructions are pseudos
+// T means that they use T8 implicitly.
+//
+//
+// Format: SelBeqZ rd, rs, rt
+// Purpose: if rt==0, do nothing
+//          else rs = rt
+//
+def SelBeqZ: Sel<0b00100, "beqz", IIAlu>;
+
+//
+// Format:  SelTBteqZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmp: SelT<0b000, "bteqz", 0b01010, "cmp", IIAlu>;
+
+//
+// Format:  SelTBteqZCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZCmpi: SeliT<0b000, "bteqz", 0b01110, "cmpi", IIAlu>;
+
+//
+// Format:  SelTBteqZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlt: SelT<0b000, "bteqz", 0b00010, "slt", IIAlu>;
+
+//
+// Format:  SelTBteqZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSlti: SeliT<0b000, "bteqz", 0b01010, "slti", IIAlu>;
+
+//
+// Format:  SelTBteqZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltu: SelT<0b000, "bteqz", 0b00011, "sltu", IIAlu>;
+
+//
+// Format:  SelTBteqZSltiu rd, rs, rl, rr
+// Purpose: b = Sltiu rl, imm.
+//          If b==0 then do nothing.
+//          if b!=0 then rd = rs
+//
+def SelTBteqZSltiu: SeliT<0b000, "bteqz", 0b01011, "sltiu", IIAlu>;
+
+//
+// Format: SelBnez rd, rs, rt
+// Purpose: if rt!=0, do nothing
+//          else rs = rt
+//
+def SelBneZ: Sel<0b00101, "bnez", IIAlu>;
+
+//
+// Format:  SelTBtneZCmp rd, rs, rl, rr
+// Purpose: b = Cmp rl, rr.
+//          If b!=0 then do nothing.
+//          if b0=0 then rd = rs
+//
+def SelTBtneZCmp: SelT<0b001, "btnez", 0b01010, "cmp", IIAlu>;
+
+//
+// Format:  SelTBtnezCmpi rd, rs, rl, rr
+// Purpose: b = Cmpi rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZCmpi: SeliT<0b000, "btnez", 0b01110, "cmpi", IIAlu>;
+
+//
+// Format:  SelTBtneZSlt rd, rs, rl, rr
+// Purpose: b = Slt rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlt: SelT<0b001, "btnez", 0b00010, "slt", IIAlu>;
+
+//
+// Format:  SelTBtneZSlti rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSlti: SeliT<0b001, "btnez", 0b01010, "slti", IIAlu>;
+
+//
+// Format:  SelTBtneZSltu rd, rs, rl, rr
+// Purpose: b = Sltu rl, rr.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltu: SelT<0b001, "btnez", 0b00011, "sltu", IIAlu>;
+
+//
+// Format:  SelTBtneZSltiu rd, rs, rl, rr
+// Purpose: b = Slti rl, imm.
+//          If b!=0 then do nothing.
+//          if b==0 then rd = rs
+//
+def SelTBtneZSltiu: SeliT<0b001, "btnez", 0b01011, "sltiu", IIAlu>;
+//
+//
 // Format: SH ry, offset(rx) MIPS16e
 // Purpose: Store Halfword (Extended)
 // To store a halfword to memory.
 //
-def ShRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, IIAlu>;
+def ShRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11001, "sh", mem16, IIStore>, MayStore;
 
 //
 // Format: SLL rx, ry, sa MIPS16e
@@ -285,8 +868,40 @@ def SllX16: FEXT_SHIFT16_ins<0b00, "sll", IIAlu>;
 //
 def SllvRxRy16 : FRxRxRy16_ins<0b00100, "sllv", IIAlu>;
 
+//
+// Format: SLTI rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+def SltiCCRxImmX16: FEXT_CCRXI16_ins<0b01010, "slti", IIAlu>;
 
 //
+// Format: SLTIU rx, immediate MIPS16e
+// Purpose: Set on Less Than Immediate Unsigned (Extended)
+// To record the result of a less-than comparison with a constant.
+//
+def SltiuCCRxImmX16: FEXT_CCRXI16_ins<0b01011, "sltiu", IIAlu>;
+
+//
+// Format: SLT rx, ry MIPS16e
+// Purpose: Set on Less Than
+// To record the result of a less-than comparison.
+//
+def SltRxRy16: FRR16_ins<0b00010, "slt", IIAlu>;
+
+def SltCCRxRy16: FCCRR16_ins<0b00010, "slt", IIAlu>;
+
+// Format: SLTU rx, ry MIPS16e
+// Purpose: Set on Less Than Unsigned
+// To record the result of an unsigned less-than comparison.
+//
+def SltuRxRyRz16: FRRTR16_ins<0b00011, "sltu", IIAlu> {
+  let isCodeGenOnly=1;
+}
+
+
+def SltuCCRxRy16: FCCRR16_ins<0b00011, "sltu", IIAlu>;
+//
 // Format: SRAV ry, rx MIPS16e
 // Purpose: Shift Word Right Arithmetic Variable
 // To execute an arithmetic right-shift of a word by a variable
@@ -333,9 +948,18 @@ def SubuRxRyRz16: FRRR16_ins<0b11, "subu", IIAlu>, ArithLogic16Defs<0>;
 // Purpose: Store Word (Extended)
 // To store a word to memory.
 //
-def SwRxRyOffMemX16: FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, IIAlu>;
+def SwRxRyOffMemX16:
+  FEXT_RRI16_mem2_ins<0b11011, "sw", mem16, IIStore>, MayStore;
 
 //
+// Format: SW rx, offset(sp) MIPS16e
+// Purpose: Store Word rx (SP-Relative)
+// To store an SP-relative word to memory.
+//
+def SwRxSpImmX16: FEXT_RI16_SP_explicit_ins<0b11010, "sw", IIStore>, MayStore;
+
+//
+//
 // Format: XOR rx, ry MIPS16e
 // Purpose: Xor
 // To do a bitwise logical XOR.
@@ -361,6 +985,7 @@ class ArithLogic16_pat<SDNode OpNode, Instruction I> :
 
 def: ArithLogic16_pat<add, AdduRxRyRz16>;
 def: ArithLogic16_pat<and, AndRxRxRy16>;
+def: ArithLogic16_pat<mul, MultRxRyRz16>;
 def: ArithLogic16_pat<or, OrRxRxRy16>;
 def: ArithLogic16_pat<sub, SubuRxRyRz16>;
 def: ArithLogic16_pat<xor, XorRxRxRy16>;
@@ -385,35 +1010,533 @@ def: shift_rotate_reg16_pat<sra, SravRxRy16>;
 def: shift_rotate_reg16_pat<srl, SrlvRxRy16>;
 
 class LoadM16_pat<PatFrag OpNode, Instruction I> :
-  Mips16Pat<(OpNode addr:$addr), (I addr:$addr)>;
+  Mips16Pat<(OpNode addr16:$addr), (I addr16:$addr)>;
 
 def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16>;
 def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16>;
-def: LoadM16_pat<sextloadi16_a, LhRxRyOffMemX16>;
-def: LoadM16_pat<zextloadi16_a, LhuRxRyOffMemX16>;
-def: LoadM16_pat<load_a, LwRxRyOffMemX16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16>;
+def: LoadM16_pat<load, LwRxRyOffMemX16>;
 
 class StoreM16_pat<PatFrag OpNode, Instruction I> :
-  Mips16Pat<(OpNode CPU16Regs:$r, addr:$addr), (I CPU16Regs:$r, addr:$addr)>;
+  Mips16Pat<(OpNode CPU16Regs:$r, addr16:$addr),
+            (I CPU16Regs:$r, addr16:$addr)>;
 
 def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
-def: StoreM16_pat<truncstorei16_a, ShRxRyOffMemX16>;
-def: StoreM16_pat<store_a, SwRxRyOffMemX16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>;
+def: StoreM16_pat<store, SwRxRyOffMemX16>;
+
+// Unconditional branch
+class UncondBranch16_pat<SDNode OpNode, Instruction I>:
+  Mips16Pat<(OpNode bb:$imm16), (I bb:$imm16)> {
+    let Predicates = [RelocPIC, InMips16Mode];
+  }
+
+// Indirect branch
+def: Mips16Pat<
+  (brind CPU16Regs:$rs), 
+  (JrcRx16 CPU16Regs:$rs)>;  
 
 
 // Jump and Link (Call)
-let isCall=1, hasDelaySlot=1 in
+let isCall=1, hasDelaySlot=0 in
 def JumpLinkReg16:
   FRR16_JALRC<0, 0, 0, (outs), (ins CPU16Regs:$rs),
-              "jalr \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>;
+              "jalrc \t$rs", [(MipsJmpLink CPU16Regs:$rs)], IIBranch>;
 
 // Mips16 pseudos
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1,
   hasExtraSrcRegAllocReq = 1 in
 def RetRA16 : MipsPseudo16<(outs), (ins), "", [(MipsRet)]>;
 
+
+// setcc patterns
+
+class SetCC_R16<PatFrag cond_op, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, CPU16Regs:$ry),
+            (I CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+class SetCC_I16<PatFrag cond_op, PatLeaf imm_type, Instruction I>:
+  Mips16Pat<(cond_op CPU16Regs:$rx, imm_type:$imm16),
+            (I CPU16Regs:$rx, imm_type:$imm16)>;
+
+
+def: Mips16Pat<(i32  addr16:$addr),
+               (AddiuRxRyOffMemX16  addr16:$addr)>;
+
+
+// Large (>16 bit) immediate loads
+def : Mips16Pat<(i32 imm:$imm),
+                (OrRxRxRy16 (SllX16 (LiRxImmX16 (HI16 imm:$imm)), 16),
+                (LiRxImmX16 (LO16 imm:$imm)))>;
+
+// Carry MipsPatterns
+def : Mips16Pat<(subc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (SubuRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc CPU16Regs:$lhs, CPU16Regs:$rhs),
+                (AdduRxRyRz16 CPU16Regs:$lhs, CPU16Regs:$rhs)>;
+def : Mips16Pat<(addc  CPU16Regs:$src, immSExt16:$imm),
+                (AddiuRxRxImmX16 CPU16Regs:$src, imm:$imm)>;
+
+//
+// Some branch conditional patterns are not generated by llvm at this time.
+// Some are for seemingly arbitrary reasons not used: i.e. with signed number
+// comparison they are used and for unsigned a different pattern is used.
+// I am pushing upstream from the full mips16 port and it seemed that I needed
+// these earlier and the mips32 port has these but now I cannot create test
+// cases that use these patterns. While I sort this all out I will leave these
+// extra patterns commented out and if I can be sure they are really not used,
+// I will delete the code. I don't want to check the code in uncommented without
+// a valid test case. In some cases, the compiler is generating patterns with
+// setcc instead and earlier I had implemented setcc first so may have masked
+// the problem. The setcc variants are suboptimal for mips16 so I may wantto
+// figure out how to enable the brcond patterns or else possibly new
+// combinations of of brcond and setcc.
+//
+//
+// bcond-seteq
+//
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BteqzT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (seteq CPU16Regs:$rx, 0)), bb:$targ16),
+   (BeqzRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// bcond-setgt (do we need to have this pair of setlt, setgt??)
+//
+def: Mips16Pat
+  <(brcond (i32 (setgt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setge
+//
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+//
+// never called because compiler transforms a >= k to a > (k-1)
+def: Mips16Pat
+  <(brcond (i32 (setge CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BteqzT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setlt
+//
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8SltX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setlt CPU16Regs:$rx, immSExt16:$imm)), bb:$imm16),
+   (BtnezT8SltiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$imm16)
+  >;
+
+//
+// bcond-setle
+//
+def: Mips16Pat
+  <(brcond (i32 (setle CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BteqzT8SltX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+  >;
+
+//
+// bcond-setne
+//
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+   (BtnezT8CmpX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, immZExt16:$imm)), bb:$targ16),
+   (BtnezT8CmpiX16 CPU16Regs:$rx, immSExt16:$imm,  bb:$targ16)
+  >;
+
+def: Mips16Pat
+  <(brcond (i32 (setne CPU16Regs:$rx, 0)), bb:$targ16),
+   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+// This needs to be there but I forget which code will generate it
+//
+def: Mips16Pat
+  <(brcond CPU16Regs:$rx, bb:$targ16),
+   (BnezRxImmX16 CPU16Regs:$rx, bb:$targ16)
+  >;
+
+//
+
+//
+// bcond-setugt
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setugt CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$ry, CPU16Regs:$rx,  bb:$imm16)
+//  >;
+
+//
+// bcond-setuge
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setuge CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BteqzT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+
+//
+// bcond-setult
+//
+//def: Mips16Pat
+//  <(brcond (i32 (setult CPU16Regs:$rx, CPU16Regs:$ry)), bb:$imm16),
+//   (BtnezT8SltuX16 CPU16Regs:$rx, CPU16Regs:$ry,  bb:$imm16)
+//  >;
+
+def: UncondBranch16_pat<br, BimmX16>;
+
 // Small immediates
+def: Mips16Pat<(i32 immSExt16:$in),
+               (AddiuRxRxImmX16 (Move32R16 ZERO), immSExt16:$in)>;
+
 def: Mips16Pat<(i32 immZExt16:$in), (LiRxImmX16 immZExt16:$in)>;
 
+//
+// MipsDivRem
+//
+def: Mips16Pat
+  <(MipsDivRem CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//
+// MipsDivRemU
+//
+def: Mips16Pat
+  <(MipsDivRemU CPU16Regs:$rx, CPU16Regs:$ry),
+   (DivuRxRy16 CPU16Regs:$rx, CPU16Regs:$ry)>;
+
+//  signed a,b
+//  x = (a>=b)?x:y
+//
+//  if !(a < b) x = y
+//
+def : Mips16Pat<(select (i32 (setge CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  signed a,b
+//  x = (a>b)?x:y
+//
+//  if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setgt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// unsigned a,b
+// x = (a>=b)?x:y
+//
+// if !(a < b) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setuge CPU16Regs:$a, CPU16Regs:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, CPU16Regs:$b)>;
+
+//  unsigned a,b
+//  x = (a>b)?x:y
+//
+//  if (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setugt CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+// signed
+// x = (a >= k)?x:y
+// due to an llvm optimization, i don't think that this will ever
+// be used. This is transformed into x = (a > k-1)?x:y
+//
+//
+
+//def : Mips16Pat<
+//  (select (i32 (setge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSlti CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+//def : Mips16Pat<
+//  (select (i32 (setuge CPU16Regs:$lhs, immSExt16:$rhs)),
+//   CPU16Regs:$T, CPU16Regs:$F),
+//  (SelTBteqZSltiu CPU16Regs:$T, CPU16Regs:$F,
+//   CPU16Regs:$lhs, immSExt16:$rhs)>;
+
+// signed
+// x = (a < k)?x:y
+//
+// if !(a < k) x = y;
+//
+def : Mips16Pat<
+  (select (i32 (setlt CPU16Regs:$a, immSExt16:$b)),
+   CPU16Regs:$x, CPU16Regs:$y),
+  (SelTBtneZSlti CPU16Regs:$x, CPU16Regs:$y,
+   CPU16Regs:$a, immSExt16:$b)>;
+
+
+//
+//
+// signed
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setle CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSlt CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// unnsigned
+// x = (a <= b)? x : y
+//
+// if  (b < a) x = y
+//
+def : Mips16Pat<(select (i32 (setule CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZSltu CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == b)? x : y
+//
+// if (a != b) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a == 0)? x : y
+//
+// if (a != 0) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBeqZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a == k)? x : y
+//
+// if (a != k) x = y
+//
+def : Mips16Pat<(select (i32 (seteq CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBteqZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+
+//
+// signed/unsigned
+// x = (a != b)? x : y
+//
+// if (a == b) x = y
+//
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, CPU16Regs:$b)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmp CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$b, CPU16Regs:$a)>;
+
+//
+// signed/unsigned
+// x = (a != 0)? x : y
+//
+// if (a == 0) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, 0)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a)>;
+
+// signed/unsigned
+// x = (a)? x : y
+//
+// if (!a) x = y
+//
+def : Mips16Pat<(select  CPU16Regs:$a,
+                 CPU16Regs:$x, CPU16Regs:$y),
+      (SelBneZ CPU16Regs:$x, CPU16Regs:$y,
+       CPU16Regs:$a)>;
+
+
+//
+// signed/unsigned
+// x = (a != k)? x : y
+//
+// if (a == k) x = y
+//
+def : Mips16Pat<(select (i32 (setne CPU16Regs:$a, immZExt16:$k)),
+                 CPU16Regs:$x, CPU16Regs:$y),
+                (SelTBtneZCmpi CPU16Regs:$x, CPU16Regs:$y,
+                 CPU16Regs:$a, immZExt16:$k)>;
+
+//
+// When writing C code to test setxx these patterns,
+// some will be transformed into
+// other things. So we test using C code but using -O3 and -O0
+//
+// seteq
+//
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltiuCCRxImmX16 (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs), 1)>;
+
+def : Mips16Pat
+  <(seteq CPU16Regs:$lhs, 0),
+   (SltiuCCRxImmX16 CPU16Regs:$lhs, 1)>;
+
+
+//
+// setge
+//
+
+def: Mips16Pat
+  <(setge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+//
+// For constants, llvm transforms this to:
+// x > (k -1) and then reverses the operands to use setlt. So this pattern
+// is not used now by the compiler. (Presumably checking that k-1 does not
+// overflow). The compiler never uses this at a the current time, due to
+// other optimizations.
+//
+//def: Mips16Pat
+//  <(setge CPU16Regs:$lhs, immSExt16:$rhs),
+//   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, immSExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+// This catches the x >= -32768 case by transforming it to  x > -32769
+//
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, -32769),
+   (XorRxRxRy16 (SltiCCRxImmX16 CPU16Regs:$lhs, -32768),
+   (LiRxImmX16 1))>;
+
+//
+// setgt
+//
+//
+
+def: Mips16Pat
+  <(setgt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setle
+//
+def: Mips16Pat
+  <(setle CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+
+//
+// setlt
+//
+def: SetCC_R16<setlt, SltCCRxRy16>;
+
+def: SetCC_I16<setlt, immSExt16, SltiCCRxImmX16>;
+
+//
+// setne
+//
+def : Mips16Pat
+  <(setne CPU16Regs:$lhs,CPU16Regs:$rhs),
+   (SltuCCRxRy16 (LiRxImmX16 0),
+   (XorRxRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs))>;
+
+
+//
+// setuge
+//
+def: Mips16Pat
+  <(setuge CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (LiRxImmX16 1))>;
+
+// this pattern will never be used because the compiler will transform
+// x >= k to x > (k - 1) and then use SLT
+//
+//def: Mips16Pat
+//  <(setuge CPU16Regs:$lhs, immZExt16:$rhs),
+//   (XorRxRxRy16 (SltiuCCRxImmX16 CPU16Regs:$lhs, immZExt16:$rhs),
+//   (LiRxImmX16 1))>;
+
+//
+// setugt
+//
+def: Mips16Pat
+  <(setugt CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs)>;
+
+//
+// setule
+//
+def: Mips16Pat
+  <(setule CPU16Regs:$lhs, CPU16Regs:$rhs),
+   (XorRxRxRy16 (SltuCCRxRy16 CPU16Regs:$rhs, CPU16Regs:$lhs), (LiRxImmX16 1))>;
+
+//
+// setult
+//
+def: SetCC_R16<setult, SltuCCRxRy16>;
+
+def: SetCC_I16<setult, immSExt16, SltiuCCRxImmX16>;
+
 def: Mips16Pat<(add CPU16Regs:$hi, (MipsLo tglobaladdr:$lo)),
                (AddiuRxRxImmX16 CPU16Regs:$hi, tglobaladdr:$lo)>;
+
+// hi/lo relocs
+
+def : Mips16Pat<(MipsHi tglobaltlsaddr:$in), 
+                (SllX16 (LiRxImmX16 tglobaltlsaddr:$in), 16)>;
+
+// wrapper_pic
+class Wrapper16Pat<SDNode node, Instruction ADDiuOp, RegisterClass RC>:
+  Mips16Pat<(MipsWrapper RC:$gp, node:$in),
+            (ADDiuOp RC:$gp, node:$in)>;
+
+
+def : Wrapper16Pat<tglobaladdr, AddiuRxRxImmX16, CPU16Regs>;
+def : Wrapper16Pat<tglobaltlsaddr, AddiuRxRxImmX16, CPU16Regs>;
+
+def : Mips16Pat<(i32 (extloadi8   addr16:$src)),
+                (LbuRxRyOffMemX16  addr16:$src)>;
+def : Mips16Pat<(i32 (extloadi16  addr16:$src)),
+                (LhuRxRyOffMemX16  addr16:$src)>;
+\ No newline at end of file
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index c15d1bf52e85..d7397a32f074 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips16RegisterInfo.h"
+#include "Mips16InstrInfo.h"
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
 #include "MipsInstrInfo.h"
@@ -39,15 +40,27 @@
 using namespace llvm;
 
 Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST,
-                                       const TargetInstrInfo &TII)
-  : MipsRegisterInfo(ST, TII) {}
+    const Mips16InstrInfo &I)
+  : MipsRegisterInfo(ST), TII(I) {}
 
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
 void Mips16RegisterInfo::
 eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I) const {
-  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  if (!TFI->hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+
+    if (I->getOpcode() == Mips::ADJCALLSTACKDOWN)
+      Amount = -Amount;
+
+    const Mips16InstrInfo *II = static_cast<const Mips16InstrInfo*>(&TII);
+
+    II->adjustStackPtr(Mips::SP, Amount, MBB, I);
+  }
+
   MBB.erase(I);
 }
 
@@ -55,57 +68,60 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
                                      unsigned OpNo, int FrameIndex,
                                      uint64_t StackSize,
                                      int64_t SPOffset) const {
-      MachineInstr &MI = *II;
-      MachineFunction &MF = *MI.getParent()->getParent();
-      MachineFrameInfo *MFI = MF.getFrameInfo();
-      MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
-
-      const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-      int MinCSFI = 0;
-      int MaxCSFI = -1;
-
-      if (CSI.size()) {
-        MinCSFI = CSI[0].getFrameIdx();
-        MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
-      }
-
-      // The following stack frame objects are always
-      // referenced relative to $sp:
-      //  1. Outgoing arguments.
-      //  2. Pointer to dynamically allocated stack space.
-      //  3. Locations for callee-saved registers.
-      // Everything else is referenced relative to whatever register
-      // getFrameRegister() returns.
-      unsigned FrameReg;
-
-      if (MipsFI->isOutArgFI(FrameIndex) ||
-         (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
-        FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  int MinCSFI = 0;
+  int MaxCSFI = -1;
+
+  if (CSI.size()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  }
+
+  // The following stack frame objects are always
+  // referenced relative to $sp:
+  //  1. Outgoing arguments.
+  //  2. Pointer to dynamically allocated stack space.
+  //  3. Locations for callee-saved registers.
+  // Everything else is referenced relative to whatever register
+  // getFrameRegister() returns.
+  unsigned FrameReg;
+
+  if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
+    FrameReg = Mips::SP;
+  else {
+    const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+    if (TFI->hasFP(MF)) {
+      FrameReg = Mips::S0;
+    }
+    else {
+      if ((MI.getNumOperands()> OpNo+2) && MI.getOperand(OpNo+2).isReg())
+        FrameReg = MI.getOperand(OpNo+2).getReg();
       else
-        FrameReg = getFrameRegister(MF);
-
-      // Calculate final offset.
-      // - There is no need to change the offset if the frame object
-      //   is one of the
-      //   following: an outgoing argument, pointer to a dynamically allocated
-      //   stack space or a $gp restore location,
-      // - If the frame object is any of the following,
-      //   its offset must be adjusted
-      //   by adding the size of the stack:
-      //   incoming argument, callee-saved register location or local variable.
-      int64_t Offset;
-
-      if (MipsFI->isOutArgFI(FrameIndex))
-        Offset = SPOffset;
-      else
-        Offset = SPOffset + (int64_t)StackSize;
-
-      Offset    += MI.getOperand(OpNo + 1).getImm();
-
-      DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
-
-      MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
-      MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
+        FrameReg = Mips::SP;
+    }
+  }
+  // Calculate final offset.
+  // - There is no need to change the offset if the frame object
+  //   is one of the
+  //   following: an outgoing argument, pointer to a dynamically allocated
+  //   stack space or a $gp restore location,
+  // - If the frame object is any of the following,
+  //   its offset must be adjusted
+  //   by adding the size of the stack:
+  //   incoming argument, callee-saved register location or local variable.
+  int64_t Offset;
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
+
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
+  MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
 
 
 }
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 3f4b3a762a1e..153def20d085 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -17,11 +17,12 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
+class Mips16InstrInfo;
 
 class Mips16RegisterInfo : public MipsRegisterInfo {
+  const Mips16InstrInfo &TII;
 public:
-  Mips16RegisterInfo(const MipsSubtarget &Subtarget,
-                     const TargetInstrInfo &TII);
+  Mips16RegisterInfo(const MipsSubtarget &Subtarget, const Mips16InstrInfo &TII);
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 20fc17807757..a6111689c7ed 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -83,8 +83,10 @@ let usesCustomInserter = 1, Predicates = [HasMips64, HasStandardEncoding],
 //===----------------------------------------------------------------------===//
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
-def DADDiu   : ArithLogicI<0x19, "daddiu", add, simm16_64, immSExt16,
+def DADDi    : ArithOverflowI<0x18, "daddi", add, simm16_64, immSExt16,
                            CPU64Regs>;
+def DADDiu   : ArithLogicI<0x19, "daddiu", add, simm16_64, immSExt16,
+                           CPU64Regs>, IsAsCheapAsAMove;
 def DANDi    : ArithLogicI<0x0c, "andi", and, uimm16_64, immZExt16, CPU64Regs>;
 def SLTi64   : SetCC_I<0x0a, "slti", setlt, simm16_64, immSExt16, CPU64Regs>;
 def SLTiu64  : SetCC_I<0x0b, "sltiu", setult, simm16_64, immSExt16, CPU64Regs>;
@@ -93,6 +95,7 @@ def XORi64   : ArithLogicI<0x0e, "xori", xor, uimm16_64, immZExt16, CPU64Regs>;
 def LUi64    : LoadUpper<0x0f, "lui", CPU64Regs, uimm16_64>;
 
 /// Arithmetic Instructions (3-Operand, R-Type)
+def DADD     : ArithOverflowR<0x00, 0x2C, "dadd", IIAlu, CPU64Regs, 1>;
 def DADDu    : ArithLogicR<0x00, 0x2d, "daddu", add, IIAlu, CPU64Regs, 1>;
 def DSUBu    : ArithLogicR<0x00, 0x2f, "dsubu", sub, IIAlu, CPU64Regs>;
 def SLT64    : SetCC_R<0x00, 0x2a, "slt", setlt, CPU64Regs>;
@@ -110,9 +113,9 @@ def DSLLV    : shift_rotate_reg<0x14, 0x00, "dsllv", shl, CPU64Regs>;
 def DSRLV    : shift_rotate_reg<0x16, 0x00, "dsrlv", srl, CPU64Regs>;
 def DSRAV    : shift_rotate_reg<0x17, 0x00, "dsrav", sra, CPU64Regs>;
 let Pattern = []<dag> in {
-def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
-def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
-def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
+  def DSLL32   : shift_rotate_imm64<0x3c, 0x00, "dsll32", shl>;
+  def DSRL32   : shift_rotate_imm64<0x3e, 0x00, "dsrl32", srl>;
+  def DSRA32   : shift_rotate_imm64<0x3f, 0x00, "dsra32", sra>;
 }
 }
 // Rotate Instructions
@@ -127,24 +130,15 @@ let DecoderNamespace = "Mips64" in {
 ///  aligned
 defm LB64    : LoadM64<0x20, "lb",  sextloadi8>;
 defm LBu64   : LoadM64<0x24, "lbu", zextloadi8>;
-defm LH64    : LoadM64<0x21, "lh",  sextloadi16_a>;
-defm LHu64   : LoadM64<0x25, "lhu", zextloadi16_a>;
-defm LW64    : LoadM64<0x23, "lw",  sextloadi32_a>;
-defm LWu64   : LoadM64<0x27, "lwu", zextloadi32_a>;
+defm LH64    : LoadM64<0x21, "lh",  sextloadi16>;
+defm LHu64   : LoadM64<0x25, "lhu", zextloadi16>;
+defm LW64    : LoadM64<0x23, "lw",  sextloadi32>;
+defm LWu64   : LoadM64<0x27, "lwu", zextloadi32>;
 defm SB64    : StoreM64<0x28, "sb", truncstorei8>;
-defm SH64    : StoreM64<0x29, "sh", truncstorei16_a>;
-defm SW64    : StoreM64<0x2b, "sw", truncstorei32_a>;
-defm LD      : LoadM64<0x37, "ld",  load_a>;
-defm SD      : StoreM64<0x3f, "sd", store_a>;
-
-///  unaligned
-defm ULH64     : LoadM64<0x21, "ulh",  sextloadi16_u, 1>;
-defm ULHu64    : LoadM64<0x25, "ulhu", zextloadi16_u, 1>;
-defm ULW64     : LoadM64<0x23, "ulw",  sextloadi32_u, 1>;
-defm USH64     : StoreM64<0x29, "ush", truncstorei16_u, 1>;
-defm USW64     : StoreM64<0x2b, "usw", truncstorei32_u, 1>;
-defm ULD       : LoadM64<0x37, "uld",  load_u, 1>;
-defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
+defm SH64    : StoreM64<0x29, "sh", truncstorei16>;
+defm SW64    : StoreM64<0x2b, "sw", truncstorei32>;
+defm LD      : LoadM64<0x37, "ld",  load>;
+defm SD      : StoreM64<0x3f, "sd", store>;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
@@ -183,6 +177,7 @@ def BLTZ64 : CBranchZero<0x01, 0, "bltz", setlt, CPU64Regs>;
 }
 let DecoderNamespace = "Mips64" in
 def JALR64 : JumpLinkReg<0x00, 0x09, "jalr", CPU64Regs>;
+def TAILCALL64_R : JumpFR<CPU64Regs, MipsTailCall>, IsTailCall;
 
 let DecoderNamespace = "Mips64" in {
 /// Multiply and Divide Instructions.
@@ -217,7 +212,15 @@ let DecoderNamespace = "Mips64" in {
 def RDHWR64 : ReadHardware<CPU64Regs, HWRegs64>;
 
 def DEXT : ExtBase<3, "dext", CPU64Regs>;
+let Pattern = []<dag> in {
+  def DEXTU : ExtBase<2, "dextu", CPU64Regs>;
+  def DEXTM : ExtBase<1, "dextm", CPU64Regs>;
+}
 def DINS : InsBase<7, "dins", CPU64Regs>;
+let Pattern = []<dag> in {
+  def DINSU : InsBase<6, "dinsu", CPU64Regs>;
+  def DINSM : InsBase<5, "dinsm", CPU64Regs>;
+}
 
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs CPU64Regs:$rd), (ins CPURegs:$rt),
@@ -236,21 +239,14 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 let Predicates = [NotN64, HasStandardEncoding] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>;
-  def : MipsPat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>;
+  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 }
 let Predicates = [IsN64, HasStandardEncoding] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>;
-  def : MipsPat<(zextloadi32_u addr:$a),
-                (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>;
+  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64_P8 addr:$src)>;
 }
 
 // hi/lo relocs
@@ -315,3 +311,38 @@ def : MipsPat<(i64 (sext_inreg CPU64Regs:$src, i32)),
 
 // bswap MipsPattern
 def : MipsPat<(bswap CPU64Regs:$rt), (DSHD (DSBH CPU64Regs:$rt))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : InstAlias<"move $dst,$src", (DADD CPU64Regs:$dst,CPU64Regs:$src,ZERO_64)>;
+
+/// Move between CPU and coprocessor registers
+let DecoderNamespace = "Mips64" in {
+def MFC0_3OP64  : MFC3OP<0x10, 0, (outs CPU64Regs:$rt), 
+                       (ins CPU64Regs:$rd, uimm16:$sel),"mfc0\t$rt, $rd, $sel">;
+def MTC0_3OP64  : MFC3OP<0x10, 4, (outs CPU64Regs:$rd, uimm16:$sel),
+                       (ins CPU64Regs:$rt),"mtc0\t$rt, $rd, $sel">;
+def MFC2_3OP64  : MFC3OP<0x12, 0, (outs CPU64Regs:$rt),
+                       (ins CPU64Regs:$rd, uimm16:$sel),"mfc2\t$rt, $rd, $sel">;
+def MTC2_3OP64  : MFC3OP<0x12, 4, (outs CPU64Regs:$rd, uimm16:$sel),
+                       (ins CPU64Regs:$rt),"mtc2\t$rt, $rd, $sel">;
+def DMFC0_3OP64  : MFC3OP<0x10, 1, (outs CPU64Regs:$rt), 
+                       (ins CPU64Regs:$rd, uimm16:$sel),"dmfc0\t$rt, $rd, $sel">;
+def DMTC0_3OP64  : MFC3OP<0x10, 5, (outs CPU64Regs:$rd, uimm16:$sel),
+                       (ins CPU64Regs:$rt),"dmtc0\t$rt, $rd, $sel">;
+def DMFC2_3OP64  : MFC3OP<0x12, 1, (outs CPU64Regs:$rt),
+                       (ins CPU64Regs:$rd, uimm16:$sel),"dmfc2\t$rt, $rd, $sel">;
+def DMTC2_3OP64  : MFC3OP<0x12, 5, (outs CPU64Regs:$rd, uimm16:$sel),
+                       (ins CPU64Regs:$rt),"dmtc2\t$rt, $rd, $sel">;
+}
+// Two operand (implicit 0 selector) versions:
+def : InstAlias<"mfc0 $rt, $rd", (MFC0_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
+def : InstAlias<"mtc0 $rt, $rd", (MTC0_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
+def : InstAlias<"mfc2 $rt, $rd", (MFC2_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
+def : InstAlias<"mtc2 $rt, $rd", (MTC2_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
+def : InstAlias<"dmfc0 $rt, $rd", (DMFC0_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
+def : InstAlias<"dmtc0 $rt, $rd", (DMTC0_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
+def : InstAlias<"dmfc2 $rt, $rd", (DMFC2_3OP64 CPU64Regs:$rt, CPU64Regs:$rd, 0)>;
+def : InstAlias<"dmtc2 $rt, $rd", (DMTC2_3OP64 CPU64Regs:$rd, 0, CPU64Regs:$rt)>;
+
diff --git a/lib/Target/Mips/MipsAnalyzeImmediate.cpp b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
index dc8fbd0d0370..99b163ec33ac 100644
--- a/lib/Target/Mips/MipsAnalyzeImmediate.cpp
+++ b/lib/Target/Mips/MipsAnalyzeImmediate.cpp
@@ -91,7 +91,7 @@ void MipsAnalyzeImmediate::ReplaceADDiuSLLWithLUi(InstSeq &Seq) {
 
   // Sign-extend and shift operand of ADDiu and see if it still fits in 16-bit.
   int64_t Imm = SignExtend64<16>(Seq[0].ImmOpnd);
-  int64_t ShiftedImm = Imm << (Seq[1].ImmOpnd - 16);
+  int64_t ShiftedImm = (uint64_t)Imm << (Seq[1].ImmOpnd - 16);
 
   if (!isInt<16>(ShiftedImm))
     return;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 00ff7545c14a..bf2818d61df0 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -37,7 +37,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
 
@@ -49,6 +49,13 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return true;
 }
 
+bool MipsAsmPrinter::lowerOperand(const MachineOperand &MO, MCOperand &MCOp) {
+  MCOp = MCInstLowering.LowerOperand(MO);
+  return MCOp.isValid();
+}
+
+#include "MipsGenMCPseudoLowering.inc"
+
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (MI->isDebugValue()) {
     SmallString<128> Str;
@@ -58,24 +65,9 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
-  // Direct object specific instruction lowering
-  if (!OutStreamer.hasRawTextSupport())
-    switch (MI->getOpcode()) {
-    case Mips::DSLL:
-    case Mips::DSRL:
-    case Mips::DSRA:
-      assert(MI->getNumOperands() == 3 &&
-             "Invalid no. of machine operands for shift!");
-      assert(MI->getOperand(2).isImm());
-      int64_t Shift = MI->getOperand(2).getImm();
-      if (Shift > 31) {
-        MCInst TmpInst0;
-        MCInstLowering.LowerLargeShift(MI, TmpInst0, Shift - 32);
-        OutStreamer.EmitInstruction(TmpInst0);
-        return;
-      }
-      break;
-    }
+  // Do any auto-generated pseudo lowerings.
+  if (emitPseudoExpansionLowering(OutStreamer, MI))
+    return;
 
   MachineBasicBlock::const_instr_iterator I = MI;
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
@@ -83,8 +75,9 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   do {
     MCInst TmpInst0;
     MCInstLowering.Lower(I++, TmpInst0);
+
     OutStreamer.EmitInstruction(TmpInst0);
-  } while ((I != E) && I->isInsideBundle());
+  } while ((I != E) && I->isInsideBundle()); // Delay slot check
 }
 
 //===----------------------------------------------------------------------===//
@@ -214,7 +207,7 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
   case MipsSubtarget::N32:  return "abiN32";
   case MipsSubtarget::N64:  return "abi64";
   case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
-  default: llvm_unreachable("Unknown Mips ABI");;
+  default: llvm_unreachable("Unknown Mips ABI");
   }
 }
 
@@ -246,8 +239,7 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
 
     OutStreamer.EmitRawText(StringRef("\t.set\tnoreorder"));
     OutStreamer.EmitRawText(StringRef("\t.set\tnomacro"));
-    if (MipsFI->getEmitNOAT())
-      OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
+    OutStreamer.EmitRawText(StringRef("\t.set\tnoat"));
   }
 }
 
@@ -258,9 +250,7 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
   // always be at the function end, and we can't emit and
   // break with BB logic.
   if (OutStreamer.hasRawTextSupport()) {
-    if (MipsFI->getEmitNOAT())
-      OutStreamer.EmitRawText(StringRef("\t.set\tat"));
-
+    OutStreamer.EmitRawText(StringRef("\t.set\tat"));
     OutStreamer.EmitRawText(StringRef("\t.set\tmacro"));
     OutStreamer.EmitRawText(StringRef("\t.set\treorder"));
     OutStreamer.EmitRawText("\t.end\t" + Twine(CurrentFnSym->getName()));
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 562bf9ce0092..94d8bfa10569 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -32,6 +32,14 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
+private:
+  // tblgen'erated function.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+
 public:
 
   const MipsSubtarget *Subtarget;
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 19213fa67305..78cf140def60 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -35,9 +35,6 @@ def RetCC_MipsO32 : CallingConv<[
 //===----------------------------------------------------------------------===//
 
 def CC_MipsN : CallingConv<[
-   // Handles byval parameters.
-  CCIfByVal<CCCustom<"CC_Mips64Byval">>,
-
   // Promote i8/i16 arguments to i32.
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
@@ -72,9 +69,6 @@ def CC_MipsN : CallingConv<[
 // N32/64 variable arguments.
 // All arguments are passed in integer registers.
 def CC_MipsN_VarArg : CallingConv<[
-   // Handles byval parameters.
-  CCIfByVal<CCCustom<"CC_Mips64Byval">>,
-
   // Promote i8/i16 arguments to i32.
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
 
@@ -211,12 +205,6 @@ def CC_Mips_FastCC : CallingConv<[
 // Mips Calling Convention Dispatch
 //===----------------------------------------------------------------------===//
 
-def CC_Mips : CallingConv<[
-  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>,
-  CCIfSubtarget<"isABI_N32()", CCDelegateTo<CC_MipsN>>,
-  CCIfSubtarget<"isABI_N64()", CCDelegateTo<CC_MipsN>>
-]>;
-
 def RetCC_Mips : CallingConv<[
   CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
   CCIfSubtarget<"isABI_N32()", CCDelegateTo<RetCC_MipsN>>,
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index cb7022b9e29e..4bfccd8fdd7d 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -30,7 +30,6 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -48,7 +47,7 @@ namespace {
 class MipsCodeEmitter : public MachineFunctionPass {
   MipsJITInfo *JTI;
   const MipsInstrInfo *II;
-  const TargetData *TD;
+  const DataLayout *TD;
   const MipsSubtarget *Subtarget;
   TargetMachine &TM;
   JITCodeEmitter &MCE;
@@ -67,7 +66,7 @@ class MipsCodeEmitter : public MachineFunctionPass {
     MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce) :
       MachineFunctionPass(ID), JTI(0),
       II((const MipsInstrInfo *) tm.getInstrInfo()),
-      TD(tm.getTargetData()), TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+      TD(tm.getDataLayout()), TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {
     }
 
@@ -129,7 +128,7 @@ char MipsCodeEmitter::ID = 0;
 bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   JTI = ((MipsTargetMachine&) MF.getTarget()).getJITInfo();
   II = ((const MipsTargetMachine&) MF.getTarget()).getInstrInfo();
-  TD = ((const MipsTargetMachine&) MF.getTarget()).getTargetData();
+  TD = ((const MipsTargetMachine&) MF.getTarget()).getDataLayout();
   Subtarget = &TM.getSubtarget<MipsSubtarget> ();
   MCPEs = &MF.getConstantPool()->getConstants();
   MJTEs = 0;
@@ -139,7 +138,7 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
 
   do {
     DEBUG(errs() << "JITTing function '"
-        << MF.getFunction()->getName() << "'\n");
+        << MF.getName() << "'\n");
     MCE.startFunction(MF);
 
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
@@ -219,15 +218,9 @@ unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     return getMipsRegisterNumbering(MO.getReg());
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
-  else if (MO.isGlobal()) {
-    if (MI.getOpcode() == Mips::ULW || MI.getOpcode() == Mips::USW ||
-          MI.getOpcode() == Mips::ULH || MI.getOpcode() == Mips::ULHu)
-      emitGlobalAddressUnaligned(MO.getGlobal(), getRelocation(MI, MO), 4);
-    else if (MI.getOpcode() == Mips::USH)
-      emitGlobalAddressUnaligned(MO.getGlobal(), getRelocation(MI, MO), 8);
-    else
-      emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
-  } else if (MO.isSymbol())
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
     emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
   else if (MO.isCPI())
     emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
@@ -384,29 +377,8 @@ void MipsCodeEmitter::emitInstruction(const MachineInstr &MI) {
   if ((MI.getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo)
     return;
 
-
-  switch (MI.getOpcode()) {
-  case Mips::USW:
-    NumEmitted += emitUSW(MI);
-    break;
-  case Mips::ULW:
-    NumEmitted += emitULW(MI);
-    break;
-  case Mips::ULH:
-    NumEmitted += emitULH(MI);
-    break;
-  case Mips::ULHu:
-    NumEmitted += emitULHu(MI);
-    break;
-  case Mips::USH:
-    NumEmitted += emitUSH(MI);
-    break;
-
-  default:
-    emitWordLE(getBinaryCodeForInstr(MI));
-    ++NumEmitted;  // Keep track of the # of mi's emitted
-    break;
-  }
+  emitWordLE(getBinaryCodeForInstr(MI));
+  ++NumEmitted;  // Keep track of the # of mi's emitted
 
   MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
new file mode 100644
index 000000000000..8e01d06596a1
--- /dev/null
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -0,0 +1,309 @@
+//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasDSP : Predicate<"Subtarget.hasDSP()">,
+             AssemblerPredicate<"FeatureDSP">;
+def HasDSPR2 : Predicate<"Subtarget.hasDSPR2()">,
+               AssemblerPredicate<"FeatureDSPR2">;
+
+// Fields.
+class Field6<bits<6> val> {
+  bits<6> V = val;
+}
+
+def SPECIAL3_OPCODE : Field6<0b011111>;
+def REGIMM_OPCODE : Field6<0b000001>;
+
+class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasDSP];
+}
+
+class PseudoDSP<dag outs, dag ins, list<dag> pattern>:
+  MipsPseudo<outs, ins, "", pattern> {
+  let Predicates = [HasDSP];
+}
+
+// ADDU.QB sub-class format.
+class ADDU_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+class RADDU_W_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010000;
+}
+
+// CMPU.EQ.QB sub-class format.
+class CMP_EQ_QB_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class CMP_EQ_QB_R3_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> rd;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+class PRECR_SRA_PH_W_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010001;
+}
+
+// ABSQ_S.PH sub-class format.
+class ABSQ_S_PH_R2_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+
+class REPL_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> imm;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = imm;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010010;
+}
+
+// SHLL.QB sub-class format.
+class SHLL_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rt;
+  bits<5> rs_sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs_sa;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0}   = 0b010011;
+}
+
+// LX sub-class format.
+class LX_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> base;
+  bits<5> index;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b001010;
+}
+
+// ADDUH.QB sub-class format.
+class ADDUH_QB_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b011000;
+}
+
+// APPEND sub-class format.
+class APPEND_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+  bits<5> sa;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = sa;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b110001;
+}
+
+// DPA.W.PH sub-class format.
+class DPA_W_PH_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = op;
+  let Inst{5-0} = 0b110000;
+}
+
+// MULT sub-class format.
+class MULT_FMT<bits<6> opcode, bits<6> funct> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+  bits<5> rt;
+
+  let Opcode = opcode;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6}  = 0;
+  let Inst{5-0} = funct;
+}
+
+// EXTR.W sub-class format (type 1).
+class EXTR_W_TY1_FMT<bits<5> op> : DSPInst {
+  bits<5> rt;
+  bits<2> ac;
+  bits<5> shift_rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = shift_rs;
+  let Inst{20-16} = rt;
+  let Inst{15-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+// SHILO sub-class format.
+class SHILO_R1_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<6> shift;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-20} = shift;
+  let Inst{19-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class SHILO_R2_FMT<bits<5> op> : DSPInst {
+  bits<2> ac;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-13} = 0;
+  let Inst{12-11} = ac;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class RDDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rd;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-16} = mask;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class WRDSP_FMT<bits<5> op> : DSPInst {
+  bits<5> rs;
+  bits<10> mask;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-11} = mask;
+  let Inst{10-6} = op;
+  let Inst{5-0} = 0b111000;
+}
+
+class BPOSGE32_FMT<bits<5> op> : DSPInst {
+  bits<16> offset;
+
+  let Opcode = REGIMM_OPCODE.V;
+
+  let Inst{25-21} = 0;
+  let Inst{20-16} = op;
+  let Inst{15-0} = offset;
+}
+
+// INSV sub-class format.
+class INSV_FMT<bits<6> op> : DSPInst {
+  bits<5> rt;
+  bits<5> rs;
+
+  let Opcode = SPECIAL3_OPCODE.V;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6} = 0;
+  let Inst{5-0} = op;
+}
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
new file mode 100644
index 000000000000..ef9402865b0d
--- /dev/null
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -0,0 +1,1319 @@
+//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips DSP ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// ImmLeaf
+def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
+
+// Mips-specific dsp nodes
+def SDT_MipsExtr : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>]>;
+def SDT_MipsShilo : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def SDT_MipsDPA : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>]>;
+
+class MipsDSPBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof,
+         [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+
+class MipsDSPSideEffectBase<string Opc, SDTypeProfile Prof> :
+  SDNode<!strconcat("MipsISD::", Opc), Prof,
+         [SDNPHasChain, SDNPInGlue, SDNPOutGlue, SDNPSideEffect]>;
+
+def MipsEXTP : MipsDSPSideEffectBase<"EXTP", SDT_MipsExtr>;
+def MipsEXTPDP : MipsDSPSideEffectBase<"EXTPDP", SDT_MipsExtr>;
+def MipsEXTR_S_H : MipsDSPSideEffectBase<"EXTR_S_H", SDT_MipsExtr>;
+def MipsEXTR_W : MipsDSPSideEffectBase<"EXTR_W", SDT_MipsExtr>;
+def MipsEXTR_R_W : MipsDSPSideEffectBase<"EXTR_R_W", SDT_MipsExtr>;
+def MipsEXTR_RS_W : MipsDSPSideEffectBase<"EXTR_RS_W", SDT_MipsExtr>;
+
+def MipsSHILO : MipsDSPBase<"SHILO", SDT_MipsShilo>;
+def MipsMTHLIP : MipsDSPBase<"MTHLIP", SDT_MipsShilo>;
+
+def MipsMULSAQ_S_W_PH : MipsDSPSideEffectBase<"MULSAQ_S_W_PH", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHL : MipsDSPSideEffectBase<"MAQ_S_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_S_W_PHR : MipsDSPSideEffectBase<"MAQ_S_W_PHR", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHL : MipsDSPSideEffectBase<"MAQ_SA_W_PHL", SDT_MipsDPA>;
+def MipsMAQ_SA_W_PHR : MipsDSPSideEffectBase<"MAQ_SA_W_PHR", SDT_MipsDPA>;
+
+def MipsDPAU_H_QBL : MipsDSPBase<"DPAU_H_QBL", SDT_MipsDPA>;
+def MipsDPAU_H_QBR : MipsDSPBase<"DPAU_H_QBR", SDT_MipsDPA>;
+def MipsDPSU_H_QBL : MipsDSPBase<"DPSU_H_QBL", SDT_MipsDPA>;
+def MipsDPSU_H_QBR : MipsDSPBase<"DPSU_H_QBR", SDT_MipsDPA>;
+def MipsDPAQ_S_W_PH : MipsDSPSideEffectBase<"DPAQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQ_S_W_PH : MipsDSPSideEffectBase<"DPSQ_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQ_SA_L_W : MipsDSPSideEffectBase<"DPAQ_SA_L_W", SDT_MipsDPA>;
+def MipsDPSQ_SA_L_W : MipsDSPSideEffectBase<"DPSQ_SA_L_W", SDT_MipsDPA>;
+
+def MipsDPA_W_PH : MipsDSPBase<"DPA_W_PH", SDT_MipsDPA>;
+def MipsDPS_W_PH : MipsDSPBase<"DPS_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_S_W_PH : MipsDSPSideEffectBase<"DPAQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPAQX_SA_W_PH : MipsDSPSideEffectBase<"DPAQX_SA_W_PH", SDT_MipsDPA>;
+def MipsDPAX_W_PH : MipsDSPBase<"DPAX_W_PH", SDT_MipsDPA>;
+def MipsDPSX_W_PH : MipsDSPBase<"DPSX_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_S_W_PH : MipsDSPSideEffectBase<"DPSQX_S_W_PH", SDT_MipsDPA>;
+def MipsDPSQX_SA_W_PH : MipsDSPSideEffectBase<"DPSQX_SA_W_PH", SDT_MipsDPA>;
+def MipsMULSA_W_PH : MipsDSPBase<"MULSA_W_PH", SDT_MipsDPA>;
+
+def MipsMULT : MipsDSPBase<"MULT", SDT_MipsDPA>;
+def MipsMULTU : MipsDSPBase<"MULTU", SDT_MipsDPA>;
+def MipsMADD_DSP : MipsDSPBase<"MADD_DSP", SDT_MipsDPA>;
+def MipsMADDU_DSP : MipsDSPBase<"MADDU_DSP", SDT_MipsDPA>;
+def MipsMSUB_DSP : MipsDSPBase<"MSUB_DSP", SDT_MipsDPA>;
+def MipsMSUBU_DSP : MipsDSPBase<"MSUBU_DSP", SDT_MipsDPA>;
+
+// Flags.
+class IsCommutable {
+  bit isCommutable = 1;
+}
+
+class UseAC {
+  list<Register> Uses = [AC0];
+}
+
+class UseDSPCtrl {
+  list<Register> Uses = [DSPCtrl];
+}
+
+class ClearDefs {
+  list<Register> Defs = [];
+}
+
+// Instruction encoding.
+class ADDU_QB_ENC : ADDU_QB_FMT<0b00000>;
+class ADDU_S_QB_ENC : ADDU_QB_FMT<0b00100>;
+class SUBU_QB_ENC : ADDU_QB_FMT<0b00001>;
+class SUBU_S_QB_ENC : ADDU_QB_FMT<0b00101>;
+class ADDQ_PH_ENC : ADDU_QB_FMT<0b01010>;
+class ADDQ_S_PH_ENC : ADDU_QB_FMT<0b01110>;
+class SUBQ_PH_ENC : ADDU_QB_FMT<0b01011>;
+class SUBQ_S_PH_ENC : ADDU_QB_FMT<0b01111>;
+class ADDQ_S_W_ENC : ADDU_QB_FMT<0b10110>;
+class SUBQ_S_W_ENC : ADDU_QB_FMT<0b10111>;
+class ADDSC_ENC : ADDU_QB_FMT<0b10000>;
+class ADDWC_ENC : ADDU_QB_FMT<0b10001>;
+class MODSUB_ENC : ADDU_QB_FMT<0b10010>;
+class RADDU_W_QB_ENC : RADDU_W_QB_FMT<0b10100>;
+class ABSQ_S_PH_ENC : ABSQ_S_PH_R2_FMT<0b01001>;
+class ABSQ_S_W_ENC : ABSQ_S_PH_R2_FMT<0b10001>;
+class PRECRQ_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01100>;
+class PRECRQ_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10100>;
+class PRECRQ_RS_PH_W_ENC : CMP_EQ_QB_R3_FMT<0b10101>;
+class PRECRQU_S_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01111>;
+class PRECEQ_W_PHL_ENC : ABSQ_S_PH_R2_FMT<0b01100>;
+class PRECEQ_W_PHR_ENC : ABSQ_S_PH_R2_FMT<0b01101>;
+class PRECEQU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b00100>;
+class PRECEQU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b00101>;
+class PRECEQU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b00110>;
+class PRECEQU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b00111>;
+class PRECEU_PH_QBL_ENC : ABSQ_S_PH_R2_FMT<0b11100>;
+class PRECEU_PH_QBR_ENC : ABSQ_S_PH_R2_FMT<0b11101>;
+class PRECEU_PH_QBLA_ENC : ABSQ_S_PH_R2_FMT<0b11110>;
+class PRECEU_PH_QBRA_ENC : ABSQ_S_PH_R2_FMT<0b11111>;
+class SHLL_QB_ENC : SHLL_QB_FMT<0b00000>;
+class SHLLV_QB_ENC : SHLL_QB_FMT<0b00010>;
+class SHRL_QB_ENC : SHLL_QB_FMT<0b00001>;
+class SHRLV_QB_ENC : SHLL_QB_FMT<0b00011>;
+class SHLL_PH_ENC : SHLL_QB_FMT<0b01000>;
+class SHLLV_PH_ENC : SHLL_QB_FMT<0b01010>;
+class SHLL_S_PH_ENC : SHLL_QB_FMT<0b01100>;
+class SHLLV_S_PH_ENC : SHLL_QB_FMT<0b01110>;
+class SHRA_PH_ENC : SHLL_QB_FMT<0b01001>;
+class SHRAV_PH_ENC : SHLL_QB_FMT<0b01011>;
+class SHRA_R_PH_ENC : SHLL_QB_FMT<0b01101>;
+class SHRAV_R_PH_ENC : SHLL_QB_FMT<0b01111>;
+class SHLL_S_W_ENC : SHLL_QB_FMT<0b10100>;
+class SHLLV_S_W_ENC : SHLL_QB_FMT<0b10110>;
+class SHRA_R_W_ENC : SHLL_QB_FMT<0b10101>;
+class SHRAV_R_W_ENC : SHLL_QB_FMT<0b10111>;
+class MULEU_S_PH_QBL_ENC : ADDU_QB_FMT<0b00110>;
+class MULEU_S_PH_QBR_ENC : ADDU_QB_FMT<0b00111>;
+class MULEQ_S_W_PHL_ENC : ADDU_QB_FMT<0b11100>;
+class MULEQ_S_W_PHR_ENC : ADDU_QB_FMT<0b11101>;
+class MULQ_RS_PH_ENC : ADDU_QB_FMT<0b11111>;
+class MULSAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00110>;
+class MAQ_S_W_PHL_ENC : DPA_W_PH_FMT<0b10100>;
+class MAQ_S_W_PHR_ENC : DPA_W_PH_FMT<0b10110>;
+class MAQ_SA_W_PHL_ENC : DPA_W_PH_FMT<0b10000>;
+class MAQ_SA_W_PHR_ENC : DPA_W_PH_FMT<0b10010>;
+class DPAU_H_QBL_ENC : DPA_W_PH_FMT<0b00011>;
+class DPAU_H_QBR_ENC : DPA_W_PH_FMT<0b00111>;
+class DPSU_H_QBL_ENC : DPA_W_PH_FMT<0b01011>;
+class DPSU_H_QBR_ENC : DPA_W_PH_FMT<0b01111>;
+class DPAQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00100>;
+class DPSQ_S_W_PH_ENC : DPA_W_PH_FMT<0b00101>;
+class DPAQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01100>;
+class DPSQ_SA_L_W_ENC : DPA_W_PH_FMT<0b01101>;
+class MULT_DSP_ENC : MULT_FMT<0b000000, 0b011000>;
+class MULTU_DSP_ENC : MULT_FMT<0b000000, 0b011001>;
+class MADD_DSP_ENC : MULT_FMT<0b011100, 0b000000>;
+class MADDU_DSP_ENC : MULT_FMT<0b011100, 0b000001>;
+class MSUB_DSP_ENC : MULT_FMT<0b011100, 0b000100>;
+class MSUBU_DSP_ENC : MULT_FMT<0b011100, 0b000101>;
+class CMPU_EQ_QB_ENC : CMP_EQ_QB_R2_FMT<0b00000>;
+class CMPU_LT_QB_ENC : CMP_EQ_QB_R2_FMT<0b00001>;
+class CMPU_LE_QB_ENC : CMP_EQ_QB_R2_FMT<0b00010>;
+class CMPGU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b00100>;
+class CMPGU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b00101>;
+class CMPGU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b00110>;
+class CMP_EQ_PH_ENC : CMP_EQ_QB_R2_FMT<0b01000>;
+class CMP_LT_PH_ENC : CMP_EQ_QB_R2_FMT<0b01001>;
+class CMP_LE_PH_ENC : CMP_EQ_QB_R2_FMT<0b01010>;
+class BITREV_ENC : ABSQ_S_PH_R2_FMT<0b11011>;
+class PACKRL_PH_ENC : CMP_EQ_QB_R3_FMT<0b01110>;
+class REPL_QB_ENC : REPL_FMT<0b00010>;
+class REPL_PH_ENC : REPL_FMT<0b01010>;
+class REPLV_QB_ENC : ABSQ_S_PH_R2_FMT<0b00011>;
+class REPLV_PH_ENC : ABSQ_S_PH_R2_FMT<0b01011>;
+class PICK_QB_ENC : CMP_EQ_QB_R3_FMT<0b00011>;
+class PICK_PH_ENC : CMP_EQ_QB_R3_FMT<0b01011>;
+class LWX_ENC : LX_FMT<0b00000>;
+class LHX_ENC : LX_FMT<0b00100>;
+class LBUX_ENC : LX_FMT<0b00110>;
+class BPOSGE32_ENC : BPOSGE32_FMT<0b11100>;
+class INSV_ENC : INSV_FMT<0b001100>;
+
+class EXTP_ENC : EXTR_W_TY1_FMT<0b00010>;
+class EXTPV_ENC : EXTR_W_TY1_FMT<0b00011>;
+class EXTPDP_ENC : EXTR_W_TY1_FMT<0b01010>;
+class EXTPDPV_ENC : EXTR_W_TY1_FMT<0b01011>;
+class EXTR_W_ENC : EXTR_W_TY1_FMT<0b00000>;
+class EXTRV_W_ENC : EXTR_W_TY1_FMT<0b00001>;
+class EXTR_R_W_ENC : EXTR_W_TY1_FMT<0b00100>;
+class EXTRV_R_W_ENC : EXTR_W_TY1_FMT<0b00101>;
+class EXTR_RS_W_ENC : EXTR_W_TY1_FMT<0b00110>;
+class EXTRV_RS_W_ENC : EXTR_W_TY1_FMT<0b00111>;
+class EXTR_S_H_ENC : EXTR_W_TY1_FMT<0b01110>;
+class EXTRV_S_H_ENC : EXTR_W_TY1_FMT<0b01111>;
+class SHILO_ENC : SHILO_R1_FMT<0b11010>;
+class SHILOV_ENC : SHILO_R2_FMT<0b11011>;
+class MTHLIP_ENC : SHILO_R2_FMT<0b11111>;
+
+class RDDSP_ENC : RDDSP_FMT<0b10010>;
+class WRDSP_ENC : WRDSP_FMT<0b10011>;
+class ADDU_PH_ENC : ADDU_QB_FMT<0b01000>;
+class ADDU_S_PH_ENC : ADDU_QB_FMT<0b01100>;
+class SUBU_PH_ENC : ADDU_QB_FMT<0b01001>;
+class SUBU_S_PH_ENC : ADDU_QB_FMT<0b01101>;
+class CMPGDU_EQ_QB_ENC : CMP_EQ_QB_R3_FMT<0b11000>;
+class CMPGDU_LT_QB_ENC : CMP_EQ_QB_R3_FMT<0b11001>;
+class CMPGDU_LE_QB_ENC : CMP_EQ_QB_R3_FMT<0b11010>;
+class ABSQ_S_QB_ENC : ABSQ_S_PH_R2_FMT<0b00001>;
+class ADDUH_QB_ENC : ADDUH_QB_FMT<0b00000>;
+class ADDUH_R_QB_ENC : ADDUH_QB_FMT<0b00010>;
+class SUBUH_QB_ENC : ADDUH_QB_FMT<0b00001>;
+class SUBUH_R_QB_ENC : ADDUH_QB_FMT<0b00011>;
+class ADDQH_PH_ENC : ADDUH_QB_FMT<0b01000>;
+class ADDQH_R_PH_ENC : ADDUH_QB_FMT<0b01010>;
+class SUBQH_PH_ENC : ADDUH_QB_FMT<0b01001>;
+class SUBQH_R_PH_ENC : ADDUH_QB_FMT<0b01011>;
+class ADDQH_W_ENC : ADDUH_QB_FMT<0b10000>;
+class ADDQH_R_W_ENC : ADDUH_QB_FMT<0b10010>;
+class SUBQH_W_ENC : ADDUH_QB_FMT<0b10001>;
+class SUBQH_R_W_ENC : ADDUH_QB_FMT<0b10011>;
+class MUL_PH_ENC : ADDUH_QB_FMT<0b01100>;
+class MUL_S_PH_ENC : ADDUH_QB_FMT<0b01110>;
+class MULQ_S_W_ENC : ADDUH_QB_FMT<0b10110>;
+class MULQ_RS_W_ENC : ADDUH_QB_FMT<0b10111>;
+class MULQ_S_PH_ENC : ADDU_QB_FMT<0b11110>;
+class DPA_W_PH_ENC : DPA_W_PH_FMT<0b00000>;
+class DPS_W_PH_ENC : DPA_W_PH_FMT<0b00001>;
+class DPAQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11000>;
+class DPAQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11010>;
+class DPAX_W_PH_ENC : DPA_W_PH_FMT<0b01000>;
+class DPSX_W_PH_ENC : DPA_W_PH_FMT<0b01001>;
+class DPSQX_S_W_PH_ENC : DPA_W_PH_FMT<0b11001>;
+class DPSQX_SA_W_PH_ENC : DPA_W_PH_FMT<0b11011>;
+class MULSA_W_PH_ENC : DPA_W_PH_FMT<0b00010>;
+class PRECR_QB_PH_ENC : CMP_EQ_QB_R3_FMT<0b01101>;
+class PRECR_SRA_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11110>;
+class PRECR_SRA_R_PH_W_ENC : PRECR_SRA_PH_W_FMT<0b11111>;
+class SHRA_QB_ENC : SHLL_QB_FMT<0b00100>;
+class SHRAV_QB_ENC : SHLL_QB_FMT<0b00110>;
+class SHRA_R_QB_ENC : SHLL_QB_FMT<0b00101>;
+class SHRAV_R_QB_ENC : SHLL_QB_FMT<0b00111>;
+class SHRL_PH_ENC : SHLL_QB_FMT<0b11001>;
+class SHRLV_PH_ENC : SHLL_QB_FMT<0b11011>;
+class APPEND_ENC : APPEND_FMT<0b00000>;
+class BALIGN_ENC : APPEND_FMT<0b10000>;
+class PREPEND_ENC : APPEND_FMT<0b00001>;
+
+// Instruction desc.
+class ADDU_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                        InstrItinClass itin, RegisterClass RCD,
+                        RegisterClass RCS,  RegisterClass RCT = RCS> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class RADDU_W_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterClass RCD,
+                           RegisterClass RCS = RCD> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class CMP_EQ_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterClass RCS,
+                             RegisterClass RCT = RCS> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt");
+  list<dag> Pattern = [(OpNode RCS:$rs, RCT:$rt)];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class CMP_EQ_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterClass RCD,
+                             RegisterClass RCS,  RegisterClass RCT = RCS> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class PRECR_SRA_PH_W_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                               InstrItinClass itin, RegisterClass RCT,
+                               RegisterClass RCS = RCT> {
+  dag OutOperandList = (outs RCT:$rt);
+  dag InOperandList = (ins RCS:$rs, shamt:$sa, RCS:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern = [(set RCT:$rt, (OpNode RCS:$src, RCS:$rs, immZExt5:$sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$src = $rt";
+}
+
+class ABSQ_S_PH_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                             InstrItinClass itin, RegisterClass RCD,
+                             RegisterClass RCT = RCD> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class REPL_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     ImmLeaf immPat, InstrItinClass itin, RegisterClass RC> {
+  dag OutOperandList = (outs RC:$rd);
+  dag InOperandList = (ins uimm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $imm");
+  list<dag> Pattern = [(set RC:$rd, (OpNode immPat:$imm))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class SHLL_QB_R3_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           InstrItinClass itin, RegisterClass RC> {
+  dag OutOperandList = (outs RC:$rd);
+  dag InOperandList =  (ins RC:$rt, CPURegs:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, CPURegs:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class SHLL_QB_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                           SDPatternOperator ImmPat, InstrItinClass itin,
+                           RegisterClass RC> {
+  dag OutOperandList = (outs RC:$rd);
+  dag InOperandList = (ins RC:$rt, uimm16:$rs_sa);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt, $rs_sa");
+  list<dag> Pattern = [(set RC:$rd, (OpNode RC:$rt, ImmPat:$rs_sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class LX_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                   InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rd);
+  dag InOperandList = (ins CPURegs:$base, CPURegs:$index);
+  string AsmString = !strconcat(instr_asm, "\t$rd, ${index}(${base})");
+  list<dag> Pattern = [(set CPURegs:$rd,
+                       (OpNode CPURegs:$base, CPURegs:$index))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+  bit mayLoad = 1;
+}
+
+class ADDUH_QB_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                         InstrItinClass itin, RegisterClass RCD,
+                         RegisterClass RCS = RCD,  RegisterClass RCT = RCD> {
+  dag OutOperandList = (outs RCD:$rd);
+  dag InOperandList = (ins RCS:$rs, RCT:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [(set RCD:$rd, (OpNode RCS:$rs, RCT:$rt))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class APPEND_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                       SDPatternOperator ImmOp, InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins CPURegs:$rs, shamt:$sa, CPURegs:$src);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs, $sa");
+  list<dag> Pattern =  [(set CPURegs:$rt,
+                        (OpNode CPURegs:$src, CPURegs:$rs, ImmOp:$sa))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+  string Constraints = "$src = $rt";
+}
+
+class EXTR_W_TY1_R2_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins ACRegs:$ac, CPURegs:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class EXTR_W_TY1_R1_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                              InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins ACRegs:$ac, uimm16:$shift_rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $ac, $shift_rs");
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class SHILO_R1_PSEUDO_BASE<SDPatternOperator OpNode, InstrItinClass itin,
+                           Instruction realinst> :
+  PseudoDSP<(outs), (ins simm16:$shift), [(OpNode immSExt6:$shift)]>,
+  PseudoInstExpansion<(realinst AC0, simm16:$shift)> {
+  list<Register> Defs = [DSPCtrl, AC0];
+  list<Register> Uses = [AC0];
+  InstrItinClass Itinerary = itin;
+}
+
+class SHILO_R1_DESC_BASE<string instr_asm> {
+  dag OutOperandList = (outs ACRegs:$ac);
+  dag InOperandList = (ins simm16:$shift);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $shift");
+}
+
+class SHILO_R2_PSEUDO_BASE<SDPatternOperator OpNode, InstrItinClass itin,
+                           Instruction realinst> :
+  PseudoDSP<(outs), (ins CPURegs:$rs), [(OpNode CPURegs:$rs)]>,
+  PseudoInstExpansion<(realinst AC0, CPURegs:$rs)> {
+  list<Register> Defs = [DSPCtrl, AC0];
+  list<Register> Uses = [AC0];
+  InstrItinClass Itinerary = itin;
+}
+
+class SHILO_R2_DESC_BASE<string instr_asm> {
+  dag OutOperandList = (outs ACRegs:$ac);
+  dag InOperandList = (ins CPURegs:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs");
+}
+
+class MTHLIP_DESC_BASE<string instr_asm> {
+  dag OutOperandList = (outs ACRegs:$ac);
+  dag InOperandList = (ins CPURegs:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $ac");
+}
+
+class RDDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rd);
+  dag InOperandList = (ins uimm16:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $mask");
+  list<dag> Pattern = [(set CPURegs:$rd, (OpNode immZExt10:$mask))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Uses = [DSPCtrl];
+}
+
+class WRDSP_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                      InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins CPURegs:$rs, uimm16:$mask);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $mask");
+  list<dag> Pattern = [(OpNode CPURegs:$rs, immZExt10:$mask)];
+  InstrItinClass Itinerary = itin;
+  list<Register> Defs = [DSPCtrl];
+}
+
+class DPA_W_PH_PSEUDO_BASE<SDPatternOperator OpNode, InstrItinClass itin,
+                           Instruction realinst> :
+  PseudoDSP<(outs), (ins CPURegs:$rs, CPURegs:$rt),
+            [(OpNode CPURegs:$rs, CPURegs:$rt)]>,
+  PseudoInstExpansion<(realinst AC0, CPURegs:$rs, CPURegs:$rt)> {
+  list<Register> Defs = [DSPCtrl, AC0];
+  list<Register> Uses = [AC0];
+  InstrItinClass Itinerary = itin;
+}
+
+class DPA_W_PH_DESC_BASE<string instr_asm> {
+  dag OutOperandList = (outs ACRegs:$ac);
+  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+}
+
+class MULT_PSEUDO_BASE<SDPatternOperator OpNode, InstrItinClass itin,
+                       Instruction realinst> :
+  PseudoDSP<(outs), (ins CPURegs:$rs, CPURegs:$rt),
+            [(OpNode CPURegs:$rs, CPURegs:$rt)]>,
+  PseudoInstExpansion<(realinst AC0, CPURegs:$rs, CPURegs:$rt)> {
+  list<Register> Defs = [DSPCtrl, AC0];
+  InstrItinClass Itinerary = itin;
+}
+
+class MULT_DESC_BASE<string instr_asm> {
+  dag OutOperandList = (outs ACRegs:$ac);
+  dag InOperandList = (ins CPURegs:$rs, CPURegs:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$ac, $rs, $rt");
+}
+
+class BPOSGE32_PSEUDO_DESC_BASE<SDPatternOperator OpNode, InstrItinClass itin> :
+  MipsPseudo<(outs CPURegs:$dst), (ins), "", [(set CPURegs:$dst, (OpNode))]> {
+  list<Register> Uses = [DSPCtrl];
+  bit usesCustomInserter = 1;
+}
+
+class BPOSGE32_DESC_BASE<string instr_asm, InstrItinClass itin> {
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins brtarget:$offset);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  InstrItinClass Itinerary = itin;
+  list<Register> Uses = [DSPCtrl];
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 1;
+}
+
+class INSV_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
+                     InstrItinClass itin> {
+  dag OutOperandList = (outs CPURegs:$rt);
+  dag InOperandList = (ins CPURegs:$src, CPURegs:$rs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $rs");
+  list<dag> Pattern = [(set CPURegs:$rt, (OpNode CPURegs:$src, CPURegs:$rs))];
+  InstrItinClass Itinerary = itin;
+  list<Register> Uses = [DSPCtrl];
+  string Constraints = "$src = $rt";
+}
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 1
+//===----------------------------------------------------------------------===//
+
+// Addition/subtraction
+class ADDU_QB_DESC : ADDU_QB_DESC_BASE<"addu.qb", int_mips_addu_qb, NoItinerary,
+                                       DSPRegs, DSPRegs>, IsCommutable;
+
+class ADDU_S_QB_DESC : ADDU_QB_DESC_BASE<"addu_s.qb", int_mips_addu_s_qb,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+class SUBU_QB_DESC : ADDU_QB_DESC_BASE<"subu.qb", int_mips_subu_qb, NoItinerary,
+                                       DSPRegs, DSPRegs>;
+
+class SUBU_S_QB_DESC : ADDU_QB_DESC_BASE<"subu_s.qb", int_mips_subu_s_qb,
+                                         NoItinerary, DSPRegs, DSPRegs>;
+
+class ADDQ_PH_DESC : ADDU_QB_DESC_BASE<"addq.ph", int_mips_addq_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>, IsCommutable;
+
+class ADDQ_S_PH_DESC : ADDU_QB_DESC_BASE<"addq_s.ph", int_mips_addq_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+class SUBQ_PH_DESC : ADDU_QB_DESC_BASE<"subq.ph", int_mips_subq_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>;
+
+class SUBQ_S_PH_DESC : ADDU_QB_DESC_BASE<"subq_s.ph", int_mips_subq_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>;
+
+class ADDQ_S_W_DESC : ADDU_QB_DESC_BASE<"addq_s.w", int_mips_addq_s_w,
+                                        NoItinerary, CPURegs, CPURegs>,
+                      IsCommutable;
+
+class SUBQ_S_W_DESC : ADDU_QB_DESC_BASE<"subq_s.w", int_mips_subq_s_w,
+                                        NoItinerary, CPURegs, CPURegs>;
+
+class ADDSC_DESC : ADDU_QB_DESC_BASE<"addsc", int_mips_addsc, NoItinerary,
+                                     CPURegs, CPURegs>, IsCommutable;
+
+class ADDWC_DESC : ADDU_QB_DESC_BASE<"addwc", int_mips_addwc, NoItinerary,
+                                     CPURegs, CPURegs>,
+                   IsCommutable, UseDSPCtrl;
+
+class MODSUB_DESC : ADDU_QB_DESC_BASE<"modsub", int_mips_modsub, NoItinerary,
+                                      CPURegs, CPURegs>, ClearDefs;
+
+class RADDU_W_QB_DESC : RADDU_W_QB_DESC_BASE<"raddu.w.qb", int_mips_raddu_w_qb,
+                                             NoItinerary, CPURegs, DSPRegs>,
+                        ClearDefs;
+
+// Absolute value
+class ABSQ_S_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.ph", int_mips_absq_s_ph,
+                                              NoItinerary, DSPRegs>;
+
+class ABSQ_S_W_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.w", int_mips_absq_s_w,
+                                             NoItinerary, CPURegs>;
+
+// Precision reduce/expand
+class PRECRQ_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.qb.ph",
+                                                 int_mips_precrq_qb_ph,
+                                                 NoItinerary, DSPRegs, DSPRegs>,
+                          ClearDefs;
+
+class PRECRQ_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq.ph.w",
+                                                int_mips_precrq_ph_w,
+                                                NoItinerary, DSPRegs, CPURegs>,
+                         ClearDefs;
+
+class PRECRQ_RS_PH_W_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrq_rs.ph.w",
+                                                   int_mips_precrq_rs_ph_w,
+                                                   NoItinerary, DSPRegs,
+                                                   CPURegs>;
+
+class PRECRQU_S_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precrqu_s.qb.ph",
+                                                    int_mips_precrqu_s_qb_ph,
+                                                    NoItinerary, DSPRegs,
+                                                    DSPRegs>;
+
+class PRECEQ_W_PHL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phl",
+                                                 int_mips_preceq_w_phl,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          ClearDefs;
+
+class PRECEQ_W_PHR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceq.w.phr",
+                                                 int_mips_preceq_w_phr,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          ClearDefs;
+
+class PRECEQU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbl",
+                                                   int_mips_precequ_ph_qbl,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+class PRECEQU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbr",
+                                                   int_mips_precequ_ph_qbr,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+class PRECEQU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbla",
+                                                    int_mips_precequ_ph_qbla,
+                                                    NoItinerary, DSPRegs>,
+                             ClearDefs;
+
+class PRECEQU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"precequ.ph.qbra",
+                                                    int_mips_precequ_ph_qbra,
+                                                    NoItinerary, DSPRegs>,
+                             ClearDefs;
+
+class PRECEU_PH_QBL_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbl",
+                                                  int_mips_preceu_ph_qbl,
+                                                  NoItinerary, DSPRegs>,
+                           ClearDefs;
+
+class PRECEU_PH_QBR_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbr",
+                                                  int_mips_preceu_ph_qbr,
+                                                  NoItinerary, DSPRegs>,
+                           ClearDefs;
+
+class PRECEU_PH_QBLA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbla",
+                                                   int_mips_preceu_ph_qbla,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+class PRECEU_PH_QBRA_DESC : ABSQ_S_PH_R2_DESC_BASE<"preceu.ph.qbra",
+                                                   int_mips_preceu_ph_qbra,
+                                                   NoItinerary, DSPRegs>,
+                            ClearDefs;
+
+// Shift
+class SHLL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shll.qb", int_mips_shll_qb, immZExt3,
+                                          NoItinerary, DSPRegs>;
+
+class SHLLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shllv.qb", int_mips_shll_qb,
+                                           NoItinerary, DSPRegs>;
+
+class SHRL_QB_DESC : SHLL_QB_R2_DESC_BASE<"shrl.qb", int_mips_shrl_qb, immZExt3,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRLV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.qb", int_mips_shrl_qb,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class SHLL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll.ph", int_mips_shll_ph, immZExt4,
+                                          NoItinerary, DSPRegs>;
+
+class SHLLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv.ph", int_mips_shll_ph,
+                                           NoItinerary, DSPRegs>;
+
+class SHLL_S_PH_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.ph", int_mips_shll_s_ph,
+                                            immZExt4, NoItinerary, DSPRegs>;
+
+class SHLLV_S_PH_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.ph", int_mips_shll_s_ph,
+                                             NoItinerary, DSPRegs>;
+
+class SHRA_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra.ph", int_mips_shra_ph, immZExt4,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRAV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav.ph", int_mips_shra_ph,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRA_R_PH_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.ph", int_mips_shra_r_ph,
+                                            immZExt4, NoItinerary, DSPRegs>,
+                       ClearDefs;
+
+class SHRAV_R_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.ph", int_mips_shra_r_ph,
+                                             NoItinerary, DSPRegs>, ClearDefs;
+
+class SHLL_S_W_DESC : SHLL_QB_R2_DESC_BASE<"shll_s.w", int_mips_shll_s_w,
+                                           immZExt5, NoItinerary, CPURegs>;
+
+class SHLLV_S_W_DESC : SHLL_QB_R3_DESC_BASE<"shllv_s.w", int_mips_shll_s_w,
+                                            NoItinerary, CPURegs>;
+
+class SHRA_R_W_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.w", int_mips_shra_r_w,
+                                           immZExt5, NoItinerary, CPURegs>,
+                      ClearDefs;
+
+class SHRAV_R_W_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.w", int_mips_shra_r_w,
+                                            NoItinerary, CPURegs>;
+
+// Multiplication
+class MULEU_S_PH_QBL_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbl",
+                                              int_mips_muleu_s_ph_qbl,
+                                              NoItinerary, DSPRegs, DSPRegs>;
+
+class MULEU_S_PH_QBR_DESC : ADDU_QB_DESC_BASE<"muleu_s.ph.qbr",
+                                              int_mips_muleu_s_ph_qbr,
+                                              NoItinerary, DSPRegs, DSPRegs>;
+
+class MULEQ_S_W_PHL_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phl",
+                                             int_mips_muleq_s_w_phl,
+                                             NoItinerary, CPURegs, DSPRegs>,
+                           IsCommutable;
+
+class MULEQ_S_W_PHR_DESC : ADDU_QB_DESC_BASE<"muleq_s.w.phr",
+                                             int_mips_muleq_s_w_phr,
+                                             NoItinerary, CPURegs, DSPRegs>,
+                           IsCommutable;
+
+class MULQ_RS_PH_DESC : ADDU_QB_DESC_BASE<"mulq_rs.ph", int_mips_mulq_rs_ph,
+                                          NoItinerary, DSPRegs, DSPRegs>,
+                        IsCommutable;
+
+class MULSAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsaq_s.w.ph">;
+
+class MAQ_S_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phl">;
+
+class MAQ_S_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_s.w.phr">;
+
+class MAQ_SA_W_PHL_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phl">;
+
+class MAQ_SA_W_PHR_DESC : DPA_W_PH_DESC_BASE<"maq_sa.w.phr">;
+
+// Dot product with accumulate/subtract
+class DPAU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbl">;
+
+class DPAU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpau.h.qbr">;
+
+class DPSU_H_QBL_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbl">;
+
+class DPSU_H_QBR_DESC : DPA_W_PH_DESC_BASE<"dpsu.h.qbr">;
+
+class DPAQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaq_s.w.ph">;
+
+class DPSQ_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsq_s.w.ph">;
+
+class DPAQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpaq_sa.l.w">;
+
+class DPSQ_SA_L_W_DESC : DPA_W_PH_DESC_BASE<"dpsq_sa.l.w">;
+
+class MULT_DSP_DESC : MULT_DESC_BASE<"mult">;
+
+class MULTU_DSP_DESC : MULT_DESC_BASE<"multu">;
+
+class MADD_DSP_DESC : MULT_DESC_BASE<"madd">;
+
+class MADDU_DSP_DESC : MULT_DESC_BASE<"maddu">;
+
+class MSUB_DSP_DESC : MULT_DESC_BASE<"msub">;
+
+class MSUBU_DSP_DESC : MULT_DESC_BASE<"msubu">;
+
+// Comparison
+class CMPU_EQ_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.eq.qb",
+                                               int_mips_cmpu_eq_qb, NoItinerary,
+                                               DSPRegs>, IsCommutable;
+
+class CMPU_LT_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.lt.qb",
+                                               int_mips_cmpu_lt_qb, NoItinerary,
+                                               DSPRegs>, IsCommutable;
+
+class CMPU_LE_QB_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmpu.le.qb",
+                                               int_mips_cmpu_le_qb, NoItinerary,
+                                               DSPRegs>, IsCommutable;
+
+class CMPGU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.eq.qb",
+                                                int_mips_cmpgu_eq_qb,
+                                                NoItinerary, CPURegs, DSPRegs>,
+                         IsCommutable;
+
+class CMPGU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.lt.qb",
+                                                int_mips_cmpgu_lt_qb,
+                                                NoItinerary, CPURegs, DSPRegs>,
+                         IsCommutable;
+
+class CMPGU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgu.le.qb",
+                                                int_mips_cmpgu_le_qb,
+                                                NoItinerary, CPURegs, DSPRegs>,
+                         IsCommutable;
+
+class CMP_EQ_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.eq.ph", int_mips_cmp_eq_ph,
+                                              NoItinerary, DSPRegs>,
+                       IsCommutable;
+
+class CMP_LT_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.lt.ph", int_mips_cmp_lt_ph,
+                                              NoItinerary, DSPRegs>,
+                       IsCommutable;
+
+class CMP_LE_PH_DESC : CMP_EQ_QB_R2_DESC_BASE<"cmp.le.ph", int_mips_cmp_le_ph,
+                                              NoItinerary, DSPRegs>,
+                       IsCommutable;
+
+// Misc
+class BITREV_DESC : ABSQ_S_PH_R2_DESC_BASE<"bitrev", int_mips_bitrev,
+                                           NoItinerary, CPURegs>, ClearDefs;
+
+class PACKRL_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"packrl.ph", int_mips_packrl_ph,
+                                              NoItinerary, DSPRegs, DSPRegs>,
+                       ClearDefs;
+
+class REPL_QB_DESC : REPL_DESC_BASE<"repl.qb", int_mips_repl_qb, immZExt8,
+                                    NoItinerary, DSPRegs>, ClearDefs;
+
+class REPL_PH_DESC : REPL_DESC_BASE<"repl.ph", int_mips_repl_ph, immZExt10,
+                                    NoItinerary, DSPRegs>, ClearDefs;
+
+class REPLV_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.qb", int_mips_repl_qb,
+                                             NoItinerary, DSPRegs, CPURegs>,
+                      ClearDefs;
+
+class REPLV_PH_DESC : ABSQ_S_PH_R2_DESC_BASE<"replv.ph", int_mips_repl_ph,
+                                             NoItinerary, DSPRegs, CPURegs>,
+                      ClearDefs;
+
+class PICK_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.qb", int_mips_pick_qb,
+                                            NoItinerary, DSPRegs, DSPRegs>,
+                     ClearDefs, UseDSPCtrl;
+
+class PICK_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"pick.ph", int_mips_pick_ph,
+                                            NoItinerary, DSPRegs, DSPRegs>,
+                     ClearDefs, UseDSPCtrl;
+
+class LWX_DESC : LX_DESC_BASE<"lwx", int_mips_lwx, NoItinerary>, ClearDefs;
+
+class LHX_DESC : LX_DESC_BASE<"lhx", int_mips_lhx, NoItinerary>, ClearDefs;
+
+class LBUX_DESC : LX_DESC_BASE<"lbux", int_mips_lbux, NoItinerary>, ClearDefs;
+
+class BPOSGE32_DESC : BPOSGE32_DESC_BASE<"bposge32", NoItinerary>;
+
+// Extr
+class EXTP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extp", MipsEXTP, NoItinerary>;
+
+class EXTPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpv", MipsEXTP, NoItinerary>;
+
+class EXTPDP_DESC : EXTR_W_TY1_R1_DESC_BASE<"extpdp", MipsEXTPDP, NoItinerary>;
+
+class EXTPDPV_DESC : EXTR_W_TY1_R2_DESC_BASE<"extpdpv", MipsEXTPDP,
+                                             NoItinerary>;
+
+class EXTR_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr.w", MipsEXTR_W, NoItinerary>;
+
+class EXTRV_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv.w", MipsEXTR_W,
+                                             NoItinerary>;
+
+class EXTR_R_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_r.w", MipsEXTR_R_W,
+                                              NoItinerary>;
+
+class EXTRV_R_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_r.w", MipsEXTR_R_W,
+                                               NoItinerary>;
+
+class EXTR_RS_W_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_rs.w", MipsEXTR_RS_W,
+                                               NoItinerary>;
+
+class EXTRV_RS_W_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_rs.w", MipsEXTR_RS_W,
+                                                NoItinerary>;
+
+class EXTR_S_H_DESC : EXTR_W_TY1_R1_DESC_BASE<"extr_s.h", MipsEXTR_S_H,
+                                              NoItinerary>;
+
+class EXTRV_S_H_DESC : EXTR_W_TY1_R2_DESC_BASE<"extrv_s.h", MipsEXTR_S_H,
+                                               NoItinerary>;
+
+class SHILO_DESC : SHILO_R1_DESC_BASE<"shilo">;
+
+class SHILOV_DESC : SHILO_R2_DESC_BASE<"shilov">;
+
+class MTHLIP_DESC : MTHLIP_DESC_BASE<"mthlip">;
+
+class RDDSP_DESC : RDDSP_DESC_BASE<"rddsp", int_mips_rddsp, NoItinerary>;
+
+class WRDSP_DESC : WRDSP_DESC_BASE<"wrdsp", int_mips_wrdsp, NoItinerary>;
+
+class INSV_DESC : INSV_DESC_BASE<"insv", int_mips_insv, NoItinerary>;
+
+//===----------------------------------------------------------------------===//
+// MIPS DSP Rev 2
+// Addition/subtraction
+class ADDU_PH_DESC : ADDU_QB_DESC_BASE<"addu.ph", int_mips_addu_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>, IsCommutable;
+
+class ADDU_S_PH_DESC : ADDU_QB_DESC_BASE<"addu_s.ph", int_mips_addu_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+class SUBU_PH_DESC : ADDU_QB_DESC_BASE<"subu.ph", int_mips_subu_ph, NoItinerary,
+                                       DSPRegs, DSPRegs>;
+
+class SUBU_S_PH_DESC : ADDU_QB_DESC_BASE<"subu_s.ph", int_mips_subu_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>;
+
+class ADDUH_QB_DESC : ADDUH_QB_DESC_BASE<"adduh.qb", int_mips_adduh_qb,
+                                         NoItinerary, DSPRegs>,
+                      ClearDefs, IsCommutable;
+
+class ADDUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"adduh_r.qb", int_mips_adduh_r_qb,
+                                           NoItinerary, DSPRegs>,
+                        ClearDefs, IsCommutable;
+
+class SUBUH_QB_DESC : ADDUH_QB_DESC_BASE<"subuh.qb", int_mips_subuh_qb,
+                                         NoItinerary, DSPRegs>, ClearDefs;
+
+class SUBUH_R_QB_DESC : ADDUH_QB_DESC_BASE<"subuh_r.qb", int_mips_subuh_r_qb,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class ADDQH_PH_DESC : ADDUH_QB_DESC_BASE<"addqh.ph", int_mips_addqh_ph,
+                                         NoItinerary, DSPRegs>,
+                      ClearDefs, IsCommutable;
+
+class ADDQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"addqh_r.ph", int_mips_addqh_r_ph,
+                                           NoItinerary, DSPRegs>,
+                        ClearDefs, IsCommutable;
+
+class SUBQH_PH_DESC : ADDUH_QB_DESC_BASE<"subqh.ph", int_mips_subqh_ph,
+                                         NoItinerary, DSPRegs>, ClearDefs;
+
+class SUBQH_R_PH_DESC : ADDUH_QB_DESC_BASE<"subqh_r.ph", int_mips_subqh_r_ph,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class ADDQH_W_DESC : ADDUH_QB_DESC_BASE<"addqh.w", int_mips_addqh_w,
+                                        NoItinerary, CPURegs>,
+                     ClearDefs, IsCommutable;
+
+class ADDQH_R_W_DESC : ADDUH_QB_DESC_BASE<"addqh_r.w", int_mips_addqh_r_w,
+                                          NoItinerary, CPURegs>,
+                       ClearDefs, IsCommutable;
+
+class SUBQH_W_DESC : ADDUH_QB_DESC_BASE<"subqh.w", int_mips_subqh_w,
+                                        NoItinerary, CPURegs>, ClearDefs;
+
+class SUBQH_R_W_DESC : ADDUH_QB_DESC_BASE<"subqh_r.w", int_mips_subqh_r_w,
+                                          NoItinerary, CPURegs>, ClearDefs;
+
+// Comparison
+class CMPGDU_EQ_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.eq.qb",
+                                                 int_mips_cmpgdu_eq_qb,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          IsCommutable;
+
+class CMPGDU_LT_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.lt.qb",
+                                                 int_mips_cmpgdu_lt_qb,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          IsCommutable;
+
+class CMPGDU_LE_QB_DESC : CMP_EQ_QB_R3_DESC_BASE<"cmpgdu.le.qb",
+                                                 int_mips_cmpgdu_le_qb,
+                                                 NoItinerary, CPURegs, DSPRegs>,
+                          IsCommutable;
+
+// Absolute
+class ABSQ_S_QB_DESC : ABSQ_S_PH_R2_DESC_BASE<"absq_s.qb", int_mips_absq_s_qb,
+                                              NoItinerary, DSPRegs>;
+
+// Multiplication
+class MUL_PH_DESC : ADDUH_QB_DESC_BASE<"mul.ph", int_mips_mul_ph, NoItinerary,
+                                       DSPRegs>, IsCommutable;
+
+class MUL_S_PH_DESC : ADDUH_QB_DESC_BASE<"mul_s.ph", int_mips_mul_s_ph,
+                                         NoItinerary, DSPRegs>, IsCommutable;
+
+class MULQ_S_W_DESC : ADDUH_QB_DESC_BASE<"mulq_s.w", int_mips_mulq_s_w,
+                                         NoItinerary, CPURegs>, IsCommutable;
+
+class MULQ_RS_W_DESC : ADDUH_QB_DESC_BASE<"mulq_rs.w", int_mips_mulq_rs_w,
+                                          NoItinerary, CPURegs>, IsCommutable;
+
+class MULQ_S_PH_DESC : ADDU_QB_DESC_BASE<"mulq_s.ph", int_mips_mulq_s_ph,
+                                         NoItinerary, DSPRegs, DSPRegs>,
+                       IsCommutable;
+
+// Dot product with accumulate/subtract
+class DPA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpa.w.ph">;
+
+class DPS_W_PH_DESC : DPA_W_PH_DESC_BASE<"dps.w.ph">;
+
+class DPAQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_s.w.ph">;
+
+class DPAQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpaqx_sa.w.ph">;
+
+class DPAX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpax.w.ph">;
+
+class DPSX_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsx.w.ph">;
+
+class DPSQX_S_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_s.w.ph">;
+
+class DPSQX_SA_W_PH_DESC : DPA_W_PH_DESC_BASE<"dpsqx_sa.w.ph">;
+
+class MULSA_W_PH_DESC : DPA_W_PH_DESC_BASE<"mulsa.w.ph">;
+
+// Precision reduce/expand
+class PRECR_QB_PH_DESC : CMP_EQ_QB_R3_DESC_BASE<"precr.qb.ph",
+                                                int_mips_precr_qb_ph,
+                                                NoItinerary, DSPRegs, DSPRegs>;
+
+class PRECR_SRA_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra.ph.w",
+                                                     int_mips_precr_sra_ph_w,
+                                                     NoItinerary, DSPRegs,
+                                                     CPURegs>, ClearDefs;
+
+class PRECR_SRA_R_PH_W_DESC : PRECR_SRA_PH_W_DESC_BASE<"precr_sra_r.ph.w",
+                                                      int_mips_precr_sra_r_ph_w,
+                                                       NoItinerary, DSPRegs,
+                                                       CPURegs>, ClearDefs;
+
+// Shift
+class SHRA_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra.qb", int_mips_shra_qb, immZExt3,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRAV_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav.qb", int_mips_shra_qb,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRA_R_QB_DESC : SHLL_QB_R2_DESC_BASE<"shra_r.qb", int_mips_shra_r_qb,
+                                            immZExt3, NoItinerary, DSPRegs>,
+                       ClearDefs;
+
+class SHRAV_R_QB_DESC : SHLL_QB_R3_DESC_BASE<"shrav_r.qb", int_mips_shra_r_qb,
+                                             NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRL_PH_DESC : SHLL_QB_R2_DESC_BASE<"shrl.ph", int_mips_shrl_ph, immZExt4,
+                                          NoItinerary, DSPRegs>, ClearDefs;
+
+class SHRLV_PH_DESC : SHLL_QB_R3_DESC_BASE<"shrlv.ph", int_mips_shrl_ph,
+                                           NoItinerary, DSPRegs>, ClearDefs;
+
+// Misc
+class APPEND_DESC : APPEND_DESC_BASE<"append", int_mips_append, immZExt5,
+                                     NoItinerary>, ClearDefs;
+
+class BALIGN_DESC : APPEND_DESC_BASE<"balign", int_mips_balign, immZExt2,
+                                     NoItinerary>, ClearDefs;
+
+class PREPEND_DESC : APPEND_DESC_BASE<"prepend", int_mips_prepend, immZExt5,
+                                      NoItinerary>, ClearDefs;
+
+// Pseudos.
+def BPOSGE32_PSEUDO : BPOSGE32_PSEUDO_DESC_BASE<int_mips_bposge32, NoItinerary>;
+
+// Instruction defs.
+// MIPS DSP Rev 1
+def ADDU_QB : ADDU_QB_ENC, ADDU_QB_DESC;
+def ADDU_S_QB : ADDU_S_QB_ENC, ADDU_S_QB_DESC;
+def SUBU_QB : SUBU_QB_ENC, SUBU_QB_DESC;
+def SUBU_S_QB : SUBU_S_QB_ENC, SUBU_S_QB_DESC;
+def ADDQ_PH : ADDQ_PH_ENC, ADDQ_PH_DESC;
+def ADDQ_S_PH : ADDQ_S_PH_ENC, ADDQ_S_PH_DESC;
+def SUBQ_PH : SUBQ_PH_ENC, SUBQ_PH_DESC;
+def SUBQ_S_PH : SUBQ_S_PH_ENC, SUBQ_S_PH_DESC;
+def ADDQ_S_W : ADDQ_S_W_ENC, ADDQ_S_W_DESC;
+def SUBQ_S_W : SUBQ_S_W_ENC, SUBQ_S_W_DESC;
+def ADDSC : ADDSC_ENC, ADDSC_DESC;
+def ADDWC : ADDWC_ENC, ADDWC_DESC;
+def MODSUB : MODSUB_ENC, MODSUB_DESC;
+def RADDU_W_QB : RADDU_W_QB_ENC, RADDU_W_QB_DESC;
+def ABSQ_S_PH : ABSQ_S_PH_ENC, ABSQ_S_PH_DESC;
+def ABSQ_S_W : ABSQ_S_W_ENC, ABSQ_S_W_DESC;
+def PRECRQ_QB_PH : PRECRQ_QB_PH_ENC, PRECRQ_QB_PH_DESC;
+def PRECRQ_PH_W : PRECRQ_PH_W_ENC, PRECRQ_PH_W_DESC;
+def PRECRQ_RS_PH_W : PRECRQ_RS_PH_W_ENC, PRECRQ_RS_PH_W_DESC;
+def PRECRQU_S_QB_PH : PRECRQU_S_QB_PH_ENC, PRECRQU_S_QB_PH_DESC;
+def PRECEQ_W_PHL : PRECEQ_W_PHL_ENC, PRECEQ_W_PHL_DESC;
+def PRECEQ_W_PHR : PRECEQ_W_PHR_ENC, PRECEQ_W_PHR_DESC;
+def PRECEQU_PH_QBL : PRECEQU_PH_QBL_ENC, PRECEQU_PH_QBL_DESC;
+def PRECEQU_PH_QBR : PRECEQU_PH_QBR_ENC, PRECEQU_PH_QBR_DESC;
+def PRECEQU_PH_QBLA : PRECEQU_PH_QBLA_ENC, PRECEQU_PH_QBLA_DESC;
+def PRECEQU_PH_QBRA : PRECEQU_PH_QBRA_ENC, PRECEQU_PH_QBRA_DESC;
+def PRECEU_PH_QBL : PRECEU_PH_QBL_ENC, PRECEU_PH_QBL_DESC;
+def PRECEU_PH_QBR : PRECEU_PH_QBR_ENC, PRECEU_PH_QBR_DESC;
+def PRECEU_PH_QBLA : PRECEU_PH_QBLA_ENC, PRECEU_PH_QBLA_DESC;
+def PRECEU_PH_QBRA : PRECEU_PH_QBRA_ENC, PRECEU_PH_QBRA_DESC;
+def SHLL_QB : SHLL_QB_ENC, SHLL_QB_DESC;
+def SHLLV_QB : SHLLV_QB_ENC, SHLLV_QB_DESC;
+def SHRL_QB : SHRL_QB_ENC, SHRL_QB_DESC;
+def SHRLV_QB : SHRLV_QB_ENC, SHRLV_QB_DESC;
+def SHLL_PH : SHLL_PH_ENC, SHLL_PH_DESC;
+def SHLLV_PH : SHLLV_PH_ENC, SHLLV_PH_DESC;
+def SHLL_S_PH : SHLL_S_PH_ENC, SHLL_S_PH_DESC;
+def SHLLV_S_PH : SHLLV_S_PH_ENC, SHLLV_S_PH_DESC;
+def SHRA_PH : SHRA_PH_ENC, SHRA_PH_DESC;
+def SHRAV_PH : SHRAV_PH_ENC, SHRAV_PH_DESC;
+def SHRA_R_PH : SHRA_R_PH_ENC, SHRA_R_PH_DESC;
+def SHRAV_R_PH : SHRAV_R_PH_ENC, SHRAV_R_PH_DESC;
+def SHLL_S_W : SHLL_S_W_ENC, SHLL_S_W_DESC;
+def SHLLV_S_W : SHLLV_S_W_ENC, SHLLV_S_W_DESC;
+def SHRA_R_W : SHRA_R_W_ENC, SHRA_R_W_DESC;
+def SHRAV_R_W : SHRAV_R_W_ENC, SHRAV_R_W_DESC;
+def MULEU_S_PH_QBL : MULEU_S_PH_QBL_ENC, MULEU_S_PH_QBL_DESC;
+def MULEU_S_PH_QBR : MULEU_S_PH_QBR_ENC, MULEU_S_PH_QBR_DESC;
+def MULEQ_S_W_PHL : MULEQ_S_W_PHL_ENC, MULEQ_S_W_PHL_DESC;
+def MULEQ_S_W_PHR : MULEQ_S_W_PHR_ENC, MULEQ_S_W_PHR_DESC;
+def MULQ_RS_PH : MULQ_RS_PH_ENC, MULQ_RS_PH_DESC;
+def MULSAQ_S_W_PH : MULSAQ_S_W_PH_ENC, MULSAQ_S_W_PH_DESC;
+def MAQ_S_W_PHL : MAQ_S_W_PHL_ENC, MAQ_S_W_PHL_DESC;
+def MAQ_S_W_PHR : MAQ_S_W_PHR_ENC, MAQ_S_W_PHR_DESC;
+def MAQ_SA_W_PHL : MAQ_SA_W_PHL_ENC, MAQ_SA_W_PHL_DESC;
+def MAQ_SA_W_PHR : MAQ_SA_W_PHR_ENC, MAQ_SA_W_PHR_DESC;
+def DPAU_H_QBL : DPAU_H_QBL_ENC, DPAU_H_QBL_DESC;
+def DPAU_H_QBR : DPAU_H_QBR_ENC, DPAU_H_QBR_DESC;
+def DPSU_H_QBL : DPSU_H_QBL_ENC, DPSU_H_QBL_DESC;
+def DPSU_H_QBR : DPSU_H_QBR_ENC, DPSU_H_QBR_DESC;
+def DPAQ_S_W_PH : DPAQ_S_W_PH_ENC, DPAQ_S_W_PH_DESC;
+def DPSQ_S_W_PH : DPSQ_S_W_PH_ENC, DPSQ_S_W_PH_DESC;
+def DPAQ_SA_L_W : DPAQ_SA_L_W_ENC, DPAQ_SA_L_W_DESC;
+def DPSQ_SA_L_W : DPSQ_SA_L_W_ENC, DPSQ_SA_L_W_DESC;
+def MULT_DSP : MULT_DSP_ENC, MULT_DSP_DESC;
+def MULTU_DSP : MULTU_DSP_ENC, MULTU_DSP_DESC;
+def MADD_DSP : MADD_DSP_ENC, MADD_DSP_DESC;
+def MADDU_DSP : MADDU_DSP_ENC, MADDU_DSP_DESC;
+def MSUB_DSP : MSUB_DSP_ENC, MSUB_DSP_DESC;
+def MSUBU_DSP : MSUBU_DSP_ENC, MSUBU_DSP_DESC;
+def CMPU_EQ_QB : CMPU_EQ_QB_ENC, CMPU_EQ_QB_DESC;
+def CMPU_LT_QB : CMPU_LT_QB_ENC, CMPU_LT_QB_DESC;
+def CMPU_LE_QB : CMPU_LE_QB_ENC, CMPU_LE_QB_DESC;
+def CMPGU_EQ_QB : CMPGU_EQ_QB_ENC, CMPGU_EQ_QB_DESC;
+def CMPGU_LT_QB : CMPGU_LT_QB_ENC, CMPGU_LT_QB_DESC;
+def CMPGU_LE_QB : CMPGU_LE_QB_ENC, CMPGU_LE_QB_DESC;
+def CMP_EQ_PH : CMP_EQ_PH_ENC, CMP_EQ_PH_DESC;
+def CMP_LT_PH : CMP_LT_PH_ENC, CMP_LT_PH_DESC;
+def CMP_LE_PH : CMP_LE_PH_ENC, CMP_LE_PH_DESC;
+def BITREV : BITREV_ENC, BITREV_DESC;
+def PACKRL_PH : PACKRL_PH_ENC, PACKRL_PH_DESC;
+def REPL_QB : REPL_QB_ENC, REPL_QB_DESC;
+def REPL_PH : REPL_PH_ENC, REPL_PH_DESC;
+def REPLV_QB : REPLV_QB_ENC, REPLV_QB_DESC;
+def REPLV_PH : REPLV_PH_ENC, REPLV_PH_DESC;
+def PICK_QB : PICK_QB_ENC, PICK_QB_DESC;
+def PICK_PH : PICK_PH_ENC, PICK_PH_DESC;
+def LWX : LWX_ENC, LWX_DESC;
+def LHX : LHX_ENC, LHX_DESC;
+def LBUX : LBUX_ENC, LBUX_DESC;
+def BPOSGE32 : BPOSGE32_ENC, BPOSGE32_DESC;
+def INSV : INSV_ENC, INSV_DESC;
+def EXTP : EXTP_ENC, EXTP_DESC;
+def EXTPV : EXTPV_ENC, EXTPV_DESC;
+def EXTPDP : EXTPDP_ENC, EXTPDP_DESC;
+def EXTPDPV : EXTPDPV_ENC, EXTPDPV_DESC;
+def EXTR_W : EXTR_W_ENC, EXTR_W_DESC;
+def EXTRV_W : EXTRV_W_ENC, EXTRV_W_DESC;
+def EXTR_R_W : EXTR_R_W_ENC, EXTR_R_W_DESC;
+def EXTRV_R_W : EXTRV_R_W_ENC, EXTRV_R_W_DESC;
+def EXTR_RS_W : EXTR_RS_W_ENC, EXTR_RS_W_DESC;
+def EXTRV_RS_W : EXTRV_RS_W_ENC, EXTRV_RS_W_DESC;
+def EXTR_S_H : EXTR_S_H_ENC, EXTR_S_H_DESC;
+def EXTRV_S_H : EXTRV_S_H_ENC, EXTRV_S_H_DESC;
+def SHILO : SHILO_ENC, SHILO_DESC;
+def SHILOV : SHILOV_ENC, SHILOV_DESC;
+def MTHLIP : MTHLIP_ENC, MTHLIP_DESC;
+def RDDSP : RDDSP_ENC, RDDSP_DESC;
+def WRDSP : WRDSP_ENC, WRDSP_DESC;
+
+// MIPS DSP Rev 2
+let Predicates = [HasDSPR2] in {
+
+def ADDU_PH : ADDU_PH_ENC, ADDU_PH_DESC;
+def ADDU_S_PH : ADDU_S_PH_ENC, ADDU_S_PH_DESC;
+def SUBU_PH : SUBU_PH_ENC, SUBU_PH_DESC;
+def SUBU_S_PH : SUBU_S_PH_ENC, SUBU_S_PH_DESC;
+def CMPGDU_EQ_QB : CMPGDU_EQ_QB_ENC, CMPGDU_EQ_QB_DESC;
+def CMPGDU_LT_QB : CMPGDU_LT_QB_ENC, CMPGDU_LT_QB_DESC;
+def CMPGDU_LE_QB : CMPGDU_LE_QB_ENC, CMPGDU_LE_QB_DESC;
+def ABSQ_S_QB : ABSQ_S_QB_ENC, ABSQ_S_QB_DESC;
+def ADDUH_QB : ADDUH_QB_ENC, ADDUH_QB_DESC;
+def ADDUH_R_QB : ADDUH_R_QB_ENC, ADDUH_R_QB_DESC;
+def SUBUH_QB : SUBUH_QB_ENC, SUBUH_QB_DESC;
+def SUBUH_R_QB : SUBUH_R_QB_ENC, SUBUH_R_QB_DESC;
+def ADDQH_PH : ADDQH_PH_ENC, ADDQH_PH_DESC;
+def ADDQH_R_PH : ADDQH_R_PH_ENC, ADDQH_R_PH_DESC;
+def SUBQH_PH : SUBQH_PH_ENC, SUBQH_PH_DESC;
+def SUBQH_R_PH : SUBQH_R_PH_ENC, SUBQH_R_PH_DESC;
+def ADDQH_W : ADDQH_W_ENC, ADDQH_W_DESC;
+def ADDQH_R_W : ADDQH_R_W_ENC, ADDQH_R_W_DESC;
+def SUBQH_W : SUBQH_W_ENC, SUBQH_W_DESC;
+def SUBQH_R_W : SUBQH_R_W_ENC, SUBQH_R_W_DESC;
+def MUL_PH : MUL_PH_ENC, MUL_PH_DESC;
+def MUL_S_PH : MUL_S_PH_ENC, MUL_S_PH_DESC;
+def MULQ_S_W : MULQ_S_W_ENC, MULQ_S_W_DESC;
+def MULQ_RS_W : MULQ_RS_W_ENC, MULQ_RS_W_DESC;
+def MULQ_S_PH : MULQ_S_PH_ENC, MULQ_S_PH_DESC;
+def DPA_W_PH : DPA_W_PH_ENC, DPA_W_PH_DESC;
+def DPS_W_PH : DPS_W_PH_ENC, DPS_W_PH_DESC;
+def DPAQX_S_W_PH : DPAQX_S_W_PH_ENC, DPAQX_S_W_PH_DESC;
+def DPAQX_SA_W_PH : DPAQX_SA_W_PH_ENC, DPAQX_SA_W_PH_DESC;
+def DPAX_W_PH : DPAX_W_PH_ENC, DPAX_W_PH_DESC;
+def DPSX_W_PH : DPSX_W_PH_ENC, DPSX_W_PH_DESC;
+def DPSQX_S_W_PH : DPSQX_S_W_PH_ENC, DPSQX_S_W_PH_DESC;
+def DPSQX_SA_W_PH : DPSQX_SA_W_PH_ENC, DPSQX_SA_W_PH_DESC;
+def MULSA_W_PH : MULSA_W_PH_ENC, MULSA_W_PH_DESC;
+def PRECR_QB_PH : PRECR_QB_PH_ENC, PRECR_QB_PH_DESC;
+def PRECR_SRA_PH_W : PRECR_SRA_PH_W_ENC, PRECR_SRA_PH_W_DESC;
+def PRECR_SRA_R_PH_W : PRECR_SRA_R_PH_W_ENC, PRECR_SRA_R_PH_W_DESC;
+def SHRA_QB : SHRA_QB_ENC, SHRA_QB_DESC;
+def SHRAV_QB : SHRAV_QB_ENC, SHRAV_QB_DESC;
+def SHRA_R_QB : SHRA_R_QB_ENC, SHRA_R_QB_DESC;
+def SHRAV_R_QB : SHRAV_R_QB_ENC, SHRAV_R_QB_DESC;
+def SHRL_PH : SHRL_PH_ENC, SHRL_PH_DESC;
+def SHRLV_PH : SHRLV_PH_ENC, SHRLV_PH_DESC;
+def APPEND : APPEND_ENC, APPEND_DESC;
+def BALIGN : BALIGN_ENC, BALIGN_DESC;
+def PREPEND : PREPEND_ENC, PREPEND_DESC;
+
+}
+
+// Pseudos.
+def MULSAQ_S_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsMULSAQ_S_W_PH, NoItinerary,
+                                                MULSAQ_S_W_PH>;
+def MAQ_S_W_PHL_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsMAQ_S_W_PHL, NoItinerary,
+                                              MAQ_S_W_PHL>;
+def MAQ_S_W_PHR_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsMAQ_S_W_PHR, NoItinerary,
+                                              MAQ_S_W_PHR>;
+def MAQ_SA_W_PHL_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsMAQ_SA_W_PHL, NoItinerary,
+                                               MAQ_SA_W_PHL>;
+def MAQ_SA_W_PHR_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsMAQ_SA_W_PHR, NoItinerary,
+                                               MAQ_SA_W_PHR>;
+def DPAU_H_QBL_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAU_H_QBL, NoItinerary,
+                                             DPAU_H_QBL>;
+def DPAU_H_QBR_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAU_H_QBR, NoItinerary,
+                                             DPAU_H_QBR>;
+def DPSU_H_QBL_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSU_H_QBL, NoItinerary,
+                                             DPSU_H_QBL>;
+def DPSU_H_QBR_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSU_H_QBR, NoItinerary,
+                                             DPSU_H_QBR>;
+def DPAQ_S_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAQ_S_W_PH, NoItinerary,
+                                              DPAQ_S_W_PH>;
+def DPSQ_S_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSQ_S_W_PH, NoItinerary,
+                                              DPSQ_S_W_PH>;
+def DPAQ_SA_L_W_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAQ_SA_L_W, NoItinerary,
+                                              DPAQ_SA_L_W>;
+def DPSQ_SA_L_W_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSQ_SA_L_W, NoItinerary,
+                                              DPSQ_SA_L_W>;
+
+def MULT_DSP_PSEUDO : MULT_PSEUDO_BASE<MipsMULT, NoItinerary, MULT_DSP>,
+                      IsCommutable;
+def MULTU_DSP_PSEUDO : MULT_PSEUDO_BASE<MipsMULTU, NoItinerary, MULTU_DSP>,
+                       IsCommutable;
+def MADD_DSP_PSEUDO : MULT_PSEUDO_BASE<MipsMADD_DSP, NoItinerary, MADD_DSP>,
+                      IsCommutable, UseAC;
+def MADDU_DSP_PSEUDO : MULT_PSEUDO_BASE<MipsMADDU_DSP, NoItinerary, MADDU_DSP>,
+                       IsCommutable, UseAC;
+def MSUB_DSP_PSEUDO : MULT_PSEUDO_BASE<MipsMSUB_DSP, NoItinerary, MSUB_DSP>,
+                      UseAC;
+def MSUBU_DSP_PSEUDO : MULT_PSEUDO_BASE<MipsMSUBU_DSP, NoItinerary, MSUBU_DSP>,
+                       UseAC;
+
+def SHILO_PSEUDO : SHILO_R1_PSEUDO_BASE<MipsSHILO, NoItinerary, SHILO>;
+def SHILOV_PSEUDO : SHILO_R2_PSEUDO_BASE<MipsSHILO, NoItinerary, SHILOV>;
+def MTHLIP_PSEUDO : SHILO_R2_PSEUDO_BASE<MipsMTHLIP, NoItinerary, MTHLIP>;
+
+let Predicates = [HasDSPR2] in {
+
+def DPA_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPA_W_PH, NoItinerary, DPA_W_PH>;
+def DPS_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPS_W_PH, NoItinerary, DPS_W_PH>;
+def DPAQX_S_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAQX_S_W_PH, NoItinerary,
+                                               DPAQX_S_W_PH>;
+def DPAQX_SA_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAQX_SA_W_PH, NoItinerary,
+                                                DPAQX_SA_W_PH>;
+def DPAX_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPAX_W_PH, NoItinerary,
+                                            DPAX_W_PH>;
+def DPSX_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSX_W_PH, NoItinerary,
+                                            DPSX_W_PH>;
+def DPSQX_S_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSQX_S_W_PH, NoItinerary,
+                                               DPSQX_S_W_PH>;
+def DPSQX_SA_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsDPSQX_SA_W_PH, NoItinerary,
+                                                DPSQX_SA_W_PH>;
+def MULSA_W_PH_PSEUDO : DPA_W_PH_PSEUDO_BASE<MipsMULSA_W_PH, NoItinerary,
+                                             MULSA_W_PH>;
+
+}
+
+// Patterns.
+class DSPPat<dag pattern, dag result, Predicate pred = HasDSP> :
+  Pat<pattern, result>, Requires<[pred]>;
+
+class BitconvertPat<ValueType DstVT, ValueType SrcVT, RegisterClass DstRC,
+                    RegisterClass SrcRC> :
+   DSPPat<(DstVT (bitconvert (SrcVT SrcRC:$src))),
+          (COPY_TO_REGCLASS SrcRC:$src, DstRC)>;
+
+def : BitconvertPat<i32, v2i16, CPURegs, DSPRegs>;
+def : BitconvertPat<i32, v4i8, CPURegs, DSPRegs>;
+def : BitconvertPat<v2i16, i32, DSPRegs, CPURegs>;
+def : BitconvertPat<v4i8, i32, DSPRegs, CPURegs>;
+
+def : DSPPat<(v2i16 (load addr:$a)),
+             (v2i16 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+def : DSPPat<(v4i8 (load addr:$a)),
+             (v4i8 (COPY_TO_REGCLASS (LW addr:$a), DSPRegs))>;
+def : DSPPat<(store (v2i16 DSPRegs:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPRegs:$val, CPURegs), addr:$a)>;
+def : DSPPat<(store (v4i8 DSPRegs:$val), addr:$a),
+             (SW (COPY_TO_REGCLASS DSPRegs:$val, CPURegs), addr:$a)>;
+
+// Extr patterns.
+class EXTR_W_TY1_R2_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode CPURegs:$rs)), (Instr AC0, CPURegs:$rs)>;
+
+class EXTR_W_TY1_R1_Pat<SDPatternOperator OpNode, Instruction Instr> :
+  DSPPat<(i32 (OpNode immZExt5:$shift)), (Instr AC0, immZExt5:$shift)>;
+
+def : EXTR_W_TY1_R1_Pat<MipsEXTP, EXTP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTP, EXTPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTPDP, EXTPDP>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTPDP, EXTPDPV>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_W, EXTR_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_W, EXTRV_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_R_W, EXTR_R_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_R_W, EXTRV_R_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_RS_W, EXTR_RS_W>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_RS_W, EXTRV_RS_W>;
+def : EXTR_W_TY1_R1_Pat<MipsEXTR_S_H, EXTR_S_H>;
+def : EXTR_W_TY1_R2_Pat<MipsEXTR_S_H, EXTRV_S_H>;
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 2bba8a38024d..e3c8ed75cf91 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -30,10 +30,11 @@ STATISTIC(FilledSlots, "Number of delay slots filled");
 STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
                        " are not NOP.");
 
-static cl::opt<bool> EnableDelaySlotFiller(
-  "enable-mips-delay-filler",
+static cl::opt<bool> DisableDelaySlotFiller(
+  "disable-mips-delay-filler",
   cl::init(false),
-  cl::desc("Fill the Mips delay slots useful instructions."),
+  cl::desc("Disable the delay slot filler, which attempts to fill the Mips"
+           "delay slots with useful instructions."),
   cl::Hidden);
 
 // This option can be used to silence complaints by machine verifier passes.
@@ -114,7 +115,9 @@ runOnMachineBasicBlock(MachineBasicBlock &MBB) {
 
       InstrIter D;
 
-      if (EnableDelaySlotFiller && findDelayInstr(MBB, I, D)) {
+      // Delay slot filling is disabled at -O0.
+      if (!DisableDelaySlotFiller && (TM.getOptLevel() != CodeGenOpt::None) &&
+          findDelayInstr(MBB, I, D)) {
         MBB.splice(llvm::next(I), &MBB, D);
         ++UsefulSlots;
       } else
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 8c0474b0eec7..2cad2a6264ab 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -98,3 +98,37 @@ bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
       MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
 }
+
+uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo &TRI = *MF.getTarget().getRegisterInfo();
+
+  int64_t Offset = 0;
+
+  // Iterate over fixed sized objects.
+  for (int I = MFI->getObjectIndexBegin(); I != 0; ++I)
+    Offset = std::max(Offset, -MFI->getObjectOffset(I));
+
+  // Conservatively assume all callee-saved registers will be saved.
+  for (const uint16_t *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+    unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
+    Offset = RoundUpToAlignment(Offset + Size, Size);
+  }
+
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Check that MaxAlign is not zero if there is a stack object that is not a
+  // callee-saved spill.
+  assert(!MFI->getObjectIndexEnd() || MaxAlign);
+
+  // Iterate over other objects.
+  for (unsigned I = 0, E = MFI->getObjectIndexEnd(); I != E; ++I)
+    Offset = RoundUpToAlignment(Offset + MFI->getObjectSize(I), MaxAlign);
+
+  // Call frame.
+  if (MFI->adjustsStack() && hasReservedCallFrame(MF))
+    Offset = RoundUpToAlignment(Offset + MFI->getMaxCallFrameSize(),
+                                std::max(MaxAlign, getStackAlignment()));
+
+  return RoundUpToAlignment(Offset, getStackAlignment());
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index ed7b7fe76c2b..df52d92da830 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -34,6 +34,9 @@ public:
                                          const MipsSubtarget &ST);
 
   bool hasFP(const MachineFunction &MF) const;
+
+protected:
+  uint64_t estimateStackSize(const MachineFunction &MF) const;
 };
 
 /// Create MipsInstrInfo objects.
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 5a97c17ec851..c5fca7f4b27a 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -86,6 +86,10 @@ private:
 
   SDNode *getGlobalBaseReg();
 
+  SDValue getMips16SPAliasReg();
+
+  void getMips16SPRefReg(SDNode *parent, SDValue &AliasReg);
+
   std::pair<SDNode*, SDNode*> SelectMULT(SDNode *N, unsigned Opc, DebugLoc dl,
                                          EVT Ty, bool HasLo, bool HasHi);
 
@@ -94,6 +98,9 @@ private:
   // Complex Pattern.
   bool SelectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Offset);
 
+  bool SelectAddr16(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Offset,
+       SDValue &Alias);
+
   // getImm - Return a target constant with the specified value.
   inline SDValue getImm(const SDNode *Node, unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, Node->getValueType(0));
@@ -102,6 +109,7 @@ private:
   void ProcessFunctionAfterISel(MachineFunction &MF);
   bool ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI, const MachineInstr&);
   void InitGlobalBaseReg(MachineFunction &MF);
+  void InitMips16SPAliasReg(MachineFunction &MF);
 
   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                             char ConstraintCode,
@@ -220,6 +228,26 @@ void MipsDAGToDAGISel::InitGlobalBaseReg(MachineFunction &MF) {
     .addReg(Mips::V0).addReg(Mips::T9);
 }
 
+// Insert instructions to initialize the Mips16 SP Alias register in the
+// first MBB of the function.
+//
+void MipsDAGToDAGISel::InitMips16SPAliasReg(MachineFunction &MF) {
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  if (!MipsFI->mips16SPAliasRegSet())
+    return;
+
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator I = MBB.begin();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  DebugLoc DL = I != MBB.end() ? I->getDebugLoc() : DebugLoc();
+  unsigned Mips16SPAliasReg = MipsFI->getMips16SPAliasReg();
+
+  BuildMI(MBB, I, DL, TII.get(Mips::MoveR3216), Mips16SPAliasReg)
+    .addReg(Mips::SP);
+}
+
+
 bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
                                               const MachineInstr& MI) {
   unsigned DstReg = 0, ZeroReg = 0;
@@ -260,6 +288,7 @@ bool MipsDAGToDAGISel::ReplaceUsesWithZeroReg(MachineRegisterInfo *MRI,
 
 void MipsDAGToDAGISel::ProcessFunctionAfterISel(MachineFunction &MF) {
   InitGlobalBaseReg(MF);
+  InitMips16SPAliasReg(MF);
 
   MachineRegisterInfo *MRI = &MF.getRegInfo();
 
@@ -284,6 +313,14 @@ SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
 }
 
+/// getMips16SPAliasReg - Output the instructions required to put the
+/// SP into a Mips16 accessible aliased register.
+SDValue MipsDAGToDAGISel::getMips16SPAliasReg() {
+  unsigned Mips16SPAliasReg =
+    MF->getInfo<MipsFunctionInfo>()->getMips16SPAliasReg();
+  return CurDAG->getRegister(Mips16SPAliasReg, TLI.getPointerTy());
+}
+
 /// ComplexPattern used on MipsInstrInfo
 /// Used on Mips Load/Store instructions
 bool MipsDAGToDAGISel::
@@ -337,8 +374,9 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
     // Generate:
     //  lui $2, %hi($CPI1_0)
     //  lwc1 $f0, %lo($CPI1_0)($2)
-    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
-      SDValue LoVal = Addr.getOperand(1), Opnd0 = LoVal.getOperand(0);
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
       if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
           isa<JumpTableSDNode>(Opnd0)) {
         Base = Addr.getOperand(0);
@@ -361,6 +399,115 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
   return true;
 }
 
+void MipsDAGToDAGISel::getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg) {
+  SDValue AliasFPReg = CurDAG->getRegister(Mips::S0, TLI.getPointerTy());
+  if (Parent) {
+    switch (Parent->getOpcode()) {
+      case ISD::LOAD: {
+        LoadSDNode *SD = dyn_cast<LoadSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+      case ISD::STORE: {
+        StoreSDNode *SD = dyn_cast<StoreSDNode>(Parent);
+        switch (SD->getMemoryVT().getSizeInBits()) {
+        case 8:
+        case 16:
+          AliasReg = TM.getFrameLowering()->hasFP(*MF)?
+            AliasFPReg: getMips16SPAliasReg();
+          return;
+        }
+        break;
+      }
+    }
+  }
+  AliasReg = CurDAG->getRegister(Mips::SP, TLI.getPointerTy());
+  return;
+
+}
+bool MipsDAGToDAGISel::SelectAddr16(
+  SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset,
+  SDValue &Alias) {
+  EVT ValTy = Addr.getValueType();
+
+  Alias = CurDAG->getTargetConstant(0, ValTy);
+
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+    Offset = CurDAG->getTargetConstant(0, ValTy);
+    getMips16SPRefReg(Parent, Alias);
+    return true;
+  }
+  // on PIC code Load GA
+  if (Addr.getOpcode() == MipsISD::Wrapper) {
+    Base   = Addr.getOperand(0);
+    Offset = Addr.getOperand(1);
+    return true;
+  }
+  if (TM.getRelocationModel() != Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }
+  // Addresses of the form FI+const or FI|const
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<16>(CN->getSExtValue())) {
+
+      // If the first operand is a FI, get the TargetFI Node
+      if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                  (Addr.getOperand(0))) {
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), ValTy);
+        getMips16SPRefReg(Parent, Alias);
+      }
+      else
+        Base = Addr.getOperand(0);
+
+      Offset = CurDAG->getTargetConstant(CN->getZExtValue(), ValTy);
+      return true;
+    }
+  }
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if (Addr.getOperand(1).getOpcode() == MipsISD::Lo ||
+        Addr.getOperand(1).getOpcode() == MipsISD::GPRel) {
+      SDValue Opnd0 = Addr.getOperand(1).getOperand(0);
+      if (isa<ConstantPoolSDNode>(Opnd0) || isa<GlobalAddressSDNode>(Opnd0) ||
+          isa<JumpTableSDNode>(Opnd0)) {
+        Base = Addr.getOperand(0);
+        Offset = Opnd0;
+        return true;
+      }
+    }
+
+    // If an indexed floating point load/store can be emitted, return false.
+    const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
+
+    if (LS &&
+        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
+        Subtarget.hasMips32r2Or64())
+      return false;
+  }
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, ValTy);
+  return true;
+}
+
 /// Select multiply instructions.
 std::pair<SDNode*, SDNode*>
 MipsDAGToDAGISel::SelectMULT(SDNode *N, unsigned Opc, DebugLoc dl, EVT Ty,
@@ -371,14 +518,16 @@ MipsDAGToDAGISel::SelectMULT(SDNode *N, unsigned Opc, DebugLoc dl, EVT Ty,
   SDValue InFlag = SDValue(Mul, 0);
 
   if (HasLo) {
-    Lo = CurDAG->getMachineNode(Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64, dl,
-                                Ty, MVT::Glue, InFlag);
+    unsigned Opcode = Subtarget.inMips16Mode() ? Mips::Mflo16 :
+      (Ty == MVT::i32 ? Mips::MFLO : Mips::MFLO64);
+    Lo = CurDAG->getMachineNode(Opcode, dl, Ty, MVT::Glue, InFlag);
     InFlag = SDValue(Lo, 1);
   }
-  if (HasHi)
-    Hi = CurDAG->getMachineNode(Ty == MVT::i32 ? Mips::MFHI : Mips::MFHI64, dl,
-                                Ty, InFlag);
-
+  if (HasHi) {
+    unsigned Opcode = Subtarget.inMips16Mode() ? Mips::Mfhi16 :
+      (Ty == MVT::i32 ? Mips::MFHI : Mips::MFHI64);
+    Hi = CurDAG->getMachineNode(Opcode, dl, Ty, InFlag);
+  }
   return std::make_pair(Lo, Hi);
 }
 
@@ -410,6 +559,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 
   case ISD::SUBE:
   case ISD::ADDE: {
+    bool inMips16Mode = Subtarget.inMips16Mode();
     SDValue InFlag = Node->getOperand(2), CmpLHS;
     unsigned Opc = InFlag.getOpcode(); (void)Opc;
     assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) ||
@@ -419,10 +569,16 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     unsigned MOp;
     if (Opcode == ISD::ADDE) {
       CmpLHS = InFlag.getValue(0);
-      MOp = Mips::ADDu;
+      if (inMips16Mode)
+        MOp = Mips::AdduRxRyRz16;
+      else
+        MOp = Mips::ADDu;
     } else {
       CmpLHS = InFlag.getOperand(0);
-      MOp = Mips::SUBu;
+      if (inMips16Mode)
+        MOp = Mips::SubuRxRyRz16;
+      else
+        MOp = Mips::SUBu;
     }
 
     SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
@@ -431,8 +587,11 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     SDValue RHS = Node->getOperand(1);
 
     EVT VT = LHS.getValueType();
-    SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, dl, VT, Ops, 2);
-    SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT,
+
+    unsigned Sltu_op = inMips16Mode? Mips::SltuRxRyRz16: Mips::SLTu;
+    SDNode *Carry = CurDAG->getMachineNode(Sltu_op, dl, VT, Ops, 2);
+    unsigned Addu_op = inMips16Mode? Mips::AdduRxRyRz16 : Mips::ADDu;
+    SDNode *AddCarry = CurDAG->getMachineNode(Addu_op, dl, VT,
                                               SDValue(Carry,0), RHS);
 
     return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue,
@@ -442,8 +601,13 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   /// Mul with two results
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI: {
-    if (NodeTy == MVT::i32)
-      MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
+    if (NodeTy == MVT::i32) {
+      if (Subtarget.inMips16Mode())
+        MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MultuRxRy16 :
+                   Mips::MultRxRy16);
+      else
+        MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
+    }
     else
       MultOpc = (Opcode == ISD::UMUL_LOHI ? Mips::DMULTu : Mips::DMULT);
 
@@ -469,8 +633,13 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   }
   case ISD::MULHS:
   case ISD::MULHU: {
-    if (NodeTy == MVT::i32)
-      MultOpc = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+    if (NodeTy == MVT::i32) {
+      if (Subtarget.inMips16Mode())
+        MultOpc = (Opcode == ISD::MULHU ?
+                   Mips::MultuRxRy16 : Mips::MultRxRy16);
+      else
+        MultOpc = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+    }
     else
       MultOpc = (Opcode == ISD::MULHU ? Mips::DMULTu : Mips::DMULT);
 
@@ -539,6 +708,15 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     return RegOpnd;
   }
 
+#ifndef NDEBUG
+  case ISD::LOAD:
+  case ISD::STORE:
+    assert(cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+           cast<MemSDNode>(Node)->getAlignment() &&
+           "Unexpected unaligned loads/stores.");
+    break;
+#endif
+
   case MipsISD::ThreadPointer: {
     EVT PtrVT = TLI.getPointerTy();
     unsigned RdhwrOpc, SrcReg, DestReg;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index c5207c67376d..e225b6c28eb6 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -25,6 +25,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/CallingConv.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -32,12 +33,33 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
+                    cl::desc("MIPS: Enable tail calls."), cl::init(false));
+
+static const uint16_t O32IntRegs[4] = {
+  Mips::A0, Mips::A1, Mips::A2, Mips::A3
+};
+
+static const uint16_t Mips64IntRegs[8] = {
+  Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
+  Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64
+};
+
+static const uint16_t Mips64DPRegs[8] = {
+  Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
+  Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
+};
+
 // If I is a shifted mask, set the size (Size) and the first bit of the
 // mask (Pos), and return true.
 // For example, if I is 0x003ff800, (Pos, Size) = (11, 11).
@@ -58,6 +80,7 @@ static SDValue GetGlobalReg(SelectionDAG &DAG, EVT Ty) {
 const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   case MipsISD::JmpLink:           return "MipsISD::JmpLink";
+  case MipsISD::TailCall:          return "MipsISD::TailCall";
   case MipsISD::Hi:                return "MipsISD::Hi";
   case MipsISD::Lo:                return "MipsISD::Lo";
   case MipsISD::GPRel:             return "MipsISD::GPRel";
@@ -89,6 +112,20 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::LDR:               return "MipsISD::LDR";
   case MipsISD::SDL:               return "MipsISD::SDL";
   case MipsISD::SDR:               return "MipsISD::SDR";
+  case MipsISD::EXTP:              return "MipsISD::EXTP";
+  case MipsISD::EXTPDP:            return "MipsISD::EXTPDP";
+  case MipsISD::EXTR_S_H:          return "MipsISD::EXTR_S_H";
+  case MipsISD::EXTR_W:            return "MipsISD::EXTR_W";
+  case MipsISD::EXTR_R_W:          return "MipsISD::EXTR_R_W";
+  case MipsISD::EXTR_RS_W:         return "MipsISD::EXTR_RS_W";
+  case MipsISD::SHILO:             return "MipsISD::SHILO";
+  case MipsISD::MTHLIP:            return "MipsISD::MTHLIP";
+  case MipsISD::MULT:              return "MipsISD::MULT";
+  case MipsISD::MULTU:             return "MipsISD::MULTU";
+  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSPDSP";
+  case MipsISD::MADDU_DSP:         return "MipsISD::MADDU_DSP";
+  case MipsISD::MSUB_DSP:          return "MipsISD::MSUB_DSP";
+  case MipsISD::MSUBU_DSP:         return "MipsISD::MSUBU_DSP";
   default:                         return NULL;
   }
 }
@@ -113,7 +150,22 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   if (Subtarget->inMips16Mode()) {
     addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
-    addRegisterClass(MVT::i32, &Mips::CPURARegRegClass);
+  }
+
+  if (Subtarget->hasDSP()) {
+    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+
+      // Expand all builtin opcodes.
+      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+        setOperationAction(Opc, VecTys[i], Expand);
+
+      setOperationAction(ISD::LOAD, VecTys[i], Legal);
+      setOperationAction(ISD::STORE, VecTys[i], Legal);
+      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+    }
   }
 
   if (!TM.Options.UseSoftFloat) {
@@ -160,10 +212,18 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f32,   Custom);
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
-  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
-  setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
-  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
-  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  if (Subtarget->inMips16Mode()) {
+    setOperationAction(ISD::MEMBARRIER,         MVT::Other, Expand);
+    setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Expand);
+  }
+  else {
+    setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
+    setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
+  }
+  if (!Subtarget->inMips16Mode()) {
+    setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+    setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  }
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -187,6 +247,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
+  setOperationAction(ISD::ADD,                MVT::i32,   Custom);
+  if (HasMips64)
+    setOperationAction(ISD::ADD,                MVT::i64,   Custom);
+
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
@@ -254,6 +318,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
   setOperationAction(ISD::VAEND,             MVT::Other, Expand);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+
   // Use the default for now
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
@@ -263,6 +330,21 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::ATOMIC_STORE,      MVT::i32,    Expand);
   setOperationAction(ISD::ATOMIC_STORE,      MVT::i64,    Expand);
 
+  if (Subtarget->inMips16Mode()) {
+    setOperationAction(ISD::ATOMIC_CMP_SWAP,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_SWAP,           MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_ADD,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_AND,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_OR,        MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND,      MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX,       MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN,      MVT::i32,    Expand);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX,      MVT::i32,    Expand);
+  }
+
   setInsertFencesForAtomic(true);
 
   if (!Subtarget->hasSEInReg()) {
@@ -310,6 +392,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
 bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
+  if (Subtarget->inMips16Mode())
+    return false;
+
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
@@ -785,6 +870,26 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   return SDValue();
 }
 
+void
+MipsTargetLowering::LowerOperationWrapper(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+void
+MipsTargetLowering::ReplaceNodeResults(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
 SDValue MipsTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const
 {
@@ -811,6 +916,9 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::SRL_PARTS:          return LowerShiftRightParts(Op, DAG, false);
     case ISD::LOAD:               return LowerLOAD(Op, DAG);
     case ISD::STORE:              return LowerSTORE(Op, DAG);
+    case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+    case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
+    case ISD::ADD:                return LowerADD(Op, DAG);
   }
   return SDValue();
 }
@@ -919,6 +1027,70 @@ static MachineBasicBlock* ExpandCondMov(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 */
+
+MachineBasicBlock *
+MipsTargetLowering::EmitBPOSGE32(MachineInstr *MI, MachineBasicBlock *BB) const{
+  // $bb:
+  //  bposge32_pseudo $vr0
+  //  =>
+  // $bb:
+  //  bposge32 $tbb
+  // $fbb:
+  //  li $vr2, 0
+  //  b $sink
+  // $tbb:
+  //  li $vr1, 1
+  // $sink:
+  //  $vr0 = phi($vr2, $fbb, $vr1, $tbb)
+
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const TargetRegisterClass *RC = &Mips::CPURegsRegClass;
+  DebugLoc DL = MI->getDebugLoc();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = llvm::next(MachineFunction::iterator(BB));
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *FBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *TBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *Sink  = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, FBB);
+  F->insert(It, TBB);
+  F->insert(It, Sink);
+
+  // Transfer the remainder of BB and its successor edges to Sink.
+  Sink->splice(Sink->begin(), BB, llvm::next(MachineBasicBlock::iterator(MI)),
+               BB->end());
+  Sink->transferSuccessorsAndUpdatePHIs(BB);
+
+  // Add successors.
+  BB->addSuccessor(FBB);
+  BB->addSuccessor(TBB);
+  FBB->addSuccessor(Sink);
+  TBB->addSuccessor(Sink);
+
+  // Insert the real bposge32 instruction to $BB.
+  BuildMI(BB, DL, TII->get(Mips::BPOSGE32)).addMBB(TBB);
+
+  // Fill $FBB.
+  unsigned VR2 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::ADDiu), VR2)
+    .addReg(Mips::ZERO).addImm(0);
+  BuildMI(*FBB, FBB->end(), DL, TII->get(Mips::B)).addMBB(Sink);
+
+  // Fill $TBB.
+  unsigned VR1 = RegInfo.createVirtualRegister(RC);
+  BuildMI(*TBB, TBB->end(), DL, TII->get(Mips::ADDiu), VR1)
+    .addReg(Mips::ZERO).addImm(1);
+
+  // Insert phi function to $Sink.
+  BuildMI(*Sink, Sink->begin(), DL, TII->get(Mips::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(VR2).addMBB(FBB).addReg(VR1).addMBB(TBB);
+
+  MI->eraseFromParent();   // The pseudo instruction is gone now.
+  return Sink;
+}
+
 MachineBasicBlock *
 MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                 MachineBasicBlock *BB) const {
@@ -1027,6 +1199,8 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case Mips::ATOMIC_CMP_SWAP_I64:
   case Mips::ATOMIC_CMP_SWAP_I64_P8:
     return EmitAtomicCmpSwap(MI, BB, 8);
+  case Mips::BPOSGE32_PSEUDO:
+    return EmitBPOSGE32(MI, BB);
   }
 }
 
@@ -1571,15 +1745,16 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
 
-    MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
+    const MipsTargetObjectFile &TLOF =
+      (const MipsTargetObjectFile&)getObjFileLowering();
 
     // %gp_rel relocation
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
       SDValue GA = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
                                               MipsII::MO_GPREL);
       SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1);
-      SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
-      return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode);
+      SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
+      return DAG.getNode(ISD::ADD, dl, MVT::i32, GPReg, GPRelNode);
     }
     // %hi/%lo relocation
     SDValue GAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
@@ -1620,8 +1795,10 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     // %hi/%lo relocation
-    SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true, MipsII::MO_ABS_HI);
-    SDValue BALo = DAG.getBlockAddress(BA, MVT::i32, true, MipsII::MO_ABS_LO);
+    SDValue BAHi =
+      DAG.getTargetBlockAddress(BA, MVT::i32, 0, MipsII::MO_ABS_HI);
+    SDValue BALo =
+      DAG.getTargetBlockAddress(BA, MVT::i32, 0, MipsII::MO_ABS_LO);
     SDValue Hi = DAG.getNode(MipsISD::Hi, dl, MVT::i32, BAHi);
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALo);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
@@ -1630,10 +1807,10 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
   EVT ValTy = Op.getValueType();
   unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
   unsigned OFSTFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
-  SDValue BAGOTOffset = DAG.getBlockAddress(BA, ValTy, true, GOTFlag);
+  SDValue BAGOTOffset = DAG.getTargetBlockAddress(BA, ValTy, 0, GOTFlag);
   BAGOTOffset = DAG.getNode(MipsISD::Wrapper, dl, ValTy,
                             GetGlobalReg(DAG, ValTy), BAGOTOffset);
-  SDValue BALOOffset = DAG.getBlockAddress(BA, ValTy, true, OFSTFlag);
+  SDValue BALOOffset = DAG.getTargetBlockAddress(BA, ValTy, 0, OFSTFlag);
   SDValue Load = DAG.getLoad(ValTy, dl, DAG.getEntryNode(), BAGOTOffset,
                              MachinePointerInfo(), false, false, false, 0);
   SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, BALOOffset);
@@ -2224,6 +2401,172 @@ SDValue MipsTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   return CreateStoreLR(MipsISD::SDR, DAG, SD, SDL, IsLittle ? 0 : 7);
 }
 
+// This function expands mips intrinsic nodes which have 64-bit input operands
+// or output values.
+//
+// out64 = intrinsic-node in64
+// =>
+// lo = copy (extract-element (in64, 0))
+// hi = copy (extract-element (in64, 1))
+// mips-specific-node
+// v0 = copy lo
+// v1 = copy hi
+// out64 = merge-values (v0, v1)
+//
+static SDValue LowerDSPIntr(SDValue Op, SelectionDAG &DAG,
+                            unsigned Opc, bool HasI64In, bool HasI64Out) {
+  DebugLoc DL = Op.getDebugLoc();
+  bool HasChainIn = Op->getOperand(0).getValueType() == MVT::Other;
+  SDValue Chain = HasChainIn ? Op->getOperand(0) : DAG.getEntryNode();
+  SmallVector<SDValue, 3> Ops;
+
+  if (HasI64In) {
+    SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                               Op->getOperand(1 + HasChainIn),
+                               DAG.getConstant(0, MVT::i32));
+    SDValue InHi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32,
+                               Op->getOperand(1 + HasChainIn),
+                               DAG.getConstant(1, MVT::i32));
+
+    Chain = DAG.getCopyToReg(Chain, DL, Mips::LO, InLo, SDValue());
+    Chain = DAG.getCopyToReg(Chain, DL, Mips::HI, InHi, Chain.getValue(1));
+
+    Ops.push_back(Chain);
+    Ops.append(Op->op_begin() + HasChainIn + 2, Op->op_end());
+    Ops.push_back(Chain.getValue(1));
+  } else {
+    Ops.push_back(Chain);
+    Ops.append(Op->op_begin() + HasChainIn + 1, Op->op_end());
+  }
+
+  if (!HasI64Out)
+    return DAG.getNode(Opc, DL, Op->value_begin(), Op->getNumValues(),
+                       Ops.begin(), Ops.size());
+
+  SDValue Intr = DAG.getNode(Opc, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                             Ops.begin(), Ops.size());
+  SDValue OutLo = DAG.getCopyFromReg(Intr.getValue(0), DL, Mips::LO, MVT::i32,
+                                     Intr.getValue(1));
+  SDValue OutHi = DAG.getCopyFromReg(OutLo.getValue(1), DL, Mips::HI, MVT::i32,
+                                     OutLo.getValue(2));
+  SDValue Out = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, OutLo, OutHi);
+
+  if (!HasChainIn)
+    return Out;
+
+  SDValue Vals[] = { Out, OutHi.getValue(1) };
+  return DAG.getMergeValues(Vals, 2, DL);
+}
+
+SDValue MipsTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  switch (cast<ConstantSDNode>(Op->getOperand(0))->getZExtValue()) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_shilo:
+    return LowerDSPIntr(Op, DAG, MipsISD::SHILO, true, true);
+  case Intrinsic::mips_dpau_h_qbl:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBL, true, true);
+  case Intrinsic::mips_dpau_h_qbr:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAU_H_QBR, true, true);
+  case Intrinsic::mips_dpsu_h_qbl:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBL, true, true);
+  case Intrinsic::mips_dpsu_h_qbr:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSU_H_QBR, true, true);
+  case Intrinsic::mips_dpa_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPA_W_PH, true, true);
+  case Intrinsic::mips_dps_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPS_W_PH, true, true);
+  case Intrinsic::mips_dpax_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAX_W_PH, true, true);
+  case Intrinsic::mips_dpsx_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSX_W_PH, true, true);
+  case Intrinsic::mips_mulsa_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::MULSA_W_PH, true, true);
+  case Intrinsic::mips_mult:
+    return LowerDSPIntr(Op, DAG, MipsISD::MULT, false, true);
+  case Intrinsic::mips_multu:
+    return LowerDSPIntr(Op, DAG, MipsISD::MULTU, false, true);
+  case Intrinsic::mips_madd:
+    return LowerDSPIntr(Op, DAG, MipsISD::MADD_DSP, true, true);
+  case Intrinsic::mips_maddu:
+    return LowerDSPIntr(Op, DAG, MipsISD::MADDU_DSP, true, true);
+  case Intrinsic::mips_msub:
+    return LowerDSPIntr(Op, DAG, MipsISD::MSUB_DSP, true, true);
+  case Intrinsic::mips_msubu:
+    return LowerDSPIntr(Op, DAG, MipsISD::MSUBU_DSP, true, true);
+  }
+}
+
+SDValue MipsTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  switch (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue()) {
+  default:
+    return SDValue();
+  case Intrinsic::mips_extp:
+    return LowerDSPIntr(Op, DAG, MipsISD::EXTP, true, false);
+  case Intrinsic::mips_extpdp:
+    return LowerDSPIntr(Op, DAG, MipsISD::EXTPDP, true, false);
+  case Intrinsic::mips_extr_w:
+    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_W, true, false);
+  case Intrinsic::mips_extr_r_w:
+    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_R_W, true, false);
+  case Intrinsic::mips_extr_rs_w:
+    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_RS_W, true, false);
+  case Intrinsic::mips_extr_s_h:
+    return LowerDSPIntr(Op, DAG, MipsISD::EXTR_S_H, true, false);
+  case Intrinsic::mips_mthlip:
+    return LowerDSPIntr(Op, DAG, MipsISD::MTHLIP, true, true);
+  case Intrinsic::mips_mulsaq_s_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::MULSAQ_S_W_PH, true, true);
+  case Intrinsic::mips_maq_s_w_phl:
+    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHL, true, true);
+  case Intrinsic::mips_maq_s_w_phr:
+    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_S_W_PHR, true, true);
+  case Intrinsic::mips_maq_sa_w_phl:
+    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHL, true, true);
+  case Intrinsic::mips_maq_sa_w_phr:
+    return LowerDSPIntr(Op, DAG, MipsISD::MAQ_SA_W_PHR, true, true);
+  case Intrinsic::mips_dpaq_s_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAQ_S_W_PH, true, true);
+  case Intrinsic::mips_dpsq_s_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSQ_S_W_PH, true, true);
+  case Intrinsic::mips_dpaq_sa_l_w:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAQ_SA_L_W, true, true);
+  case Intrinsic::mips_dpsq_sa_l_w:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSQ_SA_L_W, true, true);
+  case Intrinsic::mips_dpaqx_s_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAQX_S_W_PH, true, true);
+  case Intrinsic::mips_dpaqx_sa_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPAQX_SA_W_PH, true, true);
+  case Intrinsic::mips_dpsqx_s_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSQX_S_W_PH, true, true);
+  case Intrinsic::mips_dpsqx_sa_w_ph:
+    return LowerDSPIntr(Op, DAG, MipsISD::DPSQX_SA_W_PH, true, true);
+  }
+}
+
+SDValue MipsTargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
+      || cast<ConstantSDNode>
+        (Op->getOperand(0).getOperand(0))->getZExtValue() != 0
+      || Op->getOperand(1).getOpcode() != ISD::FRAME_TO_ARGS_OFFSET)
+    return SDValue();
+
+  // The pattern
+  //   (add (frameaddr 0), (frame_to_args_offset))
+  // results from lowering llvm.eh.dwarf.cfa intrinsic. Transform it to
+  //   (add FrameObject, 0)
+  // where FrameObject is a fixed StackObject with offset 0 which points to
+  // the old stack pointer.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ValTy = Op->getValueType(0);
+  int FI = MFI->CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
+  SDValue InArgsAddr = DAG.getFrameIndex(FI, ValTy);
+  return DAG.getNode(ISD::ADD, Op->getDebugLoc(), ValTy, InArgsAddr,
+                     DAG.getConstant(0, ValTy));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2259,16 +2602,9 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
       Mips::D6, Mips::D7
   };
 
-  // ByVal Args
-  if (ArgFlags.isByVal()) {
-    State.HandleByVal(ValNo, ValVT, LocVT, LocInfo,
-                      1 /*MinSize*/, 4 /*MinAlign*/, ArgFlags);
-    unsigned NextReg = (State.getNextStackOffset() + 3) / 4;
-    for (unsigned r = State.getFirstUnallocated(IntRegs, IntRegsSize);
-         r < std::min(IntRegsSize, NextReg); ++r)
-      State.AllocateReg(IntRegs[r]);
-    return false;
-  }
+  // Do not process byval args here.
+  if (ArgFlags.isByVal())
+    return true;
 
   // Promote i8 and i16
   if (LocVT == MVT::i8 || LocVT == MVT::i16) {
@@ -2323,279 +2659,72 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT,
   } else
     llvm_unreachable("Cannot handle this ValVT.");
 
-  unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
-  unsigned Offset = State.AllocateStack(SizeInBytes, OrigAlign);
-
-  if (!Reg)
+  if (!Reg) {
+    unsigned Offset = State.AllocateStack(ValVT.getSizeInBits() >> 3,
+                                          OrigAlign);
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-  else
+  } else
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
 
-  return false; // CC must always match
-}
-
-static const uint16_t Mips64IntRegs[8] =
-  {Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
-   Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64};
-static const uint16_t Mips64DPRegs[8] =
-  {Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
-   Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64};
-
-static bool CC_Mips64Byval(unsigned ValNo, MVT ValVT, MVT LocVT,
-                           CCValAssign::LocInfo LocInfo,
-                           ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  unsigned Align = std::max(ArgFlags.getByValAlign(), (unsigned)8);
-  unsigned Size  = (ArgFlags.getByValSize() + 7) / 8 * 8;
-  unsigned FirstIdx = State.getFirstUnallocated(Mips64IntRegs, 8);
-
-  assert(Align <= 16 && "Cannot handle alignments larger than 16.");
-
-  // If byval is 16-byte aligned, the first arg register must be even.
-  if ((Align == 16) && (FirstIdx % 2)) {
-    State.AllocateReg(Mips64IntRegs[FirstIdx], Mips64DPRegs[FirstIdx]);
-    ++FirstIdx;
-  }
-
-  // Mark the registers allocated.
-  for (unsigned I = FirstIdx; Size && (I < 8); Size -= 8, ++I)
-    State.AllocateReg(Mips64IntRegs[I], Mips64DPRegs[I]);
-
-  // Allocate space on caller's stack.
-  unsigned Offset = State.AllocateStack(Size, Align);
-
-  if (FirstIdx < 8)
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Mips64IntRegs[FirstIdx],
-                                     LocVT, LocInfo));
-  else
-    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
-
-  return true;
+  return false;
 }
 
 #include "MipsGenCallingConv.inc"
 
-static void
-AnalyzeMips64CallOperands(CCState &CCInfo,
-                          const SmallVectorImpl<ISD::OutputArg> &Outs) {
-  unsigned NumOps = Outs.size();
-  for (unsigned i = 0; i != NumOps; ++i) {
-    MVT ArgVT = Outs[i].VT;
-    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    bool R;
-
-    if (Outs[i].IsFixed)
-      R = CC_MipsN(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-    else
-      R = CC_MipsN_VarArg(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-
-    if (R) {
-#ifndef NDEBUG
-      dbgs() << "Call operand #" << i << " has unhandled type "
-             << EVT(ArgVT).getEVTString();
-#endif
-      llvm_unreachable(0);
-    }
-  }
-}
-
 //===----------------------------------------------------------------------===//
 //                  Call Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
 static const unsigned O32IntRegsSize = 4;
 
-static const uint16_t O32IntRegs[] = {
-  Mips::A0, Mips::A1, Mips::A2, Mips::A3
-};
-
 // Return next O32 integer argument register.
 static unsigned getNextIntArgReg(unsigned Reg) {
   assert((Reg == Mips::A0) || (Reg == Mips::A2));
   return (Reg == Mips::A0) ? Mips::A1 : Mips::A3;
 }
 
-// Write ByVal Arg to arg registers and stack.
-static void
-WriteByValArg(SDValue Chain, DebugLoc dl,
-              SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
-              SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
-              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-              const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
-              MVT PtrType, bool isLittle) {
-  unsigned LocMemOffset = VA.getLocMemOffset();
-  unsigned Offset = 0;
-  uint32_t RemainingSize = Flags.getByValSize();
-  unsigned ByValAlign = Flags.getByValAlign();
-
-  // Copy the first 4 words of byval arg to registers A0 - A3.
-  // FIXME: Use a stricter alignment if it enables better optimization in passes
-  //        run later.
-  for (; RemainingSize >= 4 && LocMemOffset < 4 * 4;
-       Offset += 4, RemainingSize -= 4, LocMemOffset += 4) {
-    SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                                  DAG.getConstant(Offset, MVT::i32));
-    SDValue LoadVal = DAG.getLoad(MVT::i32, dl, Chain, LoadPtr,
-                                  MachinePointerInfo(), false, false, false,
-                                  std::min(ByValAlign, (unsigned )4));
-    MemOpChains.push_back(LoadVal.getValue(1));
-    unsigned DstReg = O32IntRegs[LocMemOffset / 4];
-    RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
-  }
-
-  if (RemainingSize == 0)
-    return;
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization.
+bool MipsTargetLowering::
+IsEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                  unsigned NextStackOffset,
+                                  const MipsFunctionInfo& FI) const {
+  if (!EnableMipsTailCalls)
+    return false;
 
-  // If there still is a register available for argument passing, write the
-  // remaining part of the structure to it using subword loads and shifts.
-  if (LocMemOffset < 4 * 4) {
-    assert(RemainingSize <= 3 && RemainingSize >= 1 &&
-           "There must be one to three bytes remaining.");
-    unsigned LoadSize = (RemainingSize == 3 ? 2 : RemainingSize);
-    SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                                  DAG.getConstant(Offset, MVT::i32));
-    unsigned Alignment = std::min(ByValAlign, (unsigned )4);
-    SDValue LoadVal = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
-                                     LoadPtr, MachinePointerInfo(),
-                                     MVT::getIntegerVT(LoadSize * 8), false,
-                                     false, Alignment);
-    MemOpChains.push_back(LoadVal.getValue(1));
-
-    // If target is big endian, shift it to the most significant half-word or
-    // byte.
-    if (!isLittle)
-      LoadVal = DAG.getNode(ISD::SHL, dl, MVT::i32, LoadVal,
-                            DAG.getConstant(32 - LoadSize * 8, MVT::i32));
-
-    Offset += LoadSize;
-    RemainingSize -= LoadSize;
-
-    // Read second subword if necessary.
-    if (RemainingSize != 0)  {
-      assert(RemainingSize == 1 && "There must be one byte remaining.");
-      LoadPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                            DAG.getConstant(Offset, MVT::i32));
-      unsigned Alignment = std::min(ByValAlign, (unsigned )2);
-      SDValue Subword = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
-                                       LoadPtr, MachinePointerInfo(),
-                                       MVT::i8, false, false, Alignment);
-      MemOpChains.push_back(Subword.getValue(1));
-      // Insert the loaded byte to LoadVal.
-      // FIXME: Use INS if supported by target.
-      unsigned ShiftAmt = isLittle ? 16 : 8;
-      SDValue Shift = DAG.getNode(ISD::SHL, dl, MVT::i32, Subword,
-                                  DAG.getConstant(ShiftAmt, MVT::i32));
-      LoadVal = DAG.getNode(ISD::OR, dl, MVT::i32, LoadVal, Shift);
-    }
+  // No tail call optimization for mips16.
+  if (Subtarget->inMips16Mode())
+    return false;
 
-    unsigned DstReg = O32IntRegs[LocMemOffset / 4];
-    RegsToPass.push_back(std::make_pair(DstReg, LoadVal));
-    return;
-  }
+  // Return false if either the callee or caller has a byval argument.
+  if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
+    return false;
 
-  // Copy remaining part of byval arg using memcpy.
-  SDValue Src = DAG.getNode(ISD::ADD, dl, MVT::i32, Arg,
-                            DAG.getConstant(Offset, MVT::i32));
-  SDValue Dst = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr,
-                            DAG.getIntPtrConstant(LocMemOffset));
-  Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
-                        DAG.getConstant(RemainingSize, MVT::i32),
-                        std::min(ByValAlign, (unsigned)4),
-                        /*isVolatile=*/false, /*AlwaysInline=*/false,
-                        MachinePointerInfo(0), MachinePointerInfo(0));
-  MemOpChains.push_back(Chain);
+  // Return true if the callee's argument area is no larger than the
+  // caller's.
+  return NextStackOffset <= FI.getIncomingArgSize();
 }
 
-// Copy Mips64 byVal arg to registers and stack.
-void static
-PassByValArg64(SDValue Chain, DebugLoc dl,
-               SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
-               SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
-               MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
-               const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
-               EVT PtrTy, bool isLittle) {
-  unsigned ByValSize = Flags.getByValSize();
-  unsigned Alignment = std::min(Flags.getByValAlign(), (unsigned)8);
-  bool IsRegLoc = VA.isRegLoc();
-  unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
-  unsigned LocMemOffset = 0;
-  unsigned MemCpySize = ByValSize;
-
-  if (!IsRegLoc)
-    LocMemOffset = VA.getLocMemOffset();
-  else {
-    const uint16_t *Reg = std::find(Mips64IntRegs, Mips64IntRegs + 8,
-                                    VA.getLocReg());
-    const uint16_t *RegEnd = Mips64IntRegs + 8;
-
-    // Copy double words to registers.
-    for (; (Reg != RegEnd) && (ByValSize >= Offset + 8); ++Reg, Offset += 8) {
-      SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, PtrTy, Arg,
-                                    DAG.getConstant(Offset, PtrTy));
-      SDValue LoadVal = DAG.getLoad(MVT::i64, dl, Chain, LoadPtr,
-                                    MachinePointerInfo(), false, false, false,
-                                    Alignment);
-      MemOpChains.push_back(LoadVal.getValue(1));
-      RegsToPass.push_back(std::make_pair(*Reg, LoadVal));
-    }
-
-    // Return if the struct has been fully copied.
-    if (!(MemCpySize = ByValSize - Offset))
-      return;
-
-    // If there is an argument register available, copy the remainder of the
-    // byval argument with sub-doubleword loads and shifts.
-    if (Reg != RegEnd) {
-      assert((ByValSize < Offset + 8) &&
-             "Size of the remainder should be smaller than 8-byte.");
-      SDValue Val;
-      for (unsigned LoadSize = 4; Offset < ByValSize; LoadSize /= 2) {
-        unsigned RemSize = ByValSize - Offset;
-
-        if (RemSize < LoadSize)
-          continue;
-
-        SDValue LoadPtr = DAG.getNode(ISD::ADD, dl, PtrTy, Arg,
-                                      DAG.getConstant(Offset, PtrTy));
-        SDValue LoadVal =
-          DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i64, Chain, LoadPtr,
-                         MachinePointerInfo(), MVT::getIntegerVT(LoadSize * 8),
-                         false, false, Alignment);
-        MemOpChains.push_back(LoadVal.getValue(1));
-
-        // Offset in number of bits from double word boundary.
-        unsigned OffsetDW = (Offset % 8) * 8;
-        unsigned Shamt = isLittle ? OffsetDW : 64 - (OffsetDW + LoadSize * 8);
-        SDValue Shift = DAG.getNode(ISD::SHL, dl, MVT::i64, LoadVal,
-                                    DAG.getConstant(Shamt, MVT::i32));
-
-        Val = Val.getNode() ? DAG.getNode(ISD::OR, dl, MVT::i64, Val, Shift) :
-                              Shift;
-        Offset += LoadSize;
-        Alignment = std::min(Alignment, LoadSize);
-      }
-
-      RegsToPass.push_back(std::make_pair(*Reg, Val));
-      return;
-    }
+SDValue
+MipsTargetLowering::passArgOnStack(SDValue StackPtr, unsigned Offset,
+                                   SDValue Chain, SDValue Arg, DebugLoc DL,
+                                   bool IsTailCall, SelectionDAG &DAG) const {
+  if (!IsTailCall) {
+    SDValue PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr,
+                                 DAG.getIntPtrConstant(Offset));
+    return DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo(), false,
+                        false, 0);
   }
 
-  assert(MemCpySize && "MemCpySize must not be zero.");
-
-  // Copy remainder of byval arg to it with memcpy.
-  SDValue Src = DAG.getNode(ISD::ADD, dl, PtrTy, Arg,
-                            DAG.getConstant(Offset, PtrTy));
-  SDValue Dst = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr,
-                            DAG.getIntPtrConstant(LocMemOffset));
-  Chain = DAG.getMemcpy(Chain, dl, Dst, Src,
-                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
-                        /*isVolatile=*/false, /*AlwaysInline=*/false,
-                        MachinePointerInfo(0), MachinePointerInfo(0));
-  MemOpChains.push_back(Chain);
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  int FI = MFI->CreateFixedObject(Arg.getValueSizeInBits() / 8, Offset, false);
+  SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+  return DAG.getStore(Chain, DL, Arg, FIN, MachinePointerInfo(),
+                      /*isVolatile=*/ true, false, 0);
 }
 
 /// LowerCall - functions arguments are copied from virtual regs to
 /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
-/// TODO: isTailCall.
 SDValue
 MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                               SmallVectorImpl<SDValue> &InVals) const {
@@ -2610,56 +2739,49 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
 
-  // MIPs target does not yet support tail call optimization.
-  isTailCall = false;
-
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFL = MF.getTarget().getFrameLowering();
   bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, isVarArg, IsO32, CCInfo);
 
-  if (CallConv == CallingConv::Fast)
-    CCInfo.AnalyzeCallOperands(Outs, CC_Mips_FastCC);
-  else if (IsO32)
-    CCInfo.AnalyzeCallOperands(Outs, CC_MipsO32);
-  else if (HasMips64)
-    AnalyzeMips64CallOperands(CCInfo, Outs);
-  else
-    CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
+  MipsCCInfo.analyzeCallOperands(Outs);
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
-  unsigned StackAlignment = TFL->getStackAlignment();
-  NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
 
-  // Update size of the maximum argument space.
-  // For O32, a minimum of four words (16 bytes) of argument space is
-  // allocated.
-  if (IsO32 && (CallConv != CallingConv::Fast))
-    NextStackOffset = std::max(NextStackOffset, (unsigned)16);
+  // Check if it's really possible to do a tail call.
+  if (isTailCall)
+    isTailCall =
+      IsEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
+                                        *MF.getInfo<MipsFunctionInfo>());
+
+  if (isTailCall)
+    ++NumTailCalls;
 
   // Chain is the output chain of the last Load/Store or CopyToReg node.
   // ByValChain is the output chain of the last Memcpy node created for copying
   // byval arguments to the stack.
+  unsigned StackAlignment = TFL->getStackAlignment();
+  NextStackOffset = RoundUpToAlignment(NextStackOffset, StackAlignment);
   SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, true);
-  Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
+
+  if (!isTailCall)
+    Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal);
 
   SDValue StackPtr = DAG.getCopyFromReg(Chain, dl,
                                         IsN64 ? Mips::SP_64 : Mips::SP,
                                         getPointerTy());
 
-  if (MipsFI->getMaxCallFrameSize() < NextStackOffset)
-    MipsFI->setMaxCallFrameSize(NextStackOffset);
-
   // With EABI is it possible to have 16 args on registers.
   SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
+  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
 
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -2672,14 +2794,12 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if (Flags.isByVal()) {
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
-      if (IsO32)
-        WriteByValArg(Chain, dl, RegsToPass, MemOpChains, StackPtr,
-                      MFI, DAG, Arg, VA, Flags, getPointerTy(),
-                      Subtarget->isLittle());
-      else
-        PassByValArg64(Chain, dl, RegsToPass, MemOpChains, StackPtr,
-                       MFI, DAG, Arg, VA, Flags, getPointerTy(),
-                       Subtarget->isLittle());
+      assert(ByValArg != MipsCCInfo.byval_end());
+      assert(!isTailCall &&
+             "Do not tail-call optimize if there is a byval argument.");
+      passByValArg(Chain, dl, RegsToPass, MemOpChains, StackPtr, MFI, DAG, Arg,
+                   MipsCCInfo, *ByValArg, Flags, Subtarget->isLittle());
+      ++ByValArg;
       continue;
     }
 
@@ -2729,10 +2849,8 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
     // emit ISD::STORE whichs stores the
     // parameter value to a stack Location
-    SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr,
-                                 DAG.getIntPtrConstant(VA.getLocMemOffset()));
-    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
-                                       MachinePointerInfo(), false, false, 0));
+    MemOpChains.push_back(passArgOnStack(StackPtr, VA.getLocMemOffset(),
+                                         Chain, Arg, dl, isTailCall, DAG));
   }
 
   // Transform all store nodes into one single node because all store
@@ -2861,6 +2979,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
+  if (isTailCall)
+    return DAG.getNode(MipsISD::TailCall, dl, MVT::Other, &Ops[0], Ops.size());
+
   Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
   InFlag = Chain.getValue(1);
 
@@ -2904,70 +3025,6 @@ MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
 //===----------------------------------------------------------------------===//
 //             Formal Arguments Calling Convention Implementation
 //===----------------------------------------------------------------------===//
-static void ReadByValArg(MachineFunction &MF, SDValue Chain, DebugLoc dl,
-                         std::vector<SDValue> &OutChains,
-                         SelectionDAG &DAG, unsigned NumWords, SDValue FIN,
-                         const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
-                         const Argument *FuncArg) {
-  unsigned LocMem = VA.getLocMemOffset();
-  unsigned FirstWord = LocMem / 4;
-
-  // copy register A0 - A3 to frame object
-  for (unsigned i = 0; i < NumWords; ++i) {
-    unsigned CurWord = FirstWord + i;
-    if (CurWord >= O32IntRegsSize)
-      break;
-
-    unsigned SrcReg = O32IntRegs[CurWord];
-    unsigned Reg = AddLiveIn(MF, SrcReg, &Mips::CPURegsRegClass);
-    SDValue StorePtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIN,
-                                   DAG.getConstant(i * 4, MVT::i32));
-    SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(Reg, MVT::i32),
-                                 StorePtr, MachinePointerInfo(FuncArg, i * 4),
-                                 false, false, 0);
-    OutChains.push_back(Store);
-  }
-}
-
-// Create frame object on stack and copy registers used for byval passing to it.
-static unsigned
-CopyMips64ByValRegs(MachineFunction &MF, SDValue Chain, DebugLoc dl,
-                    std::vector<SDValue> &OutChains, SelectionDAG &DAG,
-                    const CCValAssign &VA, const ISD::ArgFlagsTy &Flags,
-                    MachineFrameInfo *MFI, bool IsRegLoc,
-                    SmallVectorImpl<SDValue> &InVals, MipsFunctionInfo *MipsFI,
-                    EVT PtrTy, const Argument *FuncArg) {
-  const uint16_t *Reg = Mips64IntRegs + 8;
-  int FOOffset; // Frame object offset from virtual frame pointer.
-
-  if (IsRegLoc) {
-    Reg = std::find(Mips64IntRegs, Mips64IntRegs + 8, VA.getLocReg());
-    FOOffset = (Reg - Mips64IntRegs) * 8 - 8 * 8;
-  }
-  else
-    FOOffset = VA.getLocMemOffset();
-
-  // Create frame object.
-  unsigned NumRegs = (Flags.getByValSize() + 7) / 8;
-  unsigned LastFI = MFI->CreateFixedObject(NumRegs * 8, FOOffset, true);
-  SDValue FIN = DAG.getFrameIndex(LastFI, PtrTy);
-  InVals.push_back(FIN);
-
-  // Copy arg registers.
-  for (unsigned I = 0; (Reg != Mips64IntRegs + 8) && (I < NumRegs);
-       ++Reg, ++I) {
-    unsigned VReg = AddLiveIn(MF, *Reg, &Mips::CPU64RegsRegClass);
-    SDValue StorePtr = DAG.getNode(ISD::ADD, dl, PtrTy, FIN,
-                                   DAG.getConstant(I * 8, PtrTy));
-    SDValue Store = DAG.getStore(Chain, dl, DAG.getRegister(VReg, MVT::i64),
-                                 StorePtr, MachinePointerInfo(FuncArg, I * 8),
-                                 false, false, 0);
-    OutChains.push_back(Store);
-  }
-
-  return LastFI;
-}
-
 /// LowerFormalArguments - transform physical registers into virtual registers
 /// and generate load operations for arguments places on the stack.
 SDValue
@@ -2991,20 +3048,21 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
+  MipsCC MipsCCInfo(CallConv, isVarArg, IsO32, CCInfo);
 
-  if (CallConv == CallingConv::Fast)
-    CCInfo.AnalyzeFormalArguments(Ins, CC_Mips_FastCC);
-  else if (IsO32)
-    CCInfo.AnalyzeFormalArguments(Ins, CC_MipsO32);
-  else
-    CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
+  MipsCCInfo.analyzeFormalArguments(Ins);
+  MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
+                           MipsCCInfo.hasByValArg());
 
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
-  int LastFI = 0;// MipsFI->LastInArgFI is 0 at the entry of this function.
+  unsigned CurArgIdx = 0;
+  MipsCC::byval_iterator ByValArg = MipsCCInfo.byval_begin();
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++FuncArg) {
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
+    std::advance(FuncArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
     EVT ValVT = VA.getValVT();
     ISD::ArgFlagsTy Flags = Ins[i].Flags;
     bool IsRegLoc = VA.isRegLoc();
@@ -3012,18 +3070,10 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     if (Flags.isByVal()) {
       assert(Flags.getByValSize() &&
              "ByVal args of size 0 should have been ignored by front-end.");
-      if (IsO32) {
-        unsigned NumWords = (Flags.getByValSize() + 3) / 4;
-        LastFI = MFI->CreateFixedObject(NumWords * 4, VA.getLocMemOffset(),
-                                        true);
-        SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
-        InVals.push_back(FIN);
-        ReadByValArg(MF, Chain, dl, OutChains, DAG, NumWords, FIN, VA, Flags,
-                     &*FuncArg);
-      } else // N32/64
-        LastFI = CopyMips64ByValRegs(MF, Chain, dl, OutChains, DAG, VA, Flags,
-                                     MFI, IsRegLoc, InVals, MipsFI,
-                                     getPointerTy(), &*FuncArg);
+      assert(ByValArg != MipsCCInfo.byval_end());
+      copyByValRegs(Chain, dl, OutChains, DAG, Flags, InVals, &*FuncArg,
+                    MipsCCInfo, *ByValArg);
+      ++ByValArg;
       continue;
     }
 
@@ -3085,13 +3135,13 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
       assert(VA.isMemLoc());
 
       // The stack pointer offset is relative to the caller stack frame.
-      LastFI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+      int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
                                       VA.getLocMemOffset(), true);
 
       // Create load nodes to retrieve arguments from the stack
-      SDValue FIN = DAG.getFrameIndex(LastFI, getPointerTy());
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
       InVals.push_back(DAG.getLoad(ValVT, dl, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(LastFI),
+                                   MachinePointerInfo::getFixedStack(FI),
                                    false, false, false, 0));
     }
   }
@@ -3102,55 +3152,16 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
     unsigned Reg = MipsFI->getSRetReturnReg();
     if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+      Reg = MF.getRegInfo().
+        createVirtualRegister(getRegClassFor(IsN64 ? MVT::i64 : MVT::i32));
       MipsFI->setSRetReturnReg(Reg);
     }
     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
-  if (isVarArg) {
-    unsigned NumOfRegs = IsO32 ? 4 : 8;
-    const uint16_t *ArgRegs = IsO32 ? O32IntRegs : Mips64IntRegs;
-    unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumOfRegs);
-    int FirstRegSlotOffset = IsO32 ? 0 : -64 ; // offset of $a0's slot.
-    const TargetRegisterClass *RC = IsO32 ?
-      (const TargetRegisterClass*)&Mips::CPURegsRegClass :
-      (const TargetRegisterClass*)&Mips::CPU64RegsRegClass;
-    unsigned RegSize = RC->getSize();
-    int RegSlotOffset = FirstRegSlotOffset + Idx * RegSize;
-
-    // Offset of the first variable argument from stack pointer.
-    int FirstVaArgOffset;
-
-    if (IsO32 || (Idx == NumOfRegs)) {
-      FirstVaArgOffset =
-        (CCInfo.getNextStackOffset() + RegSize - 1) / RegSize * RegSize;
-    } else
-      FirstVaArgOffset = RegSlotOffset;
-
-    // Record the frame index of the first variable argument
-    // which is a value necessary to VASTART.
-    LastFI = MFI->CreateFixedObject(RegSize, FirstVaArgOffset, true);
-    MipsFI->setVarArgsFrameIndex(LastFI);
-
-    // Copy the integer registers that have not been used for argument passing
-    // to the argument register save area. For O32, the save area is allocated
-    // in the caller's stack frame, while for N32/64, it is allocated in the
-    // callee's stack frame.
-    for (int StackOffset = RegSlotOffset;
-         Idx < NumOfRegs; ++Idx, StackOffset += RegSize) {
-      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegs[Idx], RC);
-      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
-                                            MVT::getIntegerVT(RegSize * 8));
-      LastFI = MFI->CreateFixedObject(RegSize, StackOffset, true);
-      SDValue PtrOff = DAG.getFrameIndex(LastFI, getPointerTy());
-      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff,
-                                       MachinePointerInfo(), false, false, 0));
-    }
-  }
-
-  MipsFI->setLastInArgFI(LastFI);
+  if (isVarArg)
+    writeVarArgRegs(OutChains, MipsCCInfo, Chain, dl, DAG);
 
   // All stores are grouped in one node to allow the matching between
   // the size of Ins and InVals. This only happens when on varg functions
@@ -3167,6 +3178,17 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
+bool
+MipsTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                   MachineFunction &MF, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
+                 RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_Mips);
+}
+
 SDValue
 MipsTargetLowering::LowerReturn(SDValue Chain,
                                 CallingConv::ID CallConv, bool isVarArg,
@@ -3219,9 +3241,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+    unsigned V0 = IsN64 ? Mips::V0_64 : Mips::V0;
 
-    Chain = DAG.getCopyToReg(Chain, dl, Mips::V0, Val, Flag);
+    Chain = DAG.getCopyToReg(Chain, dl, V0, Val, Flag);
     Flag = Chain.getValue(1);
+    MF.getRegInfo().addLiveOut(V0);
   }
 
   // Return on Mips is always a "jr $ra"
@@ -3325,8 +3349,11 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8)
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+        if (Subtarget->inMips16Mode())
+          return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::CPURegsRegClass);
+      }
       if (VT == MVT::i64 && !HasMips64)
         return std::make_pair(0U, &Mips::CPURegsRegClass);
       if (VT == MVT::i64 && HasMips64)
@@ -3485,3 +3512,316 @@ unsigned MipsTargetLowering::getJumpTableEncoding() const {
 
   return TargetLowering::getJumpTableEncoding();
 }
+
+MipsTargetLowering::MipsCC::MipsCC(CallingConv::ID CallConv, bool IsVarArg,
+                                   bool IsO32, CCState &Info) : CCInfo(Info) {
+  UseRegsForByval = true;
+
+  if (IsO32) {
+    RegSize = 4;
+    NumIntArgRegs = array_lengthof(O32IntRegs);
+    ReservedArgArea = 16;
+    IntArgRegs = ShadowRegs = O32IntRegs;
+    FixedFn = VarFn = CC_MipsO32;
+  } else {
+    RegSize = 8;
+    NumIntArgRegs = array_lengthof(Mips64IntRegs);
+    ReservedArgArea = 0;
+    IntArgRegs = Mips64IntRegs;
+    ShadowRegs = Mips64DPRegs;
+    FixedFn = CC_MipsN;
+    VarFn = CC_MipsN_VarArg;
+  }
+
+  if (CallConv == CallingConv::Fast) {
+    assert(!IsVarArg);
+    UseRegsForByval = false;
+    ReservedArgArea = 0;
+    FixedFn = VarFn = CC_Mips_FastCC;
+  }
+
+  // Pre-allocate reserved argument area.
+  CCInfo.AllocateStack(ReservedArgArea, 1);
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args) {
+  unsigned NumOpnds = Args.size();
+
+  for (unsigned I = 0; I != NumOpnds; ++I) {
+    MVT ArgVT = Args[I].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+    bool R;
+
+    if (ArgFlags.isByVal()) {
+      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
+      continue;
+    }
+
+    if (Args[I].IsFixed)
+      R = FixedFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+    else
+      R = VarFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+
+    if (R) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << I << " has unhandled type "
+             << EVT(ArgVT).getEVTString();
+#endif
+      llvm_unreachable(0);
+    }
+  }
+}
+
+void MipsTargetLowering::MipsCC::
+analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args) {
+  unsigned NumArgs = Args.size();
+
+  for (unsigned I = 0; I != NumArgs; ++I) {
+    MVT ArgVT = Args[I].VT;
+    ISD::ArgFlagsTy ArgFlags = Args[I].Flags;
+
+    if (ArgFlags.isByVal()) {
+      handleByValArg(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags);
+      continue;
+    }
+
+    if (!FixedFn(I, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo))
+      continue;
+
+#ifndef NDEBUG
+    dbgs() << "Formal Arg #" << I << " has unhandled type "
+           << EVT(ArgVT).getEVTString();
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+void
+MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
+                                           MVT LocVT,
+                                           CCValAssign::LocInfo LocInfo,
+                                           ISD::ArgFlagsTy ArgFlags) {
+  assert(ArgFlags.getByValSize() && "Byval argument's size shouldn't be 0.");
+
+  struct ByValArgInfo ByVal;
+  unsigned ByValSize = RoundUpToAlignment(ArgFlags.getByValSize(), RegSize);
+  unsigned Align = std::min(std::max(ArgFlags.getByValAlign(), RegSize),
+                            RegSize * 2);
+
+  if (UseRegsForByval)
+    allocateRegs(ByVal, ByValSize, Align);
+
+  // Allocate space on caller's stack.
+  ByVal.Address = CCInfo.AllocateStack(ByValSize - RegSize * ByVal.NumRegs,
+                                       Align);
+  CCInfo.addLoc(CCValAssign::getMem(ValNo, ValVT, ByVal.Address, LocVT,
+                                    LocInfo));
+  ByValArgs.push_back(ByVal);
+}
+
+void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
+                                              unsigned ByValSize,
+                                              unsigned Align) {
+  assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
+         "Byval argument's size and alignment should be a multiple of"
+         "RegSize.");
+
+  ByVal.FirstIdx = CCInfo.getFirstUnallocated(IntArgRegs, NumIntArgRegs);
+
+  // If Align > RegSize, the first arg register must be even.
+  if ((Align > RegSize) && (ByVal.FirstIdx % 2)) {
+    CCInfo.AllocateReg(IntArgRegs[ByVal.FirstIdx], ShadowRegs[ByVal.FirstIdx]);
+    ++ByVal.FirstIdx;
+  }
+
+  // Mark the registers allocated.
+  for (unsigned I = ByVal.FirstIdx; ByValSize && (I < NumIntArgRegs);
+       ByValSize -= RegSize, ++I, ++ByVal.NumRegs)
+    CCInfo.AllocateReg(IntArgRegs[I], ShadowRegs[I]);
+}
+
+void MipsTargetLowering::
+copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
+              SelectionDAG &DAG, const ISD::ArgFlagsTy &Flags,
+              SmallVectorImpl<SDValue> &InVals, const Argument *FuncArg,
+              const MipsCC &CC, const ByValArgInfo &ByVal) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned RegAreaSize = ByVal.NumRegs * CC.regSize();
+  unsigned FrameObjSize = std::max(Flags.getByValSize(), RegAreaSize);
+  int FrameObjOffset;
+
+  if (RegAreaSize)
+    FrameObjOffset = (int)CC.reservedArgArea() -
+      (int)((CC.numIntArgRegs() - ByVal.FirstIdx) * CC.regSize());
+  else
+    FrameObjOffset = ByVal.Address;
+
+  // Create frame object.
+  EVT PtrTy = getPointerTy();
+  int FI = MFI->CreateFixedObject(FrameObjSize, FrameObjOffset, true);
+  SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+  InVals.push_back(FIN);
+
+  if (!ByVal.NumRegs)
+    return;
+
+  // Copy arg registers.
+  EVT RegTy = MVT::getIntegerVT(CC.regSize() * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+
+  for (unsigned I = 0; I < ByVal.NumRegs; ++I) {
+    unsigned ArgReg = CC.intArgRegs()[ByVal.FirstIdx + I];
+    unsigned VReg = AddLiveIn(MF, ArgReg, RC);
+    unsigned Offset = I * CC.regSize();
+    SDValue StorePtr = DAG.getNode(ISD::ADD, DL, PtrTy, FIN,
+                                   DAG.getConstant(Offset, PtrTy));
+    SDValue Store = DAG.getStore(Chain, DL, DAG.getRegister(VReg, RegTy),
+                                 StorePtr, MachinePointerInfo(FuncArg, Offset),
+                                 false, false, 0);
+    OutChains.push_back(Store);
+  }
+}
+
+// Copy byVal arg to registers and stack.
+void MipsTargetLowering::
+passByValArg(SDValue Chain, DebugLoc DL,
+             SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+             SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
+             MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+             const MipsCC &CC, const ByValArgInfo &ByVal,
+             const ISD::ArgFlagsTy &Flags, bool isLittle) const {
+  unsigned ByValSize = Flags.getByValSize();
+  unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
+  unsigned RegSize = CC.regSize();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSize);
+  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSize * 8);
+
+  if (ByVal.NumRegs) {
+    const uint16_t *ArgRegs = CC.intArgRegs();
+    bool LeftoverBytes = (ByVal.NumRegs * RegSize > ByValSize);
+    unsigned I = 0;
+
+    // Copy words to registers.
+    for (; I < ByVal.NumRegs - LeftoverBytes; ++I, Offset += RegSize) {
+      SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                    DAG.getConstant(Offset, PtrTy));
+      SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
+                                    MachinePointerInfo(), false, false, false,
+                                    Alignment);
+      MemOpChains.push_back(LoadVal.getValue(1));
+      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
+    }
+
+    // Return if the struct has been fully copied.
+    if (ByValSize == Offset)
+      return;
+
+    // Copy the remainder of the byval argument with sub-word loads and shifts.
+    if (LeftoverBytes) {
+      assert((ByValSize > Offset) && (ByValSize < Offset + RegSize) &&
+             "Size of the remainder should be smaller than RegSize.");
+      SDValue Val;
+
+      for (unsigned LoadSize = RegSize / 2, TotalSizeLoaded = 0;
+           Offset < ByValSize; LoadSize /= 2) {
+        unsigned RemSize = ByValSize - Offset;
+
+        if (RemSize < LoadSize)
+          continue;
+
+        // Load subword.
+        SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                                      DAG.getConstant(Offset, PtrTy));
+        SDValue LoadVal =
+          DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr,
+                         MachinePointerInfo(), MVT::getIntegerVT(LoadSize * 8),
+                         false, false, Alignment);
+        MemOpChains.push_back(LoadVal.getValue(1));
+
+        // Shift the loaded value.
+        unsigned Shamt;
+
+        if (isLittle)
+          Shamt = TotalSizeLoaded;
+        else
+          Shamt = (RegSize - (TotalSizeLoaded + LoadSize)) * 8;
+
+        SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
+                                    DAG.getConstant(Shamt, MVT::i32));
+
+        if (Val.getNode())
+          Val = DAG.getNode(ISD::OR, DL, RegTy, Val, Shift);
+        else
+          Val = Shift;
+
+        Offset += LoadSize;
+        TotalSizeLoaded += LoadSize;
+        Alignment = std::min(Alignment, LoadSize);
+      }
+
+      unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
+      RegsToPass.push_back(std::make_pair(ArgReg, Val));
+      return;
+    }
+  }
+
+  // Copy remainder of byval arg to it with memcpy.
+  unsigned MemCpySize = ByValSize - Offset;
+  SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
+                            DAG.getConstant(Offset, PtrTy));
+  SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
+                            DAG.getIntPtrConstant(ByVal.Address));
+  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
+                        DAG.getConstant(MemCpySize, PtrTy), Alignment,
+                        /*isVolatile=*/false, /*AlwaysInline=*/false,
+                        MachinePointerInfo(0), MachinePointerInfo(0));
+  MemOpChains.push_back(Chain);
+}
+
+void
+MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
+                                    const MipsCC &CC, SDValue Chain,
+                                    DebugLoc DL, SelectionDAG &DAG) const {
+  unsigned NumRegs = CC.numIntArgRegs();
+  const uint16_t *ArgRegs = CC.intArgRegs();
+  const CCState &CCInfo = CC.getCCInfo();
+  unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
+  unsigned RegSize = CC.regSize();
+  EVT RegTy = MVT::getIntegerVT(RegSize * 8);
+  const TargetRegisterClass *RC = getRegClassFor(RegTy);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  // Offset of the first variable argument from stack pointer.
+  int VaArgOffset;
+
+  if (NumRegs == Idx)
+    VaArgOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), RegSize);
+  else
+    VaArgOffset =
+      (int)CC.reservedArgArea() - (int)(RegSize * (NumRegs - Idx));
+
+  // Record the frame index of the first variable argument
+  // which is a value necessary to VASTART.
+  int FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+  MipsFI->setVarArgsFrameIndex(FI);
+
+  // Copy the integer registers that have not been used for argument passing
+  // to the argument register save area. For O32, the save area is allocated
+  // in the caller's stack frame, while for N32/64, it is allocated in the
+  // callee's stack frame.
+  for (unsigned I = Idx; I < NumRegs; ++I, VaArgOffset += RegSize) {
+    unsigned Reg = AddLiveIn(MF, ArgRegs[I], RC);
+    SDValue ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegTy);
+    FI = MFI->CreateFixedObject(RegSize, VaArgOffset, true);
+    SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+    SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
+                                 MachinePointerInfo(), false, false, 0);
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(0);
+    OutChains.push_back(Store);
+  }
+}
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 95ea8fa885fb..43f97e89a7bf 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -17,6 +17,7 @@
 
 #include "Mips.h"
 #include "MipsSubtarget.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -29,6 +30,9 @@ namespace llvm {
       // Jump and link (call)
       JmpLink,
 
+      // Tail call
+      TailCall,
+
       // Get the Higher 16 bits from a 32-bit immediate
       // No relation with Mips Hi register
       Hi,
@@ -81,6 +85,47 @@ namespace llvm {
       Ext,
       Ins,
 
+      // EXTR.W instrinsic nodes.
+      EXTP,
+      EXTPDP,
+      EXTR_S_H,
+      EXTR_W,
+      EXTR_R_W,
+      EXTR_RS_W,
+      SHILO,
+      MTHLIP,
+
+      // DPA.W intrinsic nodes.
+      MULSAQ_S_W_PH,
+      MAQ_S_W_PHL,
+      MAQ_S_W_PHR,
+      MAQ_SA_W_PHL,
+      MAQ_SA_W_PHR,
+      DPAU_H_QBL,
+      DPAU_H_QBR,
+      DPSU_H_QBL,
+      DPSU_H_QBR,
+      DPAQ_S_W_PH,
+      DPSQ_S_W_PH,
+      DPAQ_SA_L_W,
+      DPSQ_SA_L_W,
+      DPA_W_PH,
+      DPS_W_PH,
+      DPAQX_S_W_PH,
+      DPAQX_SA_W_PH,
+      DPAX_W_PH,
+      DPSX_W_PH,
+      DPSQX_S_W_PH,
+      DPSQX_SA_W_PH,
+      MULSA_W_PH,
+
+      MULT,
+      MULTU,
+      MADD_DSP,
+      MADDU_DSP,
+      MSUB_DSP,
+      MSUBU_DSP,
+
       // Load/Store Left/Right nodes.
       LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LWR,
@@ -96,6 +141,7 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   // TargetLowering Implementation
   //===--------------------------------------------------------------------===//
+  class MipsFunctionInfo;
 
   class MipsTargetLowering : public TargetLowering  {
   public:
@@ -105,9 +151,19 @@ namespace llvm {
 
     virtual bool allowsUnalignedMemoryAccesses (EVT VT) const;
 
+    virtual void LowerOperationWrapper(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const;
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
     /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
@@ -117,6 +173,69 @@ namespace llvm {
 
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
+
+    /// ByValArgInfo - Byval argument information.
+    struct ByValArgInfo {
+      unsigned FirstIdx; // Index of the first register used.
+      unsigned NumRegs;  // Number of registers used for this argument.
+      unsigned Address;  // Offset of the stack area used to pass this argument.
+
+      ByValArgInfo() : FirstIdx(0), NumRegs(0), Address(0) {}
+    };
+
+    /// MipsCC - This class provides methods used to analyze formal and call
+    /// arguments and inquire about calling convention information.
+    class MipsCC {
+    public:
+      MipsCC(CallingConv::ID CallConv, bool IsVarArg, bool IsO32,
+             CCState &Info);
+
+      void analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs);
+      void analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins);
+      void handleByValArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags);
+
+      const CCState &getCCInfo() const { return CCInfo; }
+
+      /// hasByValArg - Returns true if function has byval arguments.
+      bool hasByValArg() const { return !ByValArgs.empty(); }
+
+      /// useRegsForByval - Returns true if the calling convention allows the
+      /// use of registers to pass byval arguments.
+      bool useRegsForByval() const { return UseRegsForByval; }
+
+      /// regSize - Size (in number of bits) of integer registers.
+      unsigned regSize() const { return RegSize; }
+
+      /// numIntArgRegs - Number of integer registers available for calls.
+      unsigned numIntArgRegs() const { return NumIntArgRegs; }
+
+      /// reservedArgArea - The size of the area the caller reserves for
+      /// register arguments. This is 16-byte if ABI is O32.
+      unsigned reservedArgArea() const { return ReservedArgArea; }
+
+      /// intArgRegs - Pointer to array of integer registers.
+      const uint16_t *intArgRegs() const { return IntArgRegs; }
+
+      typedef SmallVector<ByValArgInfo, 2>::const_iterator byval_iterator;
+      byval_iterator byval_begin() const { return ByValArgs.begin(); }
+      byval_iterator byval_end() const { return ByValArgs.end(); }
+
+    private:
+      void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
+                        unsigned Align);
+
+      CCState &CCInfo;
+      bool UseRegsForByval;
+      unsigned RegSize;
+      unsigned NumIntArgRegs;
+      unsigned ReservedArgArea;
+      const uint16_t *IntArgRegs, *ShadowRegs;
+      SmallVector<ByValArgInfo, 2> ByValArgs;
+      llvm::CCAssignFn *FixedFn, *VarFn;
+    };
+
     // Subtarget Info
     const MipsSubtarget *Subtarget;
 
@@ -151,6 +270,39 @@ namespace llvm {
                                  bool IsSRA) const;
     SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization.
+    bool IsEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                           unsigned NextStackOffset,
+                                           const MipsFunctionInfo& FI) const;
+
+    /// copyByValArg - Copy argument registers which were used to pass a byval
+    /// argument to the stack. Create a stack frame object for the byval
+    /// argument.
+    void copyByValRegs(SDValue Chain, DebugLoc DL,
+                       std::vector<SDValue> &OutChains, SelectionDAG &DAG,
+                       const ISD::ArgFlagsTy &Flags,
+                       SmallVectorImpl<SDValue> &InVals,
+                       const Argument *FuncArg,
+                       const MipsCC &CC, const ByValArgInfo &ByVal) const;
+
+    /// passByValArg - Pass a byval argument in registers or on stack.
+    void passByValArg(SDValue Chain, DebugLoc DL,
+                      SmallVector<std::pair<unsigned, SDValue>, 16> &RegsToPass,
+                      SmallVector<SDValue, 8> &MemOpChains, SDValue StackPtr,
+                      MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
+                      const MipsCC &CC, const ByValArgInfo &ByVal,
+                      const ISD::ArgFlagsTy &Flags, bool isLittle) const;
+
+    /// writeVarArgRegs - Write variable function arguments passed in registers
+    /// to the stack. Also create a stack frame object for the first variable
+    /// argument.
+    void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
+                         SDValue Chain, DebugLoc DL, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -159,10 +311,20 @@ namespace llvm {
                            DebugLoc dl, SelectionDAG &DAG,
                            SmallVectorImpl<SDValue> &InVals) const;
 
+    SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
+                           SDValue Arg, DebugLoc DL, bool IsTailCall,
+                           SelectionDAG &DAG) const;
+
     virtual SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const;
 
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                     bool isVarArg,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     LLVMContext &Context) const;
+
     virtual SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
@@ -209,6 +371,8 @@ namespace llvm {
 
     virtual unsigned getJumpTableEncoding() const;
 
+    MachineBasicBlock *EmitBPOSGE32(MachineInstr *MI,
+                                    MachineBasicBlock *BB) const;
     MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                     unsigned Size, unsigned BinOpcode, bool Nand = false) const;
     MachineBasicBlock *EmitAtomicBinaryPartword(MachineInstr *MI,
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 3e78c4564310..33ee02068946 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -90,20 +90,20 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 let DecoderMethod = "DecodeFMem" in {
 class FPLoad<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
   FMem<op, (outs RC:$ft), (ins MemOpnd:$addr),
-      !strconcat(opstr, "\t$ft, $addr"), [(set RC:$ft, (load_a addr:$addr))],
+      !strconcat(opstr, "\t$ft, $addr"), [(set RC:$ft, (load addr:$addr))],
       IILoad>;
 
 // FP store.
 class FPStore<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
   FMem<op, (outs), (ins RC:$ft, MemOpnd:$addr),
-      !strconcat(opstr, "\t$ft, $addr"), [(store_a RC:$ft, addr:$addr)],
+      !strconcat(opstr, "\t$ft, $addr"), [(store RC:$ft, addr:$addr)],
       IIStore>;
 }
 // FP indexed load.
 class FPIdxLoad<bits<6> funct, string opstr, RegisterClass DRC,
                 RegisterClass PRC, SDPatternOperator FOp = null_frag>:
   FFMemIdx<funct, (outs DRC:$fd), (ins PRC:$base, PRC:$index),
-           !strconcat(opstr, "\t$fd, $index($base)"),
+           !strconcat(opstr, "\t$fd, ${index}(${base})"),
            [(set DRC:$fd, (FOp (add PRC:$base, PRC:$index)))]> {
   let fs = 0;
 }
@@ -112,7 +112,7 @@ class FPIdxLoad<bits<6> funct, string opstr, RegisterClass DRC,
 class FPIdxStore<bits<6> funct, string opstr, RegisterClass DRC,
                  RegisterClass PRC, SDPatternOperator FOp= null_frag>:
   FFMemIdx<funct, (outs), (ins DRC:$fs, PRC:$base, PRC:$index),
-           !strconcat(opstr, "\t$fs, $index($base)"),
+           !strconcat(opstr, "\t$fs, ${index}(${base})"),
            [(FOp DRC:$fs, (add PRC:$base, PRC:$index))]> {
   let fd = 0;
 }
@@ -182,20 +182,21 @@ defm CEIL_W  : FFR1_W_M<0xe, "ceil">;
 defm CEIL_L  : FFR1_L_M<0xa, "ceil">;
 defm FLOOR_W : FFR1_W_M<0xf, "floor">;
 defm FLOOR_L : FFR1_L_M<0xb, "floor">;
-defm CVT_W   : FFR1_W_M<0x24, "cvt">;
+defm CVT_W   : FFR1_W_M<0x24, "cvt">, NeverHasSideEffects;
 //defm CVT_L   : FFR1_L_M<0x25, "cvt">;
 
-def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>;
-def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>;
-def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>;
+def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>, NeverHasSideEffects;
+def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>, NeverHasSideEffects;
+def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>, NeverHasSideEffects;
 
-let Predicates = [NotFP64bit, HasStandardEncoding] in {
+let Predicates = [NotFP64bit, HasStandardEncoding], neverHasSideEffects = 1 in {
   def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
   def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>;
   def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
 }
 
-let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64" in {
+let Predicates = [IsFP64bit, HasStandardEncoding], DecoderNamespace = "Mips64",
+    neverHasSideEffects = 1 in {
  def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
  def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
  def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
@@ -282,26 +283,26 @@ let Predicates = [NotN64, NotMips64, HasStandardEncoding] in {
 
 // Indexed loads and stores.
 let Predicates = [HasMips32r2Or64, HasStandardEncoding] in {
-  def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load_a>;
-  def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store_a>;
+  def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load>;
+  def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store>;
 }
 
 let Predicates = [HasMips32r2, NotMips64, HasStandardEncoding] in {
-  def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load_a>;
-  def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store_a>;
+  def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load>;
+  def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store>;
 }
 
 let Predicates = [HasMips64, NotN64, HasStandardEncoding], DecoderNamespace="Mips64" in {
-  def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load_a>;
-  def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store_a>;
+  def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load>;
+  def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store>;
 }
 
 // n64
 let Predicates = [IsN64, HasStandardEncoding], isCodeGenOnly=1 in {
-  def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load_a>;
-  def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load_a>;
-  def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store_a>;
-  def SDXC164_P8 : FPIdxStore<0x9, "sdxc1", FGR64, CPU64Regs, store_a>;
+  def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load>;
+  def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load>;
+  def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store>;
+  def SDXC164_P8 : FPIdxStore<0x9, "sdxc1", FGR64, CPU64Regs, store>;
 }
 
 // Load/store doubleword indexed unaligned.
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index 8feb853572ff..1ecbdc2474b3 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -92,6 +92,14 @@ class PseudoSE<dag outs, dag ins, string asmstr, list<dag> pattern>:
   let Predicates = [HasStandardEncoding];
 }
 
+// Pseudo-instructions for alternate assembly syntax (never used by codegen).
+// These are aliases that require C++ handling to convert to the target
+// instruction, while InstAliases can be handled directly by tblgen.
+class MipsAsmPseudoInst<dag outs, dag ins, string asmstr>:
+  MipsInst<outs, ins, asmstr, [], IIPseudo, Pseudo> {
+  let isPseudo = 1;
+  let Pattern = [];
+}
 //===----------------------------------------------------------------------===//
 // Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
 //===----------------------------------------------------------------------===//
@@ -163,6 +171,27 @@ class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
   let Inst{25-0} = addr;
 }
 
+ //===----------------------------------------------------------------------===//
+// MFC instruction class in Mips : <|op|mf|rt|rd|0000000|sel|>
+//===----------------------------------------------------------------------===//
+class MFC3OP<bits<6> op, bits<5> _mfmt, dag outs, dag ins, string asmstr>:
+  InstSE<outs, ins, asmstr, [], NoItinerary, FrmFR>
+{
+  bits<5> mfmt;
+  bits<5> rt;
+  bits<5> rd;
+  bits<3> sel;
+
+  let Opcode = op;
+  let mfmt = _mfmt;
+
+  let Inst{25-21} = mfmt;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-3}  = 0;
+  let Inst{2-0}   = sel;
+}
+
 //===----------------------------------------------------------------------===//
 //
 //  FLOATING POINT INSTRUCTION FORMATS
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 50e3eb534e88..ca80d43f36f1 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -95,6 +95,7 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                   SmallVectorImpl<MachineOperand> &Cond,
                                   bool AllowModify) const
 {
+
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
   // Skip all the debug instructions.
@@ -177,9 +178,14 @@ void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB,
   const MCInstrDesc &MCID = get(Opc);
   MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
 
-  for (unsigned i = 1; i < Cond.size(); ++i)
-    MIB.addReg(Cond[i].getReg());
-
+  for (unsigned i = 1; i < Cond.size(); ++i) {
+    if (Cond[i].isReg())
+      MIB.addReg(Cond[i].getReg());
+    else if (Cond[i].isImm())
+      MIB.addImm(Cond[i].getImm());
+    else
+       assert(true && "Cannot copy operand");
+  }
   MIB.addMBB(TBB);
 }
 
@@ -262,46 +268,3 @@ unsigned MipsInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
   }
   }
 }
-
-unsigned
-llvm::Mips::loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
-                          MachineBasicBlock& MBB,
-                          MachineBasicBlock::iterator II, DebugLoc DL,
-                          bool LastInstrIsADDiu,
-                          MipsAnalyzeImmediate::Inst *LastInst) {
-  MipsAnalyzeImmediate AnalyzeImm;
-  unsigned Size = IsN64 ? 64 : 32;
-  unsigned LUi = IsN64 ? Mips::LUi64 : Mips::LUi;
-  unsigned ZEROReg = IsN64 ? Mips::ZERO_64 : Mips::ZERO;
-  unsigned ATReg = IsN64 ? Mips::AT_64 : Mips::AT;
-
-  const MipsAnalyzeImmediate::InstSeq &Seq =
-    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
-  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
-
-  if (LastInst && (Seq.size() == 1)) {
-    *LastInst = *Inst;
-    return 0;
-  }
-
-  // The first instruction can be a LUi, which is different from other
-  // instructions (ADDiu, ORI and SLL) in that it does not have a register
-  // operand.
-  if (Inst->Opc == LUi)
-    BuildMI(MBB, II, DL, TII.get(LUi), ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-  else
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ZEROReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  // Build the remaining instructions in Seq. Skip the last instruction if
-  // LastInst is not 0.
-  for (++Inst; Inst != Seq.end() - !!LastInst; ++Inst)
-    BuildMI(MBB, II, DL, TII.get(Inst->Opc), ATReg).addReg(ATReg)
-      .addImm(SignExtend64<16>(Inst->ImmOpnd));
-
-  if (LastInst)
-    *LastInst = *Inst;
-
-  return Seq.size() - !!LastInst;
-}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 7d5625906248..aca2bc7ae98d 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -88,18 +88,6 @@ private:
                    const SmallVectorImpl<MachineOperand>& Cond) const;
 };
 
-namespace Mips {
-  /// Emit a series of instructions to load an immediate. All instructions
-  /// except for the last one are emitted. The function returns the number of
-  /// MachineInstrs generated. The opcode-immediate pair of the last
-  /// instruction is returned in LastInst, if it is not 0.
-  unsigned
-  loadImmediate(int64_t Imm, bool IsN64, const TargetInstrInfo &TII,
-                MachineBasicBlock& MBB, MachineBasicBlock::iterator II,
-                DebugLoc DL, bool LastInstrIsADDiu,
-                MipsAnalyzeImmediate::Inst *LastInst);
-}
-
 /// Create MipsInstrInfo objects.
 const MipsInstrInfo *createMips16InstrInfo(MipsTargetMachine &TM);
 const MipsInstrInfo *createMipsSEInstrInfo(MipsTargetMachine &TM);
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index da15d4de22e8..f16b5f9ee7ff 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -52,6 +52,10 @@ def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink,
                          [SDNPHasChain, SDNPOutGlue, SDNPOptInGlue,
                           SDNPVariadic]>;
 
+// Tail call
+def MipsTailCall : SDNode<"MipsISD::TailCall", SDT_MipsJmpLink,
+                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
 // Hi and Lo nodes are used to handle global addresses. Used on
 // MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol
 // static model. (nothing to do with Mips Registers Hi and Lo)
@@ -74,9 +78,10 @@ def MipsRet : SDNode<"MipsISD::Ret", SDTNone, [SDNPHasChain, SDNPOptInGlue]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
-                           [SDNPHasChain, SDNPOutGlue]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
 def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
-                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+                           [SDNPHasChain, SDNPSideEffect,
+                            SDNPOptInGlue, SDNPOutGlue]>;
 
 // MAdd*/MSub* nodes
 def MipsMAdd      : SDNode<"MipsISD::MAdd", SDT_MipsMAddMSub,
@@ -110,7 +115,7 @@ def MipsWrapper    : SDNode<"MipsISD::Wrapper", SDTIntBinOp>;
 def MipsDynAlloc  : SDNode<"MipsISD::DynAlloc", SDT_MipsDynAlloc,
                            [SDNPHasChain, SDNPInGlue]>;
 
-def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain]>;
+def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
 
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
@@ -174,6 +179,35 @@ class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStandardEncoding];
 }
 
+class IsBranch {
+  bit isBranch = 1;
+}
+
+class IsReturn {
+  bit isReturn = 1;
+}
+
+class IsCall {
+  bit isCall = 1;
+}
+
+class IsTailCall {
+  bit isCall = 1;
+  bit isTerminator = 1;
+  bit isReturn = 1;
+  bit isBarrier = 1;
+  bit hasExtraSrcRegAllocReq = 1;
+  bit isCodeGenOnly = 1;
+}
+
+class IsAsCheapAsAMove {
+  bit isAsCheapAsAMove = 1;
+}
+
+class NeverHasSideEffects {
+  bit neverHasSideEffects = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction format superclass
 //===----------------------------------------------------------------------===//
@@ -208,17 +242,24 @@ def uimm16      : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
 
+def MipsMemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let ParserMethod = "parseMemOperand";
+}
+
 // Address operand
 def mem : Operand<i32> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops CPURegs, simm16);
   let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemAsmOperand;
 }
 
 def mem64 : Operand<i64> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops CPU64Regs, simm16_64);
   let EncoderMethod = "getMemEncoding";
+  let ParserMatchClass = MipsMemAsmOperand;
 }
 
 def mem_ea : Operand<i32> {
@@ -285,57 +326,25 @@ def addr :
   ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
 
 //===----------------------------------------------------------------------===//
-// Pattern fragment for load/store
+// Instructions specific format
 //===----------------------------------------------------------------------===//
-class UnalignedLoad<PatFrag Node> :
-  PatFrag<(ops node:$ptr), (Node node:$ptr), [{
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  return LD->getMemoryVT().getSizeInBits()/8 > LD->getAlignment();
-}]>;
 
-class AlignedLoad<PatFrag Node> :
-  PatFrag<(ops node:$ptr), (Node node:$ptr), [{
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  return LD->getMemoryVT().getSizeInBits()/8 <= LD->getAlignment();
-}]>;
-
-class UnalignedStore<PatFrag Node> :
-  PatFrag<(ops node:$val, node:$ptr), (Node node:$val, node:$ptr), [{
-  StoreSDNode *SD = cast<StoreSDNode>(N);
-  return SD->getMemoryVT().getSizeInBits()/8 > SD->getAlignment();
-}]>;
+/// Move Control Registers From/To CPU Registers
+def MFC0_3OP  : MFC3OP<0x10, 0, (outs CPURegs:$rt),
+                       (ins CPURegs:$rd, uimm16:$sel),"mfc0\t$rt, $rd, $sel">;
+def : InstAlias<"mfc0 $rt, $rd", (MFC0_3OP CPURegs:$rt, CPURegs:$rd, 0)>;
 
-class AlignedStore<PatFrag Node> :
-  PatFrag<(ops node:$val, node:$ptr), (Node node:$val, node:$ptr), [{
-  StoreSDNode *SD = cast<StoreSDNode>(N);
-  return SD->getMemoryVT().getSizeInBits()/8 <= SD->getAlignment();
-}]>;
+def MTC0_3OP  : MFC3OP<0x10, 4, (outs CPURegs:$rd, uimm16:$sel),
+                       (ins CPURegs:$rt),"mtc0\t$rt, $rd, $sel">;
+def : InstAlias<"mtc0 $rt, $rd", (MTC0_3OP CPURegs:$rd, 0, CPURegs:$rt)>;
 
-// Load/Store PatFrags.
-def sextloadi16_a   : AlignedLoad<sextloadi16>;
-def zextloadi16_a   : AlignedLoad<zextloadi16>;
-def extloadi16_a    : AlignedLoad<extloadi16>;
-def load_a          : AlignedLoad<load>;
-def sextloadi32_a   : AlignedLoad<sextloadi32>;
-def zextloadi32_a   : AlignedLoad<zextloadi32>;
-def extloadi32_a    : AlignedLoad<extloadi32>;
-def truncstorei16_a : AlignedStore<truncstorei16>;
-def store_a         : AlignedStore<store>;
-def truncstorei32_a : AlignedStore<truncstorei32>;
-def sextloadi16_u   : UnalignedLoad<sextloadi16>;
-def zextloadi16_u   : UnalignedLoad<zextloadi16>;
-def extloadi16_u    : UnalignedLoad<extloadi16>;
-def load_u          : UnalignedLoad<load>;
-def sextloadi32_u   : UnalignedLoad<sextloadi32>;
-def zextloadi32_u   : UnalignedLoad<zextloadi32>;
-def extloadi32_u    : UnalignedLoad<extloadi32>;
-def truncstorei16_u : UnalignedStore<truncstorei16>;
-def store_u         : UnalignedStore<store>;
-def truncstorei32_u : UnalignedStore<truncstorei32>;
+def MFC2_3OP  : MFC3OP<0x12, 0, (outs CPURegs:$rt),
+                       (ins CPURegs:$rd, uimm16:$sel),"mfc2\t$rt, $rd, $sel">;
+def : InstAlias<"mfc2 $rt, $rd", (MFC2_3OP CPURegs:$rt, CPURegs:$rd, 0)>;
 
-//===----------------------------------------------------------------------===//
-// Instructions specific format
-//===----------------------------------------------------------------------===//
+def MTC2_3OP  : MFC3OP<0x12, 4, (outs CPURegs:$rd, uimm16:$sel),
+                       (ins CPURegs:$rt),"mtc2\t$rt, $rd, $sel">;
+def : InstAlias<"mtc2 $rt, $rd", (MTC2_3OP CPURegs:$rd, 0, CPURegs:$rt)>;
 
 // Arithmetic and logical instructions with 3 register operands.
 class ArithLogicR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
@@ -416,7 +425,7 @@ class shift_rotate_reg<bits<6> func, bits<5> isRotate, string instr_asm,
 // Load Upper Imediate
 class LoadUpper<bits<6> op, string instr_asm, RegisterClass RC, Operand Imm>:
   FI<op, (outs RC:$rt), (ins Imm:$imm16),
-     !strconcat(instr_asm, "\t$rt, $imm16"), [], IIAlu> {
+     !strconcat(instr_asm, "\t$rt, $imm16"), [], IIAlu>, IsAsCheapAsAMove {
   let rs = 0;
   let neverHasSideEffects = 1;
   let isReMaterializable = 1;
@@ -597,14 +606,13 @@ class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op, Operand Od,
      IIAlu>;
 
 // Jump
-class JumpFJ<bits<6> op, string instr_asm>:
-  FJ<op, (outs), (ins jmptarget:$target),
-     !strconcat(instr_asm, "\t$target"), [(br bb:$target)], IIBranch> {
-  let isBranch=1;
+class JumpFJ<bits<6> op, DAGOperand opnd, string instr_asm,
+             SDPatternOperator operator, SDPatternOperator targetoperator>:
+  FJ<op, (outs), (ins opnd:$target), !strconcat(instr_asm, "\t$target"),
+     [(operator targetoperator:$target)], IIBranch> {
   let isTerminator=1;
   let isBarrier=1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocStatic, HasStandardEncoding];
   let DecoderMethod = "DecodeJumpTarget";
   let Defs = [AT];
 }
@@ -625,21 +633,21 @@ class UncondBranch<bits<6> op, string instr_asm>:
 
 // Base class for indirect branch and return instruction classes.
 let isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
-class JumpFR<RegisterClass RC, list<dag> pattern>:
-  FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", pattern, IIBranch> {
+class JumpFR<RegisterClass RC, SDPatternOperator operator = null_frag>:
+  FR<0, 0x8, (outs), (ins RC:$rs), "jr\t$rs", [(operator RC:$rs)], IIBranch> {
   let rt = 0;
   let rd = 0;
   let shamt = 0;
 }
 
 // Indirect branch
-class IndirectBranch<RegisterClass RC>: JumpFR<RC, [(brind RC:$rs)]> {
+class IndirectBranch<RegisterClass RC>: JumpFR<RC, brind> {
   let isBranch = 1;
   let isIndirectBranch = 1;
 }
 
 // Return instruction
-class RetBase<RegisterClass RC>: JumpFR<RC, []> {
+class RetBase<RegisterClass RC>: JumpFR<RC> {
   let isReturn = 1;
   let isCodeGenOnly = 1;
   let hasCtrlDep = 1;
@@ -905,12 +913,28 @@ let usesCustomInserter = 1 in {
 // Instruction definition
 //===----------------------------------------------------------------------===//
 
+class LoadImm32< string instr_asm, Operand Od, RegisterClass RC> :
+  MipsAsmPseudoInst<(outs RC:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadImm32Reg : LoadImm32<"li", shamt,CPURegs>;
+
+class LoadAddress<string instr_asm, Operand MemOpnd, RegisterClass RC> :
+  MipsAsmPseudoInst<(outs RC:$rt), (ins MemOpnd:$addr),
+                     !strconcat(instr_asm, "\t$rt, $addr")> ;
+def LoadAddr32Reg : LoadAddress<"la", mem, CPURegs>;
+
+class LoadAddressImm<string instr_asm, Operand Od, RegisterClass RC> :
+  MipsAsmPseudoInst<(outs RC:$rt), (ins Od:$imm32),
+                     !strconcat(instr_asm, "\t$rt, $imm32")> ;
+def LoadAddr32Imm : LoadAddressImm<"la", shamt,CPURegs>;
+
 //===----------------------------------------------------------------------===//
 // MipsI Instructions
 //===----------------------------------------------------------------------===//
 
 /// Arithmetic Instructions (ALU Immediate)
-def ADDiu   : ArithLogicI<0x09, "addiu", add, simm16, immSExt16, CPURegs>;
+def ADDiu   : ArithLogicI<0x09, "addiu", add, simm16, immSExt16, CPURegs>,
+              IsAsCheapAsAMove;
 def ADDi    : ArithOverflowI<0x08, "addi", add, simm16, immSExt16, CPURegs>;
 def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16, CPURegs>;
 def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16, CPURegs>;
@@ -949,19 +973,12 @@ let Predicates = [HasMips32r2, HasStandardEncoding] in {
 ///  aligned
 defm LB      : LoadM32<0x20, "lb",  sextloadi8>;
 defm LBu     : LoadM32<0x24, "lbu", zextloadi8>;
-defm LH      : LoadM32<0x21, "lh",  sextloadi16_a>;
-defm LHu     : LoadM32<0x25, "lhu", zextloadi16_a>;
-defm LW      : LoadM32<0x23, "lw",  load_a>;
+defm LH      : LoadM32<0x21, "lh",  sextloadi16>;
+defm LHu     : LoadM32<0x25, "lhu", zextloadi16>;
+defm LW      : LoadM32<0x23, "lw",  load>;
 defm SB      : StoreM32<0x28, "sb", truncstorei8>;
-defm SH      : StoreM32<0x29, "sh", truncstorei16_a>;
-defm SW      : StoreM32<0x2b, "sw", store_a>;
-
-///  unaligned
-defm ULH     : LoadM32<0x21, "ulh",  sextloadi16_u, 1>;
-defm ULHu    : LoadM32<0x25, "ulhu", zextloadi16_u, 1>;
-defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
-defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
-defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
+defm SH      : StoreM32<0x29, "sh", truncstorei16>;
+defm SW      : StoreM32<0x2b, "sw", store>;
 
 /// load/store left/right
 defm LWL : LoadLeftRightM32<0x22, "lwl", MipsLWL>;
@@ -996,7 +1013,8 @@ def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>,
 }
 
 /// Jump and Branch Instructions
-def J       : JumpFJ<0x02, "j">;
+def J       : JumpFJ<0x02, jmptarget, "j", br, bb>,
+              Requires<[RelocStatic, HasStandardEncoding]>, IsBranch;
 def JR      : IndirectBranch<CPURegs>;
 def B       : UncondBranch<0x04, "b">;
 def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
@@ -1014,6 +1032,8 @@ def JAL  : JumpLink<0x03, "jal">;
 def JALR : JumpLinkReg<0x00, 0x09, "jalr", CPURegs>;
 def BGEZAL  : BranchLink<"bgezal", 0x11, CPURegs>;
 def BLTZAL  : BranchLink<"bltzal", 0x10, CPURegs>;
+def TAILCALL : JumpFJ<0x02, calltarget, "j", MipsTailCall, imm>, IsTailCall;
+def TAILCALL_R : JumpFR<CPURegs, MipsTailCall>, IsTailCall;
 
 def RET : RetBase<CPURegs>;
 
@@ -1072,6 +1092,26 @@ def EXT : ExtBase<0, "ext", CPURegs>;
 def INS : InsBase<4, "ins", CPURegs>;
 
 //===----------------------------------------------------------------------===//
+// Instruction aliases
+//===----------------------------------------------------------------------===//
+def : InstAlias<"move $dst,$src", (ADD CPURegs:$dst,CPURegs:$src,ZERO)>;
+def : InstAlias<"bal $offset", (BGEZAL RA,brtarget:$offset)>;
+def : InstAlias<"addu $rs,$rt,$imm",
+                (ADDiu CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"add $rs,$rt,$imm",
+                (ADDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"and $rs,$rt,$imm",
+                (ANDi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"j $rs", (JR CPURegs:$rs)>;
+def : InstAlias<"not $rt,$rs", (NOR CPURegs:$rt,CPURegs:$rs,ZERO)>;
+def : InstAlias<"neg $rt,$rs", (SUB CPURegs:$rt,ZERO,CPURegs:$rs)>;
+def : InstAlias<"negu $rt,$rs", (SUBu CPURegs:$rt,ZERO,CPURegs:$rs)>;
+def : InstAlias<"slt $rs,$rt,$imm",
+                (SLTi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+def : InstAlias<"xor $rs,$rt,$imm",
+                (XORi CPURegs:$rs,CPURegs:$rt,simm16:$imm)>;
+
+//===----------------------------------------------------------------------===//
 //  Arbitrary patterns that map to one or more instructions
 //===----------------------------------------------------------------------===//
 
@@ -1103,6 +1143,11 @@ def : MipsPat<(MipsJmpLink (i32 texternalsym:$dst)),
 //def : MipsPat<(MipsJmpLink CPURegs:$dst),
 //              (JALR CPURegs:$dst)>;
 
+// Tail call
+def : MipsPat<(MipsTailCall (iPTR tglobaladdr:$dst)),
+              (TAILCALL tglobaladdr:$dst)>;
+def : MipsPat<(MipsTailCall (iPTR texternalsym:$dst)),
+              (TAILCALL texternalsym:$dst)>;
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
 def : MipsPat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
@@ -1153,24 +1198,20 @@ def : MipsPat<(not CPURegs:$in),
 let Predicates = [NotN64, HasStandardEncoding] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 }
 let Predicates = [IsN64, HasStandardEncoding] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_P8 addr:$src)>;
 }
 
 // peepholes
 let Predicates = [NotN64, HasStandardEncoding] in {
-  def : MipsPat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-  def : MipsPat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>;
+  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 }
 let Predicates = [IsN64, HasStandardEncoding] in {
-  def : MipsPat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-  def : MipsPat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>;
+  def : MipsPat<(store (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
 }
 
 // brcond patterns
@@ -1265,3 +1306,8 @@ include "MipsCondMov.td"
 
 include "Mips16InstrFormats.td"
 include "Mips16InstrInfo.td"
+
+// DSP
+include "MipsDSPInstrFormats.td"
+include "MipsDSPInstrInfo.td"
+
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index f78203f70531..5d9f0cffb749 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -10,6 +10,10 @@
 // This pass expands a branch or jump instruction into a long branch if its
 // offset is too large to fit into its immediate field.
 //
+// FIXME: 
+// 1. Fix pc-region jump instructions which cross 256MB segment boundaries. 
+// 2. If program has inline assembly statements whose size cannot be
+//    determined accurately, load branch target addresses from the GOT. 
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "mips-long-branch"
@@ -48,7 +52,7 @@ namespace {
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
   struct MBBInfo {
-    uint64_t Size;
+    uint64_t Size, Address;
     bool HasLongBranch;
     MachineInstr *Br;
 
@@ -61,7 +65,10 @@ namespace {
     static char ID;
     MipsLongBranch(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm),
-        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())) {}
+        TII(static_cast<const MipsInstrInfo*>(tm.getInstrInfo())),
+        IsPIC(TM.getRelocationModel() == Reloc::PIC_),
+        ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
 
     virtual const char *getPassName() const {
       return "Mips Long Branch";
@@ -81,6 +88,9 @@ namespace {
     const MipsInstrInfo *TII;
     MachineFunction *MF;
     SmallVector<MBBInfo, 16> MBBInfos;
+    bool IsPIC;
+    unsigned ABI;
+    unsigned LongBranchSeqSize;
   };
 
   char MipsLongBranch::ID = 0;
@@ -230,12 +240,6 @@ void MipsLongBranch::replaceBranch(MachineBasicBlock &MBB, Iter Br,
 
 // Expand branch instructions to long branches.
 void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
-  I.HasLongBranch = true;
-
-  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
-  unsigned ABI = TM.getSubtarget<MipsSubtarget>().getTargetABI();
-  bool N64 = ABI == MipsSubtarget::N64;
-
   MachineBasicBlock::iterator Pos;
   MachineBasicBlock *MBB = I.Br->getParent(), *TgtMBB = getTargetMBB(*I.Br);
   DebugLoc DL = I.Br->getDebugLoc();
@@ -248,101 +252,105 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
   MBB->addSuccessor(LongBrMBB);
 
   if (IsPIC) {
-    // $longbr:
-    //  addiu $sp, $sp, -regsize * 2
-    //  sw $ra, 0($sp)
-    //  bal $baltgt
-    //  sw $a3, regsize($sp)
-    // $baltgt:
-    //  lui $a3, %hi($baltgt)
-    //  lui $at, %hi($tgt)
-    //  addiu $a3, $a3, %lo($baltgt)
-    //  addiu $at, $at, %lo($tgt)
-    //  subu $at, $at, $a3
-    //  addu $at, $ra, $at
-    //
-    //  if n64:
-    //   lui $a3, %highest($baltgt)
-    //   lui $ra, %highest($tgt)
-    //   addiu $a3, $a3, %higher($baltgt)
-    //   addiu $ra, $ra, %higher($tgt)
-    //   dsll $a3, $a3, 32
-    //   dsll $ra, $ra, 32
-    //   subu $at, $at, $a3
-    //   addu $at, $at, $ra
-    //
-    //  lw $ra, 0($sp)
-    //  lw $a3, regsize($sp)
-    //  jr $at
-    //  addiu $sp, $sp, regsize * 2
-    // $fallthrough:
-    //
-    MF->getInfo<MipsFunctionInfo>()->setEmitNOAT();
     MachineBasicBlock *BalTgtMBB = MF->CreateMachineBasicBlock(BB);
     MF->insert(FallThroughMBB, BalTgtMBB);
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
-    int RegSize = N64 ? 8 : 4;
-    unsigned AT = N64 ? Mips::AT_64 : Mips::AT;
-    unsigned A3 = N64 ? Mips::A3_64 : Mips::A3;
-    unsigned SP = N64 ? Mips::SP_64 : Mips::SP;
-    unsigned RA = N64 ? Mips::RA_64 : Mips::RA;
-    unsigned Load = N64 ? Mips::LD_P8 : Mips::LW;
-    unsigned Store = N64 ? Mips::SD_P8 : Mips::SW;
-    unsigned LUi = N64 ? Mips::LUi64 : Mips::LUi;
-    unsigned ADDiu = N64 ? Mips::DADDiu : Mips::ADDiu;
-    unsigned ADDu = N64 ? Mips::DADDu : Mips::ADDu;
-    unsigned SUBu = N64 ? Mips::SUBu : Mips::SUBu;
-    unsigned JR = N64 ? Mips::JR64 : Mips::JR;
-
-    Pos = LongBrMBB->begin();
-
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
-      .addImm(-RegSize * 2);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(RA).addReg(SP)
-      .addImm(0);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
-    BuildMI(*LongBrMBB, Pos, DL, TII->get(Store)).addReg(A3).addReg(SP)
-      .addImm(RegSize)->setIsInsideBundle();
-
-    Pos = BalTgtMBB->begin();
-
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
-      .addMBB(BalTgtMBB, MipsII::MO_ABS_HI);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), AT)
-      .addMBB(TgtMBB, MipsII::MO_ABS_HI);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
-      .addMBB(BalTgtMBB, MipsII::MO_ABS_LO);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), AT).addReg(AT)
-      .addMBB(TgtMBB, MipsII::MO_ABS_LO);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(RA).addReg(AT);
-
-    if (N64) {
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), A3)
-        .addMBB(BalTgtMBB, MipsII::MO_HIGHEST);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(LUi), RA)
-        .addMBB(TgtMBB, MipsII::MO_HIGHEST);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), A3).addReg(A3)
-        .addMBB(BalTgtMBB, MipsII::MO_HIGHER);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), RA).addReg(RA)
-        .addMBB(TgtMBB, MipsII::MO_HIGHER);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), A3).addReg(A3)
-        .addImm(32);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DSLL), RA).addReg(RA)
-        .addImm(32);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(SUBu), AT).addReg(AT).addReg(A3);
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDu), AT).addReg(AT).addReg(RA);
-      I.Size += 4 * 8;
+    int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address;
+    int64_t Offset = TgtAddress - (I.Address + I.Size - 20);
+    int64_t Lo = SignExtend64<16>(Offset & 0xffff);
+    int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff);
+
+    if (ABI != MipsSubtarget::N64) {
+      // $longbr:
+      //  addiu $sp, $sp, -8
+      //  sw $ra, 0($sp)
+      //  bal $baltgt
+      //  lui $at, %hi($tgt - $baltgt)
+      // $baltgt:
+      //  addiu $at, $at, %lo($tgt - $baltgt)
+      //  addu $at, $ra, $at
+      //  lw $ra, 0($sp)
+      //  jr $at
+      //  addiu $sp, $sp, 8
+      // $fallthrough:
+      //
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(-8);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi)
+        ->setIsInsideBundle();
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT)
+        .addReg(Mips::AT).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
+        .addReg(Mips::RA).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
+        .addReg(Mips::SP).addImm(0);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR)).addReg(Mips::AT);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::SP)
+        .addReg(Mips::SP).addImm(8)->setIsInsideBundle();
+    } else {
+      // $longbr:
+      //  daddiu $sp, $sp, -16
+      //  sd $ra, 0($sp)
+      //  lui64 $at, %highest($tgt - $baltgt)
+      //  daddiu $at, $at, %higher($tgt - $baltgt)
+      //  dsll $at, $at, 16
+      //  daddiu $at, $at, %hi($tgt - $baltgt)
+      //  bal $baltgt
+      //  dsll $at, $at, 16
+      // $baltgt:
+      //  daddiu $at, $at, %lo($tgt - $baltgt)
+      //  daddu $at, $ra, $at
+      //  ld $ra, 0($sp)
+      //  jr64 $at
+      //  daddiu $sp, $sp, 16
+      // $fallthrough:
+      //
+
+      int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff);
+      int64_t Highest =
+        SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff);
+
+      Pos = LongBrMBB->begin();
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(-16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64)
+        .addImm(Highest);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Higher);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Hi);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(16)->setIsInsideBundle();
+
+      Pos = BalTgtMBB->begin();
+
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
+        .addReg(Mips::AT_64).addImm(Lo);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
+        .addReg(Mips::RA_64).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
+        .addReg(Mips::SP_64).addImm(0);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::JR64)).addReg(Mips::AT_64);
+      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::SP_64)
+        .addReg(Mips::SP_64).addImm(16)->setIsInsideBundle();
     }
-
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), RA).addReg(SP).addImm(0);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(Load), A3).addReg(SP).addImm(RegSize);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(JR)).addReg(AT);
-    BuildMI(*BalTgtMBB, Pos, DL, TII->get(ADDiu), SP).addReg(SP)
-      .addImm(RegSize * 2)->setIsInsideBundle();
-    I.Size += 4 * 14;
   } else {
     // $longbr:
     //  j $tgt
@@ -353,7 +361,6 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     LongBrMBB->addSuccessor(TgtMBB);
     BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::J)).addMBB(TgtMBB);
     BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::NOP))->setIsInsideBundle();
-    I.Size += 4 * 2;
   }
 
   if (I.Br->isUnconditionalBranch()) {
@@ -401,19 +408,34 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
       if (!I->Br || I->HasLongBranch)
         continue;
 
-      if (!ForceLongBranch)
-        // Check if offset fits into 16-bit immediate field of branches.
-        if (isInt<16>(computeOffset(I->Br) / 4))
-          continue;
+      // Check if offset fits into 16-bit immediate field of branches.
+      if (!ForceLongBranch && isInt<16>(computeOffset(I->Br) / 4))
+        continue;
 
-      expandToLongBranch(*I);
+      I->HasLongBranch = true;
+      I->Size += LongBranchSeqSize * 4;
       ++LongBranches;
       EverMadeChange = MadeChange = true;
     }
   }
 
-  if (EverMadeChange)
-    MF->RenumberBlocks();
+  if (!EverMadeChange)
+    return true;
+
+  // Compute basic block addresses.
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    uint64_t Address = 0;
+
+    for (I = MBBInfos.begin(); I != E; Address += I->Size, ++I)
+      I->Address = Address;
+  }
+
+  // Do the expansion.
+  for (I = MBBInfos.begin(); I != E; ++I)
+    if (I->HasLongBranch)
+      expandToLongBranch(*I);
+
+  MF->RenumberBlocks();
 
   return true;
 }
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index d4c5e6dd74cf..5fa633933838 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -11,7 +11,6 @@
 // MCInst records.
 //
 //===----------------------------------------------------------------------===//
-
 #include "MipsMCInstLower.h"
 #include "MipsAsmPrinter.h"
 #include "MipsInstrInfo.h"
@@ -161,31 +160,3 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
-// If the D<shift> instruction has a shift amount that is greater
-// than 31 (checked in calling routine), lower it to a D<shift>32 instruction
-void MipsMCInstLower::LowerLargeShift(const MachineInstr *MI,
-                                      MCInst& Inst,
-                                      int64_t Shift) {
-  // rt
-  Inst.addOperand(LowerOperand(MI->getOperand(0)));
-  // rd
-  Inst.addOperand(LowerOperand(MI->getOperand(1)));
-  // saminus32
-  Inst.addOperand(MCOperand::CreateImm(Shift));
-
-  switch (MI->getOpcode()) {
-  default:
-    // Calling function is not synchronized
-    llvm_unreachable("Unexpected shift instruction");
-    break;
-  case Mips::DSLL:
-    Inst.setOpcode(Mips::DSLL32);
-    break;
-  case Mips::DSRL:
-    Inst.setOpcode(Mips::DSRL32);
-    break;
-  case Mips::DSRA:
-    Inst.setOpcode(Mips::DSRA32);
-    break;
-  }
-}
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 0abb996a6877..c4a6016105b2 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -33,12 +33,11 @@ public:
   MipsMCInstLower(MipsAsmPrinter &asmprinter);
   void Initialize(Mangler *mang, MCContext *C);
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-  void LowerLargeShift(const MachineInstr *MI, MCInst &Inst, int64_t Shift);
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
 
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
-  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
 };
 }
 
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index 362173eda3a4..5ff19aba0267 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -43,4 +43,17 @@ unsigned MipsFunctionInfo::getGlobalBaseReg() {
   return GlobalBaseReg = MF.getRegInfo().createVirtualRegister(RC);
 }
 
+bool MipsFunctionInfo::mips16SPAliasRegSet() const {
+  return Mips16SPAliasReg;
+}
+unsigned MipsFunctionInfo::getMips16SPAliasReg() {
+  // Return if it has already been initialized.
+  if (Mips16SPAliasReg)
+    return Mips16SPAliasReg;
+
+  const TargetRegisterClass *RC;
+  RC=(const TargetRegisterClass*)&Mips::CPU16RegsRegClass;
+  return Mips16SPAliasReg = MF.getRegInfo().createVirtualRegister(RC);
+}
+
 void MipsFunctionInfo::anchor() { }
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index df3c4c0de0fb..bb45f92f18fd 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -39,55 +39,45 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   /// relocation models.
   unsigned GlobalBaseReg;
 
+  /// Mips16SPAliasReg - keeps track of the virtual register initialized for
+  /// use as an alias for SP for use in load/store of halfword/byte from/to
+  /// the stack
+  unsigned Mips16SPAliasReg;
+
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
-  // Range of frame object indices.
-  // InArgFIRange: Range of indices of all frame objects created during call to
-  //               LowerFormalArguments.
-  // OutArgFIRange: Range of indices of all frame objects created during call to
-  //                LowerCall except for the frame object for restoring $gp.
-  std::pair<int, int> InArgFIRange, OutArgFIRange;
-  unsigned MaxCallFrameSize;
+  /// True if function has a byval argument.
+  bool HasByvalArg;
 
-  bool EmitNOAT;
+  /// Size of incoming argument area.
+  unsigned IncomingArgSize;
 
 public:
   MipsFunctionInfo(MachineFunction& MF)
-  : MF(MF), SRetReturnReg(0), GlobalBaseReg(0),
-    VarArgsFrameIndex(0), InArgFIRange(std::make_pair(-1, 0)),
-    OutArgFIRange(std::make_pair(-1, 0)), MaxCallFrameSize(0), EmitNOAT(false)
+   : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), Mips16SPAliasReg(0),
+     VarArgsFrameIndex(0)
   {}
 
-  bool isInArgFI(int FI) const {
-    return FI <= InArgFIRange.first && FI >= InArgFIRange.second;
-  }
-  void setLastInArgFI(int FI) { InArgFIRange.second = FI; }
-
-  bool isOutArgFI(int FI) const {
-    return FI <= OutArgFIRange.first && FI >= OutArgFIRange.second;
-  }
-  void extendOutArgFIRange(int FirstFI, int LastFI) {
-    if (!OutArgFIRange.second)
-      // this must be the first time this function was called.
-      OutArgFIRange.first = FirstFI;
-    OutArgFIRange.second = LastFI;
-  }
-
   unsigned getSRetReturnReg() const { return SRetReturnReg; }
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
   bool globalBaseRegSet() const;
   unsigned getGlobalBaseReg();
 
+  bool mips16SPAliasRegSet() const;
+  unsigned getMips16SPAliasReg();
+
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
 
-  unsigned getMaxCallFrameSize() const { return MaxCallFrameSize; }
-  void setMaxCallFrameSize(unsigned S) { MaxCallFrameSize = S; }
+  bool hasByvalArg() const { return HasByvalArg; }
+  void setFormalArgInfo(unsigned Size, bool HasByval) {
+    IncomingArgSize = Size;
+    HasByvalArg = HasByval;
+  }
 
-  bool getEmitNOAT() const { return EmitNOAT; }
-  void setEmitNOAT() { EmitNOAT = true; }
+  unsigned getIncomingArgSize() const { return IncomingArgSize; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index ae6ae3a59005..d8e0dd436a95 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -22,7 +22,6 @@
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/Type.h"
-#include "llvm/Function.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -43,9 +42,8 @@
 
 using namespace llvm;
 
-MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST,
-                                   const TargetInstrInfo &tii)
-  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST), TII(tii) {}
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
+  : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
 
 unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
 
@@ -83,11 +81,11 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   static const uint16_t ReservedCPURegs[] = {
-    Mips::ZERO, Mips::AT, Mips::K0, Mips::K1, Mips::SP
+    Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
   };
 
   static const uint16_t ReservedCPU64Regs[] = {
-    Mips::ZERO_64, Mips::AT_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
+    Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
   };
 
   BitVector Reserved(getNumRegs());
@@ -96,41 +94,49 @@ getReservedRegs(const MachineFunction &MF) const {
   for (unsigned I = 0; I < array_lengthof(ReservedCPURegs); ++I)
     Reserved.set(ReservedCPURegs[I]);
 
-  if (Subtarget.hasMips64()) {
-    for (unsigned I = 0; I < array_lengthof(ReservedCPU64Regs); ++I)
-      Reserved.set(ReservedCPU64Regs[I]);
+  for (unsigned I = 0; I < array_lengthof(ReservedCPU64Regs); ++I)
+    Reserved.set(ReservedCPU64Regs[I]);
 
+  if (Subtarget.hasMips64()) {
     // Reserve all registers in AFGR64.
     for (RegIter Reg = Mips::AFGR64RegClass.begin(),
          EReg = Mips::AFGR64RegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
   } else {
-    // Reserve all registers in CPU64Regs & FGR64.
-    for (RegIter Reg = Mips::CPU64RegsRegClass.begin(),
-         EReg = Mips::CPU64RegsRegClass.end(); Reg != EReg; ++Reg)
-      Reserved.set(*Reg);
-
+    // Reserve all registers in FGR64.
     for (RegIter Reg = Mips::FGR64RegClass.begin(),
          EReg = Mips::FGR64RegClass.end(); Reg != EReg; ++Reg)
       Reserved.set(*Reg);
   }
-
   // Reserve FP if this function should have a dedicated frame pointer register.
   if (MF.getTarget().getFrameLowering()->hasFP(MF)) {
-    Reserved.set(Mips::FP);
-    Reserved.set(Mips::FP_64);
+    if (Subtarget.inMips16Mode())
+      Reserved.set(Mips::S0);
+    else {
+      Reserved.set(Mips::FP);
+      Reserved.set(Mips::FP_64);
+    }
   }
 
   // Reserve hardware registers.
   Reserved.set(Mips::HWR29);
   Reserved.set(Mips::HWR29_64);
 
+  // Reserve DSP control register.
+  Reserved.set(Mips::DSPCtrl);
+
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
     Reserved.set(Mips::RA);
     Reserved.set(Mips::RA_64);
   }
 
+  // Reserve GP if small section is used.
+  if (Subtarget.useSmallSection()) {
+    Reserved.set(Mips::GP);
+    Reserved.set(Mips::GP_64);
+  }
+
   return Reserved;
 }
 
@@ -160,7 +166,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
            "Instr doesn't have FrameIndex operand!");
   }
 
-  DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+  DEBUG(errs() << "\nFunction : " << MF.getName() << "\n";
         errs() << "<--------->\n" << MI);
 
   int FrameIndex = MI.getOperand(i).getIndex();
@@ -179,8 +185,12 @@ getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
   bool IsN64 = Subtarget.isABI_N64();
 
-  return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
-                          (IsN64 ? Mips::SP_64 : Mips::SP);
+  if (Subtarget.inMips16Mode())
+    return TFI->hasFP(MF) ? Mips::S0 : Mips::SP;
+  else
+    return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
+                            (IsN64 ? Mips::SP_64 : Mips::SP);
+
 }
 
 unsigned MipsRegisterInfo::
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 9a05e94be991..78adf7f18bf2 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -22,16 +22,14 @@
 
 namespace llvm {
 class MipsSubtarget;
-class TargetInstrInfo;
 class Type;
 
 class MipsRegisterInfo : public MipsGenRegisterInfo {
 protected:
   const MipsSubtarget &Subtarget;
-  const TargetInstrInfo &TII;
 
 public:
-  MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
+  MipsRegisterInfo(const MipsSubtarget &Subtarget);
 
   /// getRegisterNumbering - Given the enum value for some register, e.g.
   /// Mips::RA, return the number that it corresponds to (e.g. 31).
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index b255e4222b7e..391c19e07e33 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -14,6 +14,8 @@ let Namespace = "Mips" in {
 def sub_fpeven : SubRegIndex;
 def sub_fpodd  : SubRegIndex;
 def sub_32     : SubRegIndex;
+def sub_lo     : SubRegIndex;
+def sub_hi     : SubRegIndex;
 }
 
 // We have banks of 32 registers each.
@@ -71,7 +73,7 @@ class HWR<bits<5> num, string n> : MipsReg<n> {
 let Namespace = "Mips" in {
   // General Purpose Registers
   def ZERO : MipsGPRReg< 0, "zero">, DwarfRegNum<[0]>;
-  def AT   : MipsGPRReg< 1, "at">,   DwarfRegNum<[1]>;
+  def AT   : MipsGPRReg< 1, "1">,    DwarfRegNum<[1]>;
   def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
   def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
   def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[4]>;
@@ -105,7 +107,7 @@ let Namespace = "Mips" in {
 
   // General Purpose 64-bit Registers
   def ZERO_64 : Mips64GPRReg< 0, "zero", [ZERO]>, DwarfRegNum<[0]>;
-  def AT_64   : Mips64GPRReg< 1, "at",   [AT]>, DwarfRegNum<[1]>;
+  def AT_64   : Mips64GPRReg< 1, "1",    [AT]>, DwarfRegNum<[1]>;
   def V0_64   : Mips64GPRReg< 2, "2",    [V0]>, DwarfRegNum<[2]>;
   def V1_64   : Mips64GPRReg< 3, "3",    [V1]>, DwarfRegNum<[3]>;
   def A0_64   : Mips64GPRReg< 4, "4",    [A0]>, DwarfRegNum<[4]>;
@@ -239,16 +241,29 @@ let Namespace = "Mips" in {
   // fcc0 register
   def FCC0 : Register<"fcc0">;
 
+  // PC register
+  def PC : Register<"pc">;
+
   // Hardware register $29
   def HWR29 : Register<"29">;
   def HWR29_64 : Register<"29">;
+
+  // Accum registers
+  let SubRegIndices = [sub_lo, sub_hi] in
+  def AC0 : RegisterWithSubRegs<"ac0", [LO, HI]>;
+  def AC1 : Register<"ac1">;
+  def AC2 : Register<"ac2">;
+  def AC3 : Register<"ac3">;
+
+  def DSPCtrl : Register<"dspctrl">;
 }
 
 //===----------------------------------------------------------------------===//
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-def CPURegs : RegisterClass<"Mips", [i32], 32, (add
+class CPURegsClass<list<ValueType> regTypes> :
+  RegisterClass<"Mips", regTypes, 32, (add
   // Reserved
   ZERO, AT,
   // Return Values and Arguments
@@ -262,6 +277,9 @@ def CPURegs : RegisterClass<"Mips", [i32], 32, (add
   // Reserved
   K0, K1, GP, SP, FP, RA)>;
 
+def CPURegs : CPURegsClass<[i32]>;
+def DSPRegs : CPURegsClass<[v4i8, v2i16]>;
+
 def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
   ZERO_64, AT_64,
@@ -284,6 +302,7 @@ def CPU16Regs : RegisterClass<"Mips", [i32], 32, (add
 
 def CPURAReg : RegisterClass<"Mips", [i32], 32, (add RA)>;
 
+def CPUSPReg : RegisterClass<"Mips", [i32], 32, (add SP)>;
 
 // 64bit fp:
 // * FGR64  - 32 64-bit registers
@@ -319,3 +338,5 @@ def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>;
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
 def HWRegs64 : RegisterClass<"Mips", [i64], 32, (add HWR29_64)>;
 
+// Accumulator Registers
+def ACRegs : RegisterClass<"Mips", [i64], 64, (sequence "AC%u", 0, 3)>;
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 1c598471a0aa..03f5176b2974 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -22,7 +22,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -202,6 +203,19 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // Mark $fp as used if function has dedicated frame pointer.
   if (hasFP(MF))
     MRI.setPhysRegUsed(FP);
+
+  // Set scavenging frame index if necessary.
+  uint64_t MaxSPOffset = MF.getInfo<MipsFunctionInfo>()->getIncomingArgSize() +
+    estimateStackSize(MF);
+
+  if (isInt<16>(MaxSPOffset))
+    return;
+
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+  int FI = MF.getFrameInfo()->CreateStackObject(RC->getSize(),
+                                                RC->getAlignment(), false);
+  RS->setScavengingFrameIndex(FI);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index eeb1de36efc6..fb0f9df038c3 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -260,14 +260,55 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
   if (isInt<16>(Amount))// addi sp, sp, amount
     BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
   else { // Expand immediate that doesn't fit in 16-bit.
-    unsigned ATReg = STI.isABI_N64() ? Mips::AT_64 : Mips::AT;
-
-    MBB.getParent()->getInfo<MipsFunctionInfo>()->setEmitNOAT();
-    Mips::loadImmediate(Amount, STI.isABI_N64(), *this, MBB, I, DL, false, 0);
-    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(ATReg);
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0);
+    BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg, RegState::Kill);
   }
 }
 
+/// This function generates the sequence of instructions needed to get the
+/// result of adding register REG and immediate IMM.
+unsigned
+MipsSEInstrInfo::loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator II, DebugLoc DL,
+                               unsigned *NewImm) const {
+  MipsAnalyzeImmediate AnalyzeImm;
+  const MipsSubtarget &STI = TM.getSubtarget<MipsSubtarget>();
+  MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
+  unsigned Size = STI.isABI_N64() ? 64 : 32;
+  unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
+  unsigned ZEROReg = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  const TargetRegisterClass *RC = STI.isABI_N64() ?
+    &Mips::CPU64RegsRegClass : &Mips::CPURegsRegClass;
+  bool LastInstrIsADDiu = NewImm;
+
+  const MipsAnalyzeImmediate::InstSeq &Seq =
+    AnalyzeImm.Analyze(Imm, Size, LastInstrIsADDiu);
+  MipsAnalyzeImmediate::InstSeq::const_iterator Inst = Seq.begin();
+
+  assert(Seq.size() && (!LastInstrIsADDiu || (Seq.size() > 1)));
+
+  // The first instruction can be a LUi, which is different from other
+  // instructions (ADDiu, ORI and SLL) in that it does not have a register
+  // operand.
+  unsigned Reg = RegInfo.createVirtualRegister(RC);
+
+  if (Inst->Opc == LUi)
+    BuildMI(MBB, II, DL, get(LUi), Reg).addImm(SignExtend64<16>(Inst->ImmOpnd));
+  else
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(ZEROReg)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  // Build the remaining instructions in Seq.
+  for (++Inst; Inst != Seq.end() - LastInstrIsADDiu; ++Inst)
+    BuildMI(MBB, II, DL, get(Inst->Opc), Reg).addReg(Reg, RegState::Kill)
+      .addImm(SignExtend64<16>(Inst->ImmOpnd));
+
+  if (LastInstrIsADDiu)
+    *NewImm = Inst->ImmOpnd;
+
+  return Reg;
+}
+
 unsigned MipsSEInstrInfo::GetAnalyzableBrOpc(unsigned Opc) const {
   return (Opc == Mips::BEQ    || Opc == Mips::BNE    || Opc == Mips::BGTZ   ||
           Opc == Mips::BGEZ   || Opc == Mips::BLTZ   || Opc == Mips::BLEZ   ||
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 346e74dba485..55b78b2cfb97 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -15,7 +15,6 @@
 #define MIPSSEINSTRUCTIONINFO_H
 
 #include "MipsInstrInfo.h"
-#include "MipsAnalyzeImmediate.h"
 #include "MipsSERegisterInfo.h"
 
 namespace llvm {
@@ -70,6 +69,13 @@ public:
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator I) const;
 
+  /// Emit a series of instructions to load an immediate. If NewImm is a
+  /// non-NULL parameter, the last instruction is not emitted, but instead
+  /// its immediate operand is returned in NewImm.
+  unsigned loadImmediate(int64_t Imm, MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator II, DebugLoc DL,
+                         unsigned *NewImm) const;
+
 private:
   virtual unsigned GetAnalyzableBrOpc(unsigned Opc) const;
 
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 043a1ef6833b..56b9ba95e5de 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -26,6 +26,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -40,8 +41,18 @@
 using namespace llvm;
 
 MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST,
-                                       const TargetInstrInfo &TII)
-  : MipsRegisterInfo(ST, TII) {}
+                                       const MipsSEInstrInfo &I)
+  : MipsRegisterInfo(ST), TII(I) {}
+
+bool MipsSERegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool MipsSERegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
 
 // This function eliminate ADJCALLSTACKDOWN,
 // ADJCALLSTACKUP pseudo instructions
@@ -72,7 +83,6 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   MachineInstr &MI = *II;
   MachineFunction &MF = *MI.getParent()->getParent();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
 
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
   int MinCSFI = 0;
@@ -91,8 +101,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   // getFrameRegister() returns.
   unsigned FrameReg;
 
-  if (MipsFI->isOutArgFI(FrameIndex) ||
-      (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
+  if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
     FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
   else
     FrameReg = getFrameRegister(MF);
@@ -104,14 +113,11 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   // - If the frame object is any of the following, its offset must be adjusted
   //   by adding the size of the stack:
   //   incoming argument, callee-saved register location or local variable.
+  bool IsKill = false;
   int64_t Offset;
 
-  if (MipsFI->isOutArgFI(FrameIndex))
-    Offset = SPOffset;
-  else
-    Offset = SPOffset + (int64_t)StackSize;
-
-  Offset    += MI.getOperand(OpNo + 1).getImm();
+  Offset = SPOffset + (int64_t)StackSize;
+  Offset += MI.getOperand(OpNo + 1).getImm();
 
   DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
 
@@ -121,18 +127,17 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
     MachineBasicBlock &MBB = *MI.getParent();
     DebugLoc DL = II->getDebugLoc();
     unsigned ADDu = Subtarget.isABI_N64() ? Mips::DADDu : Mips::ADDu;
-    unsigned ATReg = Subtarget.isABI_N64() ? Mips::AT_64 : Mips::AT;
-    MipsAnalyzeImmediate::Inst LastInst(0, 0);
+    unsigned NewImm;
 
-    MipsFI->setEmitNOAT();
-    Mips::loadImmediate(Offset, Subtarget.isABI_N64(), TII, MBB, II, DL, true,
-                        &LastInst);
-    BuildMI(MBB, II, DL, TII.get(ADDu), ATReg).addReg(FrameReg).addReg(ATReg);
+    unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL, &NewImm);
+    BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
+      .addReg(Reg, RegState::Kill);
 
-    FrameReg = ATReg;
-    Offset = SignExtend64<16>(LastInst.ImmOpnd);
+    FrameReg = Reg;
+    Offset = SignExtend64<16>(NewImm);
+    IsKill = true;
   }
 
-  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false);
+  MI.getOperand(OpNo).ChangeToRegister(FrameReg, false, false, IsKill);
   MI.getOperand(OpNo + 1).ChangeToImmediate(Offset);
 }
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 4b17b33e9a21..7437bd36c333 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -18,11 +18,18 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
+class MipsSEInstrInfo;
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
+  const MipsSEInstrInfo &TII;
+
 public:
   MipsSERegisterInfo(const MipsSubtarget &Subtarget,
-                     const TargetInstrInfo &TII);
+                     const MipsSEInstrInfo &TII);
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
                                      MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 11ff8092af2e..930af4dda159 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -25,12 +25,14 @@ using namespace llvm;
 void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
-                             const std::string &FS, bool little) :
+                             const std::string &FS, bool little,
+                             Reloc::Model RM) :
   MipsGenSubtargetInfo(TT, CPU, FS),
   MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
   IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false),
-  HasMinMax(false), HasSwap(false), HasBitCount(false), InMips16Mode(false)
+  HasMinMax(false), HasSwap(false), HasBitCount(false), InMips16Mode(false),
+  HasDSP(false), HasDSPR2(false), IsAndroid(false)
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
@@ -54,6 +56,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
+
+  // Set UseSmallSection.
+  UseSmallSection = !IsLinux && (RM == Reloc::Static);
 }
 
 bool
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index ba15362f07b0..ff69237ec2bd 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -65,6 +65,9 @@ protected:
   // isLinux - Target system is Linux. Is false we consider ELFOS for now.
   bool IsLinux;
 
+  // UseSmallSection - Small section is used.
+  bool UseSmallSection;
+
   /// Features related to the presence of specific instructions.
 
   // HasSEInReg - SEB and SEH (signext in register) instructions.
@@ -89,6 +92,9 @@ protected:
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
 
+  // HasDSP, HasDSPR2 -- supports DSP ASE.
+  bool HasDSP, HasDSPR2;
+
   // IsAndroid -- target is android
   bool IsAndroid;
 
@@ -109,7 +115,7 @@ public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
   MipsSubtarget(const std::string &TT, const std::string &CPU,
-                const std::string &FS, bool little);
+                const std::string &FS, bool little, Reloc::Model RM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -131,8 +137,11 @@ public:
   bool isNotSingleFloat() const { return !IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
+  bool hasDSP() const { return HasDSP; }
+  bool hasDSPR2() const { return HasDSPR2; }
   bool isAndroid() const { return IsAndroid; }
   bool isLinux() const { return IsLinux; }
+  bool useSmallSection() const { return UseSmallSection; }
 
   bool hasStandardEncoding() const { return !inMips16Mode(); }
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 03a024a361d5..983ee219412b 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -42,8 +42,8 @@ MipsTargetMachine(const Target &T, StringRef TT,
                   CodeGenOpt::Level OL,
                   bool isLittle)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, isLittle),
-    DataLayout(isLittle ?
+    Subtarget(TT, CPU, FS, isLittle, RM),
+    DL(isLittle ?
                (Subtarget.isABI_N64() ?
                 "e-p:64:64:64-i8:8:32-i16:16:32-i64:64:64-f128:128:128-n32" :
                 "e-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32") :
@@ -52,7 +52,8 @@ MipsTargetMachine(const Target &T, StringRef TT,
                 "E-p:32:32:32-i8:8:32-i16:16:32-i64:64:64-n32")),
     InstrInfo(MipsInstrInfo::create(*this)),
     FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
-    TLInfo(*this), TSInfo(*this), JITInfo() {
+    TLInfo(*this), TSInfo(*this), JITInfo(),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 void MipsebTargetMachine::anchor() { }
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 21b49e673ca9..b54f5cee6d4d 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -21,8 +21,9 @@
 #include "MipsSelectionDAGInfo.h"
 #include "MipsSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 class formatted_raw_ostream;
@@ -30,12 +31,14 @@ class MipsRegisterInfo;
 
 class MipsTargetMachine : public LLVMTargetMachine {
   MipsSubtarget       Subtarget;
-  const TargetData    DataLayout; // Calculates type size & alignment
+  const DataLayout    DL; // Calculates type size & alignment
   const MipsInstrInfo *InstrInfo;
   const MipsFrameLowering *FrameLowering;
   MipsTargetLowering  TLInfo;
   MipsSelectionDAGInfo TSInfo;
   MipsJITInfo JITInfo;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 
 public:
   MipsTargetMachine(const Target &T, StringRef TT,
@@ -52,8 +55,8 @@ public:
   { return FrameLowering; }
   virtual const MipsSubtarget *getSubtargetImpl() const
   { return &Subtarget; }
-  virtual const TargetData *getTargetData()    const
-  { return &DataLayout;}
+  virtual const DataLayout *getDataLayout()    const
+  { return &DL;}
   virtual MipsJITInfo *getJITInfo()
   { return &JITInfo; }
 
@@ -69,6 +72,13 @@ public:
     return &TSInfo;
   }
 
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
+
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
   virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 04dc60aa6b45..881908b82c91 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -13,7 +13,7 @@
 #include "llvm/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
@@ -26,6 +26,7 @@ SSThreshold("mips-ssection-threshold", cl::Hidden,
 
 void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection =
     getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
@@ -60,9 +61,10 @@ bool MipsTargetObjectFile::
 IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
                        SectionKind Kind) const {
 
-  // Only use small section for non linux targets.
   const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
-  if (Subtarget.isLinux())
+
+  // Return if small section is not available.
+  if (!Subtarget.useSmallSection())
     return false;
 
   // Only global variables, not functions.
@@ -80,7 +82,7 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
     return false;
 
   Type *Ty = GV->getType()->getElementType();
-  return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+  return IsInSmallSection(TM.getDataLayout()->getTypeAllocSize(Ty));
 }
 
 
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index ae7710e54f08..7aee3595c625 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -24,7 +24,30 @@ include "NVPTXInstrInfo.td"
 // - Need at least one feature to avoid generating zero sized array by
 //   TableGen in NVPTXGenSubtarget.inc.
 //===----------------------------------------------------------------------===//
-def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
+
+// SM Versions
+def SM10 : SubtargetFeature<"sm_10", "SmVersion", "10",
+                            "Target SM 1.0">;
+def SM11 : SubtargetFeature<"sm_11", "SmVersion", "11",
+                            "Target SM 1.1">;
+def SM12 : SubtargetFeature<"sm_12", "SmVersion", "12",
+                            "Target SM 1.2">;
+def SM13 : SubtargetFeature<"sm_13", "SmVersion", "13",
+                            "Target SM 1.3">;
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+                            "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+                            "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+                            "Target SM 3.0">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+                            "Target SM 3.5">;
+
+// PTX Versions
+def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
+                             "Use PTX version 3.0">;
+def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
+                             "Use PTX version 3.1">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -33,7 +56,14 @@ def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"sm_10", [FeatureDummy]>;
+def : Proc<"sm_10", [SM10]>;
+def : Proc<"sm_11", [SM11]>;
+def : Proc<"sm_12", [SM12]>;
+def : Proc<"sm_13", [SM13]>;
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_35", [SM35]>;
 
 
 def NVPTXInstrInfo : InstrInfo {
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 24b3bd589812..c7cabf695311 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -16,7 +16,7 @@
 
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -31,7 +31,7 @@ public:
   NVPTXAllocaHoisting() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<TargetData>();
+    AU.addRequired<DataLayout>();
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index f2b96163f43d..0a885ce1c4a6 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -68,7 +68,54 @@ static cl::opt<bool, true>InterleaveSrc("nvptx-emit-src",
                                         cl::location(llvm::InterleaveSrcInPtx));
 
 
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(Value *V,
+                              DenseSet<GlobalVariable*> &Globals) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    Globals.insert(GV);
+  else {
+    if (User *U = dyn_cast<User>(V)) {
+      for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+        DiscoverDependentGlobals(U->getOperand(i), Globals);
+      }
+    }
+  }
+}
 
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(GlobalVariable *GV,
+                                    SmallVectorImpl<GlobalVariable*> &Order,
+                                    DenseSet<GlobalVariable*> &Visited,
+                                    DenseSet<GlobalVariable*> &Visiting) {
+  // Have we already visited this one?
+  if (Visited.count(GV)) return;
+
+  // Do we have a circular dependency?
+  if (Visiting.count(GV))
+    report_fatal_error("Circular dependency found in global variable set");
+
+  // Start visiting this global
+  Visiting.insert(GV);
+
+  // Make sure we visit all dependents first
+  DenseSet<GlobalVariable*> Others;
+  for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+    DiscoverDependentGlobals(GV->getOperand(i), Others);
+  
+  for (DenseSet<GlobalVariable*>::iterator I = Others.begin(),
+       E = Others.end(); I != E; ++I)
+    VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+  // Now we can visit ourself
+  Order.push_back(GV);
+  Visited.insert(GV);
+  Visiting.erase(GV);
+}
+}
 
 // @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
 // cannot just link to the existing version.
@@ -98,10 +145,10 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
   switch (CE->getOpcode()) {
   default:
     // If the code isn't optimized, there may be outstanding folding
-    // opportunities. Attempt to fold the expression using TargetData as a
+    // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
     if (Constant *C =
-        ConstantFoldConstantExpression(CE, AP.TM.getTargetData()))
+        ConstantFoldConstantExpression(CE, AP.TM.getDataLayout()))
       if (C != CE)
         return LowerConstant(C, AP);
 
@@ -115,7 +162,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
         report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
     const Constant *PtrVal = CE->getOperand(0);
     SmallVector<Value*, 8> IdxVec(CE->op_begin()+1, CE->op_end());
@@ -145,7 +192,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     return LowerConstant(CE->getOperand(0), AP);
 
   case Instruction::IntToPtr: {
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
@@ -155,7 +202,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
   }
 
   case Instruction::PtrToInt: {
-    const TargetData &TD = *AP.TM.getTargetData();
+    const DataLayout &TD = *AP.TM.getDataLayout();
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
@@ -270,7 +317,7 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 void NVPTXAsmPrinter::printReturnValStr(const Function *F,
                                         raw_ostream &O)
 {
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   const TargetLowering *TLI = TM.getTargetLowering();
 
   Type *Ty = F->getReturnType();
@@ -874,7 +921,7 @@ bool NVPTXAsmPrinter::doInitialization (Module &M) {
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
           .Initialize(OutContext, TM);
 
-  Mang = new Mangler(OutContext, *TM.getTargetData());
+  Mang = new Mangler(OutContext, *TM.getDataLayout());
 
   // Emit header before any dwarf directives are emitted below.
   emitHeader(M, OS1);
@@ -893,10 +940,27 @@ bool NVPTXAsmPrinter::doInitialization (Module &M) {
 
   emitDeclarations(M, OS2);
 
-  // Print out module-level global variables here.
+  // As ptxas does not support forward references of globals, we need to first
+  // sort the list of module-level globals in def-use order. We visit each
+  // global variable in order, and ensure that we emit it *after* its dependent
+  // globals. We use a little extra memory maintaining both a set and a list to
+  // have fast searches while maintaining a strict ordering.
+  SmallVector<GlobalVariable*,8> Globals;
+  DenseSet<GlobalVariable*> GVVisited;
+  DenseSet<GlobalVariable*> GVVisiting;
+
+  // Visit each global variable, in order
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-      I != E; ++I)
-    printModuleLevelGV(I, OS2);
+       I != E; ++I)
+    VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+
+  assert(GVVisited.size() == M.getGlobalList().size() && 
+         "Missed a global variable");
+  assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+  // Print out module-level global variables in proper order
+  for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+    printModuleLevelGV(Globals[i], OS2);
 
   OS2 << '\n';
 
@@ -910,7 +974,8 @@ void NVPTXAsmPrinter::emitHeader (Module &M, raw_ostream &O) {
   O << "//\n";
   O << "\n";
 
-  O << ".version 3.0\n";
+  unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+  O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
 
   O << ".target ";
   O << nvptxSubtarget.getTargetName();
@@ -1023,7 +1088,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable* GVar, raw_ostream &O,
       return;
   }
 
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
   const PointerType *PTy = GVar->getType();
@@ -1296,7 +1361,7 @@ std::string NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty,
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable* GVar,
                                             raw_ostream &O) {
 
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
 
   // GlobalVariables are always constant pointers themselves.
   const PointerType *PTy = GVar->getType();
@@ -1342,7 +1407,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable* GVar,
 
 
 static unsigned int
-getOpenCLAlignment(const TargetData *TD,
+getOpenCLAlignment(const DataLayout *TD,
                    Type *Ty) {
   if (Ty->isPrimitiveType() || Ty->isIntegerTy() || isa<PointerType>(Ty))
     return TD->getPrefTypeAlignment(Ty);
@@ -1421,7 +1486,7 @@ void NVPTXAsmPrinter::printParamName(int paramIndex, raw_ostream &O) {
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
                                             raw_ostream &O) {
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   const AttrListPtr &PAL = F->getAttributes();
   const TargetLowering *TLI = TM.getTargetLowering();
   Function::const_arg_iterator I, E;
@@ -1456,7 +1521,8 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
       continue;
     }
 
-    if (PAL.paramHasAttr(paramIndex+1, Attribute::ByVal) == false) {
+    if (PAL.getParamAttributes(paramIndex+1).
+          hasAttribute(Attributes::ByVal) == false) {
       // Just a scalar
       const PointerType *PTy = dyn_cast<PointerType>(Ty);
       if (isKernelFunc) {
@@ -1524,6 +1590,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
       // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
       unsigned align = PAL.getParamAlignment(paramIndex+1);
+      if (align == 0)
+        align = TD->getABITypeAlignment(ETy);
+
       unsigned sz = TD->getTypeAllocSize(ETy);
       O << "\t.param .align " << align
           << " .b8 ";
@@ -1714,7 +1783,7 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
 void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
 
   if (isa<UndefValue>(CPV) || CPV->isNullValue()) {
     int s = TD->getTypeAllocSize(CPV->getType());
@@ -1843,7 +1912,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
 
 void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
                                               AggBuffer *aggBuffer) {
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   int Bytes;
 
   // Old constants
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 6ea10ea14ad6..f1a99d77be9d 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -174,10 +174,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // PTX does not support load / store predicate registers
-  setOperationAction(ISD::LOAD, MVT::i1, Expand);
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setOperationAction(ISD::STORE, MVT::i1, Expand);
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::i32, MVT::i1, Expand);
   setTruncStoreAction(MVT::i16, MVT::i1, Expand);
@@ -402,7 +403,7 @@ std::string NVPTXTargetLowering::getPrototype(Type *retTy,
 
     if (isABI) {
       unsigned align = Outs[i].Flags.getByValAlign();
-      unsigned sz = getTargetData()->getTypeAllocSize(ETy);
+      unsigned sz = getDataLayout()->getTypeAllocSize(ETy);
       O << ".param .align " << align
           << " .b8 ";
       O << "_";
@@ -655,11 +656,11 @@ NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       else {
         if (Func) { // direct call
           if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment))
-            retAlignment = getTargetData()->getABITypeAlignment(retTy);
+            retAlignment = getDataLayout()->getABITypeAlignment(retTy);
         } else { // indirect call
           const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction());
           if (!llvm::getAlign(*CallI, 0, retAlignment))
-            retAlignment = getTargetData()->getABITypeAlignment(retTy);
+            retAlignment = getDataLayout()->getABITypeAlignment(retTy);
         }
         SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue DeclareRetOps[] = { Chain, DAG.getConstant(retAlignment,
@@ -856,11 +857,64 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_SUBVECTOR:
     return Op;
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: return LowerLOAD(Op, DAG);
   default:
     llvm_unreachable("Custom lowering not defined for operation");
   }
 }
 
+
+// v = ld i1* addr
+//   =>
+// v1 = ld i8* addr
+// v = trunc v1 to i1
+SDValue NVPTXTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  DebugLoc dl = Node->getDebugLoc();
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD) ;
+  assert(Node->getValueType(0) == MVT::i1 &&
+         "Custom lowering for i1 load only");
+  SDValue newLD = DAG.getLoad(MVT::i8, dl, LD->getChain(), LD->getBasePtr(),
+                              LD->getPointerInfo(),
+                              LD->isVolatile(), LD->isNonTemporal(),
+                              LD->isInvariant(),
+                              LD->getAlignment());
+  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+  // The legalizer (the caller) is expecting two values from the legalized
+  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+  // in LegalizeDAG.cpp which also uses MergeValues.
+  SDValue Ops[] = {result, LD->getChain()};
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+// st i1 v, addr
+//    =>
+// v1 = zxt v to i8
+// st i8, addr
+SDValue NVPTXTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3 = ST->getValue();
+  assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                     MVT::i8, Tmp3);
+  SDValue Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
+                                ST->getPointerInfo(), isVolatile,
+                                isNonTemporal, Alignment);
+  return Result;
+}
+
+
 SDValue
 NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, int idx,
                                 EVT v) const {
@@ -916,7 +970,7 @@ NVPTXTargetLowering::LowerFormalArguments(SDValue Chain,
                                           DebugLoc dl, SelectionDAG &DAG,
                                        SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetData *TD = getTargetData();
+  const DataLayout *TD = getDataLayout();
 
   const Function *F = MF.getFunction();
   const AttrListPtr &PAL = F->getAttributes();
@@ -965,7 +1019,7 @@ NVPTXTargetLowering::LowerFormalArguments(SDValue Chain,
     // to newly created nodes. The SDNOdes for params have to
     // appear in the same order as their order of appearance
     // in the original function. "idx+1" holds that order.
-    if (PAL.paramHasAttr(i+1, Attribute::ByVal) == false) {
+    if (PAL.getParamAttributes(i+1).hasAttribute(Attributes::ByVal) == false) {
       // A plain scalar.
       if (isABI || isKernel) {
         // If ABI, load from the param symbol
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 86246e6449d2..94a177ceb00a 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -138,6 +138,9 @@ private:
   SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
 
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 };
 } // namespace llvm
 
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 56b237252d0a..9273931e9919 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -21,7 +21,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 using namespace llvm;
 
@@ -110,7 +110,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   SmallVector<MemTransferInst *, 4> aggrMemcpys;
   SmallVector<MemSetInst *, 4> aggrMemsets;
 
-  TargetData *TD = &getAnalysis<TargetData>();
+  DataLayout *TD = &getAnalysis<DataLayout>();
   LLVMContext &Context = F.getParent()->getContext();
 
   //
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index ac7f1509f215..b150c69815dd 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -17,7 +17,7 @@
 
 #include "llvm/Pass.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -28,7 +28,7 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
   NVPTXLowerAggrCopies() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.addRequired<TargetData>();
+    AU.addRequired<DataLayout>();
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 6aadd43e9446..7b62cce2c65c 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -34,16 +34,18 @@ DriverInterface(cl::desc("Choose driver interface:"),
 
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, bool is64Bit)
-:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget,
- // because we don't register all
- // nvptx targets.
- Is64Bit(is64Bit) {
+: NVPTXGenSubtargetInfo(TT, CPU, FS),
+  Is64Bit(is64Bit),
+  PTXVersion(0),
+  SmVersion(10) {
 
   drvInterface = DriverInterface;
 
   // Provide the default CPU if none
   std::string defCPU = "sm_10";
 
+  ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS);
+
   // Get the TargetName from the FS if available
   if (FS.empty() && CPU.empty())
     TargetName = defCPU;
@@ -52,6 +54,12 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
   else
     llvm_unreachable("we are not using FeatureStr");
 
-  // Set up the SmVersion
-  SmVersion = atoi(TargetName.c_str()+3);
+  // We default to PTX 3.1, but we cannot just default to it in the initializer
+  // since the attribute parser checks if the given option is >= the default.
+  // So if we set ptx31 as the default, the ptx30 attribute would never match.
+  // Instead, we use 0 as the default and manually set 31 if the default is
+  // used.
+  if (PTXVersion == 0) {
+    PTXVersion = 31;
+  }
 }
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 8f2a629d229b..3cfd9718e541 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -25,13 +25,17 @@
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
-
-  unsigned int SmVersion;
+  
   std::string TargetName;
   NVPTX::DrvInterface drvInterface;
-  bool dummy; // For the 'dummy' feature, see NVPTX.td
   bool Is64Bit;
 
+  // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned PTXVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned int SmVersion;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified module.
@@ -69,6 +73,8 @@ public:
   NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
   std::string getTargetName() const { return TargetName; }
 
+  unsigned getPTXVersion() const { return PTXVersion; }
+
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   std::string getDataLayout() const {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 433f415a8786..cbb490003d37 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -32,7 +32,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -71,8 +71,9 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T,
                                        bool is64bit)
 : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
   Subtarget(TT, CPU, FS, is64bit),
-  DataLayout(Subtarget.getDataLayout()),
-  InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit)
+  DL(Subtarget.getDataLayout()),
+  InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit),
+  STTI(&TLInfo), VTTI(&TLInfo)
 /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
 }
 
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index b3f9cace6bf4..11bc9d4fa698 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -21,10 +21,11 @@
 #include "NVPTXSubtarget.h"
 #include "NVPTXFrameLowering.h"
 #include "ManagedStringPool.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
@@ -32,7 +33,7 @@ namespace llvm {
 ///
 class NVPTXTargetMachine : public LLVMTargetMachine {
   NVPTXSubtarget        Subtarget;
-  const TargetData      DataLayout;       // Calculates type size & alignment
+  const DataLayout      DL;       // Calculates type size & alignment
   NVPTXInstrInfo        InstrInfo;
   NVPTXTargetLowering   TLInfo;
   TargetSelectionDAGInfo   TSInfo;
@@ -44,6 +45,9 @@ class NVPTXTargetMachine : public LLVMTargetMachine {
   // Hold Strings that can be free'd all together with NVPTXTargetMachine
   ManagedStringPool     ManagedStrPool;
 
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
+
   //bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
   //                            bool DisableVerify, MCContext *&OutCtx);
 
@@ -58,7 +62,7 @@ public:
     return &FrameLowering;
   }
   virtual const NVPTXInstrInfo *getInstrInfo() const  { return &InstrInfo; }
-  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const DataLayout *getDataLayout() const     { return &DL;}
   virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget;}
 
   virtual const NVPTXRegisterInfo *getRegisterInfo() const {
@@ -72,6 +76,12 @@ public:
   virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
     return &TSInfo;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 
   //virtual bool addInstSelector(PassManagerBase &PM,
   //                             CodeGenOpt::Level OptLevel);
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index d175e3e79eb6..3d583060d1ef 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -136,21 +136,21 @@ void PPCInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNo,
 
 void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
-  char Value = MI->getOperand(OpNo).getImm();
-  Value = (Value << (32-5)) >> (32-5);
+  int Value = MI->getOperand(OpNo).getImm();
+  Value = SignExtend32<5>(Value);
   O << (int)Value;
 }
 
 void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
-  unsigned char Value = MI->getOperand(OpNo).getImm();
+  unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 31 && "Invalid u5imm argument!");
   O << (unsigned int)Value;
 }
 
 void PPCInstPrinter::printU6ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
-  unsigned char Value = MI->getOperand(OpNo).getImm();
+  unsigned int Value = MI->getOperand(OpNo).getImm();
   assert(Value <= 63 && "Invalid u6imm argument!");
   O << (unsigned int)Value;
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 48de583afdf1..87ecb13a4c76 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -29,9 +29,14 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
+  case FK_Data_8:
+  case PPC::fixup_ppc_toc:
     return Value;
+  case PPC::fixup_ppc_lo14:
+  case PPC::fixup_ppc_toc16_ds:
+    return (Value & 0xffff) << 2;
   case PPC::fixup_ppc_brcond14:
-    return Value & 0x3ffc;
+    return Value & 0xfffc;
   case PPC::fixup_ppc_br24:
     return Value & 0x3fffffc;
 #if 0
@@ -41,6 +46,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case PPC::fixup_ppc_ha16:
     return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
   case PPC::fixup_ppc_lo16:
+  case PPC::fixup_ppc_toc16:
     return Value & 0xffff;
   }
 }
@@ -72,7 +78,10 @@ public:
       { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_lo16",        16,     16,   0 },
       { "fixup_ppc_ha16",        16,     16,   0 },
-      { "fixup_ppc_lo14",        16,     14,   0 }
+      { "fixup_ppc_lo14",        16,     14,   0 },
+      { "fixup_ppc_toc",          0,     64,   0 },
+      { "fixup_ppc_toc16",       16,     16,   0 },
+      { "fixup_ppc_toc16_ds",    16,     14,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -181,7 +190,7 @@ namespace {
 
 
 
-MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index a19798157bf3..dc93f7124a52 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -11,6 +11,8 @@
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -21,9 +23,15 @@ namespace {
 
     virtual ~PPCELFObjectWriter();
   protected:
+    virtual unsigned getRelocTypeInner(const MCValue &Target,
+                                       const MCFixup &Fixup,
+                                       bool IsPCRel) const;
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) const;
+    virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                                    const MCFixup &Fixup,
+                                                    bool IsPCRel) const;
     virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
   };
 }
@@ -36,11 +44,13 @@ PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
 PPCELFObjectWriter::~PPCELFObjectWriter() {
 }
 
-unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
-                                             const MCFixup &Fixup,
-                                             bool IsPCRel,
-                                             bool IsRelocWithSymbol,
-                                             int64_t Addend) const {
+unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const
+{
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
   // determine the type of the relocation
   unsigned Type;
   if (IsPCRel) {
@@ -61,17 +71,53 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
       Type = ELF::R_PPC_ADDR24;
       break;
     case PPC::fixup_ppc_brcond14:
-      Type = ELF::R_PPC_ADDR14_BRTAKEN; // XXX: or BRNTAKEN?_
+      Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
       break;
     case PPC::fixup_ppc_ha16:
-      Type = ELF::R_PPC_ADDR16_HA;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
+        Type = ELF::R_PPC_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16_HA;
+	break;
+      }
       break;
     case PPC::fixup_ppc_lo16:
-      Type = ELF::R_PPC_ADDR16_LO;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
+        Type = ELF::R_PPC_TPREL16_LO;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC_ADDR16_LO;
+	break;
+      }
       break;
     case PPC::fixup_ppc_lo14:
       Type = ELF::R_PPC_ADDR14;
       break;
+    case PPC::fixup_ppc_toc:
+      Type = ELF::R_PPC64_TOC;
+      break;
+    case PPC::fixup_ppc_toc16:
+      Type = ELF::R_PPC64_TOC16;
+      break;
+    case PPC::fixup_ppc_toc16_ds:
+      Type = ELF::R_PPC64_TOC16_DS;
+      break;
+    case FK_Data_8:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR64;
+	break;
+      }
+      break;
     case FK_Data_4:
       Type = ELF::R_PPC_ADDR32;
       break;
@@ -83,11 +129,41 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
   return Type;
 }
 
+unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel,
+                                          bool IsRelocWithSymbol,
+                                          int64_t Addend) const {
+  return getRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Target,
+                                                            const MCFixup &Fixup,
+                                                            bool IsPCRel) const {
+  assert(Target.getSymA() && "SymA cannot be 0");
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
+
+  unsigned RelocType = getRelocTypeInner(Target, Fixup, IsPCRel);
+
+  // The .odp creation emits a relocation against the symbol ".TOC." which
+  // create a R_PPC64_TOC relocation. However the relocation symbol name
+  // in final object creation should be NULL, since the symbol does not
+  // really exist, it is just the reference to TOC base for the current
+  // object file.
+  bool EmitThisSym = RelocType != ELF::R_PPC64_TOC;
+
+  if (EmitThisSym && !Symbol.isTemporary())
+    return &Symbol;
+  return NULL;
+}
+
 void PPCELFObjectWriter::
 adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
   switch ((unsigned)Fixup.getKind()) {
     case PPC::fixup_ppc_ha16:
     case PPC::fixup_ppc_lo16:
+    case PPC::fixup_ppc_toc16:
+    case PPC::fixup_ppc_toc16_ds:
       RelocOffset += 2;
       break;
     default:
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index b3c889e3f8da..37b265e7fd38 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -34,6 +34,16 @@ enum Fixups {
   /// fixup_ppc_lo14 - A 14-bit fixup corresponding to lo16(_foo) for instrs
   /// like 'std'.
   fixup_ppc_lo14,
+
+  /// fixup_ppc_toc - Insert value of TOC base (.TOC.).
+  fixup_ppc_toc,
+
+  /// fixup_ppc_toc16 - A 16-bit signed fixup relative to the TOC base.
+  fixup_ppc_toc16,
+
+  /// fixup_ppc_toc16_ds - A 14-bit signed fixup relative to the TOC base with
+  /// implied 2 zero bits
+  fixup_ppc_toc16_ds,
   
   // Marker
   LastTargetFixupKind,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 245b4578bf28..215aa40c4afd 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -59,12 +59,10 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
 
   // Exceptions handling
-  if (!is64Bit)
-    ExceptionsType = ExceptionHandling::DwarfCFI;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
-  LCOMMDirectiveType = LCOMM::NoAlignment;
   AssemblerDialect = 0;           // Old-Style mnemonics.
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index f6524222fd79..21183024a509 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -15,7 +15,9 @@
 #include "MCTargetDesc/PPCBaseInfo.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -25,16 +27,28 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
 class PPCMCCodeEmitter : public MCCodeEmitter {
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const PPCMCCodeEmitter &);   // DO NOT IMPLEMENT
-  
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+
+  const MCSubtargetInfo &STI;
+  Triple TT;
+
 public:
   PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx) {
+                   MCContext &ctx)
+    : STI(sti), TT(STI.getTargetTriple()) {
   }
   
   ~PPCMCCodeEmitter() {}
 
+  bool is64BitMode() const {
+    return (STI.getFeatureBits() & PPC::Feature64Bit) != 0;
+  }
+
+  bool isSVR4ABI() const {
+    return TT.isMacOSX() == 0;
+  }
+
   unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
@@ -61,11 +75,19 @@ public:
                                  SmallVectorImpl<MCFixup> &Fixups) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const {
-    unsigned Bits = getBinaryCodeForInstr(MI, Fixups);
+    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
+
+    // BL8_NOPELF and BLA8_NOP_ELF is both size of 8 bacause of the
+    // following 'nop'.
+    unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == PPC::BL8_NOP_ELF || Opcode == PPC::BLA8_NOP_ELF)
+      Size = 8;
     
     // Output the constant in big endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      OS << (char)(Bits >> 24);
+    int ShiftValue = (Size * 8) - 8;
+    for (unsigned i = 0; i != Size; ++i) {
+      OS << (char)(Bits >> ShiftValue);
       Bits <<= 8;
     }
     
@@ -140,8 +162,12 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
     return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  if (isSVR4ABI() && is64BitMode())
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_toc16));
+  else
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_lo16));
   return RegBits;
 }
 
@@ -158,8 +184,12 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
     return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
   
   // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo14));
+  if (isSVR4ABI() && is64BitMode())
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_toc16_ds));
+  else
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_lo14));
   return RegBits;
 }
 
@@ -168,7 +198,9 @@ unsigned PPCMCCodeEmitter::
 get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
                     SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(OpNo);
-  assert((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
+  assert((MI.getOpcode() == PPC::MTCRF || 
+          MI.getOpcode() == PPC::MFOCRF ||
+          MI.getOpcode() == PPC::MTCRF8) &&
          (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7));
   return 0x80 >> getPPCRegisterNumbering(MO.getReg());
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 6568e82e2bf0..4c2578d5dc53 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -70,7 +70,7 @@ static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
 
   // Initial state of the frame pointer is R1.
   MachineLocation Dst(MachineLocation::VirtualFP);
-  MachineLocation Src(PPC::R1, 0);
+  MachineLocation Src(isPPC64? PPC::X1 : PPC::R1, 0);
   MAI->addInitialFrameState(0, Dst, Src);
 
   return MAI;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 7162e158f033..a0e4cf3005f2 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -36,7 +36,7 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createPPCELFObjectWriter - Construct an PPC ELF object writer.
 MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
index b7f16884363a..cb15dadb7e99 100644
--- a/lib/Target/PowerPC/PPC.td
+++ b/lib/Target/PowerPC/PPC.td
@@ -35,6 +35,10 @@ def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
 def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
 def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
 def DirectiveA2  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_A2", "">;
+def DirectiveE500mc : SubtargetFeature<"", "DarwinDirective",
+                                       "PPC::DIR_E500mc", "">;
+def DirectiveE5500  : SubtargetFeature<"", "DarwinDirective", 
+                                       "PPC::DIR_E5500", "">;
 def DirectivePwr6: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR6", "">;
 def DirectivePwr7: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_PWR7", "">;
 
@@ -94,6 +98,12 @@ def : Processor<"g5", G5Itineraries,
                   [Directive970, FeatureAltivec,
                    FeatureMFOCRF, FeatureFSqrt, FeatureSTFIWX,
                    Feature64Bit /*, Feature64BitRegs */]>;
+def : ProcessorModel<"e500mc", PPCE500mcModel,
+                  [DirectiveE500mc, FeatureMFOCRF,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
+def : ProcessorModel<"e5500", PPCE5500Model,
+                  [DirectiveE5500, FeatureMFOCRF, Feature64Bit,
+                   FeatureSTFIWX, FeatureBookE, FeatureISEL]>;
 def : Processor<"a2", PPCA2Itineraries, [DirectiveA2, FeatureBookE,
                                          FeatureMFOCRF, FeatureFSqrt,
                                          FeatureSTFIWX, FeatureISEL,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f76b89c803ab..15d690bd8970 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -54,12 +54,13 @@
 #include "llvm/Support/ELF.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/MapVector.h"
 using namespace llvm;
 
 namespace {
   class PPCAsmPrinter : public AsmPrinter {
   protected:
-    DenseMap<MCSymbol*, MCSymbol*> TOC;
+    MapVector<MCSymbol*, MCSymbol*> TOC;
     const PPCSubtarget &Subtarget;
     uint64_t TOCLabelID;
   public:
@@ -109,6 +110,8 @@ namespace {
     bool doFinalization(Module &M);
 
     virtual void EmitFunctionEntryLabel();
+
+    void EmitFunctionBodyEnd();
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -282,8 +285,22 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                           unsigned AsmVariant,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'y': // A memory reference for an X-form instruction
+      {
+        const char *RegName = "r0";
+        if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+        O << RegName << ", ";
+        printOperand(MI, OpNo, O);
+        return false;
+      }
+    }
+  }
+
   assert(MI->getOperand(OpNo).isReg());
   O << "0(";
   printOperand(MI, OpNo, O);
@@ -345,23 +362,37 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer.EmitLabel(PICBase);
     return;
   }
+  case PPC::LDtocJTI:
+  case PPC::LDtocCPT:
   case PPC::LDtoc: {
     // Transform %X3 = LDtoc <ga:@min1>, %X2
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
-      
+
     // Change the opcode to LD, and the global address operand to be a
     // reference to the TOC entry we will synthesize later.
     TmpInst.setOpcode(PPC::LD);
     const MachineOperand &MO = MI->getOperand(1);
-    assert(MO.isGlobal());
-      
-    // Map symbol -> label of TOC entry.
-    MCSymbol *&TOCEntry = TOC[Mang->getSymbol(MO.getGlobal())];
-    if (TOCEntry == 0)
-      TOCEntry = GetTempSymbol("C", TOCLabelID++);
-      
+
+    // Map symbol -> label of TOC entry
+    assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
+    MCSymbol *MOSymbol = 0;
+    if (MO.isGlobal())
+      MOSymbol = Mang->getSymbol(MO.getGlobal());
+    else if (MO.isCPI())
+      MOSymbol = GetCPISymbol(MO.getIndex());
+    else if (MO.isJTI())
+      MOSymbol = GetJTISymbol(MO.getIndex());
+    MCSymbol *&TOCEntry = TOC[MOSymbol];
+    // To avoid name clash check if the name already exists.
+    while (TOCEntry == 0) {
+      if (OutContext.LookupSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+                                  "C" + Twine(TOCLabelID++)) == 0) {
+        TOCEntry = GetTempSymbol("C", TOCLabelID);
+      }
+    }
+
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC,
+      MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
@@ -404,11 +435,17 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   OutStreamer.EmitValueToAlignment(8);
   MCSymbol *Symbol1 = 
     OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
-  MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC.@tocbase"));
+  // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
+  // entry point.
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
-                        Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/);
-  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, OutContext),
-                        Subtarget.isPPC64() ? 8 : 4/*size*/, 0/*addrspace*/);
+                        8/*size*/, 0/*addrspace*/);
+  MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
+  // Generates a R_PPC64_TOC relocation for TOC base insertion.
+  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
+                        MCSymbolRefExpr::VK_PPC_TOC, OutContext),
+                        8/*size*/, 0/*addrspace*/);
+  // Emit a null environment pointer.
+  OutStreamer.EmitIntValue(0, 8 /* size */, 0 /* addrspace */);
   OutStreamer.SwitchSection(Current);
 
   MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
@@ -419,7 +456,7 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
 
 
 bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
 
   bool isPPC64 = TD->getPointerSizeInBits() == 64;
 
@@ -429,18 +466,34 @@ bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
         SectionKind::getReadOnly());
     OutStreamer.SwitchSection(Section);
 
-    // FIXME: This is nondeterminstic!
-    for (DenseMap<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
+    for (MapVector<MCSymbol*, MCSymbol*>::iterator I = TOC.begin(),
          E = TOC.end(); I != E; ++I) {
       OutStreamer.EmitLabel(I->second);
-      OutStreamer.EmitRawText("\t.tc " + Twine(I->first->getName()) +
-                              "[TC]," + I->first->getName());
+      MCSymbol *S = OutContext.GetOrCreateSymbol(I->first->getName());
+      OutStreamer.EmitTCEntry(*S);
     }
   }
 
   return AsmPrinter::doFinalization(M);
 }
 
+/// EmitFunctionBodyEnd - Print the traceback table before the .size
+/// directive.
+///
+void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+  // Only the 64-bit target requires a traceback table.  For now,
+  // we only emit the word of zeroes that GDB requires to find
+  // the end of the function, and zeroes for the eight-byte
+  // mandatory fields.
+  // FIXME: We should fill in the eight-byte mandatory fields as described in
+  // the PPC64 ELF ABI (this is a low-priority item because GDB does not
+  // currently make use of these fields).
+  if (Subtarget.isPPC64()) {
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.EmitIntValue(0, 8/*size*/);
+  }
+}
+
 void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
   static const char *const CPUDirectives[] = {
     "",
@@ -453,6 +506,8 @@ void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
     "ppc750",
     "ppc970",
     "ppcA2",
+    "ppce500mc",
+    "ppce5500",
     "power6",
     "power7",
     "ppc64"
@@ -508,7 +563,7 @@ static MCSymbol *GetAnonSym(MCSymbol *Sym, MCContext &Ctx) {
 
 void PPCDarwinAsmPrinter::
 EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
-  bool isPPC64 = TM.getTargetData()->getPointerSizeInBits() == 64;
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
   
   const TargetLoweringObjectFileMachO &TLOFMacho = 
     static_cast<const TargetLoweringObjectFileMachO &>(getObjFileLowering());
@@ -603,7 +658,7 @@ EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
 
 
 bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
-  bool isPPC64 = TM.getTargetData()->getPointerSizeInBits() == 64;
+  bool isPPC64 = TM.getDataLayout()->getPointerSizeInBits() == 64;
 
   // Darwin/PPC always uses mach-o.
   const TargetLoweringObjectFileMachO &TLOFMacho = 
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index b2b53648561f..3f87e883b1e4 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -12,12 +12,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 
 // Return-value convention for PowerPC
 def RetCC_PPC : CallingConv<[
+  // On PPC64, integer return values are always promoted to i64
+  CCIfType<[i32], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
   CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
   
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c24afa908d69..caf7bf2be793 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "PPCFrameLowering.h"
 #include "PPCInstrInfo.h"
+#include "PPCInstrBuilder.h"
 #include "PPCMachineFunctionInfo.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -49,6 +50,11 @@ static const uint16_t VRRegNo[] = {
 /// to manipulate the VRSAVE register, even though it uses vector registers.
 /// This can happen when the only registers used are known to be live in or out
 /// of the function.  Remove all of the VRSAVE related code from the function.
+/// FIXME: The removal of the code results in a compile failure at -O0 when the
+/// function contains a function call, as the GPR containing original VRSAVE
+/// contents is spilled and reloaded around the call.  Without the prolog code,
+/// the spill instruction refers to an undefined register.  This code needs
+/// to account for all uses of that GPR.
 static void RemoveVRSaveCode(MachineInstr *MI) {
   MachineBasicBlock *Entry = MI->getParent();
   MachineFunction *MF = Entry->getParent();
@@ -168,6 +174,11 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   MI->eraseFromParent();
 }
 
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
 /// determineFrameLayout - Determine the size of the frame and maximum call
 /// frame size.
 void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
@@ -184,13 +195,22 @@ void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
-  // to adjust the stack pointer (we fit in the Red Zone).
-  bool DisableRedZone = MF.getFunction()->hasFnAttr(Attribute::NoRedZone);
-  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.
+  // to adjust the stack pointer (we fit in the Red Zone).  For 64-bit
+  // SVR4, we also require a stack frame if we need to spill the CR,
+  // since this spill area is addressed relative to the stack pointer.
+  bool DisableRedZone = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::NoRedZone);
+  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.  However, it can
+  // still generate stackless code if all local vars are reg-allocated.
+  // Try: (FrameSize <= 224
+  //       || (FrameSize == 0 && Subtarget.isPPC32 && Subtarget.isSVR4ABI()))
   if (!DisableRedZone &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
+      !(Subtarget.isPPC64() &&                     // No 64-bit SVR4 CRsave.
+	Subtarget.isSVR4ABI()
+	&& spillsCR(MF)) &&
       (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
     // No need for frame
     MFI->setStackSize(0);
@@ -241,7 +261,7 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
 
   // Naked functions have no stack frame pushed, so we don't have a frame
   // pointer.
-  if (MF.getFunction()->hasFnAttr(Attribute::Naked))
+  if (MF.getFunction()->getFnAttributes().hasAttribute(Attributes::Naked))
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
@@ -268,12 +288,13 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
-  for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
-    if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
-      HandleVRSaveUpdate(MBBI, TII);
-      break;
+  if (!Subtarget.isSVR4ABI())
+    for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+      if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+        HandleVRSaveUpdate(MBBI, TII);
+        break;
+      }
     }
-  }
 
   // Move MBBI back to the beginning of the function.
   MBBI = MBB.begin();
@@ -488,7 +509,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     // Add callee saved registers to move list.
     const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
     for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned Reg = CSI[I].getReg();
       if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
 
@@ -497,6 +517,25 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       if (PPC::CRBITRCRegClass.contains(Reg))
         continue;
 
+      // For SVR4, don't emit a move for the CR spill slot if we haven't
+      // spilled CRs.
+      if (Subtarget.isSVR4ABI()
+	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+	  && !spillsCR(MF))
+	continue;
+
+      // For 64-bit SVR4 when we have spilled CRs, the spill location
+      // is SP+8, not a frame-relative slot.
+      if (Subtarget.isSVR4ABI()
+	  && Subtarget.isPPC64()
+	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+	MachineLocation CSDst(PPC::X1, 8);
+	MachineLocation CSSrc(PPC::CR2);
+	Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+	continue;
+      }
+
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
       MachineLocation CSSrc(Reg);
       Moves.push_back(MachineMove(Label, CSDst, CSSrc));
@@ -714,11 +753,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-static bool spillsCR(const MachineFunction &MF) {
-  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-  return FuncInfo->isCRSpilled();
-}
-
 /// MustSaveLR - Return true if this function requires that we save the LR
 /// register onto the stack in the prolog and restore it in the epilog of the
 /// function.
@@ -808,7 +842,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   bool HasGPSaveArea = false;
   bool HasG8SaveArea = false;
   bool HasFPSaveArea = false;
-  bool HasCRSaveArea = false;
   bool HasVRSAVESaveArea = false;
   bool HasVRSaveArea = false;
 
@@ -843,10 +876,9 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinFPR) {
         MinFPR = Reg;
       }
-// FIXME SVR4: Disable CR save area for now.
     } else if (PPC::CRBITRCRegClass.contains(Reg) ||
                PPC::CRRCRegClass.contains(Reg)) {
-//      HasCRSaveArea = true;
+      ; // do nothing, as we already know whether CRs are spilled
     } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
       HasVRSAVESaveArea = true;
     } else if (PPC::VRRCRegClass.contains(Reg)) {
@@ -926,16 +958,21 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     }
   }
 
-  // The CR save area is below the general register save area.
-  if (HasCRSaveArea) {
-    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
-    //             which have the CR/CRBIT register class?
+  // For 32-bit only, the CR save area is below the general register
+  // save area.  For 64-bit SVR4, the CR save area is addressed relative
+  // to the stack pointer and hence does not need an adjustment here.
+  // Only CR2 (the first nonvolatile spilled) has an associated frame
+  // index so that we have a single uniform save area.
+  if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
     // Adjust the frame index of the CR spill slot.
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       unsigned Reg = CSI[i].getReg();
 
-      if (PPC::CRBITRCRegClass.contains(Reg) ||
-          PPC::CRRCRegClass.contains(Reg)) {
+      if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
+	  // Leave Darwin logic as-is.
+	  || (!Subtarget.isSVR4ABI() &&
+	      (PPC::CRBITRCRegClass.contains(Reg) ||
+	       PPC::CRRCRegClass.contains(Reg)))) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -973,3 +1010,184 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     }
   }
 }
+
+bool 
+PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+				     MachineBasicBlock::iterator MI,
+				     const std::vector<CalleeSavedInfo> &CSI,
+				     const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  bool CRSpilled = false;
+  
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    // CR2 through CR4 are the nonvolatile CR fields.
+    bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
+
+    if (CRSpilled && IsCRField)
+      continue;
+
+    // Add the callee-saved register as live-in; it's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    if (IsCRField) {
+      CRSpilled = true;
+      // The first time we see a CR field, store the whole CR into the
+      // save slot via GPR12 (available in the prolog for 32- and 64-bit).
+      if (Subtarget.isPPC64()) {
+	// 64-bit:  SP+8
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::X12));
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::STW))
+			       .addReg(PPC::X12,
+				       getKillRegState(true))
+			       .addImm(8)
+			       .addReg(PPC::X1));
+      } else {
+	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12));
+	MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+					 .addReg(PPC::R12,
+						 getKillRegState(true)),
+					 CSI[i].getFrameIdx()));
+      }
+      
+      // Record that we spill the CR in this function.
+      PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      FuncInfo->setSpillsCR();
+    } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+			      CSI[i].getFrameIdx(), RC, TRI);
+    }
+  }
+  return true;
+}
+
+static void
+restoreCRs(bool isPPC64, bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
+	   MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+	   const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  unsigned RestoreOp, MoveReg;
+
+  if (isPPC64) {
+    // 64-bit:  SP+8
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::LWZ), PPC::X12)
+	       .addImm(8)
+	       .addReg(PPC::X1));
+    RestoreOp = PPC::MTCRF8;
+    MoveReg = PPC::X12;
+  } else {
+    // 32-bit:  FP-relative
+    MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
+					     PPC::R12),
+				     CSI[CSIIndex].getFrameIdx()));
+    RestoreOp = PPC::MTCRF;
+    MoveReg = PPC::R12;
+  }
+  
+  if (CR2Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
+	       .addReg(MoveReg));
+
+  if (CR3Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3)
+	       .addReg(MoveReg));
+
+  if (CR4Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4)
+	       .addReg(MoveReg));
+}
+
+bool 
+PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+					MachineBasicBlock::iterator MI,
+				        const std::vector<CalleeSavedInfo> &CSI,
+					const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  bool CR2Spilled = false;
+  bool CR3Spilled = false;
+  bool CR4Spilled = false;
+  unsigned CSIIndex = 0;
+
+  // Initialize insertion-point logic; we will be restoring in reverse
+  // order of spill.
+  MachineBasicBlock::iterator I = MI, BeforeI = I;
+  bool AtStart = I == MBB.begin();
+
+  if (!AtStart)
+    --BeforeI;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    if (Reg == PPC::CR2) {
+      CR2Spilled = true;
+      // The spill slot is associated only with CR2, which is the
+      // first nonvolatile spilled.  Save it here.
+      CSIIndex = i;
+      continue;
+    } else if (Reg == PPC::CR3) {
+      CR3Spilled = true;
+      continue;
+    } else if (Reg == PPC::CR4) {
+      CR4Spilled = true;
+      continue;
+    } else {
+      // When we first encounter a non-CR register after seeing at
+      // least one CR register, restore all spilled CRs together.
+      if ((CR2Spilled || CR3Spilled || CR4Spilled)
+	  && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+	restoreCRs(Subtarget.isPPC64(), CR2Spilled, CR3Spilled, CR4Spilled,
+		   MBB, I, CSI, CSIIndex);
+	CR2Spilled = CR3Spilled = CR4Spilled = false;
+      }
+
+      // Default behavior for non-CR saves.
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
+			       RC, TRI);
+      assert(I != MBB.begin() &&
+	     "loadRegFromStackSlot didn't insert any code!");
+      }
+
+    // Insert in reverse order.
+    if (AtStart)
+      I = MBB.begin();
+    else {
+      I = BeforeI;
+      ++I;
+    }	    
+  }
+
+  // If we haven't yet spilled the CRs, do so now.
+  if (CR2Spilled || CR3Spilled || CR4Spilled)
+    restoreCRs(Subtarget.isPPC64(), CR2Spilled, CR3Spilled, CR4Spilled,
+	       MBB, I, CSI, CSIIndex);
+
+  return true;
+}
+
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index d708541c6686..4d957b91c7bb 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -45,6 +45,16 @@ public:
                                             RegScavenger *RS = NULL) const;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
   /// time).
@@ -170,23 +180,11 @@ public:
       {PPC::R15, -68},
       {PPC::R14, -72},
 
-      // CR save area offset.
-      // FIXME SVR4: Disable CR save area for now.
-//      {PPC::CR2, -4},
-//      {PPC::CR3, -4},
-//      {PPC::CR4, -4},
-//      {PPC::CR2LT, -4},
-//      {PPC::CR2GT, -4},
-//      {PPC::CR2EQ, -4},
-//      {PPC::CR2UN, -4},
-//      {PPC::CR3LT, -4},
-//      {PPC::CR3GT, -4},
-//      {PPC::CR3EQ, -4},
-//      {PPC::CR3UN, -4},
-//      {PPC::CR4LT, -4},
-//      {PPC::CR4GT, -4},
-//      {PPC::CR4EQ, -4},
-//      {PPC::CR4UN, -4},
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
 
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
@@ -228,27 +226,6 @@ public:
       {PPC::F14, -144},
 
       // General register save area offsets.
-      // FIXME 64-bit SVR4: Are 32-bit registers actually allocated in 64-bit
-      //                    mode?
-      {PPC::R31, -4},
-      {PPC::R30, -12},
-      {PPC::R29, -20},
-      {PPC::R28, -28},
-      {PPC::R27, -36},
-      {PPC::R26, -44},
-      {PPC::R25, -52},
-      {PPC::R24, -60},
-      {PPC::R23, -68},
-      {PPC::R22, -76},
-      {PPC::R21, -84},
-      {PPC::R20, -92},
-      {PPC::R19, -100},
-      {PPC::R18, -108},
-      {PPC::R17, -116},
-      {PPC::R16, -124},
-      {PPC::R15, -132},
-      {PPC::R14, -140},
-
       {PPC::X31, -8},
       {PPC::X30, -16},
       {PPC::X29, -24},
@@ -268,24 +245,6 @@ public:
       {PPC::X15, -136},
       {PPC::X14, -144},
 
-      // CR save area offset.
-      // FIXME SVR4: Disable CR save area for now.
-//      {PPC::CR2, -4},
-//      {PPC::CR3, -4},
-//      {PPC::CR4, -4},
-//      {PPC::CR2LT, -4},
-//      {PPC::CR2GT, -4},
-//      {PPC::CR2EQ, -4},
-//      {PPC::CR2UN, -4},
-//      {PPC::CR3LT, -4},
-//      {PPC::CR3GT, -4},
-//      {PPC::CR3EQ, -4},
-//      {PPC::CR3UN, -4},
-//      {PPC::CR4LT, -4},
-//      {PPC::CR4GT, -4},
-//      {PPC::CR4EQ, -4},
-//      {PPC::CR4UN, -4},
-
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
 
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index a00f686adce1..254fea67fc4e 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -53,7 +53,9 @@ namespace {
       GlobalBaseReg = 0;
       SelectionDAGISel::runOnMachineFunction(MF);
 
-      InsertVRSaveCode(MF);
+      if (!PPCSubTarget.isSVR4ABI())
+        InsertVRSaveCode(MF);
+
       return true;
     }
 
@@ -621,6 +623,88 @@ static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
   }
 }
 
+// getVCmpInst: return the vector compare instruction for the specified
+// vector type and condition code. Since this is for altivec specific code,
+// only support the altivec types (v16i8, v8i16, v4i32, and v4f32).
+static unsigned int getVCmpInst(MVT::SimpleValueType VecVT, ISD::CondCode CC) {
+  switch (CC) {
+    case ISD::SETEQ:
+    case ISD::SETUEQ:
+    case ISD::SETNE:
+    case ISD::SETUNE:
+      if (VecVT == MVT::v16i8)
+        return PPC::VCMPEQUB;
+      else if (VecVT == MVT::v8i16)
+        return PPC::VCMPEQUH;
+      else if (VecVT == MVT::v4i32)
+        return PPC::VCMPEQUW;
+      // v4f32 != v4f32 could be translate to unordered not equal
+      else if (VecVT == MVT::v4f32)
+        return PPC::VCMPEQFP;
+      break;
+    case ISD::SETLT:
+    case ISD::SETGT:
+    case ISD::SETLE:
+    case ISD::SETGE:
+      if (VecVT == MVT::v16i8)
+        return PPC::VCMPGTSB;
+      else if (VecVT == MVT::v8i16)
+        return PPC::VCMPGTSH;
+      else if (VecVT == MVT::v4i32)
+        return PPC::VCMPGTSW;
+      else if (VecVT == MVT::v4f32)
+        return PPC::VCMPGTFP;
+      break;
+    case ISD::SETULT:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+    case ISD::SETULE:
+      if (VecVT == MVT::v16i8)
+        return PPC::VCMPGTUB;
+      else if (VecVT == MVT::v8i16)
+        return PPC::VCMPGTUH;
+      else if (VecVT == MVT::v4i32)
+        return PPC::VCMPGTUW;
+      break;
+    case ISD::SETOEQ:
+      if (VecVT == MVT::v4f32)
+        return PPC::VCMPEQFP;
+      break;
+    case ISD::SETOLT:
+    case ISD::SETOGT:
+    case ISD::SETOLE:
+      if (VecVT == MVT::v4f32)
+        return PPC::VCMPGTFP;
+      break;
+    case ISD::SETOGE:
+      if (VecVT == MVT::v4f32)
+        return PPC::VCMPGEFP;
+      break;
+    default:
+      break;
+  }
+  llvm_unreachable("Invalid integer vector compare condition");
+}
+
+// getVCmpEQInst: return the equal compare instruction for the specified vector
+// type. Since this is for altivec specific code, only support the altivec
+// types (v16i8, v8i16, v4i32, and v4f32).
+static unsigned int getVCmpEQInst(MVT::SimpleValueType VecVT) {
+  switch (VecVT) {
+    case MVT::v16i8:
+      return PPC::VCMPEQUB;
+    case MVT::v8i16:
+      return PPC::VCMPEQUH;
+    case MVT::v4i32:
+      return PPC::VCMPEQUW;
+    case MVT::v4f32:
+      return PPC::VCMPEQFP;
+    default:
+      llvm_unreachable("Invalid integer vector compare condition");
+  }
+}
+
+
 SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
   unsigned Imm;
@@ -701,10 +785,67 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
     }
   }
 
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  // Altivec Vector compare instructions do not set any CR register by default and
+  // vector compare operations return the same type as the operands.
+  if (LHS.getValueType().isVector()) {
+    EVT VecVT = LHS.getValueType();
+    MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
+    unsigned int VCmpInst = getVCmpInst(VT, CC);
+
+    switch (CC) {
+      case ISD::SETEQ:
+      case ISD::SETOEQ:
+      case ISD::SETUEQ:
+        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+      case ISD::SETNE:
+      case ISD::SETONE:
+      case ISD::SETUNE: {
+        SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+        return CurDAG->SelectNodeTo(N, PPC::VNOR, VecVT, VCmp, VCmp);
+      } 
+      case ISD::SETLT:
+      case ISD::SETOLT:
+      case ISD::SETULT:
+        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, RHS, LHS);
+      case ISD::SETGT:
+      case ISD::SETOGT:
+      case ISD::SETUGT:
+        return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+      case ISD::SETGE:
+      case ISD::SETOGE:
+      case ISD::SETUGE: {
+        // Small optimization: Altivec provides a 'Vector Compare Greater Than
+        // or Equal To' instruction (vcmpgefp), so in this case there is no
+        // need for extra logic for the equal compare.
+        if (VecVT.getSimpleVT().isFloatingPoint()) {
+          return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
+        } else {
+          SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
+          unsigned int VCmpEQInst = getVCmpEQInst(VT);
+          SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
+          return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpGT, VCmpEQ);
+        }
+      }
+      case ISD::SETLE:
+      case ISD::SETOLE:
+      case ISD::SETULE: {
+        SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
+        unsigned int VCmpEQInst = getVCmpEQInst(VT);
+        SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
+        return CurDAG->SelectNodeTo(N, PPC::VOR, VecVT, VCmpLE, VCmpEQ);
+      }
+      default:
+        llvm_unreachable("Invalid vector compare type: should be expanded by legalize");
+    }
+  }
+
   bool Inv;
   int OtherCondIdx;
   unsigned Idx = getCRIdxForSetCC(CC, Inv, OtherCondIdx);
-  SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+  SDValue CCReg = SelectCC(LHS, RHS, CC, dl);
   SDValue IntCR;
 
   // Force the ccreg into CR7.
@@ -717,7 +858,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   if (PPCSubTarget.hasMFOCRF() && OtherCondIdx == -1)
     IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
                                            CCReg), 0);
- else
+  else
     IntCR = SDValue(CurDAG->getMachineNode(PPC::MFCRpseud, dl, MVT::i32,
                                            CR7Reg, CCReg), 0);
 
@@ -975,6 +1116,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::AND: {
     unsigned Imm, Imm2, SH, MB, ME;
+    uint64_t Imm64;
 
     // If this is an and of a value rotated between 0 and 31 bits and then and'd
     // with a mask, emit rlwinm
@@ -993,6 +1135,14 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
       return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
     }
+    // If this is a 64-bit zero-extension mask, emit rldicl.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 64 - CountTrailingOnes_64(Imm64);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB) };
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
+    }
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 61d44c52d438..adf78d5233ae 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -361,6 +361,22 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
       setOperationAction(ISD::CTTZ, VT, Expand);
       setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
+
+      for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
+        MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
+        setTruncStoreAction(VT, InnerVT, Expand);
+      }
+      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
+    }
+
+    for (unsigned i = (unsigned)MVT::FIRST_FP_VECTOR_VALUETYPE;
+         i <= (unsigned)MVT::LAST_FP_VECTOR_VALUETYPE; ++i) {
+      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+      setOperationAction(ISD::FSQRT, VT, Expand);
     }
 
     // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
@@ -373,6 +389,10 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
     setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
     setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
 
     addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
     addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
@@ -392,6 +412,14 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
     setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
     setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+
+    // Altivec does not contain unordered floating-point compare instructions
+    setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
+    setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
   }
 
   if (Subtarget->has64BitSupport()) {
@@ -449,6 +477,21 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
   setSchedulingPreference(Sched::Hybrid);
 
   computeRegisterProperties();
+
+  // The Freescale cores does better with aggressive inlining of memcpy and
+  // friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
+  if (Subtarget->getDarwinDirective() == PPC::DIR_E500mc ||
+      Subtarget->getDarwinDirective() == PPC::DIR_E5500) {
+    maxStoresPerMemset = 32;
+    maxStoresPerMemsetOptSize = 16;
+    maxStoresPerMemcpy = 32;
+    maxStoresPerMemcpyOptSize = 8;
+    maxStoresPerMemmove = 32;
+    maxStoresPerMemmoveOptSize = 8;
+
+    setPrefFunctionAlignment(4);
+    benefitFromCodePlacementOpt = true;
+  }
 }
 
 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
@@ -517,11 +560,15 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
   case PPCISD::MTFSF:           return "PPCISD::MTFSF";
   case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  case PPCISD::CR6SET:          return "PPCISD::CR6SET";
+  case PPCISD::CR6UNSET:        return "PPCISD::CR6UNSET";
   }
 }
 
 EVT PPCTargetLowering::getSetCCResultType(EVT VT) const {
-  return MVT::i32;
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
 }
 
 //===----------------------------------------------------------------------===//
@@ -811,14 +858,13 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   }
 
   // Properly sign extend the value.
-  int ShAmt = (4-ByteSize)*8;
-  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+  int MaskVal = SignExtend32(Value, ByteSize * 8);
 
   // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
   if (MaskVal == 0) return SDValue();
 
   // Finally, if this value fits in a 5 bit sext field, return it
-  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+  if (SignExtend32<5>(MaskVal) == MaskVal)
     return DAG.getTargetConstant(MaskVal, MVT::i32);
   return SDValue();
 }
@@ -1204,6 +1250,14 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
   const Constant *C = CP->getConstVal();
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    return DAG.getNode(PPCISD::TOC_ENTRY, CP->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue CPIHi =
@@ -1217,6 +1271,14 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+    return DAG.getNode(PPCISD::TOC_ENTRY, JT->getDebugLoc(), MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
   SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
@@ -1232,8 +1294,8 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
-  SDValue TgtBAHi = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOHiFlag);
-  SDValue TgtBALo = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
@@ -1441,7 +1503,7 @@ SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
                               MachinePointerInfo(),
                               MVT::i32, false, false, 0);
 
-  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(), 
+  return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
                      false, false, false, 0);
 }
 
@@ -1461,7 +1523,7 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
   Type *IntPtrTy =
-    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
+    DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
                                                              *DAG.getContext());
 
   TargetLowering::ArgListTy Args;
@@ -1684,9 +1746,13 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         DebugLoc dl, SelectionDAG &DAG,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
-  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
-    return LowerFormalArguments_SVR4(Chain, CallConv, isVarArg, Ins,
-                                     dl, DAG, InVals);
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
+    else
+      return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
+                                         dl, DAG, InVals);
   } else {
     return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
                                        dl, DAG, InVals);
@@ -1694,7 +1760,7 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
 }
 
 SDValue
-PPCTargetLowering::LowerFormalArguments_SVR4(
+PPCTargetLowering::LowerFormalArguments_32SVR4(
                                       SDValue Chain,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
@@ -1911,6 +1977,334 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
   return Chain;
 }
 
+// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+// value to MVT::i64 and then truncate to the correct register size.
+SDValue
+PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
+                                     SelectionDAG &DAG, SDValue ArgVal,
+                                     DebugLoc dl) const {
+  if (Flags.isSExt())
+    ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  else if (Flags.isZExt())
+    ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                         DAG.getValueType(ObjectVT));
+  
+  return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+}
+
+// Set the size that is at least reserved in caller of this function.  Tail
+// call optimized functions' reserved stack space needs to be aligned so that
+// taking the difference between two stack areas will result in an aligned
+// stack.
+void
+PPCTargetLowering::setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                                      unsigned nAltivecParamsAtEnd,
+                                      unsigned MinReservedArea,
+                                      bool isPPC64) const {
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  // Add the Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
+  unsigned TargetAlign
+    = DAG.getMachineFunction().getTarget().getFrameLowering()->
+        getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  FI->setMinReservedArea(MinReservedArea);
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_64SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+                       (CallConv == CallingConv::Fast));
+  unsigned PtrByteSize = 8;
+
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof(VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else
+        nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Empty aggregate parameters do not take up registers.  Examples:
+      //   struct { } a;
+      //   union  { } b;
+      //   int c[0];
+      // etc.  However, we have to provide a place-holder in InVals, so
+      // pretend we have an 8-byte item at the current address for that
+      // purpose.
+      if (!ObjSize) {
+        int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+        SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+        InVals.push_back(FIN);
+        continue;
+      }
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (ObjSize < PtrByteSize)
+        CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+
+      if (ObjSize < 8) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                      MachinePointerInfo(FuncArg, CurArgOffset),
+                                      ObjType, false, false, 0);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.  The address of the parameter was already calculated
+            // above (InVals.push_back(FIN)) to be the right-justified
+            // offset within the slot.  For this store, we need a new
+            // frame index that points at the beginning of the slot.
+            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(FuncArg, ArgOffset),
+                                 false, false, 0);
+          }
+
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg;
+          VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                       MachinePointerInfo(FuncArg, ArgOffset),
+                                       false, false, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - j;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32)
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 8 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        // Vectors are aligned.
+        ArgOffset = ((ArgOffset+15)/16)*16;
+        CurArgOffset = ArgOffset;
+        ArgOffset += 16;
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined
+    // above that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
+                           false, false, false, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized functions' reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, true);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    FuncInfo->setVarArgsFrameIndex(
+      MFI->CreateFixedObject(PtrByteSize, Depth, true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                   MachinePointerInfo(), false, false, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
 SDValue
 PPCTargetLowering::LowerFormalArguments_Darwin(
                                       SDValue Chain,
@@ -1987,10 +2381,12 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       default: llvm_unreachable("Unhandled argument type!");
       case MVT::i32:
       case MVT::f32:
-        VecArgOffset += isPPC64 ? 8 : 4;
+        VecArgOffset += 4;
         break;
       case MVT::i64:  // PPC64
       case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
         VecArgOffset += 8;
         break;
       case MVT::v4f32:
@@ -2013,7 +2409,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   SmallVector<SDValue, 8> MemOps;
   unsigned nAltivecParamsAtEnd = 0;
-  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+  Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo, ++FuncArg) {
     SDValue ArgVal;
     bool needsLoad = false;
     EVT ObjectVT = Ins[ArgNo].VT;
@@ -2061,10 +2458,11 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           else
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(),
-                                            ObjSize==1 ? MVT::i8 : MVT::i16,
-                                            false, false, 0);
+                                            MachinePointerInfo(FuncArg,
+                                              CurArgOffset),
+                                            ObjType, false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
         }
@@ -2075,8 +2473,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       }
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
         // Store whatever pieces of the object are in registers
-        // to memory.  ArgVal will be address of the beginning of
-        // the object.
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg;
           if (isPPC64)
@@ -2087,7 +2485,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
           SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                                       MachinePointerInfo(),
+                                       MachinePointerInfo(FuncArg, ArgOffset),
                                        false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
@@ -2122,18 +2520,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
         unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
         ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
 
-        if (ObjectVT == MVT::i32) {
+        if (ObjectVT == MVT::i32)
           // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
           // value to MVT::i64 and then truncate to the correct register size.
-          if (Flags.isSExt())
-            ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
-                                 DAG.getValueType(ObjectVT));
-          else if (Flags.isZExt())
-            ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
-                                 DAG.getValueType(ObjectVT));
-
-          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
-        }
+          ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
 
         ++GPR_idx;
       } else {
@@ -2220,23 +2610,10 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   }
 
   // Set the size that is at least reserved in caller of this function.  Tail
-  // call optimized function's reserved stack space needs to be aligned so that
+  // call optimized functions' reserved stack space needs to be aligned so that
   // taking the difference between two stack areas will result in an aligned
   // stack.
-  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-  // Add the Altivec parameters at the end, if needed.
-  if (nAltivecParamsAtEnd) {
-    MinReservedArea = ((MinReservedArea+15)/16)*16;
-    MinReservedArea += 16*nAltivecParamsAtEnd;
-  }
-  MinReservedArea =
-    std::max(MinReservedArea,
-             PPCFrameLowering::getMinCallFrameSize(isPPC64, true));
-  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameLowering()->
-    getStackAlignment();
-  unsigned AlignMask = TargetAlign-1;
-  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
-  FI->setMinReservedArea(MinReservedArea);
+  setMinReservedArea(MF, DAG, nAltivecParamsAtEnd, MinReservedArea, isPPC64);
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
@@ -2276,8 +2653,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   return Chain;
 }
 
-/// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
-/// linkage area for the Darwin ABI.
+/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus
+/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI.
 static unsigned
 CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                                      bool isPPC64,
@@ -2408,7 +2785,7 @@ static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
-      (Addr << 6 >> 6) != Addr)
+      SignExtend32<26>(Addr) != Addr)
     return 0;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2,
@@ -2686,7 +3063,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       // Thus for a call through a function pointer, the following actions need
       // to be performed:
       //   1. Save the TOC of the caller in the TOC save area of its stack
-      //      frame (this is done in LowerCall_Darwin()).
+      //      frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
       //   2. Load the address of the function entry point from the function
       //      descriptor.
       //   3. Load the TOC of the callee from the function descriptor into r2.
@@ -2776,6 +3153,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   return CallOpc;
 }
 
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return !G->getGlobal()->isDeclaration() &&
+           !G->getGlobal()->isWeakForLinker();
+  return false;
+}
+
 SDValue
 PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
@@ -2791,12 +3177,32 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
-    EVT VT = VA.getValVT();
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyFromReg(Chain, dl,
-                               VA.getLocReg(), VT, InFlag).getValue(1);
-    InVals.push_back(Chain.getValue(0));
-    InFlag = Chain.getValue(2);
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
   }
 
   return Chain;
@@ -2819,6 +3225,10 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
                                  isTailCall, RegsToPass, Ops, NodeTys,
                                  PPCSubTarget);
 
+  // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
+  if (isVarArg && PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
+    Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
+
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCRegisterInfo::eliminateCallFramePseudoInstr.
@@ -2880,8 +3290,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       needsTOCRestore = true;
-    } else if (CallOpc == PPCISD::CALL_SVR4) {
-      // Otherwise insert NOP.
+    } else if ((CallOpc == PPCISD::CALL_SVR4) && !isLocalCall(Callee)) {
+      // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP_SVR4;
     }
   }
@@ -2923,10 +3333,16 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
 
-  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
-    return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, OutVals, Ins,
-                          dl, DAG, InVals);
+  if (PPCSubTarget.isSVR4ABI()) {
+    if (PPCSubTarget.isPPC64())
+      return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+    else
+      return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                              isTailCall, Outs, OutVals, Ins,
+                              dl, DAG, InVals);
+  }
 
   return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
                           isTailCall, Outs, OutVals, Ins,
@@ -2934,15 +3350,15 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 }
 
 SDValue
-PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  bool isTailCall,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const {
-  // See PPCTargetLowering::LowerFormalArguments_SVR4() for a description
+PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
   assert((CallConv == CallingConv::C ||
@@ -3116,12 +3532,406 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Set CR6 to true if this is a vararg call with floating args passed in
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Set CR bit 6 to true if this is a vararg call with floating args passed in
   // registers.
   if (isVarArg) {
-    SDValue SetCR(DAG.getMachineNode(seenFloatArg ? PPC::CRSET : PPC::CRUNSET,
-                                     dl, MVT::i32), 0);
-    RegsToPass.push_back(std::make_pair(unsigned(PPC::CR1EQ), SetCR));
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
+    SDValue Ops[] = { Chain, InFlag };
+
+    Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
+                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall)
+    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
+                    false, TailCallArguments);
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+// Copy an argument into memory, being careful to do this outside the
+// call sequence for the call to which the argument belongs.
+SDValue
+PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                              SDValue CallSeqStart,
+                                              ISD::ArgFlagsTy Flags,
+                                              SelectionDAG &DAG,
+                                              DebugLoc dl) const {
+  SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                        CallSeqStart.getNode()->getOperand(0),
+                        Flags, DAG, dl);
+  // The MEMCPY must go outside the CALLSEQ_START..END.
+  SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                             CallSeqStart.getNode()->getOperand(1));
+  DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                         NewCallSeqStart.getNode());
+  return NewCallSeqStart;
+}
+
+SDValue
+PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+
+  unsigned NumOps = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned PtrByteSize = 8;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
+      CallConv == CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with at least 48 bytes, which
+  // is reserved space for [SP][CR][LR][3 x unused].
+  // NOTE: For PPC64, nAltivecParamsAtEnd always remains zero as a result
+  // of this call.
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, true, isVarArg, CallConv,
+                                         Outs, OutVals, nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const uint16_t GPR[] = {
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const uint16_t *FPR = GetFPR();
+
+  static const uint16_t VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = OutVals[i];
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // Promote integers to 64-bit values.
+    if (Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
+    if (Flags.isByVal()) {
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register.
+      unsigned Size = Flags.getByValSize();
+
+      // An empty aggregate parameter takes up no storage and no
+      // registers.
+      if (Size == 0)
+        continue;
+
+      // All aggregates smaller than 8 bytes must be passed right-justified.
+      if (Size==1 || Size==2 || Size==4) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                        MachinePointerInfo(), VT,
+                                        false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+          continue;
+        }
+      }
+
+      if (GPR_idx == NumGPRs && Size < 8) {
+        SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                        PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  All arguments must be copied into the parameter area BY
+      // THE CALLEE in the event that the callee takes the address of any
+      // formal argument.  That has not yet been implemented.  However, it is
+      // reasonable to use the stack area as a staging area for the register
+      // load.
+
+      // Skip this for small aggregates, as we will use the same slot for a
+      // right-justified copy, below.
+      if (Size >= 8)
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+      // When a register is available, pass a small aggregate right-justified.
+      if (Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the 
+        // parameter save area instead of a new local variable.
+        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                          CallSeqStart,
+                                                          Flags, DAG, dl);
+
+        // Load the slot into the register.
+        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For aggregates larger than PtrByteSize, copy the pieces of the
+      // object that fit into registers from the parameter save area.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          // A single float or an aggregate containing only a single float
+          // must be passed right-justified in the stack doubleword, and
+          // in the GPR, if one is available.
+          SDValue StoreOff;
+          if (Arg.getValueType().getSimpleVT().SimpleTy == MVT::f32) {
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+          } else
+            StoreOff = PtrOff;
+
+          SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
+                                       MachinePointerInfo(), false, false, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
+                                       MachinePointerInfo(), false, false,
+                                       false, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else if (GPR_idx != NumGPRs)
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          ++GPR_idx;
+      } else {
+        // Single-precision floating-point values are mapped to the
+        // second (rightmost) word of the stack doubleword.
+        if (Arg.getValueType() == MVT::f32) {
+          SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+          PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+        }
+
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
+                                     MachinePointerInfo(), false, false, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
+                                     MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
+                                     false, false, false, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         true, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
+                         false, false, 0);
+    // R12 must contain the address of an indirect callee.  This does not
+    // mean the MTCTR instruction must use R12; it's easier to model this
+    // as an extra parameter, so do that.
+    RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
@@ -3134,8 +3944,8 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
   }
 
   if (isTailCall)
-    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
-                    false, TailCallArguments);
+    PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
 
   return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
                     RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
@@ -3152,7 +3962,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
-  unsigned NumOps  = Outs.size();
+  unsigned NumOps = Outs.size();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
@@ -3259,11 +4069,13 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     }
 
     // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
     if (Flags.isByVal()) {
       unsigned Size = Flags.getByValSize();
+      // Very small objects are passed right-justified.  Everything else is
+      // passed left-justified.
       if (Size==1 || Size==2) {
-        // Very small objects are passed right-justified.
-        // Everything else is passed left-justified.
         EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
@@ -3274,17 +4086,12 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
 
           ArgOffset += PtrByteSize;
         } else {
-          SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType());
+          SDValue Const = DAG.getConstant(PtrByteSize - Size,
+                                          PtrOff.getValueType());
           SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
-          SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
-                                CallSeqStart.getNode()->getOperand(0),
-                                Flags, DAG, dl);
-          // This must go outside the CALLSEQ_START..END.
-          SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                               CallSeqStart.getNode()->getOperand(1));
-          DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
-                                 NewCallSeqStart.getNode());
-          Chain = CallSeqStart = NewCallSeqStart;
+          Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
+                                                            CallSeqStart,
+                                                            Flags, DAG, dl);
           ArgOffset += PtrByteSize;
         }
         continue;
@@ -3292,15 +4099,13 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
       // Copy entire object into memory.  There are cases where gcc-generated
       // code assumes it is there, even if it could be put entirely into
       // registers.  (This is not what the doc says.)
-      SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
-                            CallSeqStart.getNode()->getOperand(0),
-                            Flags, DAG, dl);
-      // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
-      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode());
-      Chain = CallSeqStart = NewCallSeqStart;
-      // And copy the pieces of it that fit into registers.
+      Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
+                                                        CallSeqStart,
+                                                        Flags, DAG, dl);
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
       for (unsigned j=0; j<Size; j+=PtrByteSize) {
         SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
@@ -3369,11 +4174,10 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
               !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
             ++GPR_idx;
         }
-      } else {
+      } else
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          isPPC64, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
-      }
       if (isPPC64)
         ArgOffset += 8;
       else
@@ -3468,22 +4272,6 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                         &MemOpChains[0], MemOpChains.size());
 
-  // Check if this is an indirect call (MTCTR/BCTRL).
-  // See PrepareCall() for more information about calls through function
-  // pointers in the 64-bit SVR4 ABI.
-  if (!isTailCall && isPPC64 && PPCSubTarget.isSVR4ABI() &&
-      !dyn_cast<GlobalAddressSDNode>(Callee) &&
-      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
-      !isBLACompatibleAddress(Callee, DAG)) {
-    // Load r2 into a virtual register and store it to the TOC save area.
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
-    // TOC save area offset.
-    SDValue PtrOff = DAG.getIntPtrConstant(40);
-    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
-                         false, false, 0);
-  }
-
   // On Darwin, R12 must contain the address of an indirect callee.  This does
   // not mean the MTCTR instruction must use R12; it's easier to model this as
   // an extra parameter, so do that.
@@ -3548,8 +4336,24 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             OutVals[i], Flag);
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -3781,7 +4585,52 @@ SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op,
     return SDValue();
 
   if (Op.getOperand(0).getValueType() == MVT::i64) {
-    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op.getOperand(0));
+    SDValue SINT = Op.getOperand(0);
+    // When converting to single-precision, we actually need to convert
+    // to double-precision first and then round to single-precision.
+    // To avoid double-rounding effects during that operation, we have
+    // to prepare the input operand.  Bits that might be truncated when
+    // converting to double-precision are replaced by a bit that won't
+    // be lost at this stage, but is below the single-precision rounding
+    // position.
+    //
+    // However, if -enable-unsafe-fp-math is in effect, accept double
+    // rounding to avoid the extra overhead.
+    if (Op.getValueType() == MVT::f32 &&
+        !DAG.getTarget().Options.UnsafeFPMath) {
+
+      // Twiddle input to make sure the low 11 bits are zero.  (If this
+      // is the case, we are guaranteed the value will fit into the 53 bit
+      // mantissa of an IEEE double-precision value without rounding.)
+      // If any of those low 11 bits were not zero originally, make sure
+      // bit 12 (value 2048) is set instead, so that the final rounding
+      // to single-precision gets the correct result.
+      SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                  SINT, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                          Round, DAG.getConstant(2047, MVT::i64));
+      Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
+      Round = DAG.getNode(ISD::AND, dl, MVT::i64,
+                          Round, DAG.getConstant(-2048, MVT::i64));
+
+      // However, we cannot use that value unconditionally: if the magnitude
+      // of the input value is small, the bit-twiddling we did above might
+      // end up visibly changing the output.  Fortunately, in that case, we
+      // don't need to twiddle bits since the original input will convert
+      // exactly to double-precision floating-point already.  Therefore,
+      // construct a conditional to use the original value if the top 11
+      // bits are all sign-bit copies, and use the rounded value computed
+      // above otherwise.
+      SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
+                                 SINT, DAG.getConstant(53, MVT::i32));
+      Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
+                         Cond, DAG.getConstant(1, MVT::i64));
+      Cond = DAG.getSetCC(dl, MVT::i32,
+                          Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
+
+      SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
+    }
+    SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
     SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits);
     if (Op.getValueType() == MVT::f32)
       FP = DAG.getNode(ISD::FP_ROUND, dl,
@@ -4126,7 +4975,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     unsigned TypeShiftAmt = i & (SplatBitSize-1);
 
     // vsplti + shl self.
-    if (SextVal == (i << (int)TypeShiftAmt)) {
+    if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
       SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
@@ -4171,17 +5020,17 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     }
 
     // t = vsplti c, result = vsldoi t, t, 1
-    if (SextVal == ((i << 8) | (i < 0 ? 0xFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
-    if (SextVal == ((i << 16) | (i < 0 ? 0xFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
-    if (SextVal == ((i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
+    if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
       SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
       return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
     }
@@ -5630,6 +6479,14 @@ PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'v':
     case 'y':
       return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -5672,6 +6529,9 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
   case 'y':
     weight = CW_Register;
     break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
   }
   return weight;
 }
@@ -5688,9 +6548,9 @@ PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
         return std::make_pair(0U, &PPC::G8RCRegClass);
       return std::make_pair(0U, &PPC::GPRCRegClass);
     case 'f':
-      if (VT == MVT::f32)
+      if (VT == MVT::f32 || VT == MVT::i32)
         return std::make_pair(0U, &PPC::F4RCRegClass);
-      if (VT == MVT::f64)
+      if (VT == MVT::f64 || VT == MVT::i64)
         return std::make_pair(0U, &PPC::F8RCRegClass);
       break;
     case 'v':
@@ -5870,7 +6730,8 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   bool is31 = (getTargetMachine().Options.DisableFramePointerElim(MF) ||
                MFI->hasVarSizedObjects()) &&
                   MFI->getStackSize() &&
-                  !MF.getFunction()->hasFnAttr(Attribute::Naked);
+                  !MF.getFunction()->getFnAttributes().
+                    hasAttribute(Attributes::Naked);
   unsigned FrameReg = isPPC64 ? (is31 ? PPC::X31 : PPC::X1) :
                                 (is31 ? PPC::R31 : PPC::R1);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index b0a013b4b4cf..b3c7f9c28d40 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -174,6 +174,10 @@ namespace llvm {
       ///   operand #3 optional in flag
       TC_RETURN,
 
+      /// ch, gl = CR6[UN]SET ch, inglue - Toggle CR bit 6 for SVR4 vararg calls
+      CR6SET,
+      CR6UNSET,
+
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
 
@@ -463,20 +467,41 @@ namespace llvm {
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue
+      extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
+                        SDValue ArgVal, DebugLoc dl) const;
+
+    void
+      setMinReservedArea(MachineFunction &MF, SelectionDAG &DAG,
+                         unsigned nAltivecParamsAtEnd,
+                         unsigned MinReservedArea, bool isPPC64) const;
+
+    SDValue
       LowerFormalArguments_Darwin(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                   DebugLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
-      LowerFormalArguments_SVR4(SDValue Chain,
-                                CallingConv::ID CallConv, bool isVarArg,
-                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                                DebugLoc dl, SelectionDAG &DAG,
-                                SmallVectorImpl<SDValue> &InVals) const;
+      LowerFormalArguments_64SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+      LowerFormalArguments_32SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
+
+    SDValue
+      createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
+                                 SDValue CallSeqStart, ISD::ArgFlagsTy Flags,
+                                 SelectionDAG &DAG, DebugLoc dl) const;
 
     SDValue
-      LowerCall_Darwin(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+      LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
                        bool isVarArg, bool isTailCall,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
@@ -484,13 +509,22 @@ namespace llvm {
                        DebugLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
-    LowerCall_SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                   bool isVarArg, bool isTailCall,
-                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   const SmallVectorImpl<SDValue> &OutVals,
-                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                   DebugLoc dl, SelectionDAG &DAG,
-                   SmallVectorImpl<SDValue> &InVals) const;
+      LowerCall_64SVR4(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
+                       bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<SDValue> &OutVals,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const;
+    SDValue
+    LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+                     bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<SDValue> &OutVals,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const;
   };
 }
 
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 39778a5dc1e1..9711452ec46a 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -29,6 +29,9 @@ def symbolLo64 : Operand<i64> {
   let PrintMethod = "printSymbolLo";
   let EncoderMethod = "getLO16Encoding";
 }
+def tocentry : Operand<iPTR> {
+  let MIOperandInfo = (ops i32imm:$imm);
+}
 
 //===----------------------------------------------------------------------===//
 // 64-bit transformation functions.
@@ -60,7 +63,7 @@ def HI48_64 : SDNodeXForm<imm, [{
 //
 
 let Defs = [LR8] in
-  def MovePCtoLR8 : Pseudo<(outs), (ins), "", []>,
+  def MovePCtoLR8 : Pseudo<(outs), (ins), "#MovePCtoLR8", []>,
                     PPC970_Unit_BRU;
 
 // Darwin ABI Calls.
@@ -138,31 +141,31 @@ def : Pat<(PPCnop),
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_ADD_I64",
       [(set G8RC:$dst, (atomic_load_add_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_SUB_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_SUB_I64",
       [(set G8RC:$dst, (atomic_load_sub_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_OR_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_OR_I64",
       [(set G8RC:$dst, (atomic_load_or_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_XOR_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_XOR_I64",
       [(set G8RC:$dst, (atomic_load_xor_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_AND_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_AND_i64",
       [(set G8RC:$dst, (atomic_load_and_64 xoaddr:$ptr, G8RC:$incr))]>;
     def ATOMIC_LOAD_NAND_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr), "#ATOMIC_LOAD_NAND_I64",
       [(set G8RC:$dst, (atomic_load_nand_64 xoaddr:$ptr, G8RC:$incr))]>;
 
     def ATOMIC_CMP_SWAP_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new), "#ATOMIC_CMP_SWAP_I64",
       [(set G8RC:$dst, 
                     (atomic_cmp_swap_64 xoaddr:$ptr, G8RC:$old, G8RC:$new))]>;
 
     def ATOMIC_SWAP_I64 : Pseudo<
-      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new), "",
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new), "#ATOMIC_SWAP_I64",
       [(set G8RC:$dst, (atomic_swap_64 xoaddr:$ptr, G8RC:$new))]>;
   }
 }
@@ -231,10 +234,10 @@ def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
   let Defs = [CTR8], Uses = [CTR8] in {
-    def BDZ8  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdz $dst",  BrB, []>;
-    def BDNZ8 : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdnz $dst", BrB, []>;
+    def BDZ8  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdz $dst">;
+    def BDNZ8 : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                        "bdnz $dst">;
   }
 }
 
@@ -244,7 +247,7 @@ def MTCRF8 : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins G8RC:$rS),
             PPC970_MicroCode, PPC970_Unit_CRU;
 
 def MFCR8pseud: XFXForm_3<31, 19, (outs G8RC:$rT), (ins crbitm:$FXM),
-                       "", SprMFCR>,
+                       "#MFCR8pseud", SprMFCR>,
             PPC970_MicroCode, PPC970_Unit_CRU;
             
 def MFCR8 : XFXForm_3<31, 19, (outs G8RC:$rT), (ins),
@@ -275,7 +278,7 @@ def MFTB8 : XFXForm_1_ext<31, 339, 268, (outs G8RC:$rT), (ins),
 // the POWER3.
 
 let Defs = [X1], Uses = [X1] in
-def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),"",
+def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),"#DYNALLOC8",
                        [(set G8RC:$result,
                              (PPCdynalloc G8RC:$negsize, iaddr:$fpsi))]>;
 
@@ -296,12 +299,14 @@ def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
 
 let PPC970_Unit = 1 in {  // FXU Operations.
 
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
                       "li $rD, $imm", IntSimple,
                       [(set G8RC:$rD, immSExt16:$imm)]>;
 def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
                       "lis $rD, $imm", IntSimple,
                       [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+}
 
 // Logical ops.
 def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
@@ -459,7 +464,7 @@ def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
 
 let Defs = [CARRY] in {
 def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
-                      "sradi $rA, $rS, $SH", IntRotateD,
+                      "sradi $rA, $rS, $SH", IntRotateDI,
                       [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
 }
 def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
@@ -482,23 +487,23 @@ def MULLD : XOForm_1<31, 233, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
 let isCommutable = 1 in {
 def RLDIMI : MDForm_1<30, 3,
                       (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateDI,
                       []>, isPPC64, RegConstraint<"$rSi = $rA">,
                       NoEncode<"$rSi">;
 }
 
 // Rotate instructions.
 def RLDCL  : MDForm_1<30, 0,
-                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MB),
-                      "rldcl $rA, $rS, $rB, $MB", IntRotateD,
+                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MBE),
+                      "rldcl $rA, $rS, $rB, $MBE", IntRotateD,
                       []>, isPPC64;
 def RLDICL : MDForm_1<30, 0,
-                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB),
-                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MBE),
+                      "rldicl $rA, $rS, $SH, $MBE", IntRotateDI,
                       []>, isPPC64;
 def RLDICR : MDForm_1<30, 1,
-                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
-                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MBE),
+                      "rldicr $rA, $rS, $SH, $MBE", IntRotateDI,
                       []>, isPPC64;
 
 def RLWINM8 : MForm_2<21,
@@ -506,7 +511,7 @@ def RLWINM8 : MForm_2<21,
                      "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
                      []>;
 
-def ISEL8   : AForm_1<31, 15,
+def ISEL8   : AForm_4<31, 15,
                      (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB, pred:$cond),
                      "isel $rT, $rA, $rB, $cond", IntGeneral,
                      []>;
@@ -541,19 +546,19 @@ def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src),
 let mayLoad = 1 in
 def LHAU8 : DForm_1a<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
                             ptr_rc:$rA),
-                    "lhau $rD, $disp($rA)", LdStLoad,
+                    "lhau $rD, $disp($rA)", LdStLHAU,
                     []>, RegConstraint<"$rA = $ea_result">,
                     NoEncode<"$ea_result">;
 // NO LWAU!
 
 def LHAUX8 : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lhaux $rD, $addr", LdStLoad,
+                    "lhaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">;
-def LWAUX : XForm_1<31, 375, (outs G8RC:$rD, ptr_rc:$ea_result),
+def LWAUX : XForm_1<31, 373, (outs G8RC:$rD, ptr_rc:$ea_result),
                     (ins memrr:$addr),
-                    "lwaux $rD, $addr", LdStLoad,
+                    "lwaux $rD, $addr", LdStLHAU,
                     []>, RegConstraint<"$addr.offreg = $ea_result">,
                     NoEncode<"$ea_result">, isPPC64;
 }
@@ -584,31 +589,31 @@ def LWZX8 : XForm_1<31,  23, (outs G8RC:$rD), (ins memrr:$src),
 // Update forms.
 let mayLoad = 1 in {
 def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lbzu $rD, $addr", LdStLoad,
+                    "lbzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lhzu $rD, $addr", LdStLoad,
+                    "lhzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                    "lwzu $rD, $addr", LdStLoad,
+                    "lwzu $rD, $addr", LdStLoadUpd,
                     []>, RegConstraint<"$addr.reg = $ea_result">,
                     NoEncode<"$ea_result">;
 
 def LBZUX8 : XForm_1<31, 119, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
-def LHZUX8 : XForm_1<31, 331, (outs G8RC:$rD, ptr_rc:$ea_result),
+def LHZUX8 : XForm_1<31, 311, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 def LWZUX8 : XForm_1<31, 55, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -621,18 +626,26 @@ def LD   : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrix:$src),
                     "ld $rD, $src", LdStLD,
                     [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
 def LDtoc: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
-                  "",
+                  "#LDtoc",
                   [(set G8RC:$rD,
                      (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocJTI: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtocJTI",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tjumptable:$disp, G8RC:$reg))]>, isPPC64;
+def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                  "#LDtocCPT",
+                  [(set G8RC:$rD,
+                     (PPCtoc_entry tconstpool:$disp, G8RC:$reg))]>, isPPC64;
 
 let hasSideEffects = 1 in { 
-let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
-def LDinto_toc: DSForm_1<58, 0, (outs), (ins G8RC:$reg),
+let RST = 2, DS = 2 in
+def LDinto_toc: DSForm_1a<58, 0, (outs), (ins G8RC:$reg),
                     "ld 2, 8($reg)", LdStLD,
                     [(PPCload_toc G8RC:$reg)]>, isPPC64;
                     
-let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
-def LDtoc_restore : DSForm_1<58, 0, (outs), (ins),
+let RST = 2, DS = 10, RA = 1 in
+def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
                     "ld 2, 40(1)", LdStLD,
                     [(PPCtoc_restore)]>, isPPC64;
 }
@@ -642,13 +655,13 @@ def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
                    
 let mayLoad = 1 in
 def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
-                    "ldu $rD, $addr", LdStLD,
+                    "ldu $rD, $addr", LdStLDU,
                     []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
                     NoEncode<"$ea_result">;
 
 def LDUX : XForm_1<31, 53, (outs G8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "ldux $rD, $addr", LdStLoad,
+                   "ldux $rD, $addr", LdStLDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">, isPPC64;
 }
@@ -693,16 +706,16 @@ def STDX  : XForm_8<31, 149, (outs), (ins G8RC:$rS, memrr:$dst),
 
 let PPC970_Unit = 2 in {
 
-def STBU8 : DForm_1a<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+def STBU8 : DForm_1a<39, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
@@ -710,7 +723,7 @@ def STHU8 : DForm_1a<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 
 def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti32 G8RC:$rS, ptr_rc:$ptrreg,
                                           iaddroff:$ptroff))]>,
@@ -718,7 +731,7 @@ def STWU8 : DForm_1a<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 
 def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
                                         s16immX4:$ptroff, ptr_rc:$ptrreg),
-                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTDU,
                     [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
@@ -727,7 +740,7 @@ def STDU : DSForm_1a<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
 
 def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti8 G8RC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -736,7 +749,7 @@ def STBUX8 : XForm_8<31, 247, (outs ptr_rc:$ea_res),
 
 def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                    "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti16 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -745,7 +758,7 @@ def STHUX8 : XForm_8<31, 439, (outs ptr_rc:$ea_res),
 
 def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                        (pre_truncsti32 G8RC:$rS,
                                        ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -754,7 +767,7 @@ def STWUX8 : XForm_8<31, 183, (outs ptr_rc:$ea_res),
 
 def STDUX : XForm_8<31, 181, (outs ptr_rc:$ea_res),
                               (ins G8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stdux $rS, $ptroff, $ptrreg", LdStSTDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store G8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index b0b842328196..ba58c3e4ac88 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -340,6 +340,28 @@ def VCTUXS : VXForm_1<906, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
                       "vctuxs $vD, $vB, $UIMM", VecFP,
                       [(set VRRC:$vD,
                              (int_ppc_altivec_vctuxs VRRC:$vB, imm:$UIMM))]>;
+
+// Defines with the UIM field set to 0 for floating-point
+// to integer (fp_to_sint/fp_to_uint) conversions and integer
+// to floating-point (sint_to_fp/uint_to_fp) conversions.
+let VA = 0 in {
+def VCFSX_0 : VXForm_1<842, (outs VRRC:$vD), (ins VRRC:$vB),
+                       "vcfsx $vD, $vB, 0", VecFP,
+                       [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfsx VRRC:$vB, 0))]>;
+def VCTUXS_0 : VXForm_1<906, (outs VRRC:$vD), (ins VRRC:$vB),
+                        "vctuxs $vD, $vB, 0", VecFP,
+                        [(set VRRC:$vD,
+                               (int_ppc_altivec_vctuxs VRRC:$vB, 0))]>;
+def VCFUX_0 : VXForm_1<778, (outs VRRC:$vD), (ins VRRC:$vB),
+                       "vcfux $vD, $vB, 0", VecFP,
+                       [(set VRRC:$vD,
+                               (int_ppc_altivec_vcfux VRRC:$vB, 0))]>;
+def VCTSXS_0 : VXForm_1<970, (outs VRRC:$vD), (ins VRRC:$vB),
+                      "vctsxs $vD, $vB, 0", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctsxs VRRC:$vB, 0))]>;
+}
 def VEXPTEFP : VX2_Int<394, "vexptefp", int_ppc_altivec_vexptefp>;
 def VLOGEFP  : VX2_Int<458, "vlogefp",  int_ppc_altivec_vlogefp>;
 
@@ -689,3 +711,13 @@ def : Pat<(v8i16 (sra (v8i16 VRRC:$vA), (v8i16 VRRC:$vB))),
           (v8i16 (VSRAH VRRC:$vA, VRRC:$vB))>;
 def : Pat<(v4i32 (sra (v4i32 VRRC:$vA), (v4i32 VRRC:$vB))),
           (v4i32 (VSRAW VRRC:$vA, VRRC:$vB))>;
+
+// Float to integer and integer to float conversions
+def : Pat<(v4i32 (fp_to_sint (v4f32 VRRC:$vA))),
+           (VCTSXS_0 VRRC:$vA)>;
+def : Pat<(v4i32 (fp_to_uint (v4f32 VRRC:$vA))),
+           (VCTUXS_0 VRRC:$vA)>;
+def : Pat<(v4f32 (sint_to_fp (v4i32 VRRC:$vA))),
+           (VCFSX_0 VRRC:$vA)>;
+def : Pat<(v4f32 (uint_to_fp (v4i32 VRRC:$vA))),
+           (VCFUX_0 VRRC:$vA)>;
diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
index a41a0279d215..c3c171cd21fc 100644
--- a/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/lib/Target/PowerPC/PPCInstrFormats.td
@@ -94,12 +94,6 @@ class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
   let Inst{31}    = lk;
 }
 
-class IForm_ext<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
-            string asmstr, InstrItinClass itin, list<dag> pattern>
-         : IForm<opcode, aa, lk, OOL, IOL, asmstr, itin, pattern> {
-  let LI{0-4} = bo;
-}
-
 // 1.7.2 B-Form
 class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
   : I<opcode, OOL, IOL, asmstr, BrB> {
@@ -118,6 +112,13 @@ class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
   let Inst{31}    = lk;
 }
 
+class BForm_1<bits<6> opcode, bits<5> bo, bit aa, bit lk, dag OOL, dag IOL,
+             string asmstr>
+  : BForm<opcode, aa, lk, OOL, IOL, asmstr> {
+  let BIBO{4-0} = bo;
+  let BIBO{6-5} = 0;
+  let CR = 0;
+}
 
 // 1.7.4 D-Form
 class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
@@ -625,9 +626,9 @@ class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                 InstrItinClass itin> 
   : I<opcode, OOL, IOL, asmstr, itin> {
   bits<8>  FXM;
-  bits<5>  ST;
+  bits<5>  rS;
    
-  let Inst{6-10}  = ST;
+  let Inst{6-10}  = rS;
   let Inst{11}    = 0;
   let Inst{12-19} = FXM;
   let Inst{20}    = 0;
@@ -666,7 +667,7 @@ class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
                       string cstr, InstrItinClass itin, list<dag>pattern>
   : I<opcode, OOL, IOL, asmstr, itin> {
   bits<8> FM;
-  bits<5> RT;
+  bits<5> rT;
 
   bit RC = 0;    // set by isDOT
   let Pattern = pattern;
@@ -675,7 +676,7 @@ class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
   let Inst{6} = 0;
   let Inst{7-14}  = FM;
   let Inst{15} = 0;
-  let Inst{16-20} = RT;
+  let Inst{16-20} = rT;
   let Inst{21-30} = xo;
   let Inst{31}    = RC;
 }
@@ -758,6 +759,26 @@ class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
   let FRB = 0;
 }
 
+class AForm_4<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO (must be 12).
+  bits<3> CR;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-23} = CR;
+  let Inst{24-25} = BIBO{6-5};
+  let Inst{26-30} = xo;
+  let Inst{31}    = 0;
+}
+
 // 1.7.13 M-Form
 class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
               InstrItinClass itin, list<dag> pattern>
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 47f09dca77d3..d9d68446f536 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -54,7 +54,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
   const TargetMachine *TM,
   const ScheduleDAG *DAG) const {
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
-  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2) {
+  if (Directive == PPC::DIR_440 || Directive == PPC::DIR_A2 ||
+      Directive == PPC::DIR_E500mc || Directive == PPC::DIR_E5500) {
     const InstrItineraryData *II = TM->getInstrItineraryData();
     return new PPCScoreboardHazardRecognizer(II, DAG);
   }
@@ -70,7 +71,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetPostRAHazardRecognizer(
   unsigned Directive = TM.getSubtarget<PPCSubtarget>().getDarwinDirective();
 
   // Most subtargets use a PPC970 recognizer.
-  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2) {
+  if (Directive != PPC::DIR_440 && Directive != PPC::DIR_A2 &&
+      Directive != PPC::DIR_E500mc && Directive != PPC::DIR_E5500) {
     const TargetInstrInfo *TII = TM.getInstrInfo();
     assert(TII && "No InstrInfo?");
 
@@ -568,12 +570,15 @@ PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
     // STVX VAL, 0, R0
     //
     // FIXME: We use R0 here, because it isn't available for RA.
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+    bool Is64Bit = TM.getSubtargetImpl()->isPPC64();
+    unsigned Instr = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
+    unsigned GPR0  = Is64Bit ? PPC::X0    : PPC::R0;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Instr), GPR0),
                                        FrameIdx, 0, 0));
     NewMIs.push_back(BuildMI(MF, DL, get(PPC::STVX))
                      .addReg(SrcReg, getKillRegState(isKill))
-                     .addReg(PPC::R0)
-                     .addReg(PPC::R0));
+                     .addReg(GPR0)
+                     .addReg(GPR0));
   } else {
     llvm_unreachable("Unknown regclass!");
   }
@@ -705,10 +710,13 @@ PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
     // Dest = LVX 0, R0
     //
     // FIXME: We use R0 here, because it isn't available for RA.
-    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+    bool Is64Bit = TM.getSubtargetImpl()->isPPC64();
+    unsigned Instr = Is64Bit ? PPC::ADDI8 : PPC::ADDI;
+    unsigned GPR0  = Is64Bit ? PPC::X0    : PPC::R0;
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Instr), GPR0),
                                        FrameIdx, 0, 0));
-    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(PPC::R0)
-                     .addReg(PPC::R0));
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(GPR0)
+                     .addReg(GPR0));
   } else {
     llvm_unreachable("Unknown regclass!");
   }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index f57f0c975ad6..6ee045a2c7c9 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -123,9 +123,11 @@ def PPCnop : SDNode<"PPCISD::NOP", SDT_PPCnop, [SDNPInGlue, SDNPOutGlue]>;
 def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
-                          [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                          [SDNPHasChain, SDNPSideEffect,
+                           SDNPInGlue, SDNPOutGlue]>;
 def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
-                            [SDNPHasChain, SDNPInGlue, SDNPOutGlue]>;
+                            [SDNPHasChain, SDNPSideEffect,
+                             SDNPInGlue, SDNPOutGlue]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl_Darwin  : SDNode<"PPCISD::BCTRL_Darwin", SDTNone,
@@ -153,6 +155,12 @@ def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
 def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
                            [SDNPHasChain, SDNPMayStore]>;
 
+// Instructions to set/unset CR bit 6 for SVR4 vararg calls
+def PPCcr6set   : SDNode<"PPCISD::CR6SET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def PPCcr6unset : SDNode<"PPCISD::CR6UNSET", SDTNone,
+                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
 // Instructions to support atomic operations
 def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -330,9 +338,6 @@ def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
   let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
   let EncoderMethod = "getMemRIXEncoding";
 }
-def tocentry : Operand<iPTR> {
-  let MIOperandInfo = (ops i32imm:$imm);
-}
 
 // PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
 // that doesn't matter.
@@ -364,9 +369,9 @@ def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
 
 let hasCtrlDep = 1 in {
 let Defs = [R1], Uses = [R1] in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "",
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt",
                               [(callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "",
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2",
                               [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
@@ -375,7 +380,7 @@ def UPDATE_VRSAVE    : Pseudo<(outs GPRC:$rD), (ins GPRC:$rS),
 }
 
 let Defs = [R1], Uses = [R1] in
-def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi), "",
+def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi), "#DYNALLOC",
                        [(set GPRC:$result,
                              (PPCdynalloc GPRC:$negsize, iaddr:$fpsi))]>;
                          
@@ -384,19 +389,19 @@ def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi), "",
 let usesCustomInserter = 1,    // Expanded after instruction selection.
     PPC970_Single = 1 in {
   def SELECT_CC_I4 : Pseudo<(outs GPRC:$dst), (ins CRRC:$cond, GPRC:$T, GPRC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_I4",
                               []>;
   def SELECT_CC_I8 : Pseudo<(outs G8RC:$dst), (ins CRRC:$cond, G8RC:$T, G8RC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_I8",
                               []>;
   def SELECT_CC_F4  : Pseudo<(outs F4RC:$dst), (ins CRRC:$cond, F4RC:$T, F4RC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_F4",
                               []>;
   def SELECT_CC_F8  : Pseudo<(outs F8RC:$dst), (ins CRRC:$cond, F8RC:$T, F8RC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_F8",
                               []>;
   def SELECT_CC_VRRC: Pseudo<(outs VRRC:$dst), (ins CRRC:$cond, VRRC:$T, VRRC:$F,
-                              i32imm:$BROPC), "",
+                              i32imm:$BROPC), "#SELECT_CC_VRRC",
                               []>;
 }
 
@@ -404,16 +409,16 @@ let usesCustomInserter = 1,    // Expanded after instruction selection.
 // scavenge a register for it.
 let mayStore = 1 in
 def SPILL_CR : Pseudo<(outs), (ins CRRC:$cond, memri:$F),
-                     "", []>;
+                     "#SPILL_CR", []>;
 
 // RESTORE_CR - Indicate that we're restoring the CR register (previously
 // spilled), so we'll need to scavenge a register for it.
 let mayLoad = 1 in
 def RESTORE_CR : Pseudo<(outs CRRC:$cond), (ins memri:$F),
-                     "", []>;
+                     "#RESTORE_CR", []>;
 
 let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
-  let isReturn = 1, Uses = [LR, RM] in
+  let isCodeGenOnly = 1, isReturn = 1, Uses = [LR, RM] in
     def BLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$p),
                           "b${p:cc}lr ${p:reg}", BrB, 
                           [(retflag)]>;
@@ -422,7 +427,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 }
 
 let Defs = [LR] in
-  def MovePCtoLR : Pseudo<(outs), (ins), "", []>,
+  def MovePCtoLR : Pseudo<(outs), (ins), "#MovePCtoLR", []>,
                    PPC970_Unit_BRU;
 
 let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
@@ -434,16 +439,17 @@ let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
 
   // BCC represents an arbitrary conditional branch on a predicate.
   // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
-  // a two-value operand where a dag node expects two operands. :( 
-  def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
-                  "b${cond:cc} ${cond:reg}, $dst"
-                  /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+  // a two-value operand where a dag node expects two operands. :(
+  let isCodeGenOnly = 1 in
+    def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, condbrtarget:$dst),
+                    "b${cond:cc} ${cond:reg}, $dst"
+                    /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
 
   let Defs = [CTR], Uses = [CTR] in {
-    def BDZ  : IForm_ext<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdz $dst",  BrB, []>;
-    def BDNZ : IForm_ext<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
-                         "bdnz $dst", BrB, []>;
+    def BDZ  : BForm_1<16, 18, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdz $dst">;
+    def BDNZ : BForm_1<16, 16, 0, 0, (outs), (ins condbrtarget:$dst),
+                       "bdnz $dst">;
   }
 }
 
@@ -559,81 +565,81 @@ def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 1)),
 let usesCustomInserter = 1 in {
   let Defs = [CR0] in {
     def ATOMIC_LOAD_ADD_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I8",
       [(set GPRC:$dst, (atomic_load_add_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_SUB_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I8",
       [(set GPRC:$dst, (atomic_load_sub_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_AND_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I8",
       [(set GPRC:$dst, (atomic_load_and_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_OR_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I8",
       [(set GPRC:$dst, (atomic_load_or_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_XOR_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "ATOMIC_LOAD_XOR_I8",
       [(set GPRC:$dst, (atomic_load_xor_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_NAND_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I8",
       [(set GPRC:$dst, (atomic_load_nand_8 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_ADD_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I16",
       [(set GPRC:$dst, (atomic_load_add_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_SUB_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I16",
       [(set GPRC:$dst, (atomic_load_sub_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_AND_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I16",
       [(set GPRC:$dst, (atomic_load_and_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_OR_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I16",
       [(set GPRC:$dst, (atomic_load_or_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_XOR_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_XOR_I16",
       [(set GPRC:$dst, (atomic_load_xor_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_NAND_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I16",
       [(set GPRC:$dst, (atomic_load_nand_16 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_ADD_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_ADD_I32",
       [(set GPRC:$dst, (atomic_load_add_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_SUB_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_SUB_I32",
       [(set GPRC:$dst, (atomic_load_sub_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_AND_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_AND_I32",
       [(set GPRC:$dst, (atomic_load_and_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_OR_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_OR_I32",
       [(set GPRC:$dst, (atomic_load_or_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_XOR_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_XOR_I32",
       [(set GPRC:$dst, (atomic_load_xor_32 xoaddr:$ptr, GPRC:$incr))]>;
     def ATOMIC_LOAD_NAND_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr), "#ATOMIC_LOAD_NAND_I32",
       [(set GPRC:$dst, (atomic_load_nand_32 xoaddr:$ptr, GPRC:$incr))]>;
 
     def ATOMIC_CMP_SWAP_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I8",
       [(set GPRC:$dst, 
                     (atomic_cmp_swap_8 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
     def ATOMIC_CMP_SWAP_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I16 $dst $ptr $old $new",
       [(set GPRC:$dst, 
                     (atomic_cmp_swap_16 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
     def ATOMIC_CMP_SWAP_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new), "#ATOMIC_CMP_SWAP_I32 $dst $ptr $old $new",
       [(set GPRC:$dst, 
                     (atomic_cmp_swap_32 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
 
     def ATOMIC_SWAP_I8 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_i8",
       [(set GPRC:$dst, (atomic_swap_8 xoaddr:$ptr, GPRC:$new))]>;
     def ATOMIC_SWAP_I16 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_I16",
       [(set GPRC:$dst, (atomic_swap_16 xoaddr:$ptr, GPRC:$new))]>;
     def ATOMIC_SWAP_I32 : Pseudo<
-      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "",
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new), "#ATOMIC_SWAP_I32",
       [(set GPRC:$dst, (atomic_swap_32 xoaddr:$ptr, GPRC:$new))]>;
   }
 }
@@ -673,7 +679,7 @@ def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src),
                   [(set GPRC:$rD, (load iaddr:$src))]>;
 
 def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
-                  "lfs $rD, $src", LdStLFDU,
+                  "lfs $rD, $src", LdStLFD,
                   [(set F4RC:$rD, (load iaddr:$src))]>;
 def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
                   "lfd $rD, $src", LdStLFD,
@@ -683,32 +689,32 @@ def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
 // Unindexed (r+i) Loads with Update (preinc).
 let mayLoad = 1 in {
 def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lbzu $rD, $addr", LdStLoad,
+                   "lbzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhau $rD, $addr", LdStLoad,
+                   "lhau $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lhzu $rD, $addr", LdStLoad,
+                   "lhzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                   "lwzu $rD, $addr", LdStLoad,
+                   "lwzu $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfs $rD, $addr", LdStLFDU,
+                  "lfsu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
-                  "lfd $rD, $addr", LdStLFD,
+                  "lfdu $rD, $addr", LdStLFDU,
                   []>, RegConstraint<"$addr.reg = $ea_result">,
                    NoEncode<"$ea_result">;
 
@@ -716,37 +722,37 @@ def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
 // Indexed (r+r) Loads with Update (preinc).
 def LBZUX : XForm_1<31, 119, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lbzux $rD, $addr", LdStLoad,
+                   "lbzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LHAUX : XForm_1<31, 375, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhaux $rD, $addr", LdStLoad,
+                   "lhaux $rD, $addr", LdStLHAU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
-def LHZUX : XForm_1<31, 331, (outs GPRC:$rD, ptr_rc:$ea_result),
+def LHZUX : XForm_1<31, 311, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lhzux $rD, $addr", LdStLoad,
+                   "lhzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LWZUX : XForm_1<31, 55, (outs GPRC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lwzux $rD, $addr", LdStLoad,
+                   "lwzux $rD, $addr", LdStLoadUpd,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFSUX : XForm_1<31, 567, (outs F4RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfsux $rD, $addr", LdStLoad,
+                   "lfsux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 
 def LFDUX : XForm_1<31, 631, (outs F8RC:$rD, ptr_rc:$ea_result),
                    (ins memrr:$addr),
-                   "lfdux $rD, $addr", LdStLoad,
+                   "lfdux $rD, $addr", LdStLFDU,
                    []>, RegConstraint<"$addr.offreg = $ea_result">,
                    NoEncode<"$ea_result">;
 }
@@ -778,10 +784,10 @@ def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
                    [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
 
 def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
-                      "lfsx $frD, $src", LdStLFDU,
+                      "lfsx $frD, $src", LdStLFD,
                       [(set F4RC:$frD, (load xaddr:$src))]>;
 def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
-                      "lfdx $frD, $src", LdStLFDU,
+                      "lfdx $frD, $src", LdStLFD,
                       [(set F8RC:$frD, (load xaddr:$src))]>;
 }
 
@@ -801,10 +807,10 @@ def STW  : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src),
                    "stw $rS, $src", LdStStore,
                    [(store GPRC:$rS, iaddr:$src)]>;
 def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
-                   "stfs $rS, $dst", LdStUX,
+                   "stfs $rS, $dst", LdStSTFD,
                    [(store F4RC:$rS, iaddr:$dst)]>;
 def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
-                   "stfd $rS, $dst", LdStUX,
+                   "stfd $rS, $dst", LdStSTFD,
                    [(store F8RC:$rS, iaddr:$dst)]>;
 }
 
@@ -812,33 +818,33 @@ def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
 let PPC970_Unit = 2 in {
 def STBU  : DForm_1a<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stbu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stbu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                           (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
                                          iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STHU  : DForm_1a<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "sthu $rS, $ptroff($ptrreg)", LdStStore,
+                    "sthu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res,
                         (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
                                         iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STWU  : DForm_1a<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stwu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stwu $rS, $ptroff($ptrreg)", LdStStoreUpd,
                     [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
                                                      iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFSU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfsu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfsu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
 def STFDU : DForm_1a<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
                              symbolLo:$ptroff, ptr_rc:$ptrreg),
-                    "stfdu $rS, $ptroff($ptrreg)", LdStStore,
+                    "stfdu $rS, $ptroff($ptrreg)", LdStSTFDU,
                     [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
                                           iaddroff:$ptroff))]>,
                     RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
@@ -863,7 +869,7 @@ def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
  
 def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stbux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stbux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti8 GPRC:$rS,
                                      ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -872,7 +878,7 @@ def STBUX : XForm_8<31, 247, (outs ptr_rc:$ea_res),
  
 def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "sthux $rS, $ptroff, $ptrreg", LdStStore,
+                   "sthux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_truncsti16 GPRC:$rS,
                                       ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
@@ -881,7 +887,7 @@ def STHUX : XForm_8<31, 439, (outs ptr_rc:$ea_res),
                  
 def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
                              (ins GPRC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                   "stwux $rS, $ptroff, $ptrreg", LdStStore,
+                   "stwux $rS, $ptroff, $ptrreg", LdStStoreUpd,
                    [(set ptr_rc:$ea_res,
                       (pre_store GPRC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                    RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -889,7 +895,7 @@ def STWUX : XForm_8<31, 183, (outs ptr_rc:$ea_res),
 
 def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
                               (ins F4RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfsux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfsux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F4RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -897,7 +903,7 @@ def STFSUX : XForm_8<31, 695, (outs ptr_rc:$ea_res),
 
 def STFDUX : XForm_8<31, 759, (outs ptr_rc:$ea_res),
                               (ins F8RC:$rS, ptr_rc:$ptroff, ptr_rc:$ptrreg),
-                    "stfdux $rS, $ptroff, $ptrreg", LdStStore,
+                    "stfdux $rS, $ptroff, $ptrreg", LdStSTFDU,
                     [(set ptr_rc:$ea_res,
                        (pre_store F8RC:$rS, ptr_rc:$ptrreg, xaddroff:$ptroff))]>,
                     RegConstraint<"$ptroff = $ea_res">, NoEncode<"$ea_res">,
@@ -913,14 +919,14 @@ def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
                    PPC970_DGroup_Cracked;
 
 def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfiwx $frS, $dst", LdStUX,
+                     "stfiwx $frS, $dst", LdStSTFD,
                      [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
                      
 def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
-                     "stfsx $frS, $dst", LdStUX,
+                     "stfsx $frS, $dst", LdStSTFD,
                      [(store F4RC:$frS, xaddr:$dst)]>;
 def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
-                     "stfdx $frS, $dst", LdStUX,
+                     "stfdx $frS, $dst", LdStSTFD,
                      [(store F8RC:$frS, xaddr:$dst)]>;
 }
 
@@ -964,7 +970,7 @@ def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
                      [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
 }
 
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
   def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
                        "li $rD, $imm", IntSimple,
                        [(set GPRC:$rD, immSExt16:$imm)]>;
@@ -1143,6 +1149,16 @@ def CRUNSET: XLForm_1_ext<19, 193, (outs CRBITRC:$dst), (ins),
               "crxor $dst, $dst, $dst", BrCR,
               []>;
 
+let Defs = [CR1EQ], CRD = 6 in {
+def CR6SET  : XLForm_1_ext<19, 289, (outs), (ins),
+              "creqv 6, 6, 6", BrCR,
+              [(PPCcr6set)]>;
+
+def CR6UNSET: XLForm_1_ext<19, 193, (outs), (ins),
+              "crxor 6, 6, 6", BrCR,
+              [(PPCcr6unset)]>;
+}
+
 // XFX-Form instructions.  Instructions that deal with SPRs.
 //
 let Uses = [CTR] in {
@@ -1192,7 +1208,7 @@ def MTCRF : XFXForm_5<31, 144, (outs crbitm:$FXM), (ins GPRC:$rS),
 //
 // FIXME: Make this a real Pseudo instruction when the JIT switches to MC.
 def MFCRpseud: XFXForm_3<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
-                       "", SprMFCR>,
+                       "#MFCRpseud", SprMFCR>,
             PPC970_MicroCode, PPC970_Unit_CRU;
             
 def MFCR : XFXForm_3<31, 19, (outs GPRC:$rT), (ins),
@@ -1233,7 +1249,7 @@ let Uses = [RM] in {
                PPC970_DGroup_Single, PPC970_Unit_FPU;
   def FADDrtz: AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
 }
@@ -1364,7 +1380,7 @@ def FSELS : AForm_1<63, 23,
 let Uses = [RM] in {
   def FADD  : AForm_2<63, 21,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      "fadd $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
   def FADDS : AForm_2<59, 21,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
@@ -1379,16 +1395,16 @@ let Uses = [RM] in {
                       "fdivs $FRT, $FRA, $FRB", FPDivS,
                       [(set F4RC:$FRT, (fdiv F4RC:$FRA, F4RC:$FRB))]>;
   def FMUL  : AForm_3<63, 25,
-                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fmul $FRT, $FRA, $FRB", FPFused,
-                      [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRB))]>;
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC),
+                      "fmul $FRT, $FRA, $FRC", FPFused,
+                      [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRC))]>;
   def FMULS : AForm_3<59, 25,
-                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
-                      "fmuls $FRT, $FRA, $FRB", FPGeneral,
-                      [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC),
+                      "fmuls $FRT, $FRA, $FRC", FPGeneral,
+                      [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRC))]>;
   def FSUB  : AForm_2<63, 20,
                       (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
-                      "fsub $FRT, $FRA, $FRB", FPGeneral,
+                      "fsub $FRT, $FRA, $FRB", FPAddSub,
                       [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
   def FSUBS : AForm_2<59, 20,
                       (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
@@ -1398,7 +1414,7 @@ let Uses = [RM] in {
 }
 
 let PPC970_Unit = 1 in {  // FXU Operations.
-  def ISEL  : AForm_1<31, 15,
+  def ISEL  : AForm_4<31, 15,
                      (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB, pred:$cond),
                      "isel $rT, $rA, $rB, $cond", IntGeneral,
                      []>;
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ab8bf1f93a37..459c3589d3f6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -71,7 +71,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
                        ST.isPPC64() ? 0 : 1,
                        ST.isPPC64() ? 0 : 1),
-    Subtarget(ST), TII(tii) {
+    Subtarget(ST), TII(tii), CRSpillFrameIdx(0) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -111,10 +111,15 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
                                  CSR_Darwin32_SaveList;
 
+  // For 32-bit SVR4, also initialize the frame index associated with
+  // the CR spill slot.
+  if (!Subtarget.isPPC64())
+    CRSpillFrameIdx = 0;
+
   return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
 }
 
-const unsigned*
+const uint32_t*
 PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? CSR_Darwin64_RegMask :
@@ -477,6 +482,31 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
+bool
+PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+				      unsigned Reg, int &FrameIdx) const {
+
+  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+  // ABI, return true to prevent allocating an additional frame slot.
+  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we must
+  // create exactly one stack slot and return its FrameIdx for all
+  // nonvolatiles.
+  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    if (Subtarget.isPPC64()) {
+      FrameIdx = 0;
+    } else if (CRSpillFrameIdx) {
+      FrameIdx = CRSpillFrameIdx;
+    } else {
+      MachineFrameInfo *MFI = ((MachineFunction &)MF).getFrameInfo();
+      FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+      CRSpillFrameIdx = FrameIdx;
+    }
+    return true;
+  }
+  return false;
+}
+
 void
 PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, RegScavenger *RS) const {
@@ -566,7 +596,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // to Offset to get the correct offset.
   // Naked functions have stack size 0, although getStackSize may not reflect that
   // because we didn't call all the pieces that compute it for naked functions.
-  if (!MF.getFunction()->hasFnAttr(Attribute::Naked))
+  if (!MF.getFunction()->getFnAttributes().hasAttribute(Attributes::Naked))
     Offset += MFI->getStackSize();
 
   // If we can, encode the offset directly into the instruction.  If this is a
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 152c36d699ec..a8fd796d9e97 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -30,6 +30,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   std::map<unsigned, unsigned> ImmToIdxMap;
   const PPCSubtarget &Subtarget;
   const TargetInstrInfo &TII;
+  mutable int CRSpillFrameIdx;
 public:
   PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
   
@@ -43,7 +44,7 @@ public:
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-  const unsigned *getCallPreservedMask(CallingConv::ID CC) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -65,6 +66,8 @@ public:
                        int SPAdj, RegScavenger *RS) const;
   void lowerCRRestore(MachineBasicBlock::iterator II, unsigned FrameIndex,
                        int SPAdj, RegScavenger *RS) const;
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+			    int &FrameIdx) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
 
diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
index 6a6ccb9d9852..660c0c3b6359 100644
--- a/lib/Target/PowerPC/PPCSchedule.td
+++ b/lib/Target/PowerPC/PPCSchedule.td
@@ -40,6 +40,7 @@ def IntMulHWU    : InstrItinClass;
 def IntMulLI     : InstrItinClass;
 def IntRFID      : InstrItinClass;
 def IntRotateD   : InstrItinClass;
+def IntRotateDI  : InstrItinClass;
 def IntRotate    : InstrItinClass;
 def IntShift     : InstrItinClass;
 def IntTrapD     : InstrItinClass;
@@ -52,15 +53,18 @@ def LdStDCBA     : InstrItinClass;
 def LdStDCBF     : InstrItinClass;
 def LdStDCBI     : InstrItinClass;
 def LdStLoad     : InstrItinClass;
+def LdStLoadUpd  : InstrItinClass;
 def LdStStore    : InstrItinClass;
+def LdStStoreUpd : InstrItinClass;
 def LdStDSS      : InstrItinClass;
 def LdStICBI     : InstrItinClass;
-def LdStUX       : InstrItinClass;
 def LdStLD       : InstrItinClass;
+def LdStLDU      : InstrItinClass;
 def LdStLDARX    : InstrItinClass;
 def LdStLFD      : InstrItinClass;
 def LdStLFDU     : InstrItinClass;
 def LdStLHA      : InstrItinClass;
+def LdStLHAU     : InstrItinClass;
 def LdStLMW      : InstrItinClass;
 def LdStLVecX    : InstrItinClass;
 def LdStLWA      : InstrItinClass;
@@ -69,6 +73,9 @@ def LdStSLBIA    : InstrItinClass;
 def LdStSLBIE    : InstrItinClass;
 def LdStSTD      : InstrItinClass;
 def LdStSTDCX    : InstrItinClass;
+def LdStSTDU     : InstrItinClass;
+def LdStSTFD     : InstrItinClass;
+def LdStSTFDU    : InstrItinClass;
 def LdStSTVEBX   : InstrItinClass;
 def LdStSTWCX    : InstrItinClass;
 def LdStSync     : InstrItinClass;
@@ -86,6 +93,7 @@ def SprMTSRIN    : InstrItinClass;
 def SprRFI       : InstrItinClass;
 def SprSC        : InstrItinClass;
 def FPGeneral    : InstrItinClass;
+def FPAddSub     : InstrItinClass;
 def FPCompare    : InstrItinClass;
 def FPDivD       : InstrItinClass;
 def FPDivS       : InstrItinClass;
@@ -110,6 +118,8 @@ include "PPCScheduleG4.td"
 include "PPCScheduleG4Plus.td"
 include "PPCScheduleG5.td"
 include "PPCScheduleA2.td"
+include "PPCScheduleE500mc.td"
+include "PPCScheduleE5500.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction to itinerary class map - When add new opcodes to the supported
@@ -171,7 +181,7 @@ include "PPCScheduleA2.td"
 //    extsh      IntSimple
 //    extsw      IntSimple
 //    fabs       FPGeneral
-//    fadd       FPGeneral
+//    fadd       FPAddSub
 //    fadds      FPGeneral
 //    fcfid      FPGeneral
 //    fcmpo      FPCompare
@@ -201,35 +211,35 @@ include "PPCScheduleA2.td"
 //    fsel       FPGeneral
 //    fsqrt      FPSqrt
 //    fsqrts     FPSqrt
-//    fsub       FPGeneral
+//    fsub       FPAddSub
 //    fsubs      FPGeneral
 //    icbi       LdStICBI
 //    isync      SprISYNC
 //    lbz        LdStLoad
-//    lbzu       LdStLoad
-//    lbzux      LdStUX
+//    lbzu       LdStLoadUpd
+//    lbzux      LdStLoadUpd
 //    lbzx       LdStLoad
 //    ld         LdStLD
 //    ldarx      LdStLDARX
-//    ldu        LdStLD
-//    ldux       LdStLD
+//    ldu        LdStLDU
+//    ldux       LdStLDU
 //    ldx        LdStLD
 //    lfd        LdStLFD
 //    lfdu       LdStLFDU
 //    lfdux      LdStLFDU
-//    lfdx       LdStLFDU
-//    lfs        LdStLFDU
+//    lfdx       LdStLFD
+//    lfs        LdStLFD
 //    lfsu       LdStLFDU
 //    lfsux      LdStLFDU
-//    lfsx       LdStLFDU
+//    lfsx       LdStLFD
 //    lha        LdStLHA
-//    lhau       LdStLHA
-//    lhaux      LdStLHA
+//    lhau       LdStLHAU
+//    lhaux      LdStLHAU
 //    lhax       LdStLHA
 //    lhbrx      LdStLoad
 //    lhz        LdStLoad
-//    lhzu       LdStLoad
-//    lhzux      LdStUX
+//    lhzu       LdStLoadUpd
+//    lhzux      LdStLoadUpd
 //    lhzx       LdStLoad
 //    lmw        LdStLMW
 //    lswi       LdStLMW
@@ -243,12 +253,12 @@ include "PPCScheduleA2.td"
 //    lvxl       LdStLVecX
 //    lwa        LdStLWA
 //    lwarx      LdStLWARX
-//    lwaux      LdStLHA
+//    lwaux      LdStLHAU
 //    lwax       LdStLHA
 //    lwbrx      LdStLoad
 //    lwz        LdStLoad
-//    lwzu       LdStLoad
-//    lwzux      LdStUX
+//    lwzu       LdStLoadUpd
+//    lwzux      LdStLoadUpd
 //    lwzx       LdStLoad
 //    mcrf       BrMCR
 //    mcrfs      FPGeneral
@@ -292,10 +302,10 @@ include "PPCScheduleA2.td"
 //    rfid       IntRFID
 //    rldcl      IntRotateD
 //    rldcr      IntRotateD
-//    rldic      IntRotateD
-//    rldicl     IntRotateD
-//    rldicr     IntRotateD
-//    rldimi     IntRotateD
+//    rldic      IntRotateDI
+//    rldicl     IntRotateDI
+//    rldicr     IntRotateDI
+//    rldimi     IntRotateDI
 //    rlwimi     IntRotate
 //    rlwinm     IntGeneral
 //    rlwnm      IntGeneral
@@ -305,33 +315,33 @@ include "PPCScheduleA2.td"
 //    sld        IntRotateD
 //    slw        IntGeneral
 //    srad       IntRotateD
-//    sradi      IntRotateD
+//    sradi      IntRotateDI
 //    sraw       IntShift
 //    srawi      IntShift
 //    srd        IntRotateD
 //    srw        IntGeneral
 //    stb        LdStStore
-//    stbu       LdStStore
-//    stbux      LdStStore
+//    stbu       LdStStoreUpd
+//    stbux      LdStStoreUpd
 //    stbx       LdStStore
 //    std        LdStSTD
 //    stdcx.     LdStSTDCX
-//    stdu       LdStSTD
-//    stdux      LdStSTD
+//    stdu       LdStSTDU
+//    stdux      LdStSTDU
 //    stdx       LdStSTD
-//    stfd       LdStUX
-//    stfdu      LdStUX
-//    stfdux     LdStUX
-//    stfdx      LdStUX
-//    stfiwx     LdStUX
-//    stfs       LdStUX
-//    stfsu      LdStUX
-//    stfsux     LdStUX
-//    stfsx      LdStUX
+//    stfd       LdStSTFD
+//    stfdu      LdStSTFDU
+//    stfdux     LdStSTFDU
+//    stfdx      LdStSTFD
+//    stfiwx     LdStSTFD
+//    stfs       LdStSTFD
+//    stfsu      LdStSTFDU
+//    stfsux     LdStSTFDU
+//    stfsx      LdStSTFD
 //    sth        LdStStore
 //    sthbrx     LdStStore
-//    sthu       LdStStore
-//    sthux      LdStStore
+//    sthu       LdStStoreUpd
+//    sthux      LdStStoreUpd
 //    sthx       LdStStore
 //    stmw       LdStLMW
 //    stswi      LdStLMW
@@ -344,8 +354,8 @@ include "PPCScheduleA2.td"
 //    stw        LdStStore
 //    stwbrx     LdStStore
 //    stwcx.     LdStSTWCX
-//    stwu       LdStStore
-//    stwux      LdStStore
+//    stwu       LdStStoreUpd
+//    stwux      LdStStoreUpd
 //    stwx       LdStStore
 //    subf       IntGeneral
 //    subfc      IntGeneral
diff --git a/lib/Target/PowerPC/PPCSchedule440.td b/lib/Target/PowerPC/PPCSchedule440.td
index cd0fb70a24bd..37b6eac10cfe 100644
--- a/lib/Target/PowerPC/PPCSchedule440.td
+++ b/lib/Target/PowerPC/PPCSchedule440.td
@@ -288,6 +288,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<2, [LWB]>],
                               [9, 5],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [9, 5],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStStore   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -297,6 +306,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStICBI    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -306,7 +324,7 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<1, [IFTH1, IFTH2]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
                                InstrStage<1, [LRACC]>,
@@ -315,6 +333,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5, 5],
                               [NoBypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5, 5],
+                              [NoBypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -342,6 +369,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<1, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStLMW     , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
@@ -371,6 +407,15 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<2, [LWB]>],
                               [8, 5],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [LRACC]>,
+                               InstrStage<1, [AGEN]>,
+                               InstrStage<1, [CRD]>,
+                               InstrStage<2, [LWB]>],
+                              [8, 5],
+                              [NoBypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1]>,
@@ -537,6 +582,19 @@ def PPC440Itineraries : ProcessorItineraries<
                                InstrStage<1, [FWB]>],
                               [10, 4, 4],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [IFTH1, IFTH2]>,
+                               InstrStage<1, [PDCD1, PDCD2]>,
+                               InstrStage<1, [DISS1, DISS2]>,
+                               InstrStage<1, [FRACC]>,
+                               InstrStage<1, [FEXE1]>,
+                               InstrStage<1, [FEXE2]>,
+                               InstrStage<1, [FEXE3]>,
+                               InstrStage<1, [FEXE4]>,
+                               InstrStage<1, [FEXE5]>,
+                               InstrStage<1, [FEXE6]>,
+                               InstrStage<1, [FWB]>],
+                              [10, 4, 4],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<1, [IFTH1, IFTH2]>,
                                InstrStage<1, [PDCD1, PDCD2]>,
                                InstrStage<1, [DISS1, DISS2]>,
diff --git a/lib/Target/PowerPC/PPCScheduleA2.td b/lib/Target/PowerPC/PPCScheduleA2.td
index 4d4a5d0e1b2f..ba63b5cd8faf 100644
--- a/lib/Target/PowerPC/PPCScheduleA2.td
+++ b/lib/Target/PowerPC/PPCScheduleA2.td
@@ -181,6 +181,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [10, 7, 7],
                               [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [10, 7, 7],
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<IntShift    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -302,7 +313,18 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [GPR_Bypass, GPR_Bypass]>,
-  InstrItinData<LdStLD      , [InstrStage<4,
+  InstrItinData<LdStLoadUpd , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -324,6 +346,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,
   InstrItinData<LdStICBI    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -335,7 +368,7 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
-  InstrItinData<LdStUX      , [InstrStage<4,
+  InstrItinData<LdStSTFD    , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
                                               IU4_4, IU4_5, IU4_6, IU4_7]>,
@@ -346,6 +379,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7, 7],
                               [NoBypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7, 7],
+                              [NoBypass, FPR_Bypass, FPR_Bypass]>,                              
   InstrItinData<LdStLFD     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -379,6 +423,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [14, 7],
                               [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [14, 7],
+                              [NoBypass, GPR_Bypass]>,
   InstrItinData<LdStLMW     , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -412,6 +467,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
                               [13, 7],
                               [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDU    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [XRF1]>,
+                               InstrStage<1, [XEX1]>, InstrStage<1, [XEX2]>,
+                               InstrStage<1, [XEX3]>, InstrStage<1, [XEX4]>,
+                               InstrStage<1, [XEX5]>, InstrStage<1, [XEX6]>],
+                              [13, 7],
+                              [GPR_Bypass, GPR_Bypass]>,                              
   InstrItinData<LdStSTDCX   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
@@ -593,6 +659,17 @@ def PPCA2Itineraries : ProcessorItineraries<
                                InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
                               [15, 7, 7],
                               [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<4,
+                                 [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
+                               InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
+                                              IU4_4, IU4_5, IU4_6, IU4_7]>,
+                               InstrStage<1, [IU5]>, InstrStage<1, [IU6]>,
+                               InstrStage<1, [RF0]>, InstrStage<1, [FRF1]>,
+                               InstrStage<1, [FEX1]>, InstrStage<1, [FEX2]>,
+                               InstrStage<1, [FEX3]>, InstrStage<1, [FEX4]>,
+                               InstrStage<1, [FEX5]>, InstrStage<1, [FEX6]>],
+                              [15, 7, 7],
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
   InstrItinData<FPCompare   , [InstrStage<4,
                                  [IU0to3_0, IU0to3_1, IU0to3_2, IU0to3_3]>,
                                InstrStage<1, [IU4_0, IU4_1, IU4_2, IU4_3,
diff --git a/lib/Target/PowerPC/PPCScheduleE500mc.td b/lib/Target/PowerPC/PPCScheduleE500mc.td
new file mode 100644
index 000000000000..9bb779a0e62b
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE500mc.td
@@ -0,0 +1,265 @@
+//===-- PPCScheduleE500mc.td - e500mc Scheduling Defs ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e500mc 32-bit 
+// Power processor.
+// 
+// All information is derived from the "e500mc Core Reference Manual",
+// Freescale Document Number E500MCRM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e500mc core:
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+def DIS0 : FuncUnit; // Dispatch stage - insn 1
+def DIS1 : FuncUnit; // Dispatch stage - insn 2
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    Some instructions can only execute in SFX0 but not SFX1.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is executed.
+def SFX0  : FuncUnit; // Simple unit 0
+def SFX1  : FuncUnit; // Simple unit 1
+def BU    : FuncUnit; // Branch unit
+def CFX_DivBypass 
+          : FuncUnit; // CFX divide bypass path
+def CFX_0 : FuncUnit; // CFX pipeline
+def LSU_0 : FuncUnit; // LSU pipeline
+def FPU_0 : FuncUnit; // FPU pipeline
+
+def PPCE500mcItineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 1, 1], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<14, [CFX_DivBypass]>],
+                              [17, 1, 1], // Latency=4..35, Repeat= 4..35
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11], // Latency = 8
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<8, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [7, 1, 1], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [5, 1], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [4, 1], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1, 1], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1, 1], // Latency = 4
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 1], // Latency = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [6, 1, 1], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [6, 1], // Latency = 3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1],
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [5, 1], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [5, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [SFX0]>],
+                              [8, 1],
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [7, 1], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [4, 1], // Latency = 1, Repeat rate = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0]>],
+                              [4, 1],
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1], // Latency = 10, Repeat rate = 4 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [11, 1, 1], // Latency = 8, Repeat rate = 2
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<68, [FPU_0]>],
+                              [71, 1, 1], // Latency = 68, Repeat rate = 68
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [FPU_0]>],
+                              [13, 1, 1, 1], // Latency = 10, Repeat rate = 4
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<38, [FPU_0]>],
+                              [41, 1], // Latency = 38, Repeat rate = 38
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e500mc machine model for scheduling and other instruction cost heuristics.
+
+def PPCE500mcModel : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 5; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE500mcItineraries;
+}
diff --git a/lib/Target/PowerPC/PPCScheduleE5500.td b/lib/Target/PowerPC/PPCScheduleE5500.td
new file mode 100644
index 000000000000..d7e11acd9fc7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleE5500.td
@@ -0,0 +1,309 @@
+//===-- PPCScheduleE500mc.td - e5500 Scheduling Defs -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the Freescale e5500 64-bit 
+// Power processor.
+// 
+// All information is derived from the "e5500 Core Reference Manual",
+// Freescale Document Number e5500RM, Rev. 1, 03/2012.
+//
+//===----------------------------------------------------------------------===//
+// Relevant functional units in the Freescale e5500 core
+// (These are the same as for the e500mc)
+//
+//  * Decode & Dispatch
+//    Can dispatch up to 2 instructions per clock cycle to either the GPR Issue
+//    queues (GIQx), FP Issue Queue (FIQ), or Branch issue queue (BIQ).
+// def DIS0 : FuncUnit;
+// def DIS1 : FuncUnit;
+
+//  * Execute
+//    6 pipelined execution units: SFX0, SFX1, BU, FPU, LSU, CFX.
+//    The CFX has a bypass path, allowing non-divide instructions to execute 
+//    while a divide instruction is being executed.
+// def SFX0  : FuncUnit; // Simple unit 0
+// def SFX1  : FuncUnit; // Simple unit 1
+// def BU    : FuncUnit; // Branch unit
+// def CFX_DivBypass 
+//           : FuncUnit; // CFX divide bypass path
+// def CFX_0 : FuncUnit; // CFX pipeline stage 0
+
+def CFX_1 : FuncUnit; // CFX pipeline stage 1 
+
+// def LSU_0 : FuncUnit; // LSU pipeline
+// def FPU_0 : FuncUnit; // FPU pipeline
+
+
+def PPCE5500Itineraries : ProcessorItineraries<
+  [DIS0, DIS1, SFX0, SFX1, BU, CFX_DivBypass, CFX_0, CFX_1,
+   LSU_0, FPU_0],
+  [CR_Bypass, GPR_Bypass, FPR_Bypass], [
+  InstrItinData<IntSimple   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntGeneral  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 1 or 2
+                              [CR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntDivD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<26, [CFX_DivBypass]>],
+                              [30, 2, 2], // Latency= 4..26, Repeat rate= 4..26
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntDivW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<16, [CFX_DivBypass]>],
+                              [20, 2, 2], // Latency= 4..16, Repeat rate= 4..16
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<7, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 7
+                              [NoBypass, NoBypass, NoBypass]>,
+  InstrItinData<IntMulHD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [9, 2, 2], // Latency = 4..7, Repeat rate = 2..4
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<IntMulHW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulHWU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<1, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntMulLI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0], 0>,
+                               InstrStage<2, [CFX_1]>],
+                              [8, 2, 2], // Latency = 4 or 5, Repeat = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateD  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntRotateDI , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5, 2, 2], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,                                                            
+  InstrItinData<IntShift    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0, SFX1]>],
+                              [6, 2, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [SFX0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 2
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<BrB         , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<BrCR        , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [BU]>],
+                              [5, 2], // Latency = 1
+                              [CR_Bypass, CR_Bypass]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5, 2, 2], // Latency = 1
+                              [CR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBA    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBF    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStDCBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoad    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLDARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass]>,                              
+  InstrItinData<LdStLDU     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStStore   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStICBI    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFD    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLFD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLFDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [8, 2, 2], // Latency = 4, Repeat rate = 1
+                              [FPR_Bypass, GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops
+  InstrItinData<LdStLHA     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStLHAU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [GPR_Bypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStLMW     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [LSU_0]>],
+                              [8, 2], // Latency = r+3, Repeat rate = r+3
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStLWARX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<3, [LSU_0]>],
+                              [7, 2, 2], // Latency = 3, Repeat rate = 3
+                              [GPR_Bypass, GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<LdStSTD     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1                              
+                              [NoBypass, GPR_Bypass]>,                              
+  InstrItinData<LdStSTDU    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass],
+                              2>, // 2 micro-ops                              
+  InstrItinData<LdStSTWCX   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>],
+                              [7, 2], // Latency = 3, Repeat rate = 1
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<LdStSync    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [CFX_0]>],
+                              [6, 2], // Latency = 2, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [LSU_0], 0>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<5, [CFX_0]>],
+                              [9, 2], // Latency = 5, Repeat rate = 5
+                              [GPR_Bypass, CR_Bypass]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [SFX0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [GPR_Bypass, GPR_Bypass]>,
+  InstrItinData<SprMFSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [CFX_0]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<4, [CFX_0]>],
+                              [8, 2], // Latency = 4, Repeat rate = 4
+                              [NoBypass, GPR_Bypass]>,
+  InstrItinData<SprMTSPR    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [SFX0, SFX1]>],
+                              [5], // Latency = 1, Repeat rate = 1
+                              [GPR_Bypass]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,                              
+  InstrItinData<FPCompare   , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2], // Latency = 7, Repeat rate = 1
+                              [CR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivD      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<31, [FPU_0]>],
+                              [39, 2, 2], // Latency = 35, Repeat rate = 31
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPDivS      , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<16, [FPU_0]>],
+                              [24, 2, 2], // Latency = 20, Repeat rate = 16 
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPFused     , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<1, [FPU_0]>],
+                              [11, 2, 2, 2], // Latency = 7, Repeat rate = 1
+                              [FPR_Bypass, FPR_Bypass, FPR_Bypass, FPR_Bypass]>,
+  InstrItinData<FPRes       , [InstrStage<1, [DIS0, DIS1], 0>,
+                               InstrStage<2, [FPU_0]>],
+                              [12, 2], // Latency = 8, Repeat rate = 2
+                              [FPR_Bypass, FPR_Bypass]>
+]>;
+
+// ===---------------------------------------------------------------------===//
+// e5500 machine model for scheduling and other instruction cost heuristics.
+
+def PPCE5500Model : SchedMachineModel {
+  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
+  let MinLatency = -1; // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 6; // Optimistic load latency assuming bypass.
+                       // This is overriden by OperandCycles if the
+                       // Itineraries are queried instead.
+
+  let Itineraries = PPCE5500Itineraries;
+}
diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
index 61e89ed32c20..72a0a392631a 100644
--- a/lib/Target/PowerPC/PPCScheduleG3.td
+++ b/lib/Target/PowerPC/PPCScheduleG3.td
@@ -34,12 +34,16 @@ def G3Itineraries : ProcessorItineraries<
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
@@ -58,6 +62,7 @@ def G3Itineraries : ProcessorItineraries<
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
index e19ddfa80ea3..fc9120dfa290 100644
--- a/lib/Target/PowerPC/PPCScheduleG4.td
+++ b/lib/Target/PowerPC/PPCScheduleG4.td
@@ -33,13 +33,17 @@ def G4Itineraries : ProcessorItineraries<
   InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<2, [SLU]>]>, 
   InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
@@ -60,6 +64,7 @@ def G4Itineraries : ProcessorItineraries<
   InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
   InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
   InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
index e7446cb028a3..a4e82ce23e6f 100644
--- a/lib/Target/PowerPC/PPCScheduleG4Plus.td
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td
@@ -36,19 +36,24 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
-  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
@@ -66,6 +71,7 @@ def G4PlusItineraries : ProcessorItineraries<
   InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
   InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<5, [FPU1]>]>,  
   InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
   InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
   InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
index 137149972680..7c02ea099c14 100644
--- a/lib/Target/PowerPC/PPCScheduleG5.td
+++ b/lib/Target/PowerPC/PPCScheduleG5.td
@@ -27,6 +27,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
   InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotateDI , [InstrStage<2, [IU1, IU2]>]>,  
   InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
   InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
   InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
@@ -37,15 +38,20 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
   InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLoad    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLoadUpd , [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStStore   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStStoreUpd, [InstrStage<3, [SLU]>]>,  
   InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
   InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
-  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFD    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStSTFDU   , [InstrStage<4, [SLU]>]>,  
   InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDU     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHAU    , [InstrStage<5, [SLU]>]>,  
   InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
   InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
@@ -53,6 +59,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
   InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
   InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDU    , [InstrStage<3, [SLU]>]>,
   InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
   InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
   InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
@@ -69,6 +76,7 @@ def G5Itineraries : ProcessorItineraries<
   InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
   InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
   InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPAddSub    , [InstrStage<6, [FPU1, FPU2]>]>,
   InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
   InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
   InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index bb193ac3d9ef..9c8cb92cc7ea 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -54,19 +54,26 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
     CPUName = sys::getHostCPUName();
 #endif
 
-  // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
-
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Make sure 64-bit features are available when CPUname is generic
+  std::string FullFS = FS;
+
   // If we are generating code for ppc64, verify that options make sense.
   if (is64Bit) {
     Has64BitSupport = true;
     // Silently force 64-bit register use on ppc64.
     Use64BitRegs = true;
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
   }
-  
+
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
   // If the user requested use of 64-bit regs, but the cpu selected doesn't
   // support it, ignore.
   if (use64BitRegs() && !has64BitSupport())
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 0207c833938b..b9e22f43c39e 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -33,32 +33,34 @@ namespace PPC {
   enum {
     DIR_NONE,
     DIR_32,
-    DIR_440, 
-    DIR_601, 
-    DIR_602, 
-    DIR_603, 
+    DIR_440,
+    DIR_601,
+    DIR_602,
+    DIR_603,
     DIR_7400,
-    DIR_750, 
-    DIR_970, 
+    DIR_750,
+    DIR_970,
     DIR_A2,
+    DIR_E500mc,
+    DIR_E5500,
     DIR_PWR6,
     DIR_PWR7,
-    DIR_64  
+    DIR_64
   };
 }
 
 class GlobalValue;
 class TargetMachine;
-  
+
 class PPCSubtarget : public PPCGenSubtargetInfo {
 protected:
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned StackAlignment;
-  
+
   /// Selected instruction itineraries (one entry per itinerary class.)
   InstrItineraryData InstrItins;
-  
+
   /// Which cpu directive was used.
   unsigned DarwinDirective;
 
@@ -74,7 +76,7 @@ protected:
   bool IsBookE;
   bool HasLazyResolverStubs;
   bool IsJITCodeModel;
-  
+
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
@@ -84,11 +86,11 @@ public:
   ///
   PPCSubtarget(const std::string &TT, const std::string &CPU,
                const std::string &FS, bool is64Bit);
-  
-  /// ParseSubtargetFeatures - Parses features string setting specified 
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-  
+
   /// SetJITMode - This is called to inform the subtarget info that we are
   /// producing code for the JIT.
   void SetJITMode();
@@ -97,20 +99,27 @@ public:
   /// stack frame on entry to the function and which must be maintained by every
   /// function for this subtarget.
   unsigned getStackAlignment() const { return StackAlignment; }
-  
+
   /// getDarwinDirective - Returns the -m directive specified for the cpu.
   ///
   unsigned getDarwinDirective() const { return DarwinDirective; }
-  
-  /// getInstrItins - Return the instruction itineraies based on subtarget 
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
   /// selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
-  /// getTargetDataString - Return the pointer size and type alignment
+  /// getDataLayoutString - Return the pointer size and type alignment
   /// properties of this subtarget.
-  const char *getTargetDataString() const {
+  const char *getDataLayoutString() const {
     // Note, the alignment values for f64 and i64 on ppc64 in Darwin
     // documentation are wrong; these are correct (i.e. "what gcc does").
+    if (isPPC64() && isSVR4ABI()) {
+      if (TargetTriple.getOS() == llvm::Triple::FreeBSD)
+        return "E-p:64:64-f64:64:64-i64:64:64-f128:64:64-v128:128:128-n32:64";
+      else
+        return "E-p:64:64-f64:64:64-i64:64:64-f128:128:128-v128:128:128-n32:64";
+    }
+
     return isPPC64() ? "E-p:64:64-f64:64:64-i64:64:64-f128:64:128-n32:64"
                      : "E-p:32:32-f64:64:64-i64:64:64-f128:64:128-n32";
   }
@@ -118,22 +127,22 @@ public:
   /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
   ///
   bool isPPC64() const { return IsPPC64; }
-  
+
   /// has64BitSupport - Return true if the selected CPU supports 64-bit
   /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
   bool has64BitSupport() const { return Has64BitSupport; }
-  
+
   /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
   /// registers in 32-bit mode when possible.  This can only true if
   /// has64BitSupport() returns true.
   bool use64BitRegs() const { return Use64BitRegs; }
-  
+
   /// hasLazyResolverStub - Return true if accesses to the specified global have
   /// to go through a dyld lazy resolution stub.  This means that an extra load
   /// is required to get the address of the global.
-  bool hasLazyResolverStub(const GlobalValue *GV, 
+  bool hasLazyResolverStub(const GlobalValue *GV,
                            const TargetMachine &TM) const;
-  
+
   // isJITCodeModel - True if we're generating code for the JIT
   bool isJITCodeModel() const { return IsJITCodeModel; }
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 980511268a31..3fc977ee2b41 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -40,10 +40,11 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
                                    bool is64Bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64Bit),
-    DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
+    DL(Subtarget.getDataLayoutString()), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()) {
+    InstrItins(Subtarget.getInstrItineraryData()),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 
   // The binutils for the BG/P are too old for CFI.
   if (Subtarget.isBGP())
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 7da2b0cb10c1..c168433a71b3 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -21,7 +21,8 @@
 #include "PPCISelLowering.h"
 #include "PPCSelectionDAGInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetTransformImpl.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
@@ -29,13 +30,15 @@ namespace llvm {
 ///
 class PPCTargetMachine : public LLVMTargetMachine {
   PPCSubtarget        Subtarget;
-  const TargetData    DataLayout;       // Calculates type size & alignment
+  const DataLayout    DL;       // Calculates type size & alignment
   PPCInstrInfo        InstrInfo;
   PPCFrameLowering    FrameLowering;
   PPCJITInfo          JITInfo;
   PPCTargetLowering   TLInfo;
   PPCSelectionDAGInfo TSInfo;
   InstrItineraryData  InstrItins;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
@@ -58,11 +61,17 @@ public:
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const TargetData    *getTargetData() const    { return &DataLayout; }
+  virtual const DataLayout    *getDataLayout() const    { return &DL; }
   virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
   virtual const InstrItineraryData *getInstrItineraryData() const {
     return &InstrItins;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index cbfa4cf35ba2..8165f5b8cc97 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -152,7 +152,7 @@ stuff too.
 
 //===---------------------------------------------------------------------===//
 
-For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+For vector types, DataLayout.cpp::getTypeInfo() returns alignment that is equal
 to the type size. It works but can be overly conservative as the alignment of
 specific vector types are target dependent.
 
@@ -2367,8 +2367,3 @@ unsigned foo(unsigned x, unsigned y) { return x > y && x != 0; }
 should fold to x > y.
 
 //===---------------------------------------------------------------------===//
-
-int f(double x) { return __builtin_fabs(x) < 0.0; }
-should fold to false.
-
-//===---------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 1c5c89e97158..716c79f43a26 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 79f7ebd82dee..8e5619e6bc8d 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -637,7 +637,7 @@ SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 
   PointerType *Ty = cast<PointerType>(CalleeFn->arg_begin()->getType());
   Type *ElementTy = Ty->getElementType();
-  return getTargetData()->getTypeAllocSize(ElementTy);
+  return getDataLayout()->getTypeAllocSize(ElementTy);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
index 15541ef2f837..e64c140e4921 100644
--- a/lib/Target/Sparc/SparcInstrInfo.td
+++ b/lib/Target/Sparc/SparcInstrInfo.td
@@ -129,7 +129,7 @@ def retflag       : SDNode<"SPISD::RET_FLAG", SDT_SPRet,
                            [SDNPHasChain, SDNPOptInGlue]>;
 
 def flushw        : SDNode<"SPISD::FLUSHW", SDTNone,
-                           [SDNPHasChain]>;
+                           [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
 def getPCX        : Operand<i32> {
   let PrintMethod = "printGetPCX";
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 9ee12ed7f579..45c962471dda 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -33,10 +33,10 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
                                        bool is64bit)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
-    DataLayout(Subtarget.getDataLayout()),
+    DL(Subtarget.getDataLayout()),
     InstrInfo(Subtarget),
     TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget) {
+    FrameLowering(Subtarget), STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index b2cc624e454c..0fbe2d7cda36 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -20,18 +20,21 @@
 #include "SparcSelectionDAGInfo.h"
 #include "SparcSubtarget.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
 class SparcTargetMachine : public LLVMTargetMachine {
   SparcSubtarget Subtarget;
-  const TargetData DataLayout;       // Calculates type size & alignment
+  const DataLayout DL;       // Calculates type size & alignment
   SparcInstrInfo InstrInfo;
   SparcTargetLowering TLInfo;
   SparcSelectionDAGInfo TSInfo;
   SparcFrameLowering FrameLowering;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -52,7 +55,13 @@ public:
   virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
     return &TSInfo;
   }
-  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index a2b83bcce46e..393178a4692e 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -16,7 +16,7 @@
 #include "llvm-c/Initialization.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/PassManager.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/LLVMContext.h"
 #include <cstring>
@@ -24,8 +24,9 @@
 using namespace llvm;
 
 void llvm::initializeTarget(PassRegistry &Registry) {
-  initializeTargetDataPass(Registry);
+  initializeDataLayoutPass(Registry);
   initializeTargetLibraryInfoPass(Registry);
+  initializeTargetTransformInfoPass(Registry);
 }
 
 void LLVMInitializeTarget(LLVMPassRegistryRef R) {
@@ -33,11 +34,11 @@ void LLVMInitializeTarget(LLVMPassRegistryRef R) {
 }
 
 LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
-  return wrap(new TargetData(StringRep));
+  return wrap(new DataLayout(StringRep));
 }
 
 void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
-  unwrap(PM)->add(new TargetData(*unwrap(TD)));
+  unwrap(PM)->add(new DataLayout(*unwrap(TD)));
 }
 
 void LLVMAddTargetLibraryInfo(LLVMTargetLibraryInfoRef TLI,
@@ -55,13 +56,21 @@ LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
 }
 
 unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
-  return unwrap(TD)->getPointerSize();
+  return unwrap(TD)->getPointerSize(0);
+}
+
+unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return unwrap(TD)->getPointerSize(AS);
 }
 
 LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
   return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
 }
 
+LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), AS));
+}
+
 unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
   return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
 }
diff --git a/lib/Target/TargetELFWriterInfo.cpp b/lib/Target/TargetELFWriterInfo.cpp
deleted file mode 100644
index a661ee9c0c65..000000000000
--- a/lib/Target/TargetELFWriterInfo.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- lib/Target/TargetELFWriterInfo.cpp - ELF Writer Info --0-*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the TargetELFWriterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Function.h"
-#include "llvm/Target/TargetELFWriterInfo.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-TargetELFWriterInfo::TargetELFWriterInfo(bool is64Bit_, bool isLittleEndian_) :
-  is64Bit(is64Bit_), isLittleEndian(isLittleEndian_) {
-}
-
-TargetELFWriterInfo::~TargetELFWriterInfo() {}
-
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 8e215a763721..6d4eab12045c 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -24,6 +24,16 @@ void TargetLibraryInfo::anchor() { }
 
 const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
   {
+    "_ZdaPv",
+    "_ZdlPv",
+    "_Znaj",
+    "_ZnajRKSt9nothrow_t",
+    "_Znam",
+    "_ZnamRKSt9nothrow_t",
+    "_Znwj",
+    "_ZnwjRKSt9nothrow_t",
+    "_Znwm",
+    "_ZnwmRKSt9nothrow_t",
     "__cxa_atexit",
     "__cxa_guard_abort",
     "__cxa_guard_acquire",
@@ -31,16 +41,29 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "__memcpy_chk",
     "acos",
     "acosf",
+    "acosh",
+    "acoshf",
+    "acoshl",
     "acosl",
     "asin",
     "asinf",
+    "asinh",
+    "asinhf",
+    "asinhl",
     "asinl",
     "atan",
     "atan2",
     "atan2f",
     "atan2l",
     "atanf",
+    "atanh",
+    "atanhf",
+    "atanhl",
     "atanl",
+    "calloc",
+    "cbrt",
+    "cbrtf",
+    "cbrtl",
     "ceil",
     "ceilf",
     "ceill",
@@ -54,6 +77,9 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "coshl",
     "cosl",
     "exp",
+    "exp10",
+    "exp10f",
+    "exp10l",
     "exp2",
     "exp2f",
     "exp2l",
@@ -74,6 +100,7 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "fmodl",
     "fputc",
     "fputs",
+    "free",
     "fwrite",
     "iprintf",
     "log",
@@ -86,8 +113,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "log2",
     "log2f",
     "log2l",
+    "logb",
+    "logbf",
+    "logbl",
     "logf",
     "logl",
+    "malloc",
     "memchr",
     "memcmp",
     "memcpy",
@@ -97,11 +128,14 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "nearbyint",
     "nearbyintf",
     "nearbyintl",
+    "posix_memalign",
     "pow",
     "powf",
     "powl",
     "putchar",
     "puts",
+    "realloc",
+    "reallocf",
     "rint",
     "rintf",
     "rintl",
@@ -118,14 +152,30 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "sqrt",
     "sqrtf",
     "sqrtl",
+    "stpcpy",
     "strcat",
     "strchr",
+    "strcmp",
     "strcpy",
+    "strcspn",
+    "strdup",
     "strlen",
     "strncat",
     "strncmp",
     "strncpy",
+    "strndup",
     "strnlen",
+    "strpbrk",
+    "strrchr",
+    "strspn",
+    "strstr",
+    "strtod",
+    "strtof",
+    "strtol",
+    "strtold",
+    "strtoll",
+    "strtoul",
+    "strtoull",
     "tan",
     "tanf",
     "tanh",
@@ -134,7 +184,8 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "tanl",
     "trunc",
     "truncf",
-    "truncl"
+    "truncl",
+    "valloc"
   };
 
 /// initialize - Initialize the set of available library functions based on the
@@ -205,6 +256,21 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::tanhl);
 
     // Win32 only has C89 math
+    TLI.setUnavailable(LibFunc::acosh);
+    TLI.setUnavailable(LibFunc::acoshf);
+    TLI.setUnavailable(LibFunc::acoshl);
+    TLI.setUnavailable(LibFunc::asinh);
+    TLI.setUnavailable(LibFunc::asinhf);
+    TLI.setUnavailable(LibFunc::asinhl);
+    TLI.setUnavailable(LibFunc::atanh);
+    TLI.setUnavailable(LibFunc::atanhf);
+    TLI.setUnavailable(LibFunc::atanhl);
+    TLI.setUnavailable(LibFunc::cbrt);
+    TLI.setUnavailable(LibFunc::cbrtf);
+    TLI.setUnavailable(LibFunc::cbrtl);
+    TLI.setUnavailable(LibFunc::exp10);
+    TLI.setUnavailable(LibFunc::exp10f);
+    TLI.setUnavailable(LibFunc::exp10l);
     TLI.setUnavailable(LibFunc::exp2);
     TLI.setUnavailable(LibFunc::exp2f);
     TLI.setUnavailable(LibFunc::exp2l);
@@ -217,6 +283,9 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc::log1p);
     TLI.setUnavailable(LibFunc::log1pf);
     TLI.setUnavailable(LibFunc::log1pl);
+    TLI.setUnavailable(LibFunc::logb);
+    TLI.setUnavailable(LibFunc::logbf);
+    TLI.setUnavailable(LibFunc::logbl);
     TLI.setUnavailable(LibFunc::nearbyint);
     TLI.setUnavailable(LibFunc::nearbyintf);
     TLI.setUnavailable(LibFunc::nearbyintl);
@@ -254,6 +323,10 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc::tanf);
       TLI.setUnavailable(LibFunc::tanhf);
     }
+
+    // Win32 does *not* provide stpcpy.  It is provided on POSIX systems:
+    // http://pubs.opengroup.org/onlinepubs/9699919799/functions/stpcpy.html
+    TLI.setUnavailable(LibFunc::stpcpy);
   }
 }
 
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index b74a0bd25d72..9d7e2b825f41 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -22,7 +22,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/Dwarf.h"
@@ -184,7 +184,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
       // Otherwise, just drop it into a mergable constant section.  If we have
       // a section for this size, use it, otherwise use the arbitrary sized
       // mergable section.
-      switch (TM.getTargetData()->getTypeAllocSize(C->getType())) {
+      switch (TM.getDataLayout()->getTypeAllocSize(C->getType())) {
       case 4:  return SectionKind::getMergeableConst4();
       case 8:  return SectionKind::getMergeableConst8();
       case 16: return SectionKind::getMergeableConst16();
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index d6bba8b0dd05..f69c2abd50d2 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -14,7 +14,7 @@
 #include "llvm-c/Core.h"
 #include "llvm-c/Target.h"
 #include "llvm-c/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -146,7 +146,7 @@ char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
 }
 
 LLVMTargetDataRef LLVMGetTargetMachineData(LLVMTargetMachineRef T) {
-  return wrap(unwrap(T)->getTargetData());
+  return wrap(unwrap(T)->getDataLayout());
 }
 
 LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
@@ -158,14 +158,14 @@ LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
 
   std::string error;
 
-  const TargetData* td = TM->getTargetData();
+  const DataLayout* td = TM->getDataLayout();
 
   if (!td) {
-    error = "No TargetData in TargetMachine";
+    error = "No DataLayout in TargetMachine";
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
-  pass.add(new TargetData(*td));
+  pass.add(new DataLayout(*td));
 
   TargetMachine::CodeGenFileType ft;
   switch (codegen) {
@@ -184,7 +184,7 @@ LLVMBool LLVMTargetMachineEmitToFile(LLVMTargetMachineRef T, LLVMModuleRef M,
   }
 
   if (TM->addPassesToEmitFile(pass, destf, ft)) {
-    error = "No TargetData in TargetMachine";
+    error = "No DataLayout in TargetMachine";
     *ErrorMessage = strdup(error.c_str());
     return true;
   }
diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
index 2395f2ba12ac..be8b58289039 100644
--- a/lib/Target/TargetRegisterInfo.cpp
+++ b/lib/Target/TargetRegisterInfo.cpp
@@ -20,8 +20,10 @@ using namespace llvm;
 
 TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
                              regclass_iterator RCB, regclass_iterator RCE,
-                             const char *const *subregindexnames)
-  : InfoDesc(ID), SubRegIndexNames(subregindexnames),
+                             const char *const *SRINames,
+                             const unsigned *SRILaneMasks)
+  : InfoDesc(ID), SubRegIndexNames(SRINames),
+    SubRegIndexLaneMasks(SRILaneMasks),
     RegClassBegin(RCB), RegClassEnd(RCE) {
 }
 
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
new file mode 100644
index 000000000000..b36e6f858f72
--- /dev/null
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -0,0 +1,353 @@
+// llvm/Target/TargetTransformImpl.cpp - Target Loop Trans Info ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetTransformImpl.h"
+#include "llvm/Target/TargetLowering.h"
+#include <utility>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//
+// Calls used by scalar transformations.
+//
+//===----------------------------------------------------------------------===//
+
+bool ScalarTargetTransformImpl::isLegalAddImmediate(int64_t imm) const {
+  return TLI->isLegalAddImmediate(imm);
+}
+
+bool ScalarTargetTransformImpl::isLegalICmpImmediate(int64_t imm) const {
+  return TLI->isLegalICmpImmediate(imm);
+}
+
+bool ScalarTargetTransformImpl::isLegalAddressingMode(const AddrMode &AM,
+                                                      Type *Ty) const {
+  return TLI->isLegalAddressingMode(AM, Ty);
+}
+
+bool ScalarTargetTransformImpl::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  return TLI->isTruncateFree(Ty1, Ty2);
+}
+
+bool ScalarTargetTransformImpl::isTypeLegal(Type *Ty) const {
+  EVT T = TLI->getValueType(Ty);
+  return TLI->isTypeLegal(T);
+}
+
+unsigned ScalarTargetTransformImpl::getJumpBufAlignment() const {
+  return TLI->getJumpBufAlignment();
+}
+
+unsigned ScalarTargetTransformImpl::getJumpBufSize() const {
+  return TLI->getJumpBufSize();
+}
+
+bool ScalarTargetTransformImpl::shouldBuildLookupTables() const {
+  return TLI->supportJumpTables() &&
+      (TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+       TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Calls used by the vectorizers.
+//
+//===----------------------------------------------------------------------===//
+int VectorTargetTransformImpl::InstructionOpcodeToISD(unsigned Opcode) const {
+  enum InstructionOpcodes {
+#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
+#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
+#include "llvm/Instruction.def"
+  };
+  switch (static_cast<InstructionOpcodes>(Opcode)) {
+  case Ret:            return 0;
+  case Br:             return 0;
+  case Switch:         return 0;
+  case IndirectBr:     return 0;
+  case Invoke:         return 0;
+  case Resume:         return 0;
+  case Unreachable:    return 0;
+  case Add:            return ISD::ADD;
+  case FAdd:           return ISD::FADD;
+  case Sub:            return ISD::SUB;
+  case FSub:           return ISD::FSUB;
+  case Mul:            return ISD::MUL;
+  case FMul:           return ISD::FMUL;
+  case UDiv:           return ISD::UDIV;
+  case SDiv:           return ISD::UDIV;
+  case FDiv:           return ISD::FDIV;
+  case URem:           return ISD::UREM;
+  case SRem:           return ISD::SREM;
+  case FRem:           return ISD::FREM;
+  case Shl:            return ISD::SHL;
+  case LShr:           return ISD::SRL;
+  case AShr:           return ISD::SRA;
+  case And:            return ISD::AND;
+  case Or:             return ISD::OR;
+  case Xor:            return ISD::XOR;
+  case Alloca:         return 0;
+  case Load:           return ISD::LOAD;
+  case Store:          return ISD::STORE;
+  case GetElementPtr:  return 0;
+  case Fence:          return 0;
+  case AtomicCmpXchg:  return 0;
+  case AtomicRMW:      return 0;
+  case Trunc:          return ISD::TRUNCATE;
+  case ZExt:           return ISD::ZERO_EXTEND;
+  case SExt:           return ISD::SIGN_EXTEND;
+  case FPToUI:         return ISD::FP_TO_UINT;
+  case FPToSI:         return ISD::FP_TO_SINT;
+  case UIToFP:         return ISD::UINT_TO_FP;
+  case SIToFP:         return ISD::SINT_TO_FP;
+  case FPTrunc:        return ISD::FP_ROUND;
+  case FPExt:          return ISD::FP_EXTEND;
+  case PtrToInt:       return ISD::BITCAST;
+  case IntToPtr:       return ISD::BITCAST;
+  case BitCast:        return ISD::BITCAST;
+  case ICmp:           return ISD::SETCC;
+  case FCmp:           return ISD::SETCC;
+  case PHI:            return 0;
+  case Call:           return 0;
+  case Select:         return ISD::SELECT;
+  case UserOp1:        return 0;
+  case UserOp2:        return 0;
+  case VAArg:          return 0;
+  case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
+  case InsertElement:  return ISD::INSERT_VECTOR_ELT;
+  case ShuffleVector:  return ISD::VECTOR_SHUFFLE;
+  case ExtractValue:   return ISD::MERGE_VALUES;
+  case InsertValue:    return ISD::MERGE_VALUES;
+  case LandingPad:     return 0;
+  }
+
+  llvm_unreachable("Unknown instruction type encountered!");
+}
+
+std::pair<unsigned, MVT>
+VectorTargetTransformImpl::getTypeLegalizationCost(Type *Ty) const {
+
+  LLVMContext &C = Ty->getContext();
+  EVT MTy = TLI->getValueType(Ty);
+
+  unsigned Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, MTy);
+
+    if (LK.first == TargetLowering::TypeLegal)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
+    if (LK.first == TargetLowering::TypeSplitVector ||
+        LK.first == TargetLowering::TypeExpandInteger)
+      Cost *= 2;
+
+    // Keep legalizing the type.
+    MTy = LK.second;
+  }
+}
+
+unsigned
+VectorTargetTransformImpl::getScalarizationOverhead(Type *Ty,
+                                                    bool Insert,
+                                                    bool Extract) const {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+  unsigned Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode,
+                                                           Type *Ty) const {
+  // Check if any of the operands are vector operands.
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  // Else, assume that we need to scalarize this op.
+  if (Ty->isVectorTy()) {
+    unsigned Num = Ty->getVectorNumElements();
+    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+    // return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(Ty, true, true) + Num * Cost;
+  }
+
+  // We don't know anything about this scalar instruction.
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getBroadcastCost(Type *Tp) const {
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                  Type *Src) const {
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  std::pair<unsigned, MVT> SrcLT = getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> DstLT = getTypeLegalizationCost(Dst);
+
+  // Handle scalar conversions.
+  if (!Src->isVectorTy() && !Dst->isVectorTy()) {
+
+    // Scalar bitcasts are usually free.
+    if (Opcode == Instruction::BitCast)
+      return 0;
+
+    if (Opcode == Instruction::Trunc &&
+        TLI->isTruncateFree(SrcLT.second, DstLT.second))
+      return 0;
+
+    if (Opcode == Instruction::ZExt &&
+        TLI->isZExtFree(SrcLT.second, DstLT.second))
+      return 0;
+
+    // Just check the op cost. If the operation is legal then assume it costs 1.
+    if (!TLI->isOperationExpand(ISD, DstLT.second))
+      return  1;
+
+    // Assume that illegal scalar instruction are expensive.
+    return 4;
+  }
+
+  // Check vector-to-vector casts.
+  if (Dst->isVectorTy() && Src->isVectorTy()) {
+
+    // If the cast is between same-sized registers, then the check is simple.
+    if (SrcLT.first == DstLT.first &&
+        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+
+      // Bitcast between types that are legalized to the same type are free.
+      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
+        return 0;
+
+      // Assume that Zext is done using AND.
+      if (Opcode == Instruction::ZExt)
+        return 1;
+
+      // Assume that sext is done using SHL and SRA.
+      if (Opcode == Instruction::SExt)
+        return 2;
+
+      // Just check the op cost. If the operation is legal then assume it costs
+      // 1 and multiply by the type-legalization overhead.
+      if (!TLI->isOperationExpand(ISD, DstLT.second))
+        return SrcLT.first * 1;
+    }
+
+    // If we are converting vectors and the operation is illegal, or
+    // if the vectors are legalized to different types, estimate the
+    // scalarization costs.
+    unsigned Num = Dst->getVectorNumElements();
+    unsigned Cost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                     Src->getScalarType());
+
+    // Return the cost of multiple scalar invocation plus the cost of
+    // inserting and extracting the values.
+    return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+  }
+
+  // We already handled vector-to-vector and scalar-to-scalar conversions. This 
+  // is where we handle bitcast between vectors and scalars. We need to assume
+  //  that the conversion is scalarized in one way or another.
+  if (Opcode == Instruction::BitCast)
+    // Illegal bitcasts are done by storing and loading from a stack slot.
+    return (Src->isVectorTy()? getScalarizationOverhead(Src, false, true):0) +
+           (Dst->isVectorTy()? getScalarizationOverhead(Dst, true, false):0);
+
+  llvm_unreachable("Unhandled cast");
+ }
+
+unsigned VectorTargetTransformImpl::getCFInstrCost(unsigned Opcode) const {
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getCmpSelInstrCost(unsigned Opcode,
+                                                       Type *ValTy,
+                                                       Type *CondTy) const {
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  // Selects on vectors are actually vector selects.
+  if (ISD == ISD::SELECT) {
+    assert(CondTy && "CondTy must exist");
+    if (CondTy->isVectorTy())
+      ISD = ISD::VSELECT;
+  }
+
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  // Otherwise, assume that the cast is scalarized.
+  if (ValTy->isVectorTy()) {
+    unsigned Num = ValTy->getVectorNumElements();
+    if (CondTy)
+      CondTy = CondTy->getScalarType();
+    unsigned Cost = getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
+                                       CondTy);
+
+    // Return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
+  }
+
+  // Unknown scalar opcode.
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getVectorInstrCost(unsigned Opcode,
+                                                       Type *Val,
+                                                       unsigned Index) const {
+  return 1;
+}
+
+unsigned
+VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1,
+                                        Type *Ty2) const {
+  return 1;
+}
+
+unsigned
+VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
+
+  // Assume that all loads of legal types cost 1.
+  return LT.first;
+}
+
+unsigned
+VectorTargetTransformImpl::getNumberOfParts(Type *Tp) const {
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Tp);
+  return LT.first;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
index 2794e60df238..66ad35370936 100644
--- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
@@ -18,19 +18,19 @@
 using namespace llvm;
 
 namespace {
-  
+
 class X86AsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
-  
+
   bool tentativeIsValid;
   AsmToken tentativeToken;
-  
+
   const AsmToken &lexTentative() {
     tentativeToken = getLexer()->Lex();
     tentativeIsValid = true;
     return tentativeToken;
   }
-  
+
   const AsmToken &lexDefinite() {
     if (tentativeIsValid) {
       tentativeIsValid = false;
@@ -38,7 +38,7 @@ class X86AsmLexer : public MCTargetAsmLexer {
     }
     return getLexer()->Lex();
   }
-  
+
   AsmToken LexTokenATT();
   AsmToken LexTokenIntel();
 protected:
@@ -47,7 +47,7 @@ protected:
       SetError(SMLoc(), "No MCAsmLexer installed");
       return AsmToken(AsmToken::Error, "", 0);
     }
-    
+
     switch (AsmInfo.getAssemblerDialect()) {
     default:
       SetError(SMLoc(), "Unhandled dialect");
@@ -71,33 +71,32 @@ public:
 
 AsmToken X86AsmLexer::LexTokenATT() {
   AsmToken lexedToken = lexDefinite();
-  
+
   switch (lexedToken.getKind()) {
   default:
     return lexedToken;
   case AsmToken::Error:
     SetError(Lexer->getErrLoc(), Lexer->getErr());
     return lexedToken;
-      
+
   case AsmToken::Percent: {
     const AsmToken &nextToken = lexTentative();
     if (nextToken.getKind() != AsmToken::Identifier)
       return lexedToken;
 
-      
     if (unsigned regID = MatchRegisterName(nextToken.getString())) {
       lexDefinite();
-        
+
       // FIXME: This is completely wrong when there is a space or other
       // punctuation between the % and the register name.
       StringRef regStr(lexedToken.getString().data(),
-                       lexedToken.getString().size() + 
+                       lexedToken.getString().size() +
                        nextToken.getString().size());
-      
-      return AsmToken(AsmToken::Register, regStr, 
+
+      return AsmToken(AsmToken::Register, regStr,
                       static_cast<int64_t>(regID));
     }
-    
+
     // Match register name failed.  If this is "db[0-7]", match it as an alias
     // for dr[0-7].
     if (nextToken.getString().size() == 3 &&
@@ -113,29 +112,29 @@ AsmToken X86AsmLexer::LexTokenATT() {
       case '6': RegNo = X86::DR6; break;
       case '7': RegNo = X86::DR7; break;
       }
-      
+
       if (RegNo != -1) {
         lexDefinite();
 
         // FIXME: This is completely wrong when there is a space or other
         // punctuation between the % and the register name.
         StringRef regStr(lexedToken.getString().data(),
-                         lexedToken.getString().size() + 
+                         lexedToken.getString().size() +
                          nextToken.getString().size());
-        return AsmToken(AsmToken::Register, regStr, 
+        return AsmToken(AsmToken::Register, regStr,
                         static_cast<int64_t>(RegNo));
       }
     }
-      
-   
+
+
     return lexedToken;
-  }    
+  }
   }
 }
 
 AsmToken X86AsmLexer::LexTokenIntel() {
   const AsmToken &lexedToken = lexDefinite();
-  
+
   switch(lexedToken.getKind()) {
   default:
     return lexedToken;
@@ -144,7 +143,7 @@ AsmToken X86AsmLexer::LexTokenIntel() {
     return lexedToken;
   case AsmToken::Identifier: {
     unsigned regID = MatchRegisterName(lexedToken.getString().lower());
-    
+
     if (regID)
       return AsmToken(AsmToken::Register,
                       lexedToken.getString(),
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index fbbaa9500c99..ce446e75737c 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -11,12 +11,14 @@
 #include "llvm/MC/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -33,13 +35,16 @@ struct X86Operand;
 class X86AsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  ParseInstructionInfo *InstInfo;
 private:
   MCAsmParser &getParser() const { return Parser; }
 
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   bool Error(SMLoc L, const Twine &Msg,
-             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+             bool MatchingInlineAsm = false) {
+    if (MatchingInlineAsm) return true;
     return Parser.Error(L, Msg, Ranges);
   }
 
@@ -51,23 +56,25 @@ private:
   X86Operand *ParseOperand();
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
-  X86Operand *ParseIntelMemOperand();
+  X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
+  X86Operand *ParseIntelTypeOperator(SMLoc StartLoc);
+  X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
+  bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr **NewDisp,
+                             SmallString<64> &Err);
+
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   bool processInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
 
-  bool MatchAndEmitInstruction(SMLoc IDLoc,
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer &Out);
-
-  bool MatchInstruction(SMLoc IDLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        SmallVectorImpl<MCInst> &MCInsts);
+                               MCStreamer &Out, unsigned &ErrorInfo,
+                               bool MatchingInlineAsm);
 
   /// isSrcOp - Returns true if operand is either (%rsi) or %ds:%(rsi)
   /// in 64bit mode or (%esi) or %es:(%esi) in 32bit mode.
@@ -96,14 +103,15 @@ private:
 
 public:
   X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser)
-    : MCTargetAsmParser(), STI(sti), Parser(parser) {
+    : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
   virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
 
-  virtual bool ParseInstruction(StringRef Name, SMLoc NameLoc,
+  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                SMLoc NameLoc,
                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands);
 
   virtual bool ParseDirective(AsmToken DirectiveID);
@@ -159,6 +167,7 @@ struct X86Operand : public MCParsedAsmOperand {
   } Kind;
 
   SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
 
   union {
     struct {
@@ -172,6 +181,7 @@ struct X86Operand : public MCParsedAsmOperand {
 
     struct {
       const MCExpr *Val;
+      bool NeedAsmRewrite;
     } Imm;
 
     struct {
@@ -181,6 +191,7 @@ struct X86Operand : public MCParsedAsmOperand {
       unsigned IndexReg;
       unsigned Scale;
       unsigned Size;
+      bool NeedSizeDir;
     } Mem;
   };
 
@@ -191,8 +202,11 @@ struct X86Operand : public MCParsedAsmOperand {
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const { return OffsetOfLoc; }
 
   virtual void print(raw_ostream &OS) const {}
 
@@ -216,6 +230,11 @@ struct X86Operand : public MCParsedAsmOperand {
     return Imm.Val;
   }
 
+  bool needAsmRewrite() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.NeedAsmRewrite;
+  }
+
   const MCExpr *getMemDisp() const {
     assert(Kind == Memory && "Invalid access!");
     return Mem.Disp;
@@ -312,6 +331,20 @@ struct X86Operand : public MCParsedAsmOperand {
     return isImmSExti64i32Value(CE->getValue());
   }
 
+  unsigned getMemSize() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Size;
+  }
+
+  bool isOffsetOf() const {
+    return OffsetOfLoc.getPointer();
+  }
+
+  bool needSizeDirective() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.NeedSizeDir;
+  }
+
   bool isMem() const { return Kind == Memory; }
   bool isMem8() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 8);
@@ -437,21 +470,25 @@ struct X86Operand : public MCParsedAsmOperand {
     return Res;
   }
 
-  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) {
+  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+                               SMLoc OffsetOfLoc = SMLoc()) {
     X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
+    Res->OffsetOfLoc = OffsetOfLoc;
     return Res;
   }
 
-  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc,
+                               bool NeedRewrite = true){
     X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
     Res->Imm.Val = Val;
+    Res->Imm.NeedAsmRewrite = NeedRewrite;
     return Res;
   }
 
   /// Create an absolute memory operand.
-  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
-                               SMLoc EndLoc, unsigned Size = 0) {
+  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0, bool NeedSizeDir = false){
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -459,6 +496,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = 0;
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
+    Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
 
@@ -466,7 +504,7 @@ struct X86Operand : public MCParsedAsmOperand {
   static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
                                unsigned BaseReg, unsigned IndexReg,
                                unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0) {
+                               unsigned Size = 0, bool NeedSizeDir = false) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -481,6 +519,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = IndexReg;
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
+    Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
 };
@@ -510,12 +549,13 @@ bool X86AsmParser::isDstOp(X86Operand &Op) {
 bool X86AsmParser::ParseRegister(unsigned &RegNo,
                                  SMLoc &StartLoc, SMLoc &EndLoc) {
   RegNo = 0;
-  if (!isParsingIntelSyntax()) {
-    const AsmToken &TokPercent = Parser.getTok();
-    assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
-    StartLoc = TokPercent.getLoc();
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
     Parser.Lex(); // Eat percent token.
-  }
 
   const AsmToken &Tok = Parser.getTok();
   if (Tok.isNot(AsmToken::Identifier)) {
@@ -621,23 +661,25 @@ X86Operand *X86AsmParser::ParseOperand() {
 
 /// getIntelMemOperandSize - Return intel memory operand size.
 static unsigned getIntelMemOperandSize(StringRef OpStr) {
-  unsigned Size = 0;
-  if (OpStr == "BYTE") Size = 8;
-  if (OpStr == "WORD") Size = 16;
-  if (OpStr == "DWORD") Size = 32;
-  if (OpStr == "QWORD") Size = 64;
-  if (OpStr == "XWORD") Size = 80;
-  if (OpStr == "XMMWORD") Size = 128;
-  if (OpStr == "YMMWORD") Size = 256;
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("QWORD", "qword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Default(0);
   return Size;
 }
 
-X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
+X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, 
                                                    unsigned Size) {
   unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
-  SMLoc Start = Parser.getTok().getLoc(), End;
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start = Tok.getLoc(), End;
 
-  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
   // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ]
 
   // Eat '['
@@ -653,15 +695,17 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       if (getLexer().isNot(AsmToken::RBrac))
         return ErrorOperand(Start, "Expected ']' token!");
       Parser.Lex();
+      End = Tok.getLoc();
       return X86Operand::CreateMem(Disp, Start, End, Size);
     }
   } else if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Parser.getTok().getIntVal();
+      int64_t Val = Tok.getIntVal();
       Parser.Lex();
-      SMLoc Loc = Parser.getTok().getLoc();
+      SMLoc Loc = Tok.getLoc();
       if (getLexer().is(AsmToken::RBrac)) {
         // Handle '[' number ']'
         Parser.Lex();
+        End = Tok.getLoc();
         const MCExpr *Disp = MCConstantExpr::Create(Val, getContext());
         if (SegReg)
           return X86Operand::CreateMem(SegReg, Disp, 0, 0, Scale,
@@ -670,7 +714,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       } else if (getLexer().is(AsmToken::Star)) {
         // Handle '[' Scale*IndexReg ']'
         Parser.Lex();
-        SMLoc IdxRegLoc = Parser.getTok().getLoc();
+        SMLoc IdxRegLoc = Tok.getLoc();
         if (ParseRegister(IndexReg, IdxRegLoc, End))
           return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
@@ -678,16 +722,27 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
         return ErrorOperand(Loc, "Unexpected token");
   }
 
-  if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
-    bool isPlus = getLexer().is(AsmToken::Plus);
+  // Parse ][ as a plus.
+  bool ExpectRBrac = true;
+  if (getLexer().is(AsmToken::RBrac)) {
+    ExpectRBrac = false;
     Parser.Lex();
-    SMLoc PlusLoc = Parser.getTok().getLoc();
+    End = Tok.getLoc();
+  }
+
+  if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus) ||
+      getLexer().is(AsmToken::LBrac)) {
+    ExpectRBrac = true;
+    bool isPlus = getLexer().is(AsmToken::Plus) ||
+      getLexer().is(AsmToken::LBrac);
+    Parser.Lex(); 
+    SMLoc PlusLoc = Tok.getLoc();
     if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Parser.getTok().getIntVal();
+      int64_t Val = Tok.getIntVal();
       Parser.Lex();
       if (getLexer().is(AsmToken::Star)) {
         Parser.Lex();
-        SMLoc IdxRegLoc = Parser.getTok().getLoc();
+        SMLoc IdxRegLoc = Tok.getLoc();
         if (ParseRegister(IndexReg, IdxRegLoc, End))
           return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
@@ -698,21 +753,48 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
         return ErrorOperand(PlusLoc, "unexpected token after +");
     } else if (getLexer().is(AsmToken::Identifier)) {
       // This could be an index register or a displacement expression.
-      End = Parser.getTok().getLoc();
+      End = Tok.getLoc();
       if (!IndexReg)
         ParseRegister(IndexReg, Start, End);
       else if (getParser().ParseExpression(Disp, End)) return 0;
     }
   }
+  
+  // Parse ][ as a plus.
+  if (getLexer().is(AsmToken::RBrac)) {
+    ExpectRBrac = false;
+    Parser.Lex();
+    End = Tok.getLoc();
+    if (getLexer().is(AsmToken::LBrac)) {
+      ExpectRBrac = true;
+      Parser.Lex();
+      if (getParser().ParseExpression(Disp, End))
+        return 0;
+    }
+  } else if (ExpectRBrac) {
+      if (getParser().ParseExpression(Disp, End))
+        return 0;
+  }
 
-  if (getLexer().isNot(AsmToken::RBrac))
-    if (getParser().ParseExpression(Disp, End)) return 0;
+  if (ExpectRBrac) {
+    if (getLexer().isNot(AsmToken::RBrac))
+      return ErrorOperand(End, "expected ']' token!");
+    Parser.Lex();
+    End = Tok.getLoc();
+  }
 
-  End = Parser.getTok().getLoc();
-  if (getLexer().isNot(AsmToken::RBrac))
-    return ErrorOperand(End, "expected ']' token!");
-  Parser.Lex();
-  End = Parser.getTok().getLoc();
+  // Parse the dot operator (e.g., [ebx].foo.bar).
+  if (Tok.getString().startswith(".")) {
+    SmallString<64> Err;
+    const MCExpr *NewDisp;
+    if (ParseIntelDotOperator(Disp, &NewDisp, Err))
+      return ErrorOperand(Tok.getLoc(), Err);
+    
+    Parser.Lex();  // Eat the field.
+    Disp = NewDisp;
+  }
+
+  End = Tok.getLoc();
 
   // handle [-42]
   if (!BaseReg && !IndexReg)
@@ -723,15 +805,15 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
 }
 
 /// ParseIntelMemOperand - Parse intel style memory operand.
-X86Operand *X86AsmParser::ParseIntelMemOperand() {
+X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
   const AsmToken &Tok = Parser.getTok();
-  SMLoc Start = Parser.getTok().getLoc(), End;
-  unsigned SegReg = 0;
+  SMLoc End;
 
   unsigned Size = getIntelMemOperandSize(Tok.getString());
   if (Size) {
     Parser.Lex();
-    assert (Tok.getString() == "PTR" && "Unexpected token!");
+    assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") &&
+            "Unexpected token!");
     Parser.Lex();
   }
 
@@ -750,12 +832,164 @@ X86Operand *X86AsmParser::ParseIntelMemOperand() {
 
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
   if (getParser().ParseExpression(Disp, End)) return 0;
-  return X86Operand::CreateMem(Disp, Start, End, Size);
+  End = Parser.getTok().getLoc();
+
+  bool NeedSizeDir = false;
+  if (!Size && isParsingInlineAsm()) {
+    if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
+      const MCSymbol &Sym = SymRef->getSymbol();
+      // FIXME: The SemaLookup will fail if the name is anything other then an
+      // identifier.
+      // FIXME: Pass a valid SMLoc.
+      SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size);
+      NeedSizeDir = Size > 0;
+    }
+  }
+  if (!isParsingInlineAsm())
+    return X86Operand::CreateMem(Disp, Start, End, Size);
+  else
+    // When parsing inline assembly we set the base register to a non-zero value
+    // as we don't know the actual value at this time.  This is necessary to
+    // get the matching correct in some cases.
+    return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
+                                 /*Scale*/1, Start, End, Size, NeedSizeDir);
+}
+
+/// Parse the '.' operator.
+bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
+                                         const MCExpr **NewDisp,
+                                         SmallString<64> &Err) {
+  AsmToken Tok = *&Parser.getTok();
+  uint64_t OrigDispVal, DotDispVal;
+
+  // FIXME: Handle non-constant expressions.
+  if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) {
+    OrigDispVal = OrigDisp->getValue();
+  } else {
+    Err = "Non-constant offsets are not supported!";
+    return true;
+  }
+
+  // Drop the '.'.
+  StringRef DotDispStr = Tok.getString().drop_front(1);
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    DotDispVal = DotDisp.getZExtValue();
+  } else if (Tok.is(AsmToken::Identifier)) {
+    // We should only see an identifier when parsing the original inline asm.
+    // The front-end should rewrite this in terms of immediates.
+    assert (isParsingInlineAsm() && "Unexpected field name!");
+
+    unsigned DotDisp;
+    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
+                                           DotDisp)) {
+      Err = "Unable to lookup field reference!";
+      return true;
+    }
+    DotDispVal = DotDisp;
+  } else {
+    Err = "Unexpected token type!";
+    return true;
+  }
+
+  if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
+    SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
+    unsigned Len = DotDispStr.size();
+    unsigned Val = OrigDispVal + DotDispVal;
+    InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_DotOperator, Loc, Len,
+                                                Val));
+  }
+
+  *NewDisp = MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+  return false;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
+  SMLoc OffsetOfLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;
+  const MCExpr *Val;
+  if (getParser().ParseExpression(Val, End))
+    return ErrorOperand(Start, "Unable to parse expression!");
+
+  End = Parser.getTok().getLoc();
+
+  // Don't emit the offset operator.
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+  return X86Operand::CreateReg(RegNo, Start, End, OffsetOfLoc);
+}
+
+/// Parse the 'TYPE' operator.  The TYPE operator returns the size of a C or
+/// C++ type or variable. If the variable is an array, TYPE returns the size of
+/// a single element of the array.
+X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) {
+  SMLoc TypeLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;
+  const MCExpr *Val;
+  if (getParser().ParseExpression(Val, End))
+    return 0;
+
+  End = Parser.getTok().getLoc();
+
+  unsigned Size = 0;
+  if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) {
+    const MCSymbol &Sym = SymRef->getSymbol();
+    // FIXME: The SemaLookup will fail if the name is anything other then an
+    // identifier.
+    // FIXME: Pass a valid SMLoc.
+    if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size))
+      return ErrorOperand(Start, "Unable to lookup TYPE of expr!");
+
+    Size /= 8; // Size is in terms of bits, but we want bytes in the context.
+  }
+
+  // Rewrite the type operator and the C or C++ type or variable in terms of an
+  // immediate.  E.g. TYPE foo -> $$4
+  unsigned Len = End.getPointer() - TypeLoc.getPointer();
+  InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, Size));
+
+  const MCExpr *Imm = MCConstantExpr::Create(Size, getContext());
+  return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false);
 }
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   SMLoc Start = Parser.getTok().getLoc(), End;
 
+  // offset operator.
+  StringRef AsmTokStr = Parser.getTok().getString();
+  if ((AsmTokStr == "offset" || AsmTokStr == "OFFSET") &&
+      isParsingInlineAsm())
+    return ParseIntelOffsetOfOperator(Start);
+
+  // Type directive.
+  if ((AsmTokStr == "type" || AsmTokStr == "TYPE") &&
+      isParsingInlineAsm())
+    return ParseIntelTypeOperator(Start);
+
+  // Unsupported directives.
+  if (isParsingIntelSyntax() &&
+      (AsmTokStr == "size" || AsmTokStr == "SIZE" ||
+       AsmTokStr == "length" || AsmTokStr == "LENGTH"))
+      return ErrorOperand(Start, "Unsupported directive!");
+
   // immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
       getLexer().is(AsmToken::Minus)) {
@@ -769,12 +1003,17 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
   // register
   unsigned RegNo = 0;
   if (!ParseRegister(RegNo, Start, End)) {
-    End = Parser.getTok().getLoc();
-    return X86Operand::CreateReg(RegNo, Start, End);
+    // If this is a segment register followed by a ':', then this is the start
+    // of a memory reference, otherwise this is a normal register reference.
+    if (getLexer().isNot(AsmToken::Colon))
+      return X86Operand::CreateReg(RegNo, Start, Parser.getTok().getLoc());
+
+    getParser().Lex(); // Eat the colon.
+    return ParseIntelMemOperand(RegNo, Start);
   }
 
   // mem operand
-  return ParseIntelMemOperand();
+  return ParseIntelMemOperand(0, Start);
 }
 
 X86Operand *X86AsmParser::ParseATTOperand() {
@@ -972,8 +1211,9 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 }
 
 bool X86AsmParser::
-ParseInstruction(StringRef Name, SMLoc NameLoc,
+ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
                  SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  InstInfo = &Info;
   StringRef PatchedName = Name;
 
   // FIXME: Hack to recognize setneb as setne.
@@ -1509,28 +1749,18 @@ processInstruction(MCInst &Inst,
 }
 
 bool X86AsmParser::
-MatchAndEmitInstruction(SMLoc IDLoc,
+MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                        MCStreamer &Out) {
-  SmallVector<MCInst, 2> Insts;
-  bool Error = MatchInstruction(IDLoc, Operands, Insts);
-  if (!Error)
-    for (unsigned i = 0, e = Insts.size(); i != e; ++i)
-      Out.EmitInstruction(Insts[i]);
-  return Error;
-}
-
-bool X86AsmParser::
-MatchInstruction(SMLoc IDLoc,
-                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                 SmallVectorImpl<MCInst> &MCInsts) {
+                        MCStreamer &Out, unsigned &ErrorInfo,
+                        bool MatchingInlineAsm) {
   assert(!Operands.empty() && "Unexpect empty operand list!");
   X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
   assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+  ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
-  // Also, MatchInstructionImpl should do actually *do* the EmitInstruction
+  // Also, MatchInstructionImpl should actually *do* the EmitInstruction
   // call.
   if (Op->getToken() == "fstsw" || Op->getToken() == "fstcw" ||
       Op->getToken() == "fstsww" || Op->getToken() == "fstcww" ||
@@ -1539,7 +1769,8 @@ MatchInstruction(SMLoc IDLoc,
     MCInst Inst;
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
-    MCInsts.push_back(Inst);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
 
     const char *Repl =
       StringSwitch<const char*>(Op->getToken())
@@ -1558,28 +1789,30 @@ MatchInstruction(SMLoc IDLoc,
   }
 
   bool WasOriginallyInvalidOperand = false;
-  unsigned OrigErrorInfo;
   MCInst Inst;
 
   // First, try a direct match.
-  switch (MatchInstructionImpl(Operands, Inst, OrigErrorInfo,
+  switch (MatchInstructionImpl(Operands, Inst,
+                               ErrorInfo, MatchingInlineAsm,
                                isParsingIntelSyntax())) {
   default: break;
   case Match_Success:
     // Some instructions need post-processing to, for example, tweak which
     // encoding is selected. Loop on it while changes happen so the
     // individual transformations can chain off each other.
-    while (processInstruction(Inst, Operands))
-      ;
+    if (!MatchingInlineAsm)
+      while (processInstruction(Inst, Operands))
+        ;
 
     Inst.setLoc(IDLoc);
-    MCInsts.push_back(Inst);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+    Opcode = Inst.getOpcode();
     return false;
   case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled",
+          EmptyRanges, MatchingInlineAsm);
     return true;
-  case Match_ConversionFail:
-    return Error(IDLoc, "unable to convert operands to instruction");
   case Match_InvalidOperand:
     WasOriginallyInvalidOperand = true;
     break;
@@ -1612,13 +1845,17 @@ MatchInstruction(SMLoc IDLoc,
   unsigned ErrorInfoIgnore;
   unsigned Match1, Match2, Match3, Match4;
 
-  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
   Tmp[Base.size()] = Suffixes[1];
-  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match2 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
   Tmp[Base.size()] = Suffixes[2];
-  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match3 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
   Tmp[Base.size()] = Suffixes[3];
-  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore);
+  Match4 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
+                                isParsingIntelSyntax());
 
   // Restore the old token.
   Op->setTokenValue(Base);
@@ -1631,7 +1868,9 @@ MatchInstruction(SMLoc IDLoc,
     (Match3 == Match_Success) + (Match4 == Match_Success);
   if (NumSuccessfulMatches == 1) {
     Inst.setLoc(IDLoc);
-    MCInsts.push_back(Inst);
+    if (!MatchingInlineAsm)
+      Out.EmitInstruction(Inst);
+    Opcode = Inst.getOpcode();
     return false;
   }
 
@@ -1658,7 +1897,7 @@ MatchInstruction(SMLoc IDLoc,
       OS << "'" << Base << MatchChars[i] << "'";
     }
     OS << ")";
-    Error(IDLoc, OS.str());
+    Error(IDLoc, OS.str(), EmptyRanges, MatchingInlineAsm);
     return true;
   }
 
@@ -1669,31 +1908,36 @@ MatchInstruction(SMLoc IDLoc,
   if ((Match1 == Match_MnemonicFail) && (Match2 == Match_MnemonicFail) &&
       (Match3 == Match_MnemonicFail) && (Match4 == Match_MnemonicFail)) {
     if (!WasOriginallyInvalidOperand) {
+      ArrayRef<SMRange> Ranges = MatchingInlineAsm ? EmptyRanges :
+        Op->getLocRange();
       return Error(IDLoc, "invalid instruction mnemonic '" + Base + "'",
-                   Op->getLocRange());
+                   Ranges, MatchingInlineAsm);
     }
 
     // Recover location info for the operand if we know which was the problem.
-    if (OrigErrorInfo != ~0U) {
-      if (OrigErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction",
+                     EmptyRanges, MatchingInlineAsm);
 
-      X86Operand *Operand = (X86Operand*)Operands[OrigErrorInfo];
+      X86Operand *Operand = (X86Operand*)Operands[ErrorInfo];
       if (Operand->getStartLoc().isValid()) {
         SMRange OperandRange = Operand->getLocRange();
         return Error(Operand->getStartLoc(), "invalid operand for instruction",
-                     OperandRange);
+                     OperandRange, MatchingInlineAsm);
       }
     }
 
-    return Error(IDLoc, "invalid operand for instruction");
+    return Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+                 MatchingInlineAsm);
   }
 
   // If one instruction matched with a missing feature, report this as a
   // missing feature.
   if ((Match1 == Match_MissingFeature) + (Match2 == Match_MissingFeature) +
       (Match3 == Match_MissingFeature) + (Match4 == Match_MissingFeature) == 1){
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
+    Error(IDLoc, "instruction requires a CPU feature not currently enabled",
+          EmptyRanges, MatchingInlineAsm);
     return true;
   }
 
@@ -1701,12 +1945,14 @@ MatchInstruction(SMLoc IDLoc,
   // operand failure.
   if ((Match1 == Match_InvalidOperand) + (Match2 == Match_InvalidOperand) +
       (Match3 == Match_InvalidOperand) + (Match4 == Match_InvalidOperand) == 1){
-    Error(IDLoc, "invalid operand for instruction");
+    Error(IDLoc, "invalid operand for instruction", EmptyRanges,
+          MatchingInlineAsm);
     return true;
   }
 
   // If all of these were an outright failure, report it in a useless way.
-  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix");
+  Error(IDLoc, "unknown use of instruction mnemonic without a size suffix",
+        EmptyRanges, MatchingInlineAsm);
   return true;
 }
 
@@ -1717,7 +1963,10 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return ParseDirectiveWord(2, DirectiveID.getLoc());
   else if (IDVal.startswith(".code"))
     return ParseDirectiveCode(IDVal, DirectiveID.getLoc());
-  else if (IDVal.startswith(".intel_syntax")) {
+  else if (IDVal.startswith(".att_syntax")) {
+    getParser().setAssemblerDialect(0);
+    return false;
+  } else if (IDVal.startswith(".intel_syntax")) {
     getParser().setAssemblerDialect(1);
     if (getLexer().isNot(AsmToken::EndOfStatement)) {
       if(Parser.getTok().getString() == "noprefix") {
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index b886d46501b4..f4d03a602cf5 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -17,7 +17,6 @@ set(sources
   X86AsmPrinter.cpp
   X86COFFMachineModuleInfo.cpp
   X86CodeEmitter.cpp
-  X86ELFWriterInfo.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
   X86FrameLowering.cpp
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 5039887e1a2e..f13692739a17 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -44,7 +44,7 @@ void x86DisassemblerDebug(const char *file,
   dbgs() << file << ":" << line << ": " << s;
 }
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii) {
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
   const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
   return MII->getName(Opcode);
 }
@@ -95,8 +95,8 @@ const EDInstInfo *X86GenericDisassembler::getEDInfo() const {
 ///                   be a pointer to a MemoryObject.
 /// @param byte     - A pointer to the byte to be read.
 /// @param address  - The address to be read.
-static int regionReader(void* arg, uint8_t* byte, uint64_t address) {
-  MemoryObject* region = static_cast<MemoryObject*>(arg);
+static int regionReader(const void* arg, uint8_t* byte, uint64_t address) {
+  const MemoryObject* region = static_cast<const MemoryObject*>(arg);
   return region->readByte(address, byte);
 }
 
@@ -135,10 +135,10 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
-                              (void*)&region,
+                              (const void*)&region,
                               loggerFn,
                               (void*)&vStream,
-                              (void*)MII,
+                              (const void*)MII,
                               address,
                               fMode);
 
@@ -379,6 +379,8 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   }
 
   switch (type) {
+  case TYPE_XMM32:
+  case TYPE_XMM64:
   case TYPE_XMM128:
     mcInst.addOperand(MCOperand::CreateReg(X86::XMM0 + (immediate >> 4)));
     return;
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 0dbfa260014b..981701f52764 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -78,7 +78,7 @@
   uint16_t operands;
 
 #define INSTRUCTION_IDS               \
-  unsigned instructionIDs;
+  uint16_t instructionIDs;
 
 #include "X86DisassemblerDecoderCommon.h"
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index 0c929122aeee..85d8a991dd6e 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -138,6 +138,10 @@ static InstrUID decode(OpcodeType type,
     if (modFromModRM(modRM) == 0x3)
       return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
     return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
   case MODRM_FULL:
     return modRMTable[dec->instructionIDs+modRM];
   }
@@ -200,7 +204,7 @@ static void unconsumeByte(struct InternalInstruction* insn) {
                              insn->readerCursor + offset);        \
       if (ret)                                                    \
         return ret;                                               \
-      combined = combined | ((type)byte << ((type)offset * 8));   \
+      combined = combined | ((uint64_t)byte << (offset * 8));     \
     }                                                             \
     *ptr = combined;                                              \
     insn->readerCursor += sizeof(type);                           \
@@ -690,7 +694,7 @@ static int getIDWithAttrMask(uint16_t* instructionID,
  * @param orig  - The instruction that is not 16-bit
  * @param equiv - The instruction that is 16-bit
  */
-static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
+static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
   off_t i;
   
   for (i = 0;; i++) {
@@ -719,7 +723,7 @@ static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
  * @return      - 0 if the ModR/M could be read when needed or was not needed;
  *                nonzero otherwise.
  */
-static int getID(struct InternalInstruction* insn, void *miiArg) {
+static int getID(struct InternalInstruction* insn, const void *miiArg) {
   uint8_t attrMask;
   uint16_t instructionID;
   
@@ -856,7 +860,7 @@ static int getID(struct InternalInstruction* insn, void *miiArg) {
     specWithOpSizeName =
       x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
 
-    if (is16BitEquvalent(specName, specWithOpSizeName)) {
+    if (is16BitEquivalent(specName, specWithOpSizeName)) {
       insn->instructionID = instructionIDWithOpsize;
       insn->spec = specifierForUID(instructionIDWithOpsize);
     } else {
@@ -1621,10 +1625,10 @@ static int readOperands(struct InternalInstruction* insn) {
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode) {
   memset(insn, 0, sizeof(struct InternalInstruction));
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 797703f80335..407ead3cafa9 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -24,7 +24,7 @@ extern "C" {
   uint16_t operands;
 
 #define INSTRUCTION_IDS     \
-  unsigned instructionIDs;
+  uint16_t instructionIDs;
 
 #include "X86DisassemblerDecoderCommon.h"
 
@@ -403,7 +403,7 @@ typedef uint8_t BOOL;
  *                  be read from.
  * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
  */
-typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address);
+typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
 
 /*
  * dlog_t - Type for the logging function that the consumer can provide to
@@ -422,7 +422,7 @@ struct InternalInstruction {
   /* Reader interface (C) */
   byteReader_t reader;
   /* Opaque value passed to the reader */
-  void* readerArg;
+  const void* readerArg;
   /* The address of the next byte to read via the reader */
   uint64_t readerCursor;
 
@@ -561,10 +561,10 @@ struct InternalInstruction {
  */
 int decodeInstruction(struct InternalInstruction* insn,
                       byteReader_t reader,
-                      void* readerArg,
+                      const void* readerArg,
                       dlog_t logger,
                       void* loggerArg,
-                      void* miiArg,
+                      const void* miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
@@ -579,7 +579,7 @@ void x86DisassemblerDebug(const char *file,
                           unsigned line,
                           const char *s);
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, void *mii);
+const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
 
 #ifdef __cplusplus
 }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index b0a0e1e78ef7..23dfe4b5b5f4 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -160,6 +160,10 @@ typedef uint16_t InstrUID;
  * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
  *                  corresponds to one instruction; otherwise, it corresponds to
  *                  a different instruction.
+ * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+ *                  divided by 8 is used to select instruction; otherwise, each
+ *                  value of the ModR/M byte could correspond to a different
+ *                  instruction.
  * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
                     corresponds to instructions that use reg field as opcode
  * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
@@ -169,6 +173,7 @@ typedef uint16_t InstrUID;
 #define MODRMTYPES            \
   ENUM_ENTRY(MODRM_ONEENTRY)  \
   ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
   ENUM_ENTRY(MODRM_SPLITREG)  \
   ENUM_ENTRY(MODRM_FULL)
 
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 5118e4cad4e2..a4bd1147bc51 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "asm-printer"
 #include "X86ATTInstPrinter.h"
 #include "X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -33,11 +34,19 @@ using namespace llvm;
 
 void X86ATTInstPrinter::printRegName(raw_ostream &OS,
                                      unsigned RegNo) const {
-  OS << '%' << getRegisterName(RegNo);
+  OS << markup("<reg:")
+     << '%' << getRegisterName(RegNo)
+     << markup(">");
 }
 
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                   StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
   // Try to print any aliases first.
   if (!printAliasInstr(MI, OS))
     printInstruction(MI, OS);
@@ -52,7 +61,8 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
 
 void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                    raw_ostream &O) {
-  switch (MI->getOperand(Op).getImm()) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
   default: llvm_unreachable("Invalid ssecc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
@@ -70,6 +80,30 @@ void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
   case  0xd: O << "ge"; break;
   case  0xe: O << "gt"; break;
   case  0xf: O << "true"; break;
+  }
+}
+
+void X86ATTInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                   raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
   case 0x10: O << "eq_os"; break;
   case 0x11: O << "lt_oq"; break;
   case 0x12: O << "le_oq"; break;
@@ -89,12 +123,12 @@ void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
   }
 }
 
-/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value (e.g. for jumps and calls).  These
 /// print slightly differently than normal immediates.  For example, a $ is not
 /// emitted.
-void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
+void X86ATTInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
     O << Op.getImm();
@@ -119,17 +153,21 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    O << '%' << getRegisterName(Op.getReg());
+    printRegName(O, Op.getReg());
   } else if (Op.isImm()) {
     // Print X86 immediates as signed values.
-    O << '$' << (int64_t)Op.getImm();
+    O << markup("<imm:")
+      << '$' << (int64_t)Op.getImm()
+      << markup(">");
     
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
     
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << '$' << *Op.getExpr();
+    O << markup("<imm:")
+      << '$' << *Op.getExpr()
+      << markup(">");
   }
 }
 
@@ -140,6 +178,8 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   const MCOperand &DispSpec = MI->getOperand(Op+3);
   const MCOperand &SegReg = MI->getOperand(Op+4);
   
+  O << markup("<mem:");
+
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
     printOperand(MI, Op+4, O);
@@ -164,9 +204,15 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
       O << ',';
       printOperand(MI, Op+2, O);
       unsigned ScaleVal = MI->getOperand(Op+1).getImm();
-      if (ScaleVal != 1)
-        O << ',' << ScaleVal;
+      if (ScaleVal != 1) {
+        O << ','
+	  << markup("<imm:")
+          << ScaleVal
+	  << markup(">");
+      }
     }
     O << ')';
   }
+
+  O << markup(">");
 }
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index 2e00bff1738e..8e09183dccc9 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -40,7 +40,8 @@ public:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &OS);
-  void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &OS);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
   
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 4ea662cbe0c1..d67aec7f10ef 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===//
+//===-- X86IntelInstPrinter.cpp - Intel assembly instruction printing -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file includes code for rendering MCInst instances as AT&T-style
+// This file includes code for rendering MCInst instances as Intel-style
 // assembly.
 //
 //===----------------------------------------------------------------------===//
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "asm-printer"
 #include "X86IntelInstPrinter.h"
 #include "X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
@@ -32,6 +33,12 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
 
 void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
                                     StringRef Annot) {
+  const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  if (TSFlags & X86II::LOCK)
+    OS << "\tlock\n";
+
   printInstruction(MI, OS);
 
   // Next always print the annotation.
@@ -44,7 +51,8 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
 
 void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
                                      raw_ostream &O) {
-  switch (MI->getOperand(Op).getImm()) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0xf;
+  switch (Imm) {
   default: llvm_unreachable("Invalid ssecc argument!");
   case    0: O << "eq"; break;
   case    1: O << "lt"; break;
@@ -62,6 +70,30 @@ void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
   case  0xd: O << "ge"; break;
   case  0xe: O << "gt"; break;
   case  0xf: O << "true"; break;
+  }
+}
+
+void X86IntelInstPrinter::printAVXCC(const MCInst *MI, unsigned Op,
+                                     raw_ostream &O) {
+  int64_t Imm = MI->getOperand(Op).getImm() & 0x1f;
+  switch (Imm) {
+  default: llvm_unreachable("Invalid avxcc argument!");
+  case    0: O << "eq"; break;
+  case    1: O << "lt"; break;
+  case    2: O << "le"; break;
+  case    3: O << "unord"; break;
+  case    4: O << "neq"; break;
+  case    5: O << "nlt"; break;
+  case    6: O << "nle"; break;
+  case    7: O << "ord"; break;
+  case    8: O << "eq_uq"; break;
+  case    9: O << "nge"; break;
+  case  0xa: O << "ngt"; break;
+  case  0xb: O << "false"; break;
+  case  0xc: O << "neq_oq"; break;
+  case  0xd: O << "ge"; break;
+  case  0xe: O << "gt"; break;
+  case  0xf: O << "true"; break;
   case 0x10: O << "eq_os"; break;
   case 0x11: O << "lt_oq"; break;
   case 0x12: O << "le_oq"; break;
@@ -78,14 +110,13 @@ void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op,
   case 0x1d: O << "ge_oq"; break;
   case 0x1e: O << "gt_oq"; break;
   case 0x1f: O << "true_us"; break;
-
   }
 }
 
-/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.
-void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
+void X86IntelInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isImm())
     O << Op.getImm();
@@ -153,8 +184,7 @@ void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
     printOperand(MI, Op+2, O);
     NeedPlus = true;
   }
-  
-  
+
   if (!DispSpec.isImm()) {
     if (NeedPlus) O << " + ";
     assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 4f5938daf4cd..bb769eb52e4f 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class prints an X86 MCInst to intel style .s file syntax.
+// This class prints an X86 MCInst to Intel style .s file syntax.
 //
 //===----------------------------------------------------------------------===//
 
@@ -37,7 +37,8 @@ public:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemReference(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printSSECC(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void print_pcrel_imm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAVXCC(const MCInst *MI, unsigned Op, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   
   void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "OPAQUE PTR ";
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 32e40febd26a..467edadc7e09 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -66,9 +66,10 @@ public:
 };
 
 class X86AsmBackend : public MCAsmBackend {
+  StringRef CPU;
 public:
-  X86AsmBackend(const Target &T)
-    : MCAsmBackend() {}
+  X86AsmBackend(const Target &T, StringRef _CPU)
+    : MCAsmBackend(), CPU(_CPU) {}
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -278,9 +279,9 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
   Res.setOpcode(RelaxedOp);
 }
 
-/// writeNopData - Write optimal nops to the output file for the \arg Count
+/// writeNopData - Write optimal nops to the output file for the \p Count
 /// bytes.  This returns the number of bytes written.  It may return 0 if
-/// the \arg Count is more than the maximum optimal nops.
+/// the \p Count is more than the maximum optimal nops.
 bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   static const uint8_t Nops[10][10] = {
     // nop
@@ -305,6 +306,15 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   };
 
+  // This CPU doesnt support long nops. If needed add more.
+  // FIXME: Can we get this from the subtarget somehow?
+  if (CPU == "generic" || CPU == "i386" || CPU == "i486" || CPU == "i586" ||
+      CPU == "pentium" || CPU == "pentium-mmx" || CPU == "geode") {
+    for (uint64_t i = 0; i < Count; ++i)
+      OW->Write8(0x90);
+    return true;
+  }
+
   // Write an optimal sequence for the first 15 bytes.
   const uint64_t OptimalCount = (Count < 16) ? Count : 15;
   const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
@@ -327,8 +337,8 @@ namespace {
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
-  ELFX86AsmBackend(const Target &T, uint8_t _OSABI)
-    : X86AsmBackend(T), OSABI(_OSABI) {
+  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
+    : X86AsmBackend(T, CPU), OSABI(_OSABI) {
     HasReliableSymbolDifference = true;
   }
 
@@ -340,21 +350,21 @@ public:
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI)
-    : ELFX86AsmBackend(T, OSABI) {}
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createX86ELFObjectWriter(OS, /*Is64Bit*/ false, OSABI);
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
   }
 };
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI)
-    : ELFX86AsmBackend(T, OSABI) {}
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
+    : ELFX86AsmBackend(T, OSABI, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createX86ELFObjectWriter(OS, /*Is64Bit*/ true, OSABI);
+    return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
   }
 };
 
@@ -362,8 +372,8 @@ class WindowsX86AsmBackend : public X86AsmBackend {
   bool Is64Bit;
 
 public:
-  WindowsX86AsmBackend(const Target &T, bool is64Bit)
-    : X86AsmBackend(T)
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
     , Is64Bit(is64Bit) {
   }
 
@@ -374,14 +384,14 @@ public:
 
 class DarwinX86AsmBackend : public X86AsmBackend {
 public:
-  DarwinX86AsmBackend(const Target &T)
-    : X86AsmBackend(T) { }
+  DarwinX86AsmBackend(const Target &T, StringRef CPU)
+    : X86AsmBackend(T, CPU) { }
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_32AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {}
+  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
@@ -392,8 +402,8 @@ public:
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_64AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {
+  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {
     HasReliableSymbolDifference = true;
   }
 
@@ -439,28 +449,28 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T);
+    return new DarwinX86_32AsmBackend(T, CPU);
 
-  if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, false);
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_32AsmBackend(T, OSABI);
+  return new ELFX86_32AsmBackend(T, OSABI, CPU);
 }
 
-MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T);
+    return new DarwinX86_64AsmBackend(T, CPU);
 
-  if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, true);
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
+    return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_64AsmBackend(T, OSABI);
+  return new ELFX86_64AsmBackend(T, OSABI, CPU);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index db597fbfca9f..7ea1961dec90 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -276,9 +276,9 @@ namespace X86II {
     MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
     MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40,
     MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46,
-    MRM_D4 = 47, MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50,
-    MRM_DB = 51, MRM_DC = 52, MRM_DD = 53, MRM_DE = 54,
-    MRM_DF = 55,
+    MRM_D4 = 47, MRM_D5 = 48, MRM_D8 = 49, MRM_D9 = 50,
+    MRM_DA = 51, MRM_DB = 52, MRM_DC = 53, MRM_DD = 54,
+    MRM_DE = 55, MRM_DF = 56,
 
     /// RawFrmImm8 - This is used for the ENTER instruction, which has two
     /// immediates, the first of which is a 16-bit immediate (specified by
@@ -580,11 +580,11 @@ namespace X86II {
     case X86II::MRM_E8: case X86II::MRM_F0:
     case X86II::MRM_F8: case X86II::MRM_F9:
     case X86II::MRM_D0: case X86II::MRM_D1:
-    case X86II::MRM_D4: case X86II::MRM_D8:
-    case X86II::MRM_D9: case X86II::MRM_DA:
-    case X86II::MRM_DB: case X86II::MRM_DC:
-    case X86II::MRM_DD: case X86II::MRM_DE:
-    case X86II::MRM_DF:
+    case X86II::MRM_D4: case X86II::MRM_D5:
+    case X86II::MRM_D8: case X86II::MRM_D9:
+    case X86II::MRM_DA: case X86II::MRM_DB:
+    case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF:
       return -1;
     }
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 5a42a801825d..de80dd835e99 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -20,7 +20,7 @@ using namespace llvm;
 namespace {
   class X86ELFObjectWriter : public MCELFObjectTargetWriter {
   public:
-    X86ELFObjectWriter(bool is64Bit, uint8_t OSABI);
+    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
 
     virtual ~X86ELFObjectWriter();
   protected:
@@ -30,10 +30,11 @@ namespace {
   };
 }
 
-X86ELFObjectWriter::X86ELFObjectWriter(bool Is64Bit, uint8_t OSABI)
-  : MCELFObjectTargetWriter(Is64Bit, OSABI,
-                            Is64Bit ?  ELF::EM_X86_64 : ELF::EM_386,
-                            /*HasRelocationAddend*/ Is64Bit) {}
+X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
+                                       uint16_t EMachine)
+  : MCELFObjectTargetWriter(IsELF64, OSABI, EMachine,
+                            // Only i386 uses Rel instead of RelA.
+                            /*HasRelocationAddend*/ EMachine != ELF::EM_386) {}
 
 X86ELFObjectWriter::~X86ELFObjectWriter()
 {}
@@ -48,7 +49,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
   unsigned Type;
-  if (is64Bit()) {
+  if (getEMachine() == ELF::EM_X86_64) {
     if (IsPCRel) {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
@@ -130,7 +131,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
       case FK_Data_1: Type = ELF::R_X86_64_8; break;
       }
     }
-  } else {
+  } else if (getEMachine() == ELF::EM_386) {
     if (IsPCRel) {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
@@ -210,15 +211,17 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
       case FK_Data_1: Type = ELF::R_386_8; break;
       }
     }
-  }
+  } else
+    llvm_unreachable("Unsupported ELF machine type.");
 
   return Type;
 }
 
 MCObjectWriter *llvm::createX86ELFObjectWriter(raw_ostream &OS,
-                                               bool Is64Bit,
-                                               uint8_t OSABI) {
+                                               bool IsELF64,
+                                               uint8_t OSABI,
+                                               uint16_t EMachine) {
   MCELFObjectTargetWriter *MOTW =
-    new X86ELFObjectWriter(Is64Bit, OSABI);
+    new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
   return createELFObjectWriter(MOTW, OS,  /*IsLittleEndian=*/true);
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index b0acd7d5a101..16488eb7ae7e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -34,6 +34,10 @@ AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
              clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
              clEnumValEnd));
 
+static cl::opt<bool>
+MarkedJTDataRegions("mark-data-regions", cl::init(false),
+  cl::desc("Mark code section jump table data regions."),
+  cl::Hidden);
 
 void X86MCAsmInfoDarwin::anchor() { }
 
@@ -59,6 +63,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
 
   SupportsDebugInformation = true;
   DwarfUsesInlineInfoSection = true;
+  UseDataRegionDirectives = MarkedJTDataRegions;
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 4a38324d08e1..122204ae75c8 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -16,6 +16,7 @@
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -28,8 +29,8 @@ using namespace llvm;
 
 namespace {
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCSubtargetInfo &STI;
   MCContext &Ctx;
@@ -51,8 +52,8 @@ public:
     return (STI.getFeatureBits() & X86::Mode64Bit) == 0;
   }
 
-  static unsigned GetX86RegNum(const MCOperand &MO) {
-    return X86_MC::getX86RegNum(MO.getReg());
+  unsigned GetX86RegNum(const MCOperand &MO) const {
+    return Ctx.getRegisterInfo().getEncodingValue(MO.getReg()) & 0x7;
   }
 
   // On regular x86, both XMM0-XMM7 and XMM8-XMM15 are encoded in the range
@@ -64,8 +65,8 @@ public:
   //  VEX.VVVV    => XMM9 => ~9
   //
   // See table 4-35 of Intel AVX Programming Reference for details.
-  static unsigned char getVEXRegisterEncoding(const MCInst &MI,
-                                              unsigned OpNum) {
+  unsigned char getVEXRegisterEncoding(const MCInst &MI,
+                                       unsigned OpNum) const {
     unsigned SrcReg = MI.getOperand(OpNum).getReg();
     unsigned SrcRegNum = GetX86RegNum(MI.getOperand(OpNum));
     if (X86II::isX86_64ExtendedReg(SrcReg))
@@ -560,15 +561,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   }
 
 
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
@@ -1129,13 +1121,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM_C3: case X86II::MRM_C4:
   case X86II::MRM_C8: case X86II::MRM_C9:
   case X86II::MRM_D0: case X86II::MRM_D1:
-  case X86II::MRM_D4: case X86II::MRM_D8:
-  case X86II::MRM_D9: case X86II::MRM_DA:
-  case X86II::MRM_DB: case X86II::MRM_DC:
-  case X86II::MRM_DD: case X86II::MRM_DE:
-  case X86II::MRM_DF: case X86II::MRM_E8:
-  case X86II::MRM_F0: case X86II::MRM_F8:
-  case X86II::MRM_F9:
+  case X86II::MRM_D4: case X86II::MRM_D5:
+  case X86II::MRM_D8: case X86II::MRM_D9:
+  case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD:
+  case X86II::MRM_DE: case X86II::MRM_DF:
+  case X86II::MRM_E8: case X86II::MRM_F0:
+  case X86II::MRM_F8: case X86II::MRM_F9:
     EmitByte(BaseOpcode, CurByte, OS);
 
     unsigned char MRM;
@@ -1150,6 +1142,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     case X86II::MRM_D0: MRM = 0xD0; break;
     case X86II::MRM_D1: MRM = 0xD1; break;
     case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
     case X86II::MRM_D8: MRM = 0xD8; break;
     case X86II::MRM_D9: MRM = 0xD9; break;
     case X86II::MRM_DA: MRM = 0xDA; break;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 348236316c89..287c9f137a58 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -209,117 +209,10 @@ unsigned X86_MC::getDwarfRegFlavour(StringRef TT, bool isEH) {
   return DWARFFlavour::X86_32_Generic;
 }
 
-/// getX86RegNum - This function maps LLVM register identifiers to their X86
-/// specific numbering, which is used in various places encoding instructions.
-unsigned X86_MC::getX86RegNum(unsigned RegNo) {
-  switch(RegNo) {
-  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
-  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
-  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
-  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
-  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
-    return N86::ESP;
-  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
-    return N86::EBP;
-  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
-    return N86::ESI;
-  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
-    return N86::EDI;
-
-  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-    return N86::EAX;
-  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-    return N86::ECX;
-  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-    return N86::EDX;
-  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-    return N86::EBX;
-  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-    return N86::ESP;
-  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-    return N86::EBP;
-  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-    return N86::ESI;
-  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-    return N86::EDI;
-
-  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
-  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
-    return RegNo-X86::ST0;
-
-  case X86::XMM0: case X86::XMM8:
-  case X86::YMM0: case X86::YMM8: case X86::MM0:
-    return 0;
-  case X86::XMM1: case X86::XMM9:
-  case X86::YMM1: case X86::YMM9: case X86::MM1:
-    return 1;
-  case X86::XMM2: case X86::XMM10:
-  case X86::YMM2: case X86::YMM10: case X86::MM2:
-    return 2;
-  case X86::XMM3: case X86::XMM11:
-  case X86::YMM3: case X86::YMM11: case X86::MM3:
-    return 3;
-  case X86::XMM4: case X86::XMM12:
-  case X86::YMM4: case X86::YMM12: case X86::MM4:
-    return 4;
-  case X86::XMM5: case X86::XMM13:
-  case X86::YMM5: case X86::YMM13: case X86::MM5:
-    return 5;
-  case X86::XMM6: case X86::XMM14:
-  case X86::YMM6: case X86::YMM14: case X86::MM6:
-    return 6;
-  case X86::XMM7: case X86::XMM15:
-  case X86::YMM7: case X86::YMM15: case X86::MM7:
-    return 7;
-
-  case X86::ES: return 0;
-  case X86::CS: return 1;
-  case X86::SS: return 2;
-  case X86::DS: return 3;
-  case X86::FS: return 4;
-  case X86::GS: return 5;
-
-  case X86::CR0: case X86::CR8 : case X86::DR0: return 0;
-  case X86::CR1: case X86::CR9 : case X86::DR1: return 1;
-  case X86::CR2: case X86::CR10: case X86::DR2: return 2;
-  case X86::CR3: case X86::CR11: case X86::DR3: return 3;
-  case X86::CR4: case X86::CR12: case X86::DR4: return 4;
-  case X86::CR5: case X86::CR13: case X86::DR5: return 5;
-  case X86::CR6: case X86::CR14: case X86::DR6: return 6;
-  case X86::CR7: case X86::CR15: case X86::DR7: return 7;
-
-  // Pseudo index registers are equivalent to a "none"
-  // scaled index (See Intel Manual 2A, table 2-3)
-  case X86::EIZ:
-  case X86::RIZ:
-    return 4;
-
-  default:
-    assert((int(RegNo) > 0) && "Unknown physical register!");
-    return 0;
-  }
-}
-
 void X86_MC::InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI) {
   // FIXME: TableGen these.
   for (unsigned Reg = X86::NoRegister+1; Reg < X86::NUM_TARGET_REGS; ++Reg) {
-    int SEH = X86_MC::getX86RegNum(Reg);
-    switch (Reg) {
-    case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-    case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-    case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-    case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-    case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-    case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-    case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-    case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-    case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
-    case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-    case X86::YMM8:  case X86::YMM9:  case X86::YMM10: case X86::YMM11:
-    case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
-      SEH += 8;
-      break;
-    }
+    unsigned SEH = MRI->getEncodingValue(Reg);
     MRI->mapLLVMRegToSEHReg(Reg, SEH);
   }
 }
@@ -379,11 +272,15 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
       MAI = new X86_64MCAsmInfoDarwin(TheTriple);
     else
       MAI = new X86MCAsmInfoDarwin(TheTriple);
+  } else if (TheTriple.getEnvironment() == Triple::ELF) {
+    // Force the use of an ELF container.
+    MAI = new X86ELFMCAsmInfo(TheTriple);
   } else if (TheTriple.getOS() == Triple::Win32) {
     MAI = new X86MCAsmInfoMicrosoft(TheTriple);
   } else if (TheTriple.getOS() == Triple::MinGW32 || TheTriple.getOS() == Triple::Cygwin) {
     MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
   } else {
+    // The default is ELF.
     MAI = new X86ELFMCAsmInfo(TheTriple);
   }
 
@@ -465,7 +362,7 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
     return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
 
-  if (TheTriple.isOSWindows())
+  if (TheTriple.isOSWindows() && TheTriple.getEnvironment() != Triple::ELF)
     return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
 
   return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 46500699ebee..981aa1a2b911 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -64,8 +64,6 @@ namespace X86_MC {
 
   unsigned getDwarfRegFlavour(StringRef TT, bool isEH);
 
-  unsigned getX86RegNum(unsigned RegNo);
-
   void InitLLVM2SEHRegisterMapping(MCRegisterInfo *MRI);
 
   /// createX86MCSubtargetInfo - Create a X86 MCSubtargetInfo instance.
@@ -80,8 +78,8 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
@@ -91,8 +89,9 @@ MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
 
 /// createX86ELFObjectWriter - Construct an X86 ELF object writer.
 MCObjectWriter *createX86ELFObjectWriter(raw_ostream &OS,
-                                         bool Is64Bit,
-                                         uint8_t OSABI);
+                                         bool IsELF64,
+                                         uint8_t OSABI,
+                                         uint16_t EMachine);
 /// createX86WinCOFFObjectWriter - Construct an X86 Win COFF object writer.
 MCObjectWriter *createX86WinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
 } // End llvm namespace
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index f0f1982d57f2..7ff058edbc23 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -11,11 +11,13 @@
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Object/MachOFormat.h"
 
 using namespace llvm;
@@ -23,7 +25,7 @@ using namespace llvm::object;
 
 namespace {
 class X86MachObjectWriter : public MCMachObjectTargetWriter {
-  void RecordScatteredRelocation(MachObjectWriter *Writer,
+  bool RecordScatteredRelocation(MachObjectWriter *Writer,
                                  const MCAssembler &Asm,
                                  const MCAsmLayout &Layout,
                                  const MCFragment *Fragment,
@@ -335,7 +337,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
-void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
+bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                                                     const MCAssembler &Asm,
                                                     const MCAsmLayout &Layout,
                                                     const MCFragment *Fragment,
@@ -381,6 +383,19 @@ void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
   // Relocations are written out in reverse order, so the PAIR comes first.
   if (Type == macho::RIT_Difference ||
       Type == macho::RIT_Generic_LocalDifference) {
+    // If the offset is too large to fit in a scattered relocation,
+    // we're hosed. It's an unfortunate limitation of the MachO format.
+    if (FixupOffset > 0xffffff) {
+      char Buffer[32];
+      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
+      Asm.getContext().FatalError(Fixup.getLoc(),
+                         Twine("Section too large, can't encode "
+                                "r_address (") + Buffer +
+                         ") into 24 bits of scattered "
+                         "relocation entry.");
+      llvm_unreachable("fatal error returned?!");
+    }
+
     macho::RelocationEntry MRE;
     MRE.Word0 = ((0         <<  0) |
                  (macho::RIT_Pair  << 24) |
@@ -389,6 +404,16 @@ void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                  macho::RF_Scattered);
     MRE.Word1 = Value2;
     Writer->addRelocation(Fragment->getParent(), MRE);
+  } else {
+    // If the offset is more than 24-bits, it won't fit in a scattered
+    // relocation offset field, so we fall back to using a non-scattered
+    // relocation. This is a bit risky, as if the offset reaches out of
+    // the block and the linker is doing scattered loading on this
+    // symbol, things can go badly.
+    //
+    // Required for 'as' compatibility.
+    if (FixupOffset > 0xffffff)
+      return false;
   }
 
   macho::RelocationEntry MRE;
@@ -399,6 +424,7 @@ void X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
                macho::RF_Scattered);
   MRE.Word1 = Value;
   Writer->addRelocation(Fragment->getParent(), MRE);
+  return true;
 }
 
 void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
@@ -469,9 +495,11 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   // If this is a difference or a defined symbol plus an offset, then we need a
   // scattered relocation entry. Differences always require scattered
   // relocations.
-  if (Target.getSymB())
-    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                     Target, Log2Size, FixedValue);
+  if (Target.getSymB()) {
+    RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                              Target, Log2Size, FixedValue);
+    return;
+  }
 
   // Get the symbol data, if any.
   MCSymbolData *SD = 0;
@@ -483,9 +511,13 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   uint32_t Offset = Target.getConstant();
   if (IsPCRel)
     Offset += 1 << Log2Size;
-  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD))
-    return RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
-                                     Target, Log2Size, FixedValue);
+  // Try to record the scattered relocation if needed. Fall back to non
+  // scattered if necessary (see comments in RecordScatteredRelocation()
+  // for details).
+  if (Offset && SD && !Writer->doesSymbolRequireExternRelocation(SD) &&
+      RecordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup,
+                                Target, Log2Size, FixedValue))
+    return;
 
   // See <reloc.h>.
   uint32_t FixupOffset = Layout.getFragmentOffset(Fragment)+Fixup.getOffset();
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 624e56fa0f64..40110353fc62 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -941,3 +941,15 @@ and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
 cost of reduced accuracy.
 
 //===---------------------------------------------------------------------===//
+
+This function should be matched to haddpd when the appropriate CPU is enabled:
+
+#include <x86intrin.h>
+double f (__m128d p) {
+  return p[0] + p[1];
+}
+
+similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
+turn into hsubpd also.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 18e6b7c3d9b6..8ad0bc08ac57 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -118,8 +118,13 @@ def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
                                       "Support BMI instructions">;
 def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
+def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
+                          "HasSlowDivide", "true",
+                          "Use small divide for positive values less than 256">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -159,8 +164,9 @@ def : Proc<"core2",           [FeatureSSSE3, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
-                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP]>;
+def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
+                               FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
+                               FeatureSlowDivide]>;
 // "Arrandale" along with corei3 and corei5
 def : Proc<"corei7",          [FeatureSSE42, FeatureCMPXCHG16B,
                                FeatureSlowBTMem, FeatureFastUAMem,
@@ -188,7 +194,8 @@ def : Proc<"core-avx2",       [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT,
                                FeatureAES, FeaturePCLMUL, FeatureRDRAND,
                                FeatureF16C, FeatureFSGSBase,
                                FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                               FeatureBMI2, FeatureFMA]>;
+                               FeatureBMI2, FeatureFMA,
+                               FeatureRTM]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
@@ -227,6 +234,7 @@ def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
                                FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
 def : Proc<"winchip2",        [Feature3DNow]>;
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index db71e2751555..fdd712520b44 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86AsmPrinter.h"
-#include "X86MCInstLower.h"
 #include "X86.h"
 #include "X86COFFMachineModuleInfo.h"
 #include "X86MachineFunctionInfo.h"
@@ -206,10 +205,10 @@ void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO,
   }
 }
 
-/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.  These print slightly differently, for
 /// example, a $ is not emitted.
-void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo,
+void X86AsmPrinter::printPCRelImm(const MachineInstr *MI, unsigned OpNo,
                                     raw_ostream &O) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
@@ -233,15 +232,17 @@ void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo,
 
 
 void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
-                                 raw_ostream &O, const char *Modifier) {
+                                 raw_ostream &O, const char *Modifier,
+                                 unsigned AsmVariant) {
   const MachineOperand &MO = MI->getOperand(OpNo);
   switch (MO.getType()) {
   default: llvm_unreachable("unknown operand type!");
   case MachineOperand::MO_Register: {
-    O << '%';
+    // FIXME: Enumerating AsmVariant, so we can remove magic number.
+    if (AsmVariant == 0) O << '%';
     unsigned Reg = MO.getReg();
     if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
-      EVT VT = (strcmp(Modifier+6,"64") == 0) ?
+      MVT::SimpleValueType VT = (strcmp(Modifier+6,"64") == 0) ?
         MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
                     ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
       Reg = getX86SubSuperRegister(Reg, VT);
@@ -265,46 +266,6 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   }
 }
 
-void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op,
-                               raw_ostream &O) {
-  unsigned char value = MI->getOperand(Op).getImm();
-  switch (value) {
-  default: llvm_unreachable("Invalid ssecc argument!");
-  case    0: O << "eq"; break;
-  case    1: O << "lt"; break;
-  case    2: O << "le"; break;
-  case    3: O << "unord"; break;
-  case    4: O << "neq"; break;
-  case    5: O << "nlt"; break;
-  case    6: O << "nle"; break;
-  case    7: O << "ord"; break;
-  case    8: O << "eq_uq"; break;
-  case    9: O << "nge"; break;
-  case  0xa: O << "ngt"; break;
-  case  0xb: O << "false"; break;
-  case  0xc: O << "neq_oq"; break;
-  case  0xd: O << "ge"; break;
-  case  0xe: O << "gt"; break;
-  case  0xf: O << "true"; break;
-  case 0x10: O << "eq_os"; break;
-  case 0x11: O << "lt_oq"; break;
-  case 0x12: O << "le_oq"; break;
-  case 0x13: O << "unord_s"; break;
-  case 0x14: O << "neq_us"; break;
-  case 0x15: O << "nlt_uq"; break;
-  case 0x16: O << "nle_uq"; break;
-  case 0x17: O << "ord_s"; break;
-  case 0x18: O << "eq_us"; break;
-  case 0x19: O << "nge_uq"; break;
-  case 0x1a: O << "ngt_uq"; break;
-  case 0x1b: O << "false_os"; break;
-  case 0x1c: O << "neq_os"; break;
-  case 0x1d: O << "ge_oq"; break;
-  case 0x1e: O << "gt_oq"; break;
-  case 0x1f: O << "true_us"; break;
-  }
-}
-
 void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
                                          raw_ostream &O, const char *Modifier) {
   const MachineOperand &BaseReg  = MI->getOperand(Op);
@@ -363,10 +324,51 @@ void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
   printLeaMemReference(MI, Op, O, Modifier);
 }
 
-void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op,
-                                  raw_ostream &O) {
-  O << *MF->getPICBaseSymbol() << '\n';
-  O << *MF->getPICBaseSymbol() << ':';
+void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
+                                           raw_ostream &O, const char *Modifier,
+                                           unsigned AsmVariant){
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+  const MachineOperand &SegReg   = MI->getOperand(Op+4);
+  
+  // If this has a segment register, print it.
+  if (SegReg.getReg()) {
+    printOperand(MI, Op+4, O, Modifier, AsmVariant);
+    O << ':';
+  }
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+2, O, Modifier, AsmVariant);
+    NeedPlus = true;
+  }
+
+  assert (DispSpec.isImm() && "Displacement is not an immediate!");
+  int64_t DispVal = DispSpec.getImm();
+  if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+    if (NeedPlus) {
+      if (DispVal > 0)
+        O << " + ";
+      else {
+        O << " - ";
+        DispVal = -DispVal;
+      }
+    }
+    O << DispVal;
+  }  
+  O << ']';
 }
 
 bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
@@ -457,7 +459,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       return false;
 
     case 'P': // This is the operand of a call, treat specially.
-      print_pcrel_imm(MI, OpNo, O);
+      printPCRelImm(MI, OpNo, O);
       return false;
 
     case 'n':  // Negate the immediate or print a '-' before the operand.
@@ -471,7 +473,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     }
   }
 
-  printOperand(MI, OpNo, O);
+  printOperand(MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
   return false;
 }
 
@@ -479,6 +481,11 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                           unsigned OpNo, unsigned AsmVariant,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
+  if (AsmVariant) {
+    printIntelMemReference(MI, OpNo, O);
+    return false;
+  }
+
   if (ExtraCode && ExtraCode[0]) {
     if (ExtraCode[1] != 0) return true; // Unknown modifier.
 
@@ -680,7 +687,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
     if (!Stubs.empty()) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const TargetData *TD = TM.getTargetData();
+      const DataLayout *TD = TM.getDataLayout();
 
       for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
         OutStreamer.EmitLabel(Stubs[i].first);
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 35386cd5803d..61eb14e036d0 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -34,47 +34,48 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
   }
 
-  virtual const char *getPassName() const {
+  virtual const char *getPassName() const LLVM_OVERRIDE {
     return "X86 AT&T-Style Assembly Printer";
   }
 
   const X86Subtarget &getSubtarget() const { return *Subtarget; }
 
-  virtual void EmitStartOfAsmFile(Module &M);
+  virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
 
-  virtual void EmitEndOfAsmFile(Module &M);
+  virtual void EmitEndOfAsmFile(Module &M) LLVM_OVERRIDE;
 
-  virtual void EmitInstruction(const MachineInstr *MI);
+  virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
 
   void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
 
   // These methods are used by the tablegen'erated instruction printer.
   void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
-                    const char *Modifier = 0);
-  void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
+                    const char *Modifier = 0, unsigned AsmVariant = 0);
+  void printPCRelImm(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
 
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &OS);
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &OS);
-
-  void printMachineInstruction(const MachineInstr *MI);
-  void printSSECC(const MachineInstr *MI, unsigned Op, raw_ostream &O);
+  virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode,
+                               raw_ostream &OS) LLVM_OVERRIDE;
+  virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                     unsigned AsmVariant, const char *ExtraCode,
+                                     raw_ostream &OS) LLVM_OVERRIDE;
+
   void printMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
                          const char *Modifier=NULL);
   void printLeaMemReference(const MachineInstr *MI, unsigned Op, raw_ostream &O,
                             const char *Modifier=NULL);
 
-  void printPICLabel(const MachineInstr *MI, unsigned Op, raw_ostream &O);
+  void printIntelMemReference(const MachineInstr *MI, unsigned Op,
+                              raw_ostream &O, const char *Modifier=NULL,
+                              unsigned AsmVariant = 1);
 
-  bool runOnMachineFunction(MachineFunction &F);
+  virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
 
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  virtual MachineLocation
+    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
index 471eb31131ae..a5a8dc18e41d 100644
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h
@@ -20,7 +20,7 @@
 
 namespace llvm {
   class X86MachineFunctionInfo;
-  class TargetData;
+  class DataLayout;
 
 /// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
 /// for X86 COFF targets.
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index a6d2709b372d..6786756c7faf 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -88,6 +88,21 @@ def RetCC_X86_32_Fast : CallingConv<[
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
@@ -128,6 +143,10 @@ def RetCC_X86_64 : CallingConv<[
 
 // This is the return-value convention used for the entire X86 backend.
 def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
   CCDelegateTo<RetCC_X86_32>
 ]>;
@@ -235,6 +254,29 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfType<[f80], CCAssignToStack<0, 0>>
 ]>;
 
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+  
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+
 def CC_X86_64_GHC : CallingConv<[
   // Promote i8/i16/i32 arguments to i64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -324,7 +366,7 @@ def CC_X86_32_FastCall : CallingConv<[
   CCIfNest<CCAssignToReg<[EAX]>>,
 
   // The first 2 integer arguments are passed in ECX/EDX
-  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
 
   // Otherwise, same as everything else.
   CCDelegateTo<CC_X86_32_Common>
@@ -408,6 +450,7 @@ def CC_X86_64 : CallingConv<[
 
 // This is the argument convention used for the entire X86 backend.
 def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
   CCDelegateTo<CC_X86_32>
 ]>;
@@ -426,3 +469,17 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
 
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
+
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15, 
+                                                  (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index d7050495f89c..44db563818b1 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
-#include "llvm/Function.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
@@ -43,7 +42,7 @@ namespace {
   template<class CodeEmitter>
   class Emitter : public MachineFunctionPass {
     const X86InstrInfo  *II;
-    const TargetData    *TD;
+    const DataLayout    *TD;
     X86TargetMachine    &TM;
     CodeEmitter         &MCE;
     MachineModuleInfo   *MMI;
@@ -57,7 +56,7 @@ namespace {
       MCE(mce), PICBaseOffset(0), Is64BitMode(false),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
     Emitter(X86TargetMachine &tm, CodeEmitter &mce,
-            const X86InstrInfo &ii, const TargetData &td, bool is64)
+            const X86InstrInfo &ii, const DataLayout &td, bool is64)
       : MachineFunctionPass(ID), II(&ii), TD(&td), TM(tm),
       MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
@@ -110,6 +109,14 @@ namespace {
     void emitMemModRMByte(const MachineInstr &MI,
                           unsigned Op, unsigned RegOpcodeField,
                           intptr_t PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo) const {
+      const TargetRegisterInfo *TRI = TM.getRegisterInfo();
+      return TRI->getEncodingValue(RegNo) & 0x7;
+    }
+
+    unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
+                                         unsigned OpNum) const;
   };
 
 template<class CodeEmitter>
@@ -129,13 +136,12 @@ bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
   MCE.setModuleInfo(MMI);
 
   II = TM.getInstrInfo();
-  TD = TM.getTargetData();
+  TD = TM.getDataLayout();
   Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
 
   do {
-    DEBUG(dbgs() << "JITTing function '"
-          << MF.getFunction()->getName() << "'\n");
+    DEBUG(dbgs() << "JITTing function '" << MF.getName() << "'\n");
     MCE.startFunction(MF);
     for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
          MBB != E; ++MBB) {
@@ -365,7 +371,7 @@ inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
 template<class CodeEmitter>
 void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
                                             unsigned RegOpcodeFld){
-  MCE.emitByte(ModRMByte(3, RegOpcodeFld, X86_MC::getX86RegNum(ModRMReg)));
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
 }
 
 template<class CodeEmitter>
@@ -503,7 +509,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
   // 2-7) and absolute references.
   unsigned BaseRegNo = -1U;
   if (BaseReg != 0 && BaseReg != X86::RIP)
-    BaseRegNo = X86_MC::getX86RegNum(BaseReg);
+    BaseRegNo = getX86RegNum(BaseReg);
 
   if (// The SIB byte must be used if there is an index register.
       IndexReg.getReg() == 0 &&
@@ -579,15 +585,15 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
     // Manual 2A, table 2-7. The displacement has already been output.
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
     else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
       IndexRegNo = 4;
     emitSIBByte(SS, IndexRegNo, 5);
   } else {
-    unsigned BaseRegNo = X86_MC::getX86RegNum(BaseReg);
+    unsigned BaseRegNo = getX86RegNum(BaseReg);
     unsigned IndexRegNo;
     if (IndexReg.getReg())
-      IndexRegNo = X86_MC::getX86RegNum(IndexReg.getReg());
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
     else
       IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
     emitSIBByte(SS, IndexRegNo, BaseRegNo);
@@ -758,10 +764,12 @@ void Emitter<CodeEmitter>::emitOpcodePrefix(uint64_t TSFlags,
 //  VEX.VVVV    => XMM9 => ~9
 //
 // See table 4-35 of Intel AVX Programming Reference for details.
-static unsigned char getVEXRegisterEncoding(const MachineInstr &MI,
-                                            unsigned OpNum) {
+template<class CodeEmitter>
+unsigned char
+Emitter<CodeEmitter>::getVEXRegisterEncoding(const MachineInstr &MI,
+                                             unsigned OpNum) const {
   unsigned SrcReg = MI.getOperand(OpNum).getReg();
-  unsigned SrcRegNum = X86_MC::getX86RegNum(MI.getOperand(OpNum).getReg());
+  unsigned SrcRegNum = getX86RegNum(MI.getOperand(OpNum).getReg());
   if (X86II::isX86_64ExtendedReg(SrcReg))
     SrcRegNum |= 8;
 
@@ -923,17 +931,6 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   }
 
 
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    if (MI.getOperand(i).isImplicit())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc->getNumOperands();
   unsigned CurOp = 0;
@@ -1248,7 +1245,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
   case X86II::AddRegFrm: {
     MCE.emitByte(BaseOpcode +
-                 X86_MC::getX86RegNum(MI.getOperand(CurOp++).getReg()));
+                 getX86RegNum(MI.getOperand(CurOp++).getReg()));
 
     if (CurOp == NumOps)
       break;
@@ -1283,7 +1280,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
   case X86II::MRMDestReg: {
     MCE.emitByte(BaseOpcode);
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
     CurOp += 2;
     break;
   }
@@ -1294,7 +1291,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       SrcRegNum++;
     emitMemModRMByte(MI, CurOp,
-                X86_MC::getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
+                     getX86RegNum(MI.getOperand(SrcRegNum).getReg()));
     CurOp = SrcRegNum + 1;
     break;
   }
@@ -1310,7 +1307,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       ++SrcRegNum;
 
     emitRegModRMByte(MI.getOperand(SrcRegNum).getReg(),
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
     // 2 operands skipped with HasMemOp4, compensate accordingly
     CurOp = HasMemOp4 ? SrcRegNum : SrcRegNum + 1;
     if (HasVEX_4VOp3)
@@ -1332,7 +1329,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
       X86II::getSizeOfImm(Desc->TSFlags) : 0;
     emitMemModRMByte(MI, FirstMemOp,
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
+                     getX86RegNum(MI.getOperand(CurOp).getReg()),PCAdj);
     CurOp += AddrOperands + 1;
     if (HasVEX_4VOp3)
       ++CurOp;
@@ -1422,7 +1419,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     MCE.emitByte(BaseOpcode);
     // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
     emitRegModRMByte(MI.getOperand(CurOp).getReg(),
-                     X86_MC::getX86RegNum(MI.getOperand(CurOp).getReg()));
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
     ++CurOp;
     break;
 
@@ -1455,7 +1452,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
       const MachineOperand &MO = MI.getOperand(HasMemOp4 ? MemOp4_I8IMMOperand
                                                          : CurOp);
       ++CurOp;
-      unsigned RegNum = X86_MC::getX86RegNum(MO.getReg()) << 4;
+      unsigned RegNum = getX86RegNum(MO.getReg()) << 4;
       if (X86II::isX86_64ExtendedReg(MO.getReg()))
         RegNum |= 1 << 7;
       // If there is an additional 5th operand it must be an immediate, which
diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
deleted file mode 100644
index c1a49a764614..000000000000
--- a/lib/Target/X86/X86ELFWriterInfo.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the X86 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86ELFWriterInfo.h"
-#include "X86Relocations.h"
-#include "llvm/Function.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Target/TargetMachine.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-//  Implementation of the X86ELFWriterInfo class
-//===----------------------------------------------------------------------===//
-
-X86ELFWriterInfo::X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_)
-  : TargetELFWriterInfo(is64Bit_, isLittleEndian_) {
-    EMachine = is64Bit ? EM_X86_64 : EM_386;
-  }
-
-X86ELFWriterInfo::~X86ELFWriterInfo() {}
-
-unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
-  if (is64Bit) {
-    switch(MachineRelTy) {
-    case X86::reloc_pcrel_word:
-      return ELF::R_X86_64_PC32;
-    case X86::reloc_absolute_word:
-      return ELF::R_X86_64_32;
-    case X86::reloc_absolute_word_sext:
-      return ELF::R_X86_64_32S;
-    case X86::reloc_absolute_dword:
-      return ELF::R_X86_64_64;
-    case X86::reloc_picrel_word:
-    default:
-      llvm_unreachable("unknown x86_64 machine relocation type");
-    }
-  } else {
-    switch(MachineRelTy) {
-    case X86::reloc_pcrel_word:
-      return ELF::R_386_PC32;
-    case X86::reloc_absolute_word:
-      return ELF::R_386_32;
-    case X86::reloc_absolute_word_sext:
-    case X86::reloc_absolute_dword:
-    case X86::reloc_picrel_word:
-    default:
-      llvm_unreachable("unknown x86 machine relocation type");
-    }
-  }
-}
-
-long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
-                                                    long int Modifier) const {
-  if (is64Bit) {
-    switch(RelTy) {
-    case ELF::R_X86_64_PC32: return Modifier - 4;
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-    case ELF::R_X86_64_64:
-      return Modifier;
-    default:
-      llvm_unreachable("unknown x86_64 relocation type");
-    }
-  } else {
-    switch(RelTy) {
-    case ELF::R_386_PC32: return Modifier - 4;
-    case ELF::R_386_32: return Modifier;
-    default:
-      llvm_unreachable("unknown x86 relocation type");
-    }
-  }
-}
-
-unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
-  if (is64Bit) {
-    switch(RelTy) {
-    case ELF::R_X86_64_PC32:
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-        return 32;
-    case ELF::R_X86_64_64:
-        return 64;
-    default:
-      llvm_unreachable("unknown x86_64 relocation type");
-    }
-  } else {
-    switch(RelTy) {
-    case ELF::R_386_PC32:
-    case ELF::R_386_32:
-        return 32;
-    default:
-      llvm_unreachable("unknown x86 relocation type");
-    }
-  }
-}
-
-bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
-  if (is64Bit) {
-    switch(RelTy) {
-    case ELF::R_X86_64_PC32:
-        return true;
-    case ELF::R_X86_64_32:
-    case ELF::R_X86_64_32S:
-    case ELF::R_X86_64_64:
-        return false;
-    default:
-      llvm_unreachable("unknown x86_64 relocation type");
-    }
-  } else {
-    switch(RelTy) {
-    case ELF::R_386_PC32:
-        return true;
-    case ELF::R_386_32:
-        return false;
-    default:
-      llvm_unreachable("unknown x86 relocation type");
-    }
-  }
-}
-
-unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
-  return is64Bit ?
-    X86::reloc_absolute_dword : X86::reloc_absolute_word;
-}
-
-long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
-                                             unsigned RelOffset,
-                                             unsigned RelTy) const {
-
-  if (RelTy == ELF::R_X86_64_PC32 || RelTy == ELF::R_386_PC32)
-    return SymOffset - (RelOffset + 4);
-
-  llvm_unreachable("computeRelocation unknown for this relocation type");
-}
diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
deleted file mode 100644
index a45b5bb66a07..000000000000
--- a/lib/Target/X86/X86ELFWriterInfo.h
+++ /dev/null
@@ -1,59 +0,0 @@
-//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF writer information for the X86 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86_ELF_WRITER_INFO_H
-#define X86_ELF_WRITER_INFO_H
-
-#include "llvm/Target/TargetELFWriterInfo.h"
-
-namespace llvm {
-
-  class X86ELFWriterInfo : public TargetELFWriterInfo {
-
-  public:
-    X86ELFWriterInfo(bool is64Bit_, bool isLittleEndian_);
-    virtual ~X86ELFWriterInfo();
-
-    /// getRelocationType - Returns the target specific ELF Relocation type.
-    /// 'MachineRelTy' contains the object code independent relocation type
-    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
-
-    /// hasRelocationAddend - True if the target uses an addend in the
-    /// ELF relocation entry.
-    virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
-
-    /// getDefaultAddendForRelTy - Gets the default addend value for a
-    /// relocation entry based on the target ELF relocation type.
-    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
-                                              long int Modifier = 0) const;
-
-    /// getRelTySize - Returns the size of relocatable field in bits
-    virtual unsigned getRelocationTySize(unsigned RelTy) const;
-
-    /// isPCRelativeRel - True if the relocation type is pc relative
-    virtual bool isPCRelativeRel(unsigned RelTy) const;
-
-    /// getJumpTableRelocationTy - Returns the machine relocation type used
-    /// to reference a jumptable.
-    virtual unsigned getAbsoluteLabelMachineRelTy() const;
-
-    /// computeRelocation - Some relocatable fields could be relocated
-    /// directly, avoiding the relocation symbol emission, compute the
-    /// final relocation value for this symbol.
-    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
-                                       unsigned RelTy) const;
-  };
-
-} // end llvm namespace
-
-#endif // X86_ELF_WRITER_INFO_H
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index e5952aae16de..d4627c74cb1c 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -45,9 +45,9 @@ class X86FastISel : public FastISel {
   /// make the right decision when generating code for different targets.
   const X86Subtarget *Subtarget;
 
-  /// StackPtr - Register used as the stack pointer.
+  /// RegInfo - X86 register info.
   ///
-  unsigned StackPtr;
+  const X86RegisterInfo *RegInfo;
 
   /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
   /// floating point ops.
@@ -61,9 +61,9 @@ public:
                        const TargetLibraryInfo *libInfo)
     : FastISel(funcInfo, libInfo) {
     Subtarget = &TM.getSubtarget<X86Subtarget>();
-    StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
     X86ScalarSSEf64 = Subtarget->hasSSE2();
     X86ScalarSSEf32 = Subtarget->hasSSE1();
+    RegInfo = static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
   }
 
   virtual bool TargetSelectInstruction(const Instruction *I);
@@ -710,6 +710,8 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
 bool X86FastISel::X86SelectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
+  const X86MachineFunctionInfo *X86MFInfo =
+      FuncInfo.MF->getInfo<X86MachineFunctionInfo>();
 
   if (!FuncInfo.CanLowerReturn)
     return false;
@@ -724,8 +726,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     return false;
 
   // Don't handle popping bytes on return for now.
-  if (FuncInfo.MF->getInfo<X86MachineFunctionInfo>()
-        ->getBytesToPopOnReturn() != 0)
+  if (X86MFInfo->getBytesToPopOnReturn() != 0)
     return 0;
 
   // fastcc with -tailcallopt is intended to provide a guaranteed
@@ -809,6 +810,19 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     MRI.addLiveOut(VA.getLocReg());
   }
 
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax.
+  if (Subtarget->is64Bit() && F.hasStructRetAttr()) {
+    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    assert(Reg &&
+           "SRetReturnReg should have been set in LowerFormalArguments()!");
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+            X86::RAX).addReg(Reg);
+    MRI.addLiveOut(X86::RAX);
+  }
+
   // Now emit the RET.
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
   return true;
@@ -1527,9 +1541,9 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
   CallingConv::ID CC = CS.getCallingConv();
   if (CC == CallingConv::Fast || CC == CallingConv::GHC)
     return 0;
-  if (!CS.paramHasAttr(1, Attribute::StructRet))
+  if (!CS.paramHasAttr(1, Attributes::StructRet))
     return 0;
-  if (CS.paramHasAttr(1, Attribute::InReg))
+  if (CS.paramHasAttr(1, Attributes::InReg))
     return 0;
   return 4;
 }
@@ -1608,12 +1622,12 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     Value *ArgVal = *i;
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
-    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+    if (CS.paramHasAttr(AttrInd, Attributes::SExt))
       Flags.setSExt();
-    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+    if (CS.paramHasAttr(AttrInd, Attributes::ZExt))
       Flags.setZExt();
 
-    if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
+    if (CS.paramHasAttr(AttrInd, Attributes::ByVal)) {
       PointerType *Ty = cast<PointerType>(ArgVal->getType());
       Type *ElementTy = Ty->getElementType();
       unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
@@ -1627,9 +1641,9 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
         return false;
     }
 
-    if (CS.paramHasAttr(AttrInd, Attribute::InReg))
+    if (CS.paramHasAttr(AttrInd, Attributes::InReg))
       Flags.setInReg();
-    if (CS.paramHasAttr(AttrInd, Attribute::Nest))
+    if (CS.paramHasAttr(AttrInd, Attributes::Nest))
       Flags.setNest();
 
     // If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
@@ -1771,7 +1785,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     } else {
       unsigned LocMemOffset = VA.getLocMemOffset();
       X86AddressMode AM;
-      AM.Base.Reg = StackPtr;
+      AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
       const Value *ArgVal = ArgVals[VA.getValNo()];
       ISD::ArgFlagsTy Flags = ArgFlags[VA.getValNo()];
@@ -1897,11 +1911,11 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
       ISD::InputArg MyFlags;
       MyFlags.VT = RegisterVT.getSimpleVT();
       MyFlags.Used = !CS.getInstruction()->use_empty();
-      if (CS.paramHasAttr(0, Attribute::SExt))
+      if (CS.paramHasAttr(0, Attributes::SExt))
         MyFlags.Flags.setSExt();
-      if (CS.paramHasAttr(0, Attribute::ZExt))
+      if (CS.paramHasAttr(0, Attributes::ZExt))
         MyFlags.Flags.setZExt();
-      if (CS.paramHasAttr(0, Attribute::InReg))
+      if (CS.paramHasAttr(0, Attributes::InReg))
         MyFlags.Flags.setInReg();
       Ins.push_back(MyFlags);
     }
@@ -2014,13 +2028,17 @@ X86FastISel::TargetSelectInstruction(const Instruction *I)  {
 unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
   MVT VT;
   if (!isTypeLegal(C->getType(), VT))
-    return false;
+    return 0;
+
+  // Can't handle alternate code models yet.
+  if (TM.getCodeModel() != CodeModel::Small)
+    return 0;
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
   const TargetRegisterClass *RC = NULL;
   switch (VT.SimpleTy) {
-  default: return false;
+  default: return 0;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -2058,7 +2076,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
     break;
   case MVT::f80:
     // No f80 support yet.
-    return false;
+    return 0;
   }
 
   // Materialize addresses with LEA instructions.
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 955c75aa563f..791f5982af7c 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -171,6 +171,7 @@ namespace {
     // Shuffle live registers to match the expectations of successor blocks.
     void finishBlockStack();
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dumpStack() const {
       dbgs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
@@ -181,6 +182,7 @@ namespace {
         dbgs() << ", ST" << i << " in FP" << unsigned(PendingST[i]);
       dbgs() << "\n";
     }
+#endif
 
     /// getSlot - Return the stack slot number a particular register number is
     /// in.
@@ -575,8 +577,8 @@ namespace {
     friend bool operator<(const TableEntry &TE, unsigned V) {
       return TE.from < V;
     }
-    friend bool LLVM_ATTRIBUTE_USED operator<(unsigned V,
-                                              const TableEntry &TE) {
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
       return V < TE.from;
     }
   };
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 22386885b6be..369589d469a6 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -25,7 +25,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/SmallSet.h"
@@ -313,11 +313,11 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   if (CSI.empty()) return;
 
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  const TargetData *TD = TM.getTargetData();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize();
+  int stackGrowth = -RegInfo->getSlotSize();
 
   // FIXME: This is dirty hack. The code itself is pretty mess right now.
   // It should be rewritten from scratch and generalized sometimes.
@@ -674,7 +674,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone).
-  if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) &&
+  if (Is64Bit && !Fn->getFnAttributes().hasAttribute(Attributes::NoRedZone) &&
       !RegInfo->needsStackRealignment(MF) &&
       !MFI->hasVarSizedObjects() &&                     // No dynamic alloca.
       !MFI->adjustsStack() &&                           // No calls.
@@ -715,9 +715,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   //        ELSE                        => DW_CFA_offset_extended
 
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  const TargetData *TD = MF.getTarget().getTargetData();
   uint64_t NumBytes = 0;
-  int stackGrowth = -TD->getPointerSize();
+  int stackGrowth = -SlotSize;
 
   if (HasFP) {
     // Calculate required stack adjustment.
@@ -836,8 +835,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
     MI->getOperand(3).setIsDead();
   }
 
-  DL = MBB.findDebugLoc(MBBI);
-
   // If there is an SUB32ri of ESP immediately before this instruction, merge
   // the two. This can be the case when tail call elimination is enabled and
   // the callee has more arguments then the caller.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 27195b4522a6..99f557417b7c 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -100,6 +100,7 @@ namespace {
       Base_Reg = Reg;
     }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
@@ -133,6 +134,7 @@ namespace {
         dbgs() << "nul";
       dbgs() << " JT" << JT << " Align" << Align << '\n';
     }
+#endif
   };
 }
 
@@ -189,7 +191,6 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
-    SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
     SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
@@ -244,13 +245,15 @@ namespace {
       else if (AM.CP)
         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
                                              AM.Align, AM.Disp, AM.SymbolFlags);
-      else if (AM.ES)
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
-      else if (AM.JT != -1)
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
-      else if (AM.BlockAddr)
-        Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
-                                       true, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
       else
         Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
 
@@ -359,7 +362,7 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
 /// MoveBelowCallOrigChain - Replace the original chain operand of the call with
 /// load's chain operand and move load below the call's chain operand.
 static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
-                                  SDValue Call, SDValue OrigChain) {
+                               SDValue Call, SDValue OrigChain) {
   SmallVector<SDValue, 8> Ops;
   SDValue Chain = OrigChain.getOperand(0);
   if (Chain.getNode() == Load.getNode())
@@ -383,11 +386,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
   CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
+
+  unsigned NumOps = Call.getNode()->getNumOperands();
   Ops.clear();
   Ops.push_back(SDValue(Load.getNode(), 1));
-  for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
+  for (unsigned i = 1, e = NumOps; i != e; ++i)
     Ops.push_back(Call.getOperand(i));
-  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
 }
 
 /// isCalleeLoad - Return true if call address is a load and it can be
@@ -396,6 +401,10 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
 /// In the case of a tail call, there isn't a callseq node between the call
 /// chain and the load.
 static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
+  // The transformation is somewhat dangerous if the call's chain was glued to
+  // the call. After MoveBelowOrigChain the load is moved between the call and
+  // the chain, this can create a cycle if the load is not folded. So it is
+  // *really* important that we are sure the load will be folded.
   if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
     return false;
   LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
@@ -425,7 +434,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
   // OptForSize is used in pattern predicates that isel is matching.
-  OptForSize = MF->getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  OptForSize = MF->getFunction()->getFnAttributes().
+    hasAttribute(Attributes::OptimizeForSize);
 
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
@@ -433,7 +443,10 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
 
     if (OptLevel != CodeGenOpt::None &&
         (N->getOpcode() == X86ISD::CALL ||
-         N->getOpcode() == X86ISD::TC_RETURN)) {
+         (N->getOpcode() == X86ISD::TC_RETURN &&
+          // Only does this if load can be foled into TC_RETURN.
+          (Subtarget->is64Bit() ||
+           getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
       /// Also try moving call address load from outside callseq_start to just
       /// before the call to allow it to be folded.
       ///
@@ -652,10 +665,16 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
     } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
-    } else {
-      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
-      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
-    }
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
 
     if (N.getOpcode() == X86ISD::WrapperRIP)
       AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
@@ -684,10 +703,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
     } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
-    } else {
-      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
-      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
-    }
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
     return false;
   }
 
@@ -1011,7 +1032,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
           AM.IndexReg = ShVal.getNode()->getOperand(0);
           ConstantSDNode *AddVal =
             cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
-          uint64_t Disp = AddVal->getSExtValue() << Val;
+          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
           if (!FoldOffsetIntoAddress(Disp, AM))
             return false;
         }
@@ -1281,7 +1302,9 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
       // that are not a MemSDNode, and thus don't have proper addrspace info.
       Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme
       Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores
-      Parent->getOpcode() != X86ISD::TLSCALL) { // Fixme
+      Parent->getOpcode() != X86ISD::TLSCALL && // Fixme
+      Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp
+      Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
     unsigned AddrSpace =
       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
     // AddrSpace 256 -> GS, 257 -> FS.
@@ -1468,6 +1491,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   SDValue In1 = Node->getOperand(1);
   SDValue In2L = Node->getOperand(2);
   SDValue In2H = Node->getOperand(3);
+
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
     return NULL;
@@ -1481,159 +1505,13 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   return ResNode;
 }
 
-// FIXME: Figure out some way to unify this with the 'or' and other code
-// below.
-SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
-  if (Node->hasAnyUseOfValue(0))
-    return 0;
-
-  // Optimize common patterns for __sync_add_and_fetch and
-  // __sync_sub_and_fetch where the result is not used. This allows us
-  // to use "lock" version of add, sub, inc, dec instructions.
-  // FIXME: Do not use special instructions but instead add the "lock"
-  // prefix to the target node somehow. The extra information will then be
-  // transferred to machine instruction and it denotes the prefix.
-  SDValue Chain = Node->getOperand(0);
-  SDValue Ptr = Node->getOperand(1);
-  SDValue Val = Node->getOperand(2);
-  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return 0;
-
-  bool isInc = false, isDec = false, isSub = false, isCN = false;
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
-  if (CN && CN->getSExtValue() == (int32_t)CN->getSExtValue()) {
-    isCN = true;
-    int64_t CNVal = CN->getSExtValue();
-    if (CNVal == 1)
-      isInc = true;
-    else if (CNVal == -1)
-      isDec = true;
-    else if (CNVal >= 0)
-      Val = CurDAG->getTargetConstant(CNVal, NVT);
-    else {
-      isSub = true;
-      Val = CurDAG->getTargetConstant(-CNVal, NVT);
-    }
-  } else if (Val.hasOneUse() &&
-             Val.getOpcode() == ISD::SUB &&
-             X86::isZeroNode(Val.getOperand(0))) {
-    isSub = true;
-    Val = Val.getOperand(1);
-  }
-
-  DebugLoc dl = Node->getDebugLoc();
-  unsigned Opc = 0;
-  switch (NVT.getSimpleVT().SimpleTy) {
-  default: return 0;
-  case MVT::i8:
-    if (isInc)
-      Opc = X86::LOCK_INC8m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC8m;
-    else if (isSub) {
-      if (isCN)
-        Opc = X86::LOCK_SUB8mi;
-      else
-        Opc = X86::LOCK_SUB8mr;
-    } else {
-      if (isCN)
-        Opc = X86::LOCK_ADD8mi;
-      else
-        Opc = X86::LOCK_ADD8mr;
-    }
-    break;
-  case MVT::i16:
-    if (isInc)
-      Opc = X86::LOCK_INC16m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC16m;
-    else if (isSub) {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB16mi8;
-        else
-          Opc = X86::LOCK_SUB16mi;
-      } else
-        Opc = X86::LOCK_SUB16mr;
-    } else {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD16mi8;
-        else
-          Opc = X86::LOCK_ADD16mi;
-      } else
-        Opc = X86::LOCK_ADD16mr;
-    }
-    break;
-  case MVT::i32:
-    if (isInc)
-      Opc = X86::LOCK_INC32m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC32m;
-    else if (isSub) {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB32mi8;
-        else
-          Opc = X86::LOCK_SUB32mi;
-      } else
-        Opc = X86::LOCK_SUB32mr;
-    } else {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD32mi8;
-        else
-          Opc = X86::LOCK_ADD32mi;
-      } else
-        Opc = X86::LOCK_ADD32mr;
-    }
-    break;
-  case MVT::i64:
-    if (isInc)
-      Opc = X86::LOCK_INC64m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC64m;
-    else if (isSub) {
-      Opc = X86::LOCK_SUB64mr;
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB64mi8;
-        else if (i64immSExt32(Val.getNode()))
-          Opc = X86::LOCK_SUB64mi32;
-      }
-    } else {
-      Opc = X86::LOCK_ADD64mr;
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD64mi8;
-        else if (i64immSExt32(Val.getNode()))
-          Opc = X86::LOCK_ADD64mi32;
-      }
-    }
-    break;
-  }
-
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  if (isInc || isDec) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
-    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0);
-    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-    SDValue RetVals[] = { Undef, Ret };
-    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
-  } else {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
-    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
-    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-    SDValue RetVals[] = { Undef, Ret };
-    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
-  }
-}
-
+/// Atomic opcode table
+///
 enum AtomicOpc {
+  ADD,
+  SUB,
+  INC,
+  DEC,
   OR,
   AND,
   XOR,
@@ -1657,6 +1535,58 @@ enum AtomicSz {
 
 static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
   {
+    X86::LOCK_ADD8mi,
+    X86::LOCK_ADD8mr,
+    X86::LOCK_ADD16mi8,
+    X86::LOCK_ADD16mi,
+    X86::LOCK_ADD16mr,
+    X86::LOCK_ADD32mi8,
+    X86::LOCK_ADD32mi,
+    X86::LOCK_ADD32mr,
+    X86::LOCK_ADD64mi8,
+    X86::LOCK_ADD64mi32,
+    X86::LOCK_ADD64mr,
+  },
+  {
+    X86::LOCK_SUB8mi,
+    X86::LOCK_SUB8mr,
+    X86::LOCK_SUB16mi8,
+    X86::LOCK_SUB16mi,
+    X86::LOCK_SUB16mr,
+    X86::LOCK_SUB32mi8,
+    X86::LOCK_SUB32mi,
+    X86::LOCK_SUB32mr,
+    X86::LOCK_SUB64mi8,
+    X86::LOCK_SUB64mi32,
+    X86::LOCK_SUB64mr,
+  },
+  {
+    0,
+    X86::LOCK_INC8m,
+    0,
+    0,
+    X86::LOCK_INC16m,
+    0,
+    0,
+    X86::LOCK_INC32m,
+    0,
+    0,
+    X86::LOCK_INC64m,
+  },
+  {
+    0,
+    X86::LOCK_DEC8m,
+    0,
+    0,
+    X86::LOCK_DEC16m,
+    0,
+    0,
+    X86::LOCK_DEC32m,
+    0,
+    0,
+    X86::LOCK_DEC64m,
+  },
+  {
     X86::LOCK_OR8mi,
     X86::LOCK_OR8mr,
     X86::LOCK_OR16mi8,
@@ -1667,7 +1597,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
     X86::LOCK_OR32mr,
     X86::LOCK_OR64mi8,
     X86::LOCK_OR64mi32,
-    X86::LOCK_OR64mr
+    X86::LOCK_OR64mr,
   },
   {
     X86::LOCK_AND8mi,
@@ -1680,7 +1610,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
     X86::LOCK_AND32mr,
     X86::LOCK_AND64mi8,
     X86::LOCK_AND64mi32,
-    X86::LOCK_AND64mr
+    X86::LOCK_AND64mr,
   },
   {
     X86::LOCK_XOR8mi,
@@ -1693,18 +1623,74 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
     X86::LOCK_XOR32mr,
     X86::LOCK_XOR64mi8,
     X86::LOCK_XOR64mi32,
-    X86::LOCK_XOR64mr
+    X86::LOCK_XOR64mr,
   }
 };
 
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+                                                DebugLoc dl,
+                                                enum AtomicOpc &Op, EVT NVT,
+                                                SDValue Val) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+    int64_t CNVal = CN->getSExtValue();
+    // Quit if not 32-bit imm.
+    if ((int32_t)CNVal != CNVal)
+      return Val;
+    // For atomic-load-add, we could do some optimizations.
+    if (Op == ADD) {
+      // Translate to INC/DEC if ADD by 1 or -1.
+      if ((CNVal == 1) || (CNVal == -1)) {
+        Op = (CNVal == 1) ? INC : DEC;
+        // No more constant operand after being translated into INC/DEC.
+        return SDValue();
+      }
+      // Translate to SUB if ADD by negative value.
+      if (CNVal < 0) {
+        Op = SUB;
+        CNVal = -CNVal;
+      }
+    }
+    return CurDAG->getTargetConstant(CNVal, NVT);
+  }
+
+  // If the value operand is single-used, try to optimize it.
+  if (Op == ADD && Val.hasOneUse()) {
+    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+      Op = SUB;
+      return Val.getOperand(1);
+    }
+    // A special case for i16, which needs truncating as, in most cases, it's
+    // promoted to i32. We will translate
+    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+        Val.getOperand(0).getOpcode() == ISD::SUB &&
+        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+      Op = SUB;
+      Val = Val.getOperand(0);
+      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+                                            Val.getOperand(1));
+    }
+  }
+
+  return Val;
+}
+
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
+  DebugLoc dl = Node->getDebugLoc();
+
   // Optimize common patterns for __sync_or_and_fetch and similar arith
   // operations where the result is not used. This allows us to use the "lock"
   // version of the arithmetic instruction.
-  // FIXME: Same as for 'add' and 'sub', try to merge those down here.
   SDValue Chain = Node->getOperand(0);
   SDValue Ptr = Node->getOperand(1);
   SDValue Val = Node->getOperand(2);
@@ -1715,6 +1701,8 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   // Which index into the table.
   enum AtomicOpc Op;
   switch (Node->getOpcode()) {
+    default:
+      return 0;
     case ISD::ATOMIC_LOAD_OR:
       Op = OR;
       break;
@@ -1724,16 +1712,14 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
     case ISD::ATOMIC_LOAD_XOR:
       Op = XOR;
       break;
-    default:
-      return 0;
-  }
-
-  bool isCN = false;
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
-  if (CN && (int32_t)CN->getSExtValue() == CN->getSExtValue()) {
-    isCN = true;
-    Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT);
+    case ISD::ATOMIC_LOAD_ADD:
+      Op = ADD;
+      break;
   }
+  
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+  bool isUnOp = !Val.getNode();
+  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
   unsigned Opc = 0;
   switch (NVT.getSimpleVT().SimpleTy) {
@@ -1775,13 +1761,20 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
 
   assert(Opc != 0 && "Invalid arith lock transform!");
 
-  DebugLoc dl = Node->getDebugLoc();
+  SDValue Ret;
   SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                  dl, NVT), 0);
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
-  SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+  if (isUnOp) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  }
   cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
   SDValue RetVals[] = { Undef, Ret };
   return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
@@ -2059,6 +2052,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   case X86ISD::ATOMSUB64_DAG:
   case X86ISD::ATOMNAND64_DAG:
   case X86ISD::ATOMAND64_DAG:
+  case X86ISD::ATOMMAX64_DAG:
+  case X86ISD::ATOMMIN64_DAG:
+  case X86ISD::ATOMUMAX64_DAG:
+  case X86ISD::ATOMUMIN64_DAG:
   case X86ISD::ATOMSWAP64_DAG: {
     unsigned Opc;
     switch (Opcode) {
@@ -2069,6 +2066,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     case X86ISD::ATOMSUB64_DAG:  Opc = X86::ATOMSUB6432;  break;
     case X86ISD::ATOMNAND64_DAG: Opc = X86::ATOMNAND6432; break;
     case X86ISD::ATOMAND64_DAG:  Opc = X86::ATOMAND6432;  break;
+    case X86ISD::ATOMMAX64_DAG:  Opc = X86::ATOMMAX6432;  break;
+    case X86ISD::ATOMMIN64_DAG:  Opc = X86::ATOMMIN6432;  break;
+    case X86ISD::ATOMUMAX64_DAG: Opc = X86::ATOMUMAX6432; break;
+    case X86ISD::ATOMUMIN64_DAG: Opc = X86::ATOMUMIN6432; break;
     case X86ISD::ATOMSWAP64_DAG: Opc = X86::ATOMSWAP6432; break;
     }
     SDNode *RetVal = SelectAtomic64(Node, Opc);
@@ -2077,15 +2078,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
 
-  case ISD::ATOMIC_LOAD_ADD: {
-    SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
-    if (RetVal)
-      return RetVal;
-    break;
-  }
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR: {
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_ADD: {
     SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
     if (RetVal)
       return RetVal;
@@ -2116,7 +2112,8 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     // Make sure that we don't change the operation by removing bits.
     // This only matters for OR and XOR, AND is unaffected.
-    if (Opcode != ISD::AND && ((Val >> ShlVal) << ShlVal) != Val)
+    uint64_t RemovedBitsMask = (1ULL << ShlVal) - 1;
+    if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0)
       break;
 
     unsigned ShlOp, Op;
@@ -2199,13 +2196,16 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     bool isSigned = Opcode == ISD::SMUL_LOHI;
+    bool hasBMI2 = Subtarget->hasBMI2();
     if (!isSigned) {
       switch (NVT.getSimpleVT().SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
       case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
       case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
-      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
-      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+      case MVT::i32: Opc = hasBMI2 ? X86::MULX32rr : X86::MUL32r;
+                     MOpc = hasBMI2 ? X86::MULX32rm : X86::MUL32m; break;
+      case MVT::i64: Opc = hasBMI2 ? X86::MULX64rr : X86::MUL64r;
+                     MOpc = hasBMI2 ? X86::MULX64rm : X86::MUL64m; break;
       }
     } else {
       switch (NVT.getSimpleVT().SimpleTy) {
@@ -2217,13 +2217,31 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       }
     }
 
-    unsigned LoReg, HiReg;
-    switch (NVT.getSimpleVT().SimpleTy) {
-    default: llvm_unreachable("Unsupported VT!");
-    case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
-    case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
-    case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
-    case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+    unsigned SrcReg, LoReg, HiReg;
+    switch (Opc) {
+    default: llvm_unreachable("Unknown MUL opcode!");
+    case X86::IMUL8r:
+    case X86::MUL8r:
+      SrcReg = LoReg = X86::AL; HiReg = X86::AH;
+      break;
+    case X86::IMUL16r:
+    case X86::MUL16r:
+      SrcReg = LoReg = X86::AX; HiReg = X86::DX;
+      break;
+    case X86::IMUL32r:
+    case X86::MUL32r:
+      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+      break;
+    case X86::IMUL64r:
+    case X86::MUL64r:
+      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+      break;
+    case X86::MULX32rr:
+      SrcReg = X86::EDX; LoReg = HiReg = 0;
+      break;
+    case X86::MULX64rr:
+      SrcReg = X86::RDX; LoReg = HiReg = 0;
+      break;
     }
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
@@ -2235,22 +2253,47 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         std::swap(N0, N1);
     }
 
-    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
                                           N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
 
     if (foldedLoad) {
+      SDValue Chain;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      SDNode *CNode =
-        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops,
-                               array_lengthof(Ops));
-      InFlag = SDValue(CNode, 1);
+      if (MOpc == X86::MULX32rm || MOpc == X86::MULX64rm) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+        InFlag = SDValue(CNode, 3);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
 
       // Update the chain.
-      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+      ReplaceUses(N1.getValue(1), Chain);
     } else {
-      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag);
-      InFlag = SDValue(CNode, 0);
+      SDValue Ops[] = { N1, InFlag };
+      if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        InFlag = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops,
+                                               array_lengthof(Ops));
+        InFlag = SDValue(CNode, 0);
+      }
     }
 
     // Prevent use of AH in a REX instruction by referencing AX instead.
@@ -2275,19 +2318,25 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              LoReg, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      if (ResLo.getNode() == 0) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
+                                       InFlag);
+        InFlag = ResLo.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 0), ResLo);
+      DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              HiReg, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-      DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
+      if (ResHi.getNode() == 0) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
+                                       InFlag);
+        InFlag = ResHi.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), ResHi);
+      DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
     return NULL;
@@ -2488,7 +2537,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                                                         MVT::i8, Reg);
 
         // Emit a testb.
-        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
 
       // For example, "testl %eax, $2048" to "testb %ah, $8".
@@ -2519,8 +2574,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // Emit a testb.  The EXTRACT_SUBREG becomes a COPY that can only
         // target GR8_NOREX registers, so make sure the register class is
         // forced.
-        return CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl, MVT::i32,
-                                      Subreg, ShiftedImm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST8ri_NOREX, dl,
+                                                 MVT::i32, Subreg, ShiftedImm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
 
       // For example, "testl %eax, $32776" to "testw %ax, $32776".
@@ -2536,7 +2596,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                                                         MVT::i16, Reg);
 
         // Emit a testw.
-        return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
 
       // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
@@ -2552,7 +2618,13 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
                                                         MVT::i32, Reg);
 
         // Emit a testl.
-        return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm);
+        SDNode *NewNode = CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32,
+                                                 Subreg, Imm);
+        // Replace SUB|CMP with TEST, since SUB has two outputs while TEST has
+        // one, do not call ReplaceAllUsesWith.
+        ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
+                    SDValue(NewNode, 0));
+        return NULL;
       }
     }
     break;
@@ -2607,85 +2679,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     return Result;
   }
-
-  // FIXME: Custom handling because TableGen doesn't support multiple implicit
-  // defs in an instruction pattern
-  case X86ISD::PCMPESTRI: {
-    SDValue N0 = Node->getOperand(0);
-    SDValue N1 = Node->getOperand(1);
-    SDValue N2 = Node->getOperand(2);
-    SDValue N3 = Node->getOperand(3);
-    SDValue N4 = Node->getOperand(4);
-
-    // Make sure last argument is a constant
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N4);
-    if (!Cst)
-      break;
-
-    uint64_t Imm = Cst->getZExtValue();
-
-    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
-                                          X86::EAX, N1, SDValue()).getValue(1);
-    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
-                                  N3, InFlag).getValue(1);
-
-    SDValue Ops[] = { N0, N2, getI8Imm(Imm), InFlag };
-    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr :
-                                         X86::PCMPESTRIrr;
-    InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
-                                            array_lengthof(Ops)), 0);
-
-    if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::ECX, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-    }
-    if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::EFLAGS, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-    }
-
-    return NULL;
-  }
-
-  // FIXME: Custom handling because TableGen doesn't support multiple implicit
-  // defs in an instruction pattern
-  case X86ISD::PCMPISTRI: {
-    SDValue N0 = Node->getOperand(0);
-    SDValue N1 = Node->getOperand(1);
-    SDValue N2 = Node->getOperand(2);
-
-    // Make sure last argument is a constant
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N2);
-    if (!Cst)
-      break;
-
-    uint64_t Imm = Cst->getZExtValue();
-
-    SDValue Ops[] = { N0, N1, getI8Imm(Imm) };
-    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr :
-                                         X86::PCMPISTRIrr;
-    SDValue InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
-                                                    array_lengthof(Ops)), 0);
-
-    if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::ECX, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-    }
-    if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::EFLAGS, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-    }
-
-    return NULL;
-  }
   }
 
   SDNode *ResNode = SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c77355f91796..b35fb514bf94 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -85,7 +85,7 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
                                VecIdx);
 
@@ -118,7 +118,7 @@ static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/128)
                                * ElemsPerChunk);
 
-  SDValue VecIdx = DAG.getConstant(NormalizedIdxVal, MVT::i32);
+  SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
                      VecIdx);
 }
@@ -158,10 +158,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
-  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
 
   RegInfo = TM.getRegisterInfo();
-  TD = getTargetData();
+  TD = getDataLayout();
 
   // Set up the TargetLowering object.
   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
@@ -180,7 +179,11 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
-  setStackPointerRegisterToSaveRestore(X86StackPtr);
+  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+
+  // Bypass i32 with i8 on Atom when compiling with O2
+  if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+    addBypassSlowDiv(32, 8);
 
   if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
     // Setup Windows compiler runtime calls.
@@ -453,6 +456,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
   }
   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+  // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intened to support
+  // SjLj exception handling but a light-weight setjmp/longjmp replacement to
+  // support continuation, user-level threading, and etc.. As a result, no
+  // other SjLj exception interfaces are implemented and please don't build
+  // your own exception handling based on them.
+  // LLVM/Clang supports zero-cost DWARF exception handling.
+  setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
+  setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
 
   // Darwin ABI issue.
   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
@@ -510,6 +521,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
     setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i64, Custom);
   }
 
   if (Subtarget->hasCmpxchg16b()) {
@@ -541,6 +556,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
@@ -643,7 +659,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
     if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f32  , Expand);
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f32  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
@@ -735,6 +753,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FFLOOR, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
     setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
@@ -824,6 +843,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
@@ -857,6 +877,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
 
     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
@@ -925,6 +946,18 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    // As there is no 64-bit GPR available, we need build a special custom
+    // sequence to convert from v2i32 to v2f32.
+    if (!Subtarget->is64Bit())
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
+
+    setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
+    setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
   }
 
   if (Subtarget->hasSSE41()) {
@@ -939,6 +972,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
 
+    setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
+
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
@@ -1016,19 +1052,33 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
 
     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+    setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
+
+    setOperationAction(ISD::TRUNCATE,           MVT::v8i16, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
+    setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
+
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
 
@@ -1052,7 +1102,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
     setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
 
-    if (Subtarget->hasFMA()) {
+    if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
       setOperationAction(ISD::FMA,             MVT::v8f32, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f64, Custom);
       setOperationAction(ISD::FMA,             MVT::v4f32, Custom);
@@ -1217,10 +1267,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::FP_TO_SINT);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -1318,7 +1366,7 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   // cases like PR2962.  This should be removed when PR2962 is fixed.
   const Function *F = MF.getFunction();
   if (IsZeroVal &&
-      !F->hasFnAttr(Attribute::NoImplicitFloat)) {
+      !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
     if (Size >= 16 &&
         (Subtarget->isUnalignedMemAccessFast() ||
          ((DstAlign == 0 || DstAlign >= 16) &&
@@ -1986,7 +2034,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
                                                        TotalNumIntRegs);
 
-      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
+      bool NoImplicitFloatOps = Fn->getFnAttributes().
+        hasAttribute(Attributes::NoImplicitFloat);
       assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
              "SSE register cannot be used when SSE is disabled!");
       assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
@@ -2134,16 +2183,14 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
-                         SDValue Chain, SDValue RetAddrFrIdx,
-                         bool Is64Bit, int FPDiff, DebugLoc dl) {
+                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
+                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
-  int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI =
     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
-  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
-  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
                        false, false, 0);
@@ -2178,7 +2225,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, SR != NotStructReturn,
-                    MF.getFunction()->hasStructRetAttr(),
+                    MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
                     Outs, OutVals, Ins, DAG);
 
     // Sibcalls are automatically detected tailcalls which do not require
@@ -2218,14 +2265,15 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   int FPDiff = 0;
   if (isTailCall && !IsSibcall) {
     // Lower arguments at fp - stackoffset + fpdiff.
-    unsigned NumBytesCallerPushed =
-      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+    X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
+
     FPDiff = NumBytesCallerPushed - NumBytes;
 
     // Set the delta of movement of the returnaddr stackslot.
     // But only set if delta is greater than previous delta.
-    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
-      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+    if (FPDiff < X86Info->getTCReturnAddrDelta())
+      X86Info->setTCReturnAddrDelta(FPDiff);
   }
 
   if (!IsSibcall)
@@ -2302,7 +2350,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
       if (StackPtr.getNode() == 0)
-        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                                      getPointerTy());
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
     }
@@ -2390,7 +2439,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           // Copy relative to framepointer.
           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
           if (StackPtr.getNode() == 0)
-            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+            StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                          RegInfo->getStackRegister(),
                                           getPointerTy());
           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
 
@@ -2412,7 +2462,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           &MemOpChains2[0], MemOpChains2.size());
 
     // Store the return address to the appropriate stack slot.
-    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(), RegInfo->getSlotSize(),
                                      FPDiff, dl);
   }
 
@@ -2462,7 +2513,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         OpFlags = X86II::MO_DARWIN_STUB;
       } else if (Subtarget->isPICStyleRIPRel() &&
                  isa<Function>(GV) &&
-                 cast<Function>(GV)->hasFnAttr(Attribute::NonLazyBind)) {
+                 cast<Function>(GV)->getFnAttributes().
+                   hasAttribute(Attributes::NonLazyBind)) {
         // If the function is marked as non-lazy, generate an indirect call
         // which loads from the GOT directly. This avoids runtime overhead
         // at the cost of eager binding (and one extra byte of encoding).
@@ -2623,7 +2675,7 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
-  uint64_t SlotSize = TD->getPointerSize();
+  unsigned SlotSize = RegInfo->getSlotSize();
   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
     // Number smaller than 12 so just add the difference.
     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
@@ -2698,6 +2750,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
                                                      bool isVarArg,
                                                      bool isCalleeStructRet,
                                                      bool isCallerStructRet,
+                                                     Type *RetTy,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -2709,6 +2762,13 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // If -tailcallopt is specified, make fastcc functions tail-callable.
   const MachineFunction &MF = DAG.getMachineFunction();
   const Function *CallerF = DAG.getMachineFunction().getFunction();
+
+  // If the function return type is x86_fp80 and the callee return type is not,
+  // then the FP_EXTEND of the call result is not a nop. It's not safe to
+  // perform a tailcall optimization here.
+  if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+    return false;
+
   CallingConv::ID CallerCC = CallerF->getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
@@ -2832,7 +2892,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
       MachineFrameInfo *MFI = MF.getFrameInfo();
       const MachineRegisterInfo *MRI = &MF.getRegInfo();
       const X86InstrInfo *TII =
-        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+        ((const X86TargetMachine&)getTargetMachine()).getInstrInfo();
       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
         CCValAssign &VA = ArgLocs[i];
         SDValue Arg = OutVals[i];
@@ -2983,7 +3043,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = TD->getPointerSize();
+    unsigned SlotSize = RegInfo->getSlotSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                            false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
@@ -3506,25 +3566,26 @@ SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
       MatchOddMask = false;
   }
-  static const int CompactionMaskEven[] = {0, 2, -1, -1, 4, 6, -1, -1};
-  static const int CompactionMaskOdd [] = {1, 3, -1, -1, 5, 7, -1, -1};
 
-  const int *CompactionMask;
-  if (MatchEvenMask)
-    CompactionMask = CompactionMaskEven;
-  else if (MatchOddMask)
-    CompactionMask = CompactionMaskOdd;
-  else
+  if (!MatchEvenMask && !MatchOddMask)
     return SDValue();
 
   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
 
-  SDValue Op0 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(0),
-                                     UndefNode, CompactionMask);
-  SDValue Op1 = DAG.getVectorShuffle(VT, dl, SVOp->getOperand(1),
-                                     UndefNode, CompactionMask);
-  static const int UnpackMask[] = {0, 8, 1, 9, 4, 12, 5, 13};
-  return DAG.getVectorShuffle(VT, dl, Op0, Op1, UnpackMask);
+  SDValue Op0 = SVOp->getOperand(0);
+  SDValue Op1 = SVOp->getOperand(1);
+
+  if (MatchEvenMask) {
+    // Shift the second operand right to 32 bits.
+    static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
+    Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
+  } else {
+    // Shift the first operand left to 32 bits.
+    static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
+    Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
+  }
+  static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
+  return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
 }
 
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
@@ -4575,7 +4636,6 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
     MVT ShufVT = V.getValueType().getSimpleVT();
     unsigned NumElems = ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
-    SDValue ImmN;
     bool IsUnary;
 
     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
@@ -4977,6 +5037,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
                                 LDBase->getAlignment(),
                                 false/*isVolatile*/, true/*ReadMem*/,
                                 false/*WriteMem*/);
+
+    // Make sure the newly-created LOAD is in the same position as LDBase in
+    // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
+    // update uses of LDBase's output chain to use the TokenFactor.
+    if (LDBase->hasAnyUseOfValue(1)) {
+      SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                             SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
+      DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
+                             SDValue(ResNode.getNode(), 1));
+    }
+
     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
   }
   return SDValue();
@@ -4990,7 +5062,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
 /// The VBROADCAST node is returned when a pattern is found,
 /// or SDValue() otherwise.
 SDValue
-X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
+X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
   if (!Subtarget->hasAVX())
     return SDValue();
 
@@ -5114,80 +5186,78 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-// LowerVectorFpExtend - Recognize the scalarized FP_EXTEND from v2f32 to v2f64
-// and convert it into X86ISD::VFPEXT due to the current ISD::FP_EXTEND has the
-// constraint of matching input/output vector elements.
 SDValue
-X86TargetLowering::LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const {
-  DebugLoc DL = Op.getDebugLoc();
-  SDNode *N = Op.getNode();
+X86TargetLowering::buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  unsigned NumElts = Op.getNumOperands();
 
-  // Check supported types and sub-targets.
-  //
-  // Only v2f32 -> v2f64 needs special handling.
-  if (VT != MVT::v2f64 || !Subtarget->hasSSE2())
+  // Skip if insert_vec_elt is not supported.
+  if (!isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
     return SDValue();
 
-  SDValue VecIn;
-  EVT VecInVT;
-  SmallVector<int, 8> Mask;
-  EVT SrcVT = MVT::Other;
+  DebugLoc DL = Op.getDebugLoc();
+  unsigned NumElems = Op.getNumOperands();
+
+  SDValue VecIn1;
+  SDValue VecIn2;
+  SmallVector<unsigned, 4> InsertIndices;
+  SmallVector<int, 8> Mask(NumElems, -1);
 
-  // Check the patterns could be translated into X86vfpext.
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue In = N->getOperand(i);
-    unsigned Opcode = In.getOpcode();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    unsigned Opc = Op.getOperand(i).getOpcode();
 
-    // Skip if the element is undefined.
-    if (Opcode == ISD::UNDEF) {
-      Mask.push_back(-1);
+    if (Opc == ISD::UNDEF)
       continue;
-    }
 
-    // Quit if one of the elements is not defined from 'fpext'.
-    if (Opcode != ISD::FP_EXTEND)
-      return SDValue();
+    if (Opc != ISD::EXTRACT_VECTOR_ELT) {
+      // Quit if more than 1 elements need inserting.
+      if (InsertIndices.size() > 1)
+        return SDValue();
+
+      InsertIndices.push_back(i);
+      continue;
+    }
 
-    // Check how the source of 'fpext' is defined.
-    SDValue L2In = In.getOperand(0);
-    EVT L2InVT = L2In.getValueType();
+    SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
+    SDValue ExtIdx = Op.getOperand(i).getOperand(1);
 
-    // Check the original type
-    if (SrcVT == MVT::Other)
-      SrcVT = L2InVT;
-    else if (SrcVT != L2InVT) // Quit if non-homogenous typed.
+    // Quit if extracted from vector of different type.
+    if (ExtractedFromVec.getValueType() != VT)
       return SDValue();
 
-    // Check whether the value being 'fpext'ed is extracted from the same
-    // source.
-    Opcode = L2In.getOpcode();
-
-    // Quit if it's not extracted with a constant index.
-    if (Opcode != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(L2In.getOperand(1)))
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
       return SDValue();
 
-    SDValue ExtractedFromVec = L2In.getOperand(0);
+    if (VecIn1.getNode() == 0)
+      VecIn1 = ExtractedFromVec;
+    else if (VecIn1 != ExtractedFromVec) {
+      if (VecIn2.getNode() == 0)
+        VecIn2 = ExtractedFromVec;
+      else if (VecIn2 != ExtractedFromVec)
+        // Quit if more than 2 vectors to shuffle
+        return SDValue();
+    }
 
-    if (VecIn.getNode() == 0) {
-      VecIn = ExtractedFromVec;
-      VecInVT = ExtractedFromVec.getValueType();
-    } else if (VecIn != ExtractedFromVec) // Quit if built from more than 1 vec.
-      return SDValue();
+    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
 
-    Mask.push_back(cast<ConstantSDNode>(L2In.getOperand(1))->getZExtValue());
+    if (ExtractedFromVec == VecIn1)
+      Mask[i] = Idx;
+    else if (ExtractedFromVec == VecIn2)
+      Mask[i] = Idx + NumElems;
   }
 
-  // Fill the remaining mask as undef.
-  for (unsigned i = NumElts; i < VecInVT.getVectorNumElements(); ++i)
-    Mask.push_back(-1);
+  if (VecIn1.getNode() == 0)
+    return SDValue();
+
+  VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+  SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
+  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
+    unsigned Idx = InsertIndices[i];
+    NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
+                     DAG.getIntPtrConstant(Idx));
+  }
 
-  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
-                     DAG.getVectorShuffle(VecInVT, DL,
-                                          VecIn, DAG.getUNDEF(VecInVT),
-                                          &Mask[0]));
+  return NV;
 }
 
 SDValue
@@ -5222,10 +5292,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   if (Broadcast.getNode())
     return Broadcast;
 
-  SDValue FpExt = LowerVectorFpExtend(Op, DAG);
-  if (FpExt.getNode())
-    return FpExt;
-
   unsigned EVTBits = ExtVT.getSizeInBits();
 
   unsigned NumZero  = 0;
@@ -5470,6 +5536,11 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (LD.getNode())
       return LD;
 
+    // Check for a build vector from mostly shuffle plus few inserting.
+    SDValue Sh = buildFromShuffleMostly(Op, DAG);
+    if (Sh.getNode())
+      return Sh;
+
     // For SSE 4.1, use insertps to put the high elements into the low element.
     if (getSubtarget()->hasSSE41()) {
       SDValue Result;
@@ -5536,8 +5607,7 @@ static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
 }
 
-SDValue
-X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getNumOperands() == 2);
 
   // 256-bit AVX can use the vinsertf128 instruction to create 256-bit vectors
@@ -5546,9 +5616,9 @@ X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
 }
 
 // Try to lower a shuffle node into a simple blend instruction.
-static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
-                                          const X86Subtarget *Subtarget,
-                                          SelectionDAG &DAG) {
+static SDValue
+LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
+                           const X86Subtarget *Subtarget, SelectionDAG &DAG) {
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
   DebugLoc dl = SVOp->getDebugLoc();
@@ -5618,9 +5688,9 @@ static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
 // 2. [ssse3] 1 x pshufb
 // 3. [ssse3] 2 x pshufb + 1 x por
 // 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
-SDValue
-X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
-                                            SelectionDAG &DAG) const {
+static SDValue
+LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
+                         SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   SDValue V1 = SVOp->getOperand(0);
   SDValue V2 = SVOp->getOperand(1);
@@ -5877,8 +5947,6 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   DebugLoc dl = SVOp->getDebugLoc();
   ArrayRef<int> MaskVals = SVOp->getMask();
 
-  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
-
   // If we have SSSE3, case 1 is generated when all result bytes come from
   // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
   // present, fall back to case 3.
@@ -5902,7 +5970,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
                                  MVT::v16i8, &pshufbMask[0], 16));
-    if (V2IsUndef)
+
+    // As PSHUFB will zero elements with negative indices, it's safe to ignore
+    // the 2nd operand if it's undefined or zero.
+    if (V2.getOpcode() == ISD::UNDEF ||
+        ISD::isBuildVectorAllZeros(V2.getNode()))
       return V1;
 
     // Calculate the shuffle mask for the second input, shuffle it, and
@@ -5988,6 +6060,51 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV);
 }
 
+// v32i8 shuffles - Translate to VPSHUFB if possible.
+static
+SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
+                                 const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 32> MaskVals(SVOp->getMask().begin(), SVOp->getMask().end());
+
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+  // VPSHUFB may be generated if
+  // (1) one of input vector is undefined or zeroinitializer.
+  // The mask value 0x80 puts 0 in the corresponding slot of the vector.
+  // And (2) the mask indexes don't cross the 128-bit lane.
+  if (VT != MVT::v32i8 || !Subtarget->hasAVX2() ||
+      (!V2IsUndef && !V2IsAllZero && !V1IsAllZero))
+    return SDValue();
+
+  if (V1IsAllZero && !V2IsAllZero) {
+    CommuteVectorShuffleMask(MaskVals, 32);
+    V1 = V2;
+  }
+  SmallVector<SDValue, 32> pshufbMask;
+  for (unsigned i = 0; i != 32; i++) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0 || EltIdx >= 32)
+      EltIdx = 0x80;
+    else {
+      if ((EltIdx >= 16 && i < 16) || (EltIdx < 16 && i >= 16))
+        // Cross lane is not allowed.
+        return SDValue();
+      EltIdx &= 0xf;
+    }
+    pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+  }
+  return DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, V1,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                  MVT::v32i8, &pshufbMask[0], 32));
+}
+
 /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
 /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be
 /// done when every pair / quad of shuffle mask elements point to elements in
@@ -6322,17 +6439,17 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
 }
 
 static bool MayFoldVectorLoad(SDValue V) {
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+  while (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
     V = V.getOperand(0);
+
   if (V.hasOneUse() && V.getOpcode() == ISD::SCALAR_TO_VECTOR)
     V = V.getOperand(0);
   if (V.hasOneUse() && V.getOpcode() == ISD::BUILD_VECTOR &&
       V.getNumOperands() == 2 && V.getOperand(1).getOpcode() == ISD::UNDEF)
     // BUILD_VECTOR (load), undef
     V = V.getOperand(0);
-  if (MayFoldLoad(V))
-    return true;
-  return false;
+
+  return MayFoldLoad(V);
 }
 
 // FIXME: the version above should always be used. Since there's
@@ -6455,6 +6572,81 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+// Reduce a vector shuffle to zext.
+SDValue
+X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+  // PMOVZX is only available from SSE41.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  // Only AVX2 support 256-bit vector integer extending.
+  if (!Subtarget->hasAVX2() && VT.is256BitVector())
+    return SDValue();
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Extending is an unary operation and the element type of the source vector
+  // won't be equal to or larger than i64.
+  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+      VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+  while ((1U << Shift) < NumElems) {
+    if (SVOp->getMaskElt(1U << Shift) == 1)
+      break;
+    Shift += 1;
+    // The maximal ratio is 8, i.e. from i8 to i64.
+    if (Shift > 3)
+      return SDValue();
+  }
+
+  // Check the shuffle mask.
+  unsigned Mask = (1U << Shift) - 1;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if ((i & Mask) != 0 && EltIdx != -1)
+      return SDValue();
+    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+      return SDValue();
+  }
+
+  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+  EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
+
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
+  // Simplify the operand as it's prepared to be fed into shuffle.
+  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+  if (V1.getOpcode() == ISD::BITCAST &&
+      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V1.getOperand(0)
+        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+    ConstantSDNode *CIdx =
+      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+    // If it's foldable, i.e. normal load with single use, we will let code
+    // selection to fold it. Otherwise, we will short the conversion sequence.
+    if (CIdx && CIdx->getZExtValue() == 0 &&
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
 SDValue
 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -6485,6 +6677,11 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     return PromoteSplat(SVOp, DAG);
   }
 
+  // Check integer expanding shuffles.
+  SDValue NewOp = lowerVectorIntExtend(Op, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
   if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
@@ -6534,7 +6731,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool HasAVX    = Subtarget->hasAVX();
   bool HasAVX2   = Subtarget->hasAVX2();
   MachineFunction &MF = DAG.getMachineFunction();
-  bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
+  bool OptForSize = MF.getFunction()->getFnAttributes().
+    hasAttribute(Attributes::OptimizeForSize);
 
   assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
 
@@ -6803,7 +7001,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
 
   // Handle v8i16 specifically since SSE can do byte extraction and insertion.
   if (VT == MVT::v8i16) {
-    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, DAG);
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(Op, Subtarget, DAG);
     if (NewOp.getNode())
       return NewOp;
   }
@@ -6814,6 +7012,12 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       return NewOp;
   }
 
+  if (VT == MVT::v32i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv32i8(SVOp, Subtarget, DAG);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
   // Handle all 128-bit wide vectors with 4 elements, and match them with
   // several different shuffle types.
   if (NumElems == 4 && VT.is128BitVector())
@@ -6837,9 +7041,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
 
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -6854,9 +7058,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                  Op.getOperand(0)),
                                      Op.getOperand(1)));
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -6940,9 +7144,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // Transform it so it match pextrw which produces a 32-bit result.
     EVT EltVT = MVT::i32;
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -7085,8 +7289,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue
-X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
   EVT OpVT = Op.getValueType();
@@ -7118,8 +7321,8 @@ X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const {
 // Lower a node with an EXTRACT_SUBVECTOR opcode.  This may result in
 // a simple subregister reference or explicit instructions to grab
 // upper bits of a vector.
-SDValue
-X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   if (Subtarget->hasAVX()) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     SDValue Vec = Op.getNode()->getOperand(0);
@@ -7138,8 +7341,8 @@ X86TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
 // simple superregister reference or explicit instructions to insert
 // the upper bits of a vector.
-SDValue
-X86TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
   if (Subtarget->hasAVX()) {
     DebugLoc dl = Op.getNode()->getDebugLoc();
     SDValue Vec = Op.getNode()->getOperand(0);
@@ -7282,9 +7485,10 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
     Subtarget->ClassifyBlockAddressReference();
   CodeModel::Model M = getTargetMachine().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
-                                       /*isTarget=*/true, OpFlags);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+                                             OpFlags);
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -7393,8 +7597,8 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   SDValue InFlag;
   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-                                     DAG.getNode(X86ISD::GlobalBaseReg,
-                                                 DebugLoc(), PtrVT), InFlag);
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               DebugLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -7895,11 +8099,29 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   return Sub;
 }
 
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  EVT SVT = N0.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+         "Custom UINT_TO_FP is not supported!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
+  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   DebugLoc dl = Op.getDebugLoc();
 
+  if (Op.getValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   // the optimization here.
@@ -8073,10 +8295,66 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co
   }
 }
 
+SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue In = Op.getOperand(0);
+  EVT SVT = In.getValueType();
+
+  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+  // AVX2 has better support of integer extending.
+  if (Subtarget->hasAVX2())
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
+  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
+  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
+                           DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+}
+
+SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT SVT = Op.getOperand(0).getValueType();
+
+  if (!VT.is128BitVector() || !SVT.is256BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                             NumElems * 2);
+
+  SDValue In = Op.getOperand(0);
+  SmallVector<int, 16> MaskVec(NumElems * 2, -1);
+  // Prepare truncation shuffle mask
+  for (unsigned i = 0; i != NumElems; ++i)
+    MaskVec[i] = i * 2;
+  SDValue V = DAG.getVectorShuffle(NVT, DL,
+                                   DAG.getNode(ISD::BITCAST, DL, NVT, In),
+                                   DAG.getUNDEF(NVT), &MaskVec[0]);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
+                     DAG.getIntPtrConstant(0));
+}
+
 SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
+  if (Op.getValueType().isVector()) {
+    if (Op.getValueType() == MVT::v8i16)
+      return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
+                         DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
+                                     MVT::v8i32, Op.getOperand(0)));
     return SDValue();
+  }
 
   std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
@@ -8111,26 +8389,49 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
   return FIST;
 }
 
-SDValue X86TargetLowering::LowerFABS(SDValue Op,
-                                     SelectionDAG &DAG) const {
+SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue In = Op.getOperand(0);
+  EVT SVT = In.getValueType();
+
+  assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
+
+  return DAG.getNode(X86ISD::VFPEXT, DL, VT,
+                     DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
+                                 In, DAG.getUNDEF(SVT)));
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
   LLVMContext *Context = DAG.getContext();
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
   EVT EltVT = VT;
-  if (VT.isVector())
+  unsigned NumElts = VT == MVT::f64 ? 2 : 4;
+  if (VT.isVector()) {
     EltVT = VT.getVectorElementType();
-  Constant *C;
-  if (EltVT == MVT::f64) {
-    C = ConstantVector::getSplat(2,
-                ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
-  } else {
-    C = ConstantVector::getSplat(4,
-               ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+    NumElts = VT.getVectorNumElements();
   }
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  Constant *C;
+  if (EltVT == MVT::f64)
+    C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+  else
+    C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+  C = ConstantVector::getSplat(NumElts, C);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
+  if (VT.isVector()) {
+    MVT ANDVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    return DAG.getNode(ISD::BITCAST, dl, VT,
+                       DAG.getNode(ISD::AND, dl, ANDVT,
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT,
+                                               Op.getOperand(0)),
+                                   DAG.getNode(ISD::BITCAST, dl, ANDVT, Mask)));
+  }
   return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
 }
 
@@ -8150,10 +8451,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
   else
     C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
   C = ConstantVector::getSplat(NumElts, C);
-  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
   SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
                              MachinePointerInfo::getConstantPool(),
-                             false, false, false, 16);
+                             false, false, false, Alignment);
   if (VT.isVector()) {
     MVT XORVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
     return DAG.getNode(ISD::BITCAST, dl, VT,
@@ -8239,7 +8541,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
 }
 
-SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   SDValue N0 = Op.getOperand(0);
   DebugLoc dl = Op.getDebugLoc();
   EVT VT = Op.getValueType();
@@ -8250,6 +8552,98 @@ SDValue X86TargetLowering::LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
 }
 
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+  SmallVector<SDValue, 8> VecIns;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+    VecIns.push_back(I->first);
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8283,7 +8677,33 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
 
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
-  switch (Op.getNode()->getOpcode()) {
+
+  // Truncate operations may prevent the merge of the SETCC instruction
+  // and the arithmetic intruction before it. Attempt to truncate the operands
+  // of the arithmetic instruction and use a reduced bit-width instruction.
+  bool NeedTruncation = false;
+  SDValue ArithOp = Op;
+  if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
+    SDValue Arith = Op->getOperand(0);
+    // Both the trunc and the arithmetic op need to have one user each.
+    if (Arith->hasOneUse())
+      switch (Arith.getOpcode()) {
+        default: break;
+        case ISD::ADD:
+        case ISD::SUB:
+        case ISD::AND:
+        case ISD::OR:
+        case ISD::XOR: {
+          NeedTruncation = true;
+          ArithOp = Arith;
+        }
+      }
+  }
+
+  // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
+  // which may be the result of a CAST.  We use the variable 'Op', which is the
+  // non-casted variable when we check for possible users.
+  switch (ArithOp.getOpcode()) {
   case ISD::ADD:
     // Due to an isel shortcoming, be conservative if this add is likely to be
     // selected as part of a load-modify-store instruction. When the root node
@@ -8303,7 +8723,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
         goto default_case;
 
     if (ConstantSDNode *C =
-        dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+        dyn_cast<ConstantSDNode>(ArithOp.getNode()->getOperand(1))) {
       // An add of one will be selected as an INC.
       if (C->getAPIntValue() == 1) {
         Opcode = X86ISD::INC;
@@ -8339,7 +8759,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
 
       if (User->getOpcode() != ISD::BRCOND &&
           User->getOpcode() != ISD::SETCC &&
-          (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
         NonFlagUse = true;
         break;
       }
@@ -8360,14 +8780,20 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
         goto default_case;
 
     // Otherwise use a regular EFLAGS-setting instruction.
-    switch (Op.getNode()->getOpcode()) {
+    switch (ArithOp.getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
-    case ISD::SUB:
-      Opcode = X86ISD::SUB;
-      break;
-    case ISD::OR:  Opcode = X86ISD::OR;  break;
+    case ISD::SUB: Opcode = X86ISD::SUB; break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
     }
 
     NumOperands = 2;
@@ -8385,19 +8811,40 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     break;
   }
 
+  // If we found that truncation is beneficial, perform the truncation and
+  // update 'Op'.
+  if (NeedTruncation) {
+    EVT VT = Op.getValueType();
+    SDValue WideVal = Op->getOperand(0);
+    EVT WideVT = WideVal.getValueType();
+    unsigned ConvertedOp = 0;
+    // Use a target machine opcode to prevent further DAGCombine
+    // optimizations that may separate the arithmetic operations
+    // from the setcc node.
+    switch (WideVal.getOpcode()) {
+      default: break;
+      case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
+      case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
+      case ISD::AND: ConvertedOp = X86ISD::AND; break;
+      case ISD::OR:  ConvertedOp = X86ISD::OR;  break;
+      case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
+    }
+
+    if (ConvertedOp) {
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
+        SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
+        SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
+        Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
+      }
+    }
+  }
+
   if (Opcode == 0)
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, Op.getValueType()));
 
-  if (Opcode == X86ISD::CMP) {
-    SDValue New = DAG.getNode(Opcode, dl, MVT::i32, Op.getOperand(0),
-                              Op.getOperand(1));
-    // We can't replace usage of SUB with CMP.
-    // The SUB node will be removed later because there is no use of it.
-    return SDValue(New.getNode(), 0);
-  }
-
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
   SmallVector<SDValue, 4> Ops;
   for (unsigned i = 0; i != NumOperands; ++i)
@@ -8956,6 +9403,21 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
+  // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
+  // widen the cmov and push the truncate through. This avoids introducing a new
+  // branch during isel and doesn't add any extensions.
+  if (Op.getValueType() == MVT::i8 &&
+      Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
+    SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
+    if (T1.getValueType() == T2.getValueType() &&
+        // Blacklist CopyFromReg to avoid partial register stalls.
+        T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
+      SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
+      SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+      return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
+    }
+  }
+
   // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
   // condition is true.
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
@@ -9310,7 +9772,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
     Flag = Chain.getValue(1);
 
-    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+                               SPTy).getValue(1);
 
     SDValue Ops1[2] = { Chain.getValue(0), Chain };
     return DAG.getMergeValues(Ops1, 2, dl);
@@ -9393,7 +9856,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
 
   EVT ArgVT = Op.getNode()->getValueType(0);
   Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-  uint32_t ArgSize = getTargetData()->getTypeAllocSize(ArgTy);
+  uint32_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
   uint8_t ArgMode;
 
   // Decide which area this value should be read from.
@@ -9413,7 +9876,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
     // Sanity Check: Make sure using fp_offset makes sense.
     assert(!getTargetMachine().Options.UseSoftFloat &&
            !(DAG.getMachineFunction()
-                .getFunction()->hasFnAttr(Attribute::NoImplicitFloat)) &&
+                .getFunction()->getFnAttributes()
+                .hasAttribute(Attributes::NoImplicitFloat)) &&
            Subtarget->hasSSE1());
   }
 
@@ -9444,7 +9908,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                      false, false, false, 0);
 }
 
-SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
+                           SelectionDAG &DAG) {
   // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
   assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
   SDValue Chain = Op.getOperand(0);
@@ -9505,8 +9970,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
-SDValue
-X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
@@ -9894,62 +10358,6 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
                                Op.getOperand(1), Op.getOperand(2), DAG);
   }
 
-  // Fix vector shift instructions where the last operand is a non-immediate
-  // i32 value.
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = Op.getOperand(2);
-    if (isa<ConstantSDNode>(ShAmt))
-      return SDValue();
-
-    unsigned NewIntNo;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntNo = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntNo = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntNo = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntNo = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntNo = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntNo = Intrinsic::x86_mmx_psrl_q;
-      break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntNo = Intrinsic::x86_mmx_psra_w;
-      break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntNo = Intrinsic::x86_mmx_psra_d;
-      break;
-    }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, ShAmt,
-                         DAG.getConstant(0, MVT::i32));
-// FIXME this must be lowered to get rid of the invalid type.
-
-    EVT VT = Op.getValueType();
-    ShAmt = DAG.getNode(ISD::BITCAST, dl, VT, ShAmt);
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(NewIntNo, MVT::i32),
-                       Op.getOperand(1), ShAmt);
-  }
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -10028,11 +10436,78 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
     return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
   }
+  case Intrinsic::x86_fma_vfmadd_ps:
+  case Intrinsic::x86_fma_vfmadd_pd:
+  case Intrinsic::x86_fma_vfmsub_ps:
+  case Intrinsic::x86_fma_vfmsub_pd:
+  case Intrinsic::x86_fma_vfnmadd_ps:
+  case Intrinsic::x86_fma_vfnmadd_pd:
+  case Intrinsic::x86_fma_vfnmsub_ps:
+  case Intrinsic::x86_fma_vfnmsub_pd:
+  case Intrinsic::x86_fma_vfmaddsub_ps:
+  case Intrinsic::x86_fma_vfmaddsub_pd:
+  case Intrinsic::x86_fma_vfmsubadd_ps:
+  case Intrinsic::x86_fma_vfmsubadd_pd:
+  case Intrinsic::x86_fma_vfmadd_ps_256:
+  case Intrinsic::x86_fma_vfmadd_pd_256:
+  case Intrinsic::x86_fma_vfmsub_ps_256:
+  case Intrinsic::x86_fma_vfmsub_pd_256:
+  case Intrinsic::x86_fma_vfnmadd_ps_256:
+  case Intrinsic::x86_fma_vfnmadd_pd_256:
+  case Intrinsic::x86_fma_vfnmsub_ps_256:
+  case Intrinsic::x86_fma_vfnmsub_pd_256:
+  case Intrinsic::x86_fma_vfmaddsub_ps_256:
+  case Intrinsic::x86_fma_vfmaddsub_pd_256:
+  case Intrinsic::x86_fma_vfmsubadd_ps_256:
+  case Intrinsic::x86_fma_vfmsubadd_pd_256: {
+    unsigned Opc;
+    switch (IntNo) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::x86_fma_vfmadd_ps:
+    case Intrinsic::x86_fma_vfmadd_pd:
+    case Intrinsic::x86_fma_vfmadd_ps_256:
+    case Intrinsic::x86_fma_vfmadd_pd_256:
+      Opc = X86ISD::FMADD;
+      break;
+    case Intrinsic::x86_fma_vfmsub_ps:
+    case Intrinsic::x86_fma_vfmsub_pd:
+    case Intrinsic::x86_fma_vfmsub_ps_256:
+    case Intrinsic::x86_fma_vfmsub_pd_256:
+      Opc = X86ISD::FMSUB;
+      break;
+    case Intrinsic::x86_fma_vfnmadd_ps:
+    case Intrinsic::x86_fma_vfnmadd_pd:
+    case Intrinsic::x86_fma_vfnmadd_ps_256:
+    case Intrinsic::x86_fma_vfnmadd_pd_256:
+      Opc = X86ISD::FNMADD;
+      break;
+    case Intrinsic::x86_fma_vfnmsub_ps:
+    case Intrinsic::x86_fma_vfnmsub_pd:
+    case Intrinsic::x86_fma_vfnmsub_ps_256:
+    case Intrinsic::x86_fma_vfnmsub_pd_256:
+      Opc = X86ISD::FNMSUB;
+      break;
+    case Intrinsic::x86_fma_vfmaddsub_ps:
+    case Intrinsic::x86_fma_vfmaddsub_pd:
+    case Intrinsic::x86_fma_vfmaddsub_ps_256:
+    case Intrinsic::x86_fma_vfmaddsub_pd_256:
+      Opc = X86ISD::FMADDSUB;
+      break;
+    case Intrinsic::x86_fma_vfmsubadd_ps:
+    case Intrinsic::x86_fma_vfmsubadd_pd:
+    case Intrinsic::x86_fma_vfmsubadd_ps_256:
+    case Intrinsic::x86_fma_vfmsubadd_pd_256:
+      Opc = X86ISD::FMSUBADD;
+      break;
+    }
+
+    return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2), Op.getOperand(3));
+  }
   }
 }
 
-SDValue
-X86TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   switch (IntNo) {
@@ -10070,21 +10545,21 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-      DAG.getConstant(TD->getPointerSize(),
-                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT,
                                    FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
 }
 
@@ -10106,7 +10581,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  return DAG.getIntPtrConstant(2*TD->getPointerSize());
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -10121,7 +10596,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
 
   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
-                                  DAG.getIntPtrConstant(TD->getPointerSize()));
+                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
   StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
@@ -10132,8 +10607,22 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
                      Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
 }
 
-SDValue X86TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
-                                                  SelectionDAG &DAG) const {
+SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
+                     DAG.getVTList(MVT::i32, MVT::Other),
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
+                     Op.getOperand(0), Op.getOperand(1));
+}
+
+static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
   return Op.getOperand(0);
 }
 
@@ -10146,6 +10635,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   DebugLoc dl  = Op.getDebugLoc();
 
   const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
 
   if (Subtarget->is64Bit()) {
     SDValue OutChains[6];
@@ -10154,8 +10644,8 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
     const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
     const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
 
-    const unsigned char N86R10 = X86_MC::getX86RegNum(X86::R10);
-    const unsigned char N86R11 = X86_MC::getX86RegNum(X86::R11);
+    const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
+    const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
 
     const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
 
@@ -10228,7 +10718,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
         for (FunctionType::param_iterator I = FTy->param_begin(),
              E = FTy->param_end(); I != E; ++I, ++Idx)
-          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+          if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
             // FIXME: should only count parameters that are lowered to integers.
             InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
 
@@ -10257,7 +10747,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
     // This is storing the opcode for MOV32ri.
     const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
-    const unsigned char N86Reg = X86_MC::getX86RegNum(NestReg);
+    const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
     OutChains[0] = DAG.getStore(Root, dl,
                                 DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
                                 Trmp, MachinePointerInfo(TrmpAddr),
@@ -10356,7 +10846,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
                       ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
 }
 
-SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -10390,8 +10880,7 @@ SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const {
   return Op;
 }
 
-SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
-                                                SelectionDAG &DAG) const {
+static SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   EVT OpVT = VT;
   unsigned NumBits = VT.getSizeInBits();
@@ -10416,7 +10905,7 @@ SDValue X86TargetLowering::LowerCTLZ_ZERO_UNDEF(SDValue Op,
   return Op;
 }
 
-SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
   unsigned NumBits = VT.getSizeInBits();
   DebugLoc dl = Op.getDebugLoc();
@@ -10465,21 +10954,22 @@ static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
                      DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
 }
 
-SDValue X86TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getValueType().is256BitVector() &&
          Op.getValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
-SDValue X86TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getValueType().is256BitVector() &&
          Op.getValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
   return Lower256IntArith(Op, DAG);
 }
 
-SDValue X86TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
+                        SelectionDAG &DAG) {
   EVT VT = Op.getValueType();
 
   // Decompose 256-bit ops into smaller 128-bit ops.
@@ -10754,7 +11244,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
   // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
   // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
   // looks for this combo and may remove the "setcc" instruction if the "setcc"
@@ -10869,7 +11359,7 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
         LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
         LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
 
-        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);
       }
       // fall through
     case MVT::v4i32:
@@ -10882,7 +11372,8 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
 }
 
 
-SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
+static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
+                              SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
 
   // Go ahead and emit the fence on x86-64 even if we asked for no-sse2.
@@ -10927,8 +11418,8 @@ SDValue X86TargetLowering::LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const{
   return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
 }
 
-SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
-                                             SelectionDAG &DAG) const {
+static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
+                                 SelectionDAG &DAG) {
   DebugLoc dl = Op.getDebugLoc();
   AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
     cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
@@ -10966,7 +11457,8 @@ SDValue X86TargetLowering::LowerATOMIC_FENCE(SDValue Op,
 }
 
 
-SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
   EVT T = Op.getValueType();
   DebugLoc DL = Op.getDebugLoc();
   unsigned Reg = 0;
@@ -10997,8 +11489,8 @@ SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
   return cpOut;
 }
 
-SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
-                                                 SelectionDAG &DAG) const {
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
   assert(Subtarget->is64Bit() && "Result not type legalized?");
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue TheChain = Op.getOperand(0);
@@ -11016,8 +11508,7 @@ SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
   return DAG.getMergeValues(Ops, 2, dl);
 }
 
-SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
-                                            SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   EVT SrcVT = Op.getOperand(0).getValueType();
   EVT DstVT = Op.getValueType();
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
@@ -11037,7 +11528,7 @@ SDValue X86TargetLowering::LowerBITCAST(SDValue Op,
   return SDValue();
 }
 
-SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
   SDNode *Node = Op.getNode();
   DebugLoc dl = Node->getDebugLoc();
   EVT T = Node->getValueType(0);
@@ -11110,9 +11601,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
-  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op,DAG);
-  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op,DAG);
-  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
+  case ISD::MEMBARRIER:         return LowerMEMBARRIER(Op, Subtarget, DAG);
+  case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -11120,8 +11611,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
+  case ISD::INSERT_SUBVECTOR:   return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
   case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
   case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
   case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
@@ -11133,8 +11624,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return lowerZERO_EXTEND(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
   case ISD::FABS:               return LowerFABS(Op, DAG);
   case ISD::FNEG:               return LowerFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
@@ -11145,7 +11639,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
   case ISD::VASTART:            return LowerVASTART(Op, DAG);
   case ISD::VAARG:              return LowerVAARG(Op, DAG);
-  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, Subtarget, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
   case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
@@ -11154,13 +11648,15 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
   case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::EH_SJLJ_SETJMP:     return lowerEH_SJLJ_SETJMP(Op, DAG);
+  case ISD::EH_SJLJ_LONGJMP:    return lowerEH_SJLJ_LONGJMP(Op, DAG);
   case ISD::INIT_TRAMPOLINE:    return LowerINIT_TRAMPOLINE(Op, DAG);
   case ISD::ADJUST_TRAMPOLINE:  return LowerADJUST_TRAMPOLINE(Op, DAG);
   case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
-  case ISD::MUL:                return LowerMUL(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, DAG);
@@ -11170,7 +11666,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:              return LowerXALUO(Op, DAG);
-  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
   case ISD::BITCAST:            return LowerBITCAST(Op, DAG);
   case ISD::ADDC:
   case ISD::ADDE:
@@ -11263,6 +11759,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
+  case ISD::UINT_TO_FP: {
+    if (N->getOperand(0).getValueType() != MVT::v2i32 &&
+        N->getValueType(0) != MVT::v2f32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+                                 N->getOperand(0));
+    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
+  case ISD::FP_ROUND: {
+    SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+    Results.push_back(V);
+    return;
+  }
   case ISD::READCYCLECOUNTER: {
     SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue TheChain = N->getOperand(0);
@@ -11330,6 +11847,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::ATOMIC_LOAD_OR:
   case ISD::ATOMIC_LOAD_SUB:
   case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD_UMIN:
   case ISD::ATOMIC_SWAP: {
     unsigned Opc;
     switch (N->getOpcode()) {
@@ -11352,6 +11873,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     case ISD::ATOMIC_LOAD_XOR:
       Opc = X86ISD::ATOMXOR64_DAG;
       break;
+    case ISD::ATOMIC_LOAD_MAX:
+      Opc = X86ISD::ATOMMAX64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_MIN:
+      Opc = X86ISD::ATOMMIN64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_UMAX:
+      Opc = X86ISD::ATOMUMAX64_DAG;
+      break;
+    case ISD::ATOMIC_LOAD_UMIN:
+      Opc = X86ISD::ATOMUMIN64_DAG;
+      break;
     case ISD::ATOMIC_SWAP:
       Opc = X86ISD::ATOMSWAP64_DAG;
       break;
@@ -11418,11 +11951,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
+  case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
+  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
+  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
   case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
   case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
   case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
@@ -11438,7 +11975,10 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
+  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
   case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
   case X86ISD::VSHL:               return "X86ISD::VSHL";
@@ -11505,6 +12045,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   }
 }
 
@@ -11653,430 +12195,724 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
-// private utility function
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
-                                                       MachineBasicBlock *MBB,
-                                                       unsigned regOpc,
-                                                       unsigned immOpc,
-                                                       unsigned LoadOpc,
-                                                       unsigned CXchgOpc,
-                                                       unsigned notOpc,
-                                                       unsigned EAXreg,
-                                                 const TargetRegisterClass *RC,
-                                                       bool Invert) const {
-  // For the atomic bitwise operator, we generate
-  //   thisMBB:
-  //   newMBB:
-  //     ld  t1 = [bitinstr.addr]
-  //     op  t2 = t1, [bitinstr.val]
-  //     not t3 = t2  (if Invert)
-  //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
-  //     bz  newMBB
-  //     fallthrough -->nextMBB
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+/// Utility function to emit xbegin specifying the start of an RTM region.
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
+  DebugLoc DL = MI->getDebugLoc();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(bInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to itself and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  // Insert instructions into newMBB based on incoming instruction
-  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
-         "unexpected number of operands");
-  DebugLoc dl = bInstr->getDebugLoc();
-  MachineOperand& destOper = bInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  int numArgs = bInstr->getNumOperands() - 1;
-  for (int i=0; i < numArgs; ++i)
-    argOpers[i] = &bInstr->getOperand(i+1);
-
-  // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-  int valArgIndx = lastAddrIndx + 1;
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
-  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-
-  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
-  MIB.addReg(t1);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
 
-  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
-  if (Invert) {
-    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
-  }
-  else
-    t3 = t2;
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
-  MIB.addReg(t1);
+  MI->eraseFromParent();
+  return sinkMBB;
+}
 
-  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t3);
-  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(bInstr->memoperands_begin(),
-                    bInstr->memoperands_end());
+// Get CMPXCHG opcode for the specified data type.
+static unsigned getCmpXChgOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::LCMPXCHG8;
+  case MVT::i16: return X86::LCMPXCHG16;
+  case MVT::i32: return X86::LCMPXCHG32;
+  case MVT::i64: return X86::LCMPXCHG64;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
-  MIB.addReg(EAXreg);
+// Get LOAD opcode for the specified data type.
+static unsigned getLoadOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::MOV8rm;
+  case MVT::i16: return X86::MOV16rm;
+  case MVT::i32: return X86::MOV32rm;
+  case MVT::i64: return X86::MOV64rm;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+// Get opcode of the non-atomic one from the specified atomic instruction.
+static unsigned getNonAtomicOpcode(unsigned Opc) {
+  switch (Opc) {
+  case X86::ATOMAND8:  return X86::AND8rr;
+  case X86::ATOMAND16: return X86::AND16rr;
+  case X86::ATOMAND32: return X86::AND32rr;
+  case X86::ATOMAND64: return X86::AND64rr;
+  case X86::ATOMOR8:   return X86::OR8rr;
+  case X86::ATOMOR16:  return X86::OR16rr;
+  case X86::ATOMOR32:  return X86::OR32rr;
+  case X86::ATOMOR64:  return X86::OR64rr;
+  case X86::ATOMXOR8:  return X86::XOR8rr;
+  case X86::ATOMXOR16: return X86::XOR16rr;
+  case X86::ATOMXOR32: return X86::XOR32rr;
+  case X86::ATOMXOR64: return X86::XOR64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction with
+// extra opcode.
+static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
+                                               unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
+  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
+  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
+  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
+  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
+  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
+  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
+  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
+  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
+  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
+  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
+  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
+  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
+  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
+  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
+  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
+  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
+  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
+  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
+  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target.
+static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
+  switch (Opc) {
+  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
+  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
+  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
+  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
+  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
+  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
+  case X86::ATOMMAX6432:  HiOpc = X86::SETLr;   return X86::SETLr;
+  case X86::ATOMMIN6432:  HiOpc = X86::SETGr;   return X86::SETGr;
+  case X86::ATOMUMAX6432: HiOpc = X86::SETBr;   return X86::SETBr;
+  case X86::ATOMUMIN6432: HiOpc = X86::SETAr;   return X86::SETAr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target with extra opcode.
+static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
+                                                   unsigned &HiOpc,
+                                                   unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND6432:
+    ExtraOpc = X86::NOT32r;
+    HiOpc = X86::AND32rr;
+    return X86::AND32rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
 
-  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+// Get pseudo CMOV opcode from the specified data type.
+static unsigned getPseudoCMOVOpc(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::CMOV_GR8;
+  case MVT::i16: return X86::CMOV_GR16;
+  case MVT::i32: return X86::CMOV_GR32;
+  default:
+    break;
+  }
+  llvm_unreachable("Unknown CMOV opcode!");
 }
 
-// private utility function:  64 bit atomics on 32 bit host.
+// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
+// They will be translated into a spin-loop or compare-exchange loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    EAX = LOAD MI.addr
+// loop:
+//    t1 = OP MI.val, EAX
+//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+//    JNE loop
+// sink:
+//    dst = EAX
+//    ...
 MachineBasicBlock *
-X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
-                                                       MachineBasicBlock *MBB,
-                                                       unsigned regOpcL,
-                                                       unsigned regOpcH,
-                                                       unsigned immOpcL,
-                                                       unsigned immOpcH,
-                                                       bool Invert) const {
-  // For the atomic bitwise operator, we generate
-  //   thisMBB (instructions are in pairs, except cmpxchg8b)
-  //     ld t1,t2 = [bitinstr.addr]
-  //   newMBB:
-  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
-  //     op  t5, t6 <- out1, out2, [bitinstr.val]
-  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
-  //     neg t7, t8 < t5, t6  (if Invert)
-  //     mov ECX, EBX <- t5, t6
-  //     mov EAX, EDX <- t1, t2
-  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
-  //     mov t3, t4 <- EAX, EDX
-  //     bz  newMBB
-  //     result in out1, out2
-  //     fallthrough -->nextMBB
-
-  const TargetRegisterClass *RC = &X86::GR32RegClass;
-  const unsigned LoadOpc = X86::MOV32rm;
-  const unsigned NotOpc = X86::NOT32r;
+X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg, SrcReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  MVT::SimpleValueType VT = *RC->vt_begin();
+  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
+
+  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
+  unsigned LOADOpc = getLoadOpcode(VT);
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    EAX = LOAD [MI.addr]
+  //  mainMBB:
+  //    t1 = OP MI.val, EAX
+  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+  //    JNE mainMBB
+  //  sinkMBB:
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(bInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to itself and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  DebugLoc dl = bInstr->getDebugLoc();
-  // Insert instructions into newMBB based on incoming instruction
-  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
-  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
-         "unexpected number of operands");
-  MachineOperand& dest1Oper = bInstr->getOperand(0);
-  MachineOperand& dest2Oper = bInstr->getOperand(1);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
-    argOpers[i] = &bInstr->getOperand(i+2);
-
-    // We use some of the operands multiple times, so conservatively just
-    // clear any kill flags that might be present.
-    if (argOpers[i]->isReg() && argOpers[i]->isUse())
-      argOpers[i]->setIsKill(false);
-  }
-
-  // x86 address has 5 operands: base, index, scale, displacement, and segment.
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
-  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
-  // add 4 to displacement.
-  for (int i=0; i <= lastAddrIndx-2; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MachineOperand newOp3 = *(argOpers[3]);
-  if (newOp3.isImm())
-    newOp3.setImm(newOp3.getImm()+4);
-  else
-    newOp3.setOffset(newOp3.getOffset()+4);
-  (*MIB).addOperand(newOp3);
-  (*MIB).addOperand(*argOpers[lastAddrIndx]);
-
-  // t3/4 are defined later, at the bottom of the loop
-  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
-  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
-  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
-    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
-  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
-    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
-
-  // The subsequent operations should be using the destination registers of
-  // the PHI instructions.
-  t1 = dest1Oper.getReg();
-  t2 = dest2Oper.getReg();
-
-  int valArgIndx = lastAddrIndx + 1;
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
-  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
-  if (regOpcL != X86::MOV32rr)
-    MIB.addReg(t1);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
-  assert(argOpers[valArgIndx + 1]->isReg() ==
-         argOpers[valArgIndx]->isReg());
-  assert(argOpers[valArgIndx + 1]->isImm() ==
-         argOpers[valArgIndx]->isImm());
-  if (argOpers[valArgIndx + 1]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
-  if (regOpcH != X86::MOV32rr)
-    MIB.addReg(t2);
-  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
-
-  unsigned t7, t8;
-  if (Invert) {
-    t7 = F->getRegInfo().createVirtualRegister(RC);
-    t8 = F->getRegInfo().createVirtualRegister(RC);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
-  } else {
-    t7 = t5;
-    t8 = t6;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+  mainMBB->addLiveIn(AccPhyReg);
+
+  // Copy AccPhyReg as it is used more than once.
+  unsigned AccReg = MRI.createVirtualRegister(RC);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
+    .addReg(AccPhyReg);
+
+  unsigned t1 = MRI.createVirtualRegister(RC);
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op opcode!");
+  case X86::ATOMAND8:
+  case X86::ATOMAND16:
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+  case X86::ATOMOR8:
+  case X86::ATOMOR16:
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+  case X86::ATOMXOR8:
+  case X86::ATOMXOR16:
+  case X86::ATOMXOR32:
+  case X86::ATOMXOR64: {
+    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
+    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
+      .addReg(AccReg);
+    break;
+  }
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
+  case X86::ATOMNAND64: {
+    unsigned t2 = MRI.createVirtualRegister(RC);
+    unsigned NOTOpc;
+    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
+    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
+      .addReg(AccReg);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
+    break;
+  }
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
+  case X86::ATOMMAX64:
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
+  case X86::ATOMUMAX64:
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64: {
+    unsigned CMPOpc;
+    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
+
+    BuildMI(mainMBB, DL, TII->get(CMPOpc))
+      .addReg(SrcReg)
+      .addReg(AccReg);
+
+    if (Subtarget->hasCMov()) {
+      if (VT != MVT::i8) {
+        // Native support
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
+          .addReg(SrcReg)
+          .addReg(AccReg);
+      } else {
+        // Promote i8 to i32 to use CMOV32
+        const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
+        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
+        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
+        unsigned t2 = MRI.createVirtualRegister(RC32);
+
+        unsigned Undef = MRI.createVirtualRegister(RC32);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
+          .addReg(Undef)
+          .addReg(SrcReg)
+          .addImm(X86::sub_8bit);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
+          .addReg(Undef)
+          .addReg(AccReg)
+          .addImm(X86::sub_8bit);
+
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+          .addReg(SrcReg32)
+          .addReg(AccReg32);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
+          .addReg(t2, 0, X86::sub_8bit);
+      }
+    } else {
+      // Use pseudo select and lower them.
+      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+             "Invalid atomic-load-op transformation!");
+      unsigned SelOpc = getPseudoCMOVOpc(VT);
+      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
+      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
+      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
+              .addReg(SrcReg).addReg(AccReg)
+              .addImm(CC);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    break;
+  }
   }
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
-  MIB.addReg(t1);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
-  MIB.addReg(t2);
+  // Copy AccPhyReg back from virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
+    .addReg(AccReg);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
-  MIB.addReg(t7);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
-  MIB.addReg(t8);
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.addReg(t1);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
-  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(bInstr->memoperands_begin(),
-                    bInstr->memoperands_end());
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
-  MIB.addReg(X86::EAX);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
-  MIB.addReg(X86::EDX);
+  // sinkMBB:
+  sinkMBB->addLiveIn(AccPhyReg);
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstReg)
+    .addReg(AccPhyReg);
 
-  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+  MI->eraseFromParent();
+  return sinkMBB;
 }
 
-// private utility function
+// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
+// instructions. They will be translated into a spin-loop or compare-exchange
+// loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    EAX = LOAD [MI.addr + 0]
+//    EDX = LOAD [MI.addr + 4]
+// loop:
+//    EBX = OP MI.val.lo, EAX
+//    ECX = OP MI.val.hi, EDX
+//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+//    JNE loop
+// sink:
+//    dst = EDX:EAX
+//    ...
 MachineBasicBlock *
-X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
-                                                      MachineBasicBlock *MBB,
-                                                      unsigned cmovOpc) const {
-  // For the atomic min/max operator, we generate
-  //   thisMBB:
-  //   newMBB:
-  //     ld t1 = [min/max.addr]
-  //     mov t2 = [min/max.val]
-  //     cmp  t1, t2
-  //     cmov[cond] t2 = t1
-  //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
-  //     bz   newMBB
-  //     fallthrough -->nextMBB
-  //
+X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op32 to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstLoReg, DstHiReg;
+  unsigned SrcLoReg, SrcHiReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstLoReg = MI->getOperand(CurOp++).getReg();
+  DstHiReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcLoReg = MI->getOperand(CurOp++).getReg();
+  SrcHiReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
+  const TargetRegisterClass *RC8 = &X86::GR8RegClass;
+
+  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
+  unsigned LOADOpc = X86::MOV32rm;
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    EAX = LOAD [MI.addr + 0]
+  //    EDX = LOAD [MI.addr + 4]
+  //  mainMBB:
+  //    EBX = OP MI.vallo, EAX
+  //    ECX = OP MI.valhi, EDX
+  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+  //    JNE mainMBB
+  //  sinkMBB:
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(mInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to newMBB and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  DebugLoc dl = mInstr->getDebugLoc();
-  // Insert instructions into newMBB based on incoming instruction
-  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
-         "unexpected number of operands");
-  MachineOperand& destOper = mInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  int numArgs = mInstr->getNumOperands() - 1;
-  for (int i=0; i < numArgs; ++i)
-    argOpers[i] = &mInstr->getOperand(i+1);
-
-  // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-  int valArgIndx = lastAddrIndx + 1;
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-
-  // We only support register and immediate values
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-
-  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
-  MIB.addReg(t1);
+  MachineInstrBuilder MIB;
 
-  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
-  MIB.addReg(t1);
-  MIB.addReg(t2);
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  // Lo
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Hi
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  // Generate movc
-  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
-  MIB.addReg(t2);
-  MIB.addReg(t1);
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+  mainMBB->addLiveIn(X86::EAX);
+  mainMBB->addLiveIn(X86::EDX);
+
+  // Copy EDX:EAX as they are used more than once.
+  unsigned LoReg = MRI.createVirtualRegister(RC);
+  unsigned HiReg = MRI.createVirtualRegister(RC);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
+
+  unsigned t1L = MRI.createVirtualRegister(RC);
+  unsigned t1H = MRI.createVirtualRegister(RC);
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
+  case X86::ATOMAND6432:
+  case X86::ATOMOR6432:
+  case X86::ATOMXOR6432:
+  case X86::ATOMADD6432:
+  case X86::ATOMSUB6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
+    break;
+  }
+  case X86::ATOMNAND6432: {
+    unsigned HiOpc, NOTOpc;
+    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
+    unsigned t2L = MRI.createVirtualRegister(RC);
+    unsigned t2H = MRI.createVirtualRegister(RC);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
+    break;
+  }
+  case X86::ATOMMAX6432:
+  case X86::ATOMMIN6432:
+  case X86::ATOMUMAX6432:
+  case X86::ATOMUMIN6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    unsigned cL = MRI.createVirtualRegister(RC8);
+    unsigned cH = MRI.createVirtualRegister(RC8);
+    unsigned cL32 = MRI.createVirtualRegister(RC);
+    unsigned cH32 = MRI.createVirtualRegister(RC);
+    unsigned cc = MRI.createVirtualRegister(RC);
+    // cl := cmp src_lo, lo
+    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+      .addReg(SrcLoReg).addReg(LoReg);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
+    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
+    // ch := cmp src_hi, hi
+    BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
+      .addReg(SrcHiReg).addReg(HiReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
+    BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
+    // cc := if (src_hi == hi) ? cl : ch;
+    if (Subtarget->hasCMov()) {
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVE32rr), cc)
+        .addReg(cH32).addReg(cL32);
+    } else {
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), cc)
+              .addReg(cH32).addReg(cL32)
+              .addImm(X86::COND_E);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
+    if (Subtarget->hasCMov()) {
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
+        .addReg(SrcLoReg).addReg(LoReg);
+      BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
+        .addReg(SrcHiReg).addReg(HiReg);
+    } else {
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
+              .addReg(SrcLoReg).addReg(LoReg)
+              .addImm(X86::COND_NE);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+      MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
+              .addReg(SrcHiReg).addReg(HiReg)
+              .addImm(X86::COND_NE);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    break;
+  }
+  case X86::ATOMSWAP6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
+    break;
+  }
+  }
+
+  // Copy EDX:EAX back from HiReg:LoReg
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
+  // Copy ECX:EBX from t1H:t1L
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
+
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  // Cmp and exchange if none has modified the memory location
-  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t3);
-  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(mInstr->memoperands_begin(),
-                    mInstr->memoperands_end());
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
-  MIB.addReg(X86::EAX);
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  // sinkMBB:
+  sinkMBB->addLiveIn(X86::EAX);
+  sinkMBB->addLiveIn(X86::EDX);
+
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstLoReg)
+    .addReg(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstHiReg)
+    .addReg(X86::EDX);
 
-  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+  MI->eraseFromParent();
+  return sinkMBB;
 }
 
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
 // or XMM0_V32I8 in AVX all of this code can be replaced with that
 // in the .td file.
-MachineBasicBlock *
-X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
-                            unsigned numArgs, bool memArg) const {
-  assert(Subtarget->hasSSE42() &&
-         "Target must have SSE4.2 or AVX features enabled");
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
 
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
   unsigned Opc;
-  if (!Subtarget->hasAVX()) {
-    if (memArg)
-      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
-    else
-      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
-  } else {
-    if (memArg)
-      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
-    else
-      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   }
 
+  DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-  for (unsigned i = 0; i < numArgs; ++i) {
-    MachineOperand &Op = MI->getOperand(i+1);
+
+  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
   BuildMI(*BB, MI, dl,
     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::XMM0);
+    .addReg(X86::ECX);
 
   MI->eraseFromParent();
   return BB;
 }
 
-MachineBasicBlock *
-X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII,
+                                       const X86Subtarget* Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   // Address into RAX/EAX, other two args into ECX, EDX.
   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
@@ -12765,6 +13601,203 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
 }
 
 MachineBasicBlock *
+X86TargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
+                                    MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg;
+  unsigned MemOpndSlot = 0;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  assert(RC->hasType(MVT::i32) && "Invalid destination!");
+  unsigned mainDstReg = MRI.createVirtualRegister(RC);
+  unsigned restoreDstReg = MRI.createVirtualRegister(RC);
+
+  MemOpndSlot = CurOp;
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  // For v = setjmp(buf), we generate
+  //
+  // thisMBB:
+  //  buf[LabelOffset] = restoreMBB
+  //  SjLjSetup restoreMBB
+  //
+  // mainMBB:
+  //  v_main = 0
+  //
+  // sinkMBB:
+  //  v = phi(main, restore)
+  //
+  // restoreMBB:
+  //  v_restore = 1
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+  MF->push_back(restoreMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  unsigned PtrStoreOpc = 0;
+  unsigned LabelReg = 0;
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  Reloc::Model RM = getTargetMachine().getRelocationModel();
+  bool UseImmLabel = (getTargetMachine().getCodeModel() == CodeModel::Small) &&
+                     (RM == Reloc::Static || RM == Reloc::DynamicNoPIC);
+
+  // Prepare IP either in reg or imm.
+  if (!UseImmLabel) {
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
+    const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
+    LabelReg = MRI.createVirtualRegister(PtrRC);
+    if (Subtarget->is64Bit()) {
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
+              .addReg(X86::RIP)
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB)
+              .addReg(0);
+    } else {
+      const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
+      MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
+              .addReg(XII->getGlobalBaseReg(MF))
+              .addImm(0)
+              .addReg(0)
+              .addMBB(restoreMBB, Subtarget->ClassifyBlockAddressReference())
+              .addReg(0);
+    }
+  } else
+    PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
+  // Store IP
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  if (!UseImmLabel)
+    MIB.addReg(LabelReg);
+  else
+    MIB.addMBB(restoreMBB);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Setup
+  MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
+          .addMBB(restoreMBB);
+  MIB.addRegMask(RegInfo->getNoPreservedMask());
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(restoreMBB);
+
+  // mainMBB:
+  //  EAX = 0
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(X86::PHI), DstReg)
+    .addReg(mainDstReg).addMBB(mainMBB)
+    .addReg(restoreDstReg).addMBB(restoreMBB);
+
+  // restoreMBB:
+  BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
+  BuildMI(restoreMBB, DL, TII->get(X86::JMP_4)).addMBB(sinkMBB);
+  restoreMBB->addSuccessor(sinkMBB);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
+                                     MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  MVT PVT = getPointerTy();
+  assert((PVT == MVT::i64 || PVT == MVT::i32) &&
+         "Invalid Pointer Size!");
+
+  const TargetRegisterClass *RC =
+    (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
+  unsigned Tmp = MRI.createVirtualRegister(RC);
+  // Since FP is only updated here but NOT referenced, it's treated as GPR.
+  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  unsigned SP = RegInfo->getStackRegister();
+
+  MachineInstrBuilder MIB;
+
+  const int64_t LabelOffset = 1 * PVT.getStoreSize();
+  const int64_t SPOffset = 2 * PVT.getStoreSize();
+
+  unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
+  unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
+
+  // Reload FP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload IP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), LabelOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Reload SP
+  MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(i), SPOffset);
+    else
+      MIB.addOperand(MI->getOperand(i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Jump
+  BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
+
+  MI->eraseFromParent();
+  return MBB;
+}
+
+MachineBasicBlock *
 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
                                                MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
@@ -12893,198 +13926,101 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::PCMPESTRM128REG:
   case X86::VPCMPESTRM128REG:
   case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM: {
-    unsigned NumArgs;
-    bool MemArg;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::PCMPISTRM128REG:
-    case X86::VPCMPISTRM128REG:
-      NumArgs = 3; MemArg = false; break;
-    case X86::PCMPISTRM128MEM:
-    case X86::VPCMPISTRM128MEM:
-      NumArgs = 3; MemArg = true; break;
-    case X86::PCMPESTRM128REG:
-    case X86::VPCMPESTRM128REG:
-      NumArgs = 5; MemArg = false; break;
-    case X86::PCMPESTRM128MEM:
-    case X86::VPCMPESTRM128MEM:
-      NumArgs = 5; MemArg = true; break;
-    }
-    return EmitPCMP(MI, BB, NumArgs, MemArg);
-  }
-
-    // Thread synchronization.
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+
+  // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB);
+    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
 
-    // Atomic Lowering.
-  case X86::ATOMAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
-                                               X86::OR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMXOR32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
-                                               X86::XOR32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass);
-  case X86::ATOMNAND32:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
-                                               X86::AND32ri, X86::MOV32rm,
-                                               X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass, true);
-  case X86::ATOMMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
-  case X86::ATOMMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
-  case X86::ATOMUMIN32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
-  case X86::ATOMUMAX32:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
 
+  // Atomic Lowering.
+  case X86::ATOMAND8:
   case X86::ATOMAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+    // Fall through
+  case X86::ATOMOR8:
   case X86::ATOMOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
-                                               X86::OR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+    // Fall through
   case X86::ATOMXOR16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
-                                               X86::XOR16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass);
-  case X86::ATOMNAND16:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
-                                               X86::AND16ri, X86::MOV16rm,
-                                               X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass, true);
-  case X86::ATOMMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
-  case X86::ATOMMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
-  case X86::ATOMUMIN16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
-  case X86::ATOMUMAX16:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
-
-  case X86::ATOMAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
-  case X86::ATOMOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
-                                               X86::OR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
   case X86::ATOMXOR8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
-                                               X86::XOR8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass);
-  case X86::ATOMNAND8:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
-                                               X86::AND8ri, X86::MOV8rm,
-                                               X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass, true);
-  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
-  // This group is for 64-bit host.
-  case X86::ATOMAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
-  case X86::ATOMOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
-                                               X86::OR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
+  case X86::ATOMXOR32:
   case X86::ATOMXOR64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
-                                               X86::XOR64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass);
+    // Fall through
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
   case X86::ATOMNAND64:
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
-                                               X86::AND64ri32, X86::MOV64rm,
-                                               X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass, true);
-  case X86::ATOMMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+    // Fall through
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
   case X86::ATOMMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
-  case X86::ATOMUMIN64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+    // Fall through
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+    // Fall through
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
   case X86::ATOMUMAX64:
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+    // Fall through
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64:
+    return EmitAtomicLoadArith(MI, BB);
 
   // This group does 64-bit operations on a 32-bit host.
   case X86::ATOMAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               false);
   case X86::ATOMOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::OR32rr, X86::OR32rr,
-                                               X86::OR32ri, X86::OR32ri,
-                                               false);
   case X86::ATOMXOR6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::XOR32rr, X86::XOR32rr,
-                                               X86::XOR32ri, X86::XOR32ri,
-                                               false);
   case X86::ATOMNAND6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::AND32rr, X86::AND32rr,
-                                               X86::AND32ri, X86::AND32ri,
-                                               true);
   case X86::ATOMADD6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::ADD32rr, X86::ADC32rr,
-                                               X86::ADD32ri, X86::ADC32ri,
-                                               false);
   case X86::ATOMSUB6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::SUB32rr, X86::SBB32rr,
-                                               X86::SUB32ri, X86::SBB32ri,
-                                               false);
+  case X86::ATOMMAX6432:
+  case X86::ATOMMIN6432:
+  case X86::ATOMUMAX6432:
+  case X86::ATOMUMIN6432:
   case X86::ATOMSWAP6432:
-    return EmitAtomicBit6432WithCustomInserter(MI, BB,
-                                               X86::MOV32rr, X86::MOV32rr,
-                                               X86::MOV32ri, X86::MOV32ri,
-                                               false);
+    return EmitAtomicLoadArith6432(MI, BB);
+
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
   case X86::VAARG_64:
     return EmitVAARG64WithCustomInserter(MI, BB);
+
+  case X86::EH_SjLj_SetJmp32:
+  case X86::EH_SjLj_SetJmp64:
+    return emitEHSjLjSetJmp(MI, BB);
+
+  case X86::EH_SjLj_LongJmp32:
+  case X86::EH_SjLj_LongJmp64:
+    return emitEHSjLjLongJmp(MI, BB);
   }
 }
 
@@ -13331,12 +14267,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
 }
 
 
-/// DCI, PerformTruncateCombine - Converts truncate operation to
+/// PerformTruncateCombine - Converts truncate operation to
 /// a sequence of vector shuffle operations.
 /// It is possible when we truncate 256-bit vector to 128-bit vector
-
-SDValue X86TargetLowering::PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
-                                                  DAGCombinerInfo &DCI) const {
+static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget *Subtarget)  {
   if (!DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -13528,7 +14464,7 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
     // alignment is valid.
     unsigned Align = LN0->getAlignment();
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    unsigned NewAlign = TLI.getTargetData()->
+    unsigned NewAlign = TLI.getDataLayout()->
       getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
 
     if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VT))
@@ -13559,6 +14495,14 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
     return NewOp;
 
   SDValue InputVector = N->getOperand(0);
+  // Detect whether we are trying to convert from mmx to i32 and the bitcast
+  // from mmx to v2i32 has a single usage.
+  if (InputVector.getNode()->getOpcode() == llvm::ISD::BITCAST &&
+      InputVector.getNode()->getOperand(0).getValueType() == MVT::x86mmx &&
+      InputVector.hasOneUse() && N->getValueType(0) == MVT::i32)
+    return DAG.getNode(X86ISD::MMX_MOVD2W, InputVector.getDebugLoc(),
+                       N->getValueType(0),
+                       InputVector.getNode()->getOperand(0));
 
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
@@ -13959,7 +14903,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 //
 // where Op could be BRCOND or CMOV.
 //
-static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
+static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   // Quit if not CMP and SUB with its value result used.
   if (Cmp.getOpcode() != X86ISD::CMP &&
       (Cmp.getOpcode() != X86ISD::SUB || Cmp.getNode()->hasAnyUseOfValue(0)))
@@ -13995,40 +14939,55 @@ static SDValue BoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   if (SetCC.getOpcode() == ISD::ZERO_EXTEND)
     SetCC = SetCC.getOperand(0);
 
-  // Quit if not SETCC.
-  // FIXME: So far we only handle the boolean value generated from SETCC. If
-  // there is other ways to generate boolean values, we need handle them here
-  // as well.
-  if (SetCC.getOpcode() != X86ISD::SETCC)
-    return SDValue();
-
-  // Set the condition code or opposite one if necessary.
-  CC = X86::CondCode(SetCC.getConstantOperandVal(0));
-  if (needOppositeCond)
-    CC = X86::GetOppositeBranchCondition(CC);
-
-  return SetCC.getOperand(1);
-}
-
-static bool IsValidFCMOVCondition(X86::CondCode CC) {
-  switch (CC) {
-  default:
-    return false;
-  case X86::COND_B:
-  case X86::COND_BE:
-  case X86::COND_E:
-  case X86::COND_P:
-  case X86::COND_AE:
-  case X86::COND_A:
-  case X86::COND_NE:
-  case X86::COND_NP:
-    return true;
+  switch (SetCC.getOpcode()) {
+  case X86ISD::SETCC:
+    // Set the condition code or opposite one if necessary.
+    CC = X86::CondCode(SetCC.getConstantOperandVal(0));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(1);
+  case X86ISD::CMOV: {
+    // Check whether false/true value has canonical one, i.e. 0 or 1.
+    ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
+    ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
+    // Quit if true value is not a constant.
+    if (!TVal)
+      return SDValue();
+    // Quit if false value is not a constant.
+    if (!FVal) {
+      // A special case for rdrand, where 0 is set if false cond is found.
+      SDValue Op = SetCC.getOperand(0);
+      if (Op.getOpcode() != X86ISD::RDRAND)
+        return SDValue();
+    }
+    // Quit if false value is not the constant 0 or 1.
+    bool FValIsFalse = true;
+    if (FVal && FVal->getZExtValue() != 0) {
+      if (FVal->getZExtValue() != 1)
+        return SDValue();
+      // If FVal is 1, opposite cond is needed.
+      needOppositeCond = !needOppositeCond;
+      FValIsFalse = false;
+    }
+    // Quit if TVal is not the constant opposite of FVal.
+    if (FValIsFalse && TVal->getZExtValue() != 1)
+      return SDValue();
+    if (!FValIsFalse && TVal->getZExtValue() != 0)
+      return SDValue();
+    CC = X86::CondCode(SetCC.getConstantOperandVal(2));
+    if (needOppositeCond)
+      CC = X86::GetOppositeBranchCondition(CC);
+    return SetCC.getOperand(3);
+  }
   }
+
+  return SDValue();
 }
 
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI) {
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
 
   // If the flag operand isn't dead, don't touch this CMOV.
@@ -14053,10 +15012,10 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(Cond, CC);
+  Flags = checkBoolTestSetCCCombine(Cond, CC);
   if (Flags.getNode() &&
       // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || IsValidFCMOVCondition(CC))) {
+      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
     SDValue Ops[] = { FalseOp, TrueOp,
                       DAG.getConstant(CC, MVT::i8), Flags };
     return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
@@ -14073,6 +15032,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
         CC = X86::GetOppositeBranchCondition(CC);
         std::swap(TrueC, FalseC);
+        std::swap(TrueOp, FalseOp);
       }
 
       // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
@@ -14155,6 +15115,46 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       }
     }
   }
+
+  // Handle these cases:
+  //   (select (x != c), e, c) -> select (x != c), e, x),
+  //   (select (x == c), c, e) -> select (x == c), x, e)
+  // where the c is an integer constant, and the "select" is the combination
+  // of CMOV and CMP.
+  //
+  // The rationale for this change is that the conditional-move from a constant
+  // needs two instructions, however, conditional-move from a register needs
+  // only one instruction.
+  //
+  // CAVEAT: By replacing a constant with a symbolic value, it may obscure
+  //  some instruction-combining opportunities. This opt needs to be
+  //  postponed as late as possible.
+  //
+  if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
+    // the DCI.xxxx conditions are provided to postpone the optimization as
+    // late as possible.
+
+    ConstantSDNode *CmpAgainst = 0;
+    if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
+        (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
+        dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
+
+      if (CC == X86::COND_NE &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueOp, FalseOp);
+      }
+
+      if (CC == X86::COND_E &&
+          CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
+        SDValue Ops[] = { FalseOp, Cond.getOperand(0),
+                          DAG.getConstant(CC, MVT::i8), Cond };
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
+                           array_lengthof(Ops));
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -14811,11 +15811,11 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
   ISD::LoadExtType Ext = Ld->getExtensionType();
 
   // If this is a vector EXT Load then attempt to optimize it using a
-  // shuffle. We need SSE4 for the shuffles.
+  // shuffle. We need SSSE3 shuffles.
   // TODO: It is possible to support ZExt by zeroing the undef values
   // during the shuffle phase or after the shuffle.
   if (RegVT.isVector() && RegVT.isInteger() &&
-      Ext == ISD::EXTLOAD && Subtarget->hasSSE41()) {
+      Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
     assert(MemVT != RegVT && "Cannot extend to the same type");
     assert(MemVT.isVector() && "Must load a vector from memory");
 
@@ -15041,7 +16041,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   const Function *F = DAG.getMachineFunction().getFunction();
-  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  bool NoImplicitFloatOps = F->getFnAttributes().
+    hasAttribute(Attributes::NoImplicitFloat);
   bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
                      && Subtarget->hasSSE2();
   if ((VT.isVector() ||
@@ -15313,6 +16314,29 @@ static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// PerformFMinFMaxCombine - Do target-specific dag combines on X86ISD::FMIN and
+/// X86ISD::FMAX nodes.
+static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
+
+  // Only perform optimizations if UnsafeMath is used.
+  if (!DAG.getTarget().Options.UnsafeFPMath)
+    return SDValue();
+
+  // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
+  // into FMINC and FMAXC, which are Commutative operations.
+  unsigned NewOp = 0;
+  switch (N->getOpcode()) {
+    default: llvm_unreachable("unknown opcode");
+    case X86ISD::FMIN:  NewOp = X86ISD::FMINC; break;
+    case X86ISD::FMAX:  NewOp = X86ISD::FMAXC; break;
+  }
+
+  return DAG.getNode(NewOp, N->getDebugLoc(), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+
 /// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
 static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
   // FAND(0.0, x) -> 0.0
@@ -15418,8 +16442,13 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   EVT ScalarVT = VT.getScalarType();
-  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+  if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) ||
+      (!Subtarget->hasFMA() && !Subtarget->hasFMA4()))
     return SDValue();
 
   SDValue A = N->getOperand(0);
@@ -15441,9 +16470,10 @@ static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
 
   unsigned Opcode;
   if (!NegMul)
-    Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+    Opcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
   else
-    Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+    Opcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+
   return DAG.getNode(Opcode, dl, VT, A, B, C);
 }
 
@@ -15540,24 +16570,51 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg" 
+// as "sbb reg,reg", since it can be extended without zext and produces 
+// an all-ones bit which is more useful than 0/1 in some cases.
+static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
+  return DAG.getNode(ISD::AND, DL, MVT::i8,
+                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
+                                 DAG.getConstant(X86::COND_B, MVT::i8), EFLAGS),
+                     DAG.getConstant(1, MVT::i8));
+}
+
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget *Subtarget) {
   DebugLoc DL = N->getDebugLoc();
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
+  if (CC == X86::COND_A) {
+    // Try to convert COND_A into COND_B in an attempt to facilitate 
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && 
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      return MaterializeSETB(DL, NewEFLAGS, DAG);
+    }
+  }
+
   // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
   // a zext and produces an all-ones bit which is more useful than 0/1 in some
   // cases.
   if (CC == X86::COND_B)
-    return DAG.getNode(ISD::AND, DL, MVT::i8,
-                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(CC, MVT::i8), EFLAGS),
-                       DAG.getConstant(1, MVT::i8));
+    return MaterializeSETB(DL, EFLAGS, DAG);
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
   if (Flags.getNode()) {
     SDValue Cond = DAG.getConstant(CC, MVT::i8);
     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
@@ -15579,7 +16636,7 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
 
   SDValue Flags;
 
-  Flags = BoolTestSetCCCombine(EFLAGS, CC);
+  Flags = checkBoolTestSetCCCombine(EFLAGS, CC);
   if (Flags.getNode()) {
     SDValue Cond = DAG.getConstant(CC, MVT::i8);
     return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
@@ -15589,23 +16646,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op0 = N->getOperand(0);
-  EVT InVT = Op0->getValueType(0);
-
-  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
-  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
-    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
-    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
-    // Notice that we use SINT_TO_FP because we know that the high bits
-    // are zero and SINT_TO_FP is better supported by the hardware.
-    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
-  }
-
-  return SDValue();
-}
-
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86TargetLowering *XTLI) {
   SDValue Op0 = N->getOperand(0);
@@ -15637,20 +16677,6 @@ static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformFP_TO_SINTCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  // v4i8 = FP_TO_SINT() -> v4i8 = TRUNCATE (V4i32 = FP_TO_SINT()
-  if (VT == MVT::v8i8 || VT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
-    MVT DstVT = VT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
-    SDValue I = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, N->getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, I);
-  }
-
-  return SDValue();
-}
-
 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
 static SDValue PerformADCCombine(SDNode *N, SelectionDAG &DAG,
                                  X86TargetLowering::DAGCombinerInfo &DCI) {
@@ -15765,6 +16791,21 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget *Subtarget) {
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue In = N->getOperand(0);
+  while (In.getOpcode() == ISD::BITCAST)
+    In = In.getOperand(0);
+
+  if (In.getOpcode() != X86ISD::VZEXT)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -15774,7 +16815,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
     return PerformEXTRACT_VECTOR_ELTCombine(N, DAG, DCI);
   case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, DCI, Subtarget);
-  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI, Subtarget);
   case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
   case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
@@ -15787,23 +16828,24 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
-  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
-  case ISD::FP_TO_SINT:     return PerformFP_TO_SINTCombine(N, DAG);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
   case X86ISD::FXOR:
   case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FMIN:
+  case X86ISD::FMAX:        return PerformFMinFMaxCombine(N, DAG);
   case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
   case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
   case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG, DCI);
+  case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
-  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG);
+  case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
   case X86ISD::UNPCKH:
@@ -16231,7 +17273,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     return;
   case 'K':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+      if (isInt<8>(C->getSExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
         break;
       }
@@ -16556,3 +17598,207 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
   return Res;
 }
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+struct X86CostTblEntry {
+  int ISD;
+  MVT Type;
+  unsigned Cost;
+};
+
+static int
+FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+struct X86TypeConversionCostTblEntry {
+  int ISD;
+  MVT Dst;
+  MVT Src;
+  unsigned Cost;
+};
+
+static int
+FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
+                   int ISD, MVT Dst, MVT Src) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+unsigned
+X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
+                                                     Type *Ty) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
+
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86CostTblEntry AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::MUL,     MVT::v4i64,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    };
+
+  // Look for AVX1 lowering tricks.
+  if (ST.hasAVX()) {
+    int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
+                          LT.second);
+    if (Idx != -1)
+      return LT.first * AVX1CostTable[Idx].Cost;
+  }
+  // Fallback to the default implementation.
+  return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned
+X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                                 unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
+  return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
+                                                          Type *ValTy,
+                                                          Type *CondTy) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  const X86Subtarget &ST =
+  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86CostTblEntry SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const X86CostTblEntry AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const X86CostTblEntry AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  if (ST.hasSSE42()) {
+    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * SSE42CostTbl[Idx].Cost;
+  }
+
+  if (ST.hasAVX()) {
+    int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX1CostTbl[Idx].Cost;
+  }
+
+  if (ST.hasAVX2()) {
+    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX2CostTbl[Idx].Cost;
+  }
+
+  return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
+                                                        Type *Dst,
+                                                        Type *Src) const {
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
+  };
+
+  if (ST.hasAVX()) {
+    int Idx = FindInConvertTable(AVXConversionTbl,
+                                 array_lengthof(AVXConversionTbl),
+                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVXConversionTbl[Idx].Cost;
+  }
+
+  return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+}
+
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 896d067fda75..465c6036ada6 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -19,6 +19,7 @@
 #include "X86RegisterInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/SelectionDAG.h"
@@ -142,6 +143,10 @@ namespace llvm {
       /// mnemonic, so do I; blame Intel.
       MOVDQ2Q,
 
+      /// MMX_MOVD2W - Copies a 32-bit value from the low word of a MMX
+      /// vector to a GPR.
+      MMX_MOVD2W,
+
       /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
       /// i32, corresponds to X86::PEXTRB.
       PEXTRB,
@@ -195,6 +200,9 @@ namespace llvm {
       ///
       FMAX, FMIN,
 
+      /// FMAXC, FMINC - Commutative FMIN and FMAX.
+      FMAXC, FMINC,
+
       /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
       /// approximation.  Note that these typically require refinement
       /// in order to obtain suitable precision.
@@ -214,6 +222,12 @@ namespace llvm {
       // EH_RETURN - Exception Handling helpers.
       EH_RETURN,
 
+      // EH_SJLJ_SETJMP - SjLj exception handling setjmp.
+      EH_SJLJ_SETJMP,
+
+      // EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
+      EH_SJLJ_LONGJMP,
+
       /// TC_RETURN - Tail call return.
       ///   operand #0 chain
       ///   operand #1 callee (register or absolute)
@@ -227,9 +241,18 @@ namespace llvm {
       // VSEXT_MOVL - Vector move low and sign extend.
       VSEXT_MOVL,
 
+      // VZEXT - Vector integer zero-extend.
+      VZEXT,
+
+      // VSEXT - Vector integer signed-extend.
+      VSEXT,
+
       // VFPEXT - Vector FP extend.
       VFPEXT,
 
+      // VFPROUND - Vector FP round.
+      VFPROUND,
+
       // VSHL, VSRL - 128-bit vector logical left / right shift
       VSHLDQ, VSRLDQ,
 
@@ -345,6 +368,10 @@ namespace llvm {
       ATOMXOR64_DAG,
       ATOMAND64_DAG,
       ATOMNAND64_DAG,
+      ATOMMAX64_DAG,
+      ATOMMIN64_DAG,
+      ATOMUMAX64_DAG,
+      ATOMUMIN64_DAG,
       ATOMSWAP64_DAG,
 
       // LCMPXCHG_DAG, LCMPXCHG8_DAG, LCMPXCHG16_DAG - Compare and swap.
@@ -458,10 +485,6 @@ namespace llvm {
     getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
                                  unsigned JTI, MCContext &Ctx) const;
 
-    /// getStackPtrReg - Return the stack pointer register we are using: either
-    /// ESP or RSP.
-    unsigned getStackPtrReg() const { return X86StackPtr; }
-
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area. For X86, aggregates
     /// that contains are placed at 16-byte boundaries while the rest are at
@@ -694,10 +717,7 @@ namespace llvm {
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
     const X86RegisterInfo *RegInfo;
-    const TargetData *TD;
-
-    /// X86StackPtr - X86 physical register used as stack ptr.
-    unsigned X86StackPtr;
+    const DataLayout *TD;
 
     /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
     /// floating point ops.
@@ -741,6 +761,7 @@ namespace llvm {
                                            bool isVarArg,
                                            bool isCalleeStructRet,
                                            bool isCallerStructRet,
+                                           Type *RetTy,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
                                     const SmallVectorImpl<SDValue> &OutVals,
                                     const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -760,15 +781,11 @@ namespace llvm {
     SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
                                    SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
@@ -782,12 +799,15 @@ namespace llvm {
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC,
                       DebugLoc dl, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
@@ -799,39 +819,26 @@ namespace llvm {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTLZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
-    SDValue PerformTruncateCombine(SDNode* N, SelectionDAG &DAG, DAGCombinerInfo &DCI) const;
 
-    // Utility functions to help LowerVECTOR_SHUFFLE
-    SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const;
+    // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
+    SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
     SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
+    SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
+
+    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
 
-    SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const;
+    SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
 
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
@@ -864,51 +871,17 @@ namespace llvm {
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const;
 
-    /// Utility function to emit string processing sse4.2 instructions
-    /// that return in xmm0.
-    /// This takes the instruction to expand, the associated machine basic
-    /// block, the number of args, and whether or not the second arg is
-    /// in memory or not.
-    MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
-                                unsigned argNum, bool inMem) const;
-
-    /// Utility functions to emit monitor and mwait instructions. These
-    /// need to make sure that the arguments to the intrinsic are in the
-    /// correct registers.
-    MachineBasicBlock *EmitMonitor(MachineInstr *MI,
-                                   MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const;
-
-    /// Utility function to emit atomic bitwise operations (and, or, xor).
-    /// It takes the bitwise instruction to expand, the associated machine basic
-    /// block, and the associated X86 opcodes for reg/reg and reg/imm.
-    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
-                                                    MachineInstr *BInstr,
-                                                    MachineBasicBlock *BB,
-                                                    unsigned regOpc,
-                                                    unsigned immOpc,
-                                                    unsigned loadOpc,
-                                                    unsigned cxchgOpc,
-                                                    unsigned notOpc,
-                                                    unsigned EAXreg,
-                                              const TargetRegisterClass *RC,
-                                                    bool Invert = false) const;
-
-    MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
-                                                    MachineInstr *BInstr,
-                                                    MachineBasicBlock *BB,
-                                                    unsigned regOpcL,
-                                                    unsigned regOpcH,
-                                                    unsigned immOpcL,
-                                                    unsigned immOpcH,
-                                                    bool Invert = false) const;
-
-    /// Utility function to emit atomic min and max.  It takes the min/max
-    /// instruction to expand, the associated basic block, and the associated
-    /// cmov opcode for moving the min or max value.
-    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
-                                                          MachineBasicBlock *BB,
-                                                        unsigned cmovOpc) const;
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, max, min, umax, umin). It takes the corresponding instruction to
+    /// expand, the associated machine basic block, and the associated X86
+    /// opcodes for reg/reg.
+    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
+    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) const;
 
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *EmitVAARG64WithCustomInserter(
@@ -936,6 +909,12 @@ namespace llvm {
     MachineBasicBlock *emitLoweredTLSAddr(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr *MI,
+                                        MachineBasicBlock *MBB) const;
+
+    MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) const;
+
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
     SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
@@ -953,6 +932,23 @@ namespace llvm {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
   }
+
+  class X86VectorTargetTransformInfo : public VectorTargetTransformImpl {
+  public:
+    explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
+    VectorTargetTransformImpl(TL) {}
+
+    virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+
+    virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                        unsigned Index) const;
+
+    unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                Type *CondTy) const;
+
+    virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                      Type *Src) const;
+  };
 }
 
 #endif    // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 99c2b8f955e4..9e6f27988f71 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -165,6 +165,33 @@ def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
 
 }
 
+let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
+    usesCustomInserter = 1 in {
+  def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
+                            "#EH_SJLJ_SETJMP32",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In32BitMode]>;
+  def EH_SjLj_SetJmp64  : I<0, Pseudo, (outs GR32:$dst), (ins i64mem:$buf),
+                            "#EH_SJLJ_SETJMP64",
+                            [(set GR32:$dst, (X86eh_sjlj_setjmp addr:$buf))]>,
+                          Requires<[In64BitMode]>;
+  let isTerminator = 1 in {
+  def EH_SjLj_LongJmp32 : I<0, Pseudo, (outs), (ins i32mem:$buf),
+                            "#EH_SJLJ_LONGJMP32",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In32BitMode]>;
+  def EH_SjLj_LongJmp64 : I<0, Pseudo, (outs), (ins i64mem:$buf),
+                            "#EH_SJLJ_LONGJMP64",
+                            [(X86eh_sjlj_longjmp addr:$buf)]>,
+                          Requires<[In64BitMode]>;
+  }
+}
+
+let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
+  def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
+                        "#EH_SjLj_Setup\t$dst", []>;
+}
+
 //===----------------------------------------------------------------------===//
 // Pseudo instructions used by segmented stacks.
 //
@@ -230,25 +257,18 @@ def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
                         IIC_ALU_NONMEM>;
 
 // Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isCodeGenOnly = 1 in {
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1 in {
 // FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
 // However, Pat<> can't replicate the destination reg into the inputs of the
 // result.
-// FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces
-// X86CodeEmitter.
-def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "",
-                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>;
-def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "",
-                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>,
-                OpSize;
-def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>;
-def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))],
-                 IIC_ALU_NONMEM>;
+def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
 } // isCodeGenOnly
 
 
@@ -453,6 +473,11 @@ def CMOV_GR16 : I<0, Pseudo,
                     "#CMOV_GR16* PSEUDO!",
                     [(set GR16:$dst,
                       (X86cmov GR16:$src1, GR16:$src2, imm:$cond, EFLAGS))]>;
+} // Predicates = [NoCMov]
+
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE1.
+let Predicates = [FPStackf32] in
 def CMOV_RFP32 : I<0, Pseudo,
                     (outs RFP32:$dst),
                     (ins RFP32:$src1, RFP32:$src2, i8imm:$cond),
@@ -460,6 +485,9 @@ def CMOV_RFP32 : I<0, Pseudo,
                     [(set RFP32:$dst,
                       (X86cmov RFP32:$src1, RFP32:$src2, imm:$cond,
                                                   EFLAGS))]>;
+// fcmov doesn't handle all possible EFLAGS, provide a fallback if there is no
+// SSE2.
+let Predicates = [FPStackf64] in
 def CMOV_RFP64 : I<0, Pseudo,
                     (outs RFP64:$dst),
                     (ins RFP64:$src1, RFP64:$src2, i8imm:$cond),
@@ -474,7 +502,6 @@ def CMOV_RFP80 : I<0, Pseudo,
                     [(set RFP80:$dst,
                       (X86cmov RFP80:$src1, RFP80:$src2, imm:$cond,
                                                   EFLAGS))]>;
-} // Predicates = [NoCMov]
 } // UsesCustomInserter = 1, Uses = [EFLAGS]
 
 
@@ -482,130 +509,74 @@ def CMOV_RFP80 : I<0, Pseudo,
 // Atomic Instruction Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-// Atomic exchange, and, or, xor
-let Constraints = "$val = $dst", Defs = [EFLAGS],
-                  usesCustomInserter = 1 in {
-
-def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMAND8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
-def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMOR8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
-def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMXOR8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
-def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMNAND8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
-
-def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMAND16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
-def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMOR16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
-def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMXOR16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
-def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMNAND16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
-def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
-               "#ATOMMIN16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
-def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMMAX16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
-def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMUMIN16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
-def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMUMAX16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
-
-
-def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMAND32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
-def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMOR32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
-def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMXOR32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
-def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMNAND32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
-def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
-               "#ATOMMIN32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
-def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMMAX32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
-def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMUMIN32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
-def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMUMAX32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
-
-
-
-def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMAND64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
-def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMOR64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
-def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMXOR64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
-def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMNAND64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
-def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
-               "#ATOMMIN64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
-def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMMAX64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
-def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMUMIN64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
-def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMUMAX64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
+// Pseudo atomic instructions
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
+    def #NAME#8  : I<0, Pseudo, (outs GR8:$dst),
+                     (ins i8mem:$ptr, GR8:$val),
+                     !strconcat(mnemonic, "8 PSEUDO!"), []>;
+    def #NAME#16 : I<0, Pseudo,(outs GR16:$dst),
+                     (ins i16mem:$ptr, GR16:$val),
+                     !strconcat(mnemonic, "16 PSEUDO!"), []>;
+    def #NAME#32 : I<0, Pseudo, (outs GR32:$dst),
+                     (ins i32mem:$ptr, GR32:$val),
+                     !strconcat(mnemonic, "32 PSEUDO!"), []>;
+    def #NAME#64 : I<0, Pseudo, (outs GR64:$dst),
+                     (ins i64mem:$ptr, GR64:$val),
+                     !strconcat(mnemonic, "64 PSEUDO!"), []>;
+  }
+}
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
+  def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
+            (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
+            (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
+            (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
+            (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
 }
 
-let Constraints = "$val1 = $dst1, $val2 = $dst2",
-                  Defs = [EFLAGS, EAX, EBX, ECX, EDX],
-                  Uses = [EAX, EBX, ECX, EDX],
-                  mayLoad = 1, mayStore = 1,
-                  usesCustomInserter = 1 in {
-def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMAND6432 PSEUDO!", []>;
-def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMOR6432 PSEUDO!", []>;
-def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMXOR6432 PSEUDO!", []>;
-def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMNAND6432 PSEUDO!", []>;
-def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMADD6432 PSEUDO!", []>;
-def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMSUB6432 PSEUDO!", []>;
-def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMSWAP6432 PSEUDO!", []>;
+// Atomic exchange, and, or, xor
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
+
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND",  "atomic_load_and">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR",   "atomic_load_or">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR",  "atomic_load_xor">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX",  "atomic_load_max">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN",  "atomic_load_min">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in
+    def #NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                       (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+                       !strconcat(mnemonic, "6432 PSEUDO!"), []>;
 }
 
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
+defm ATOMADD  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
+defm ATOMSUB  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
+defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
+
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -617,7 +588,6 @@ def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
 // TODO: Get this to fold the constant into the instruction.
 let isCodeGenOnly = 1, Defs = [EFLAGS] in
 def OR32mrLocked  : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$zero),
-                      "lock\n\t"
                       "or{l}\t{$zero, $dst|$dst, $zero}",
                       [], IIC_ALU_MEM>, Requires<[In32BitMode]>, LOCK;
 
@@ -637,72 +607,72 @@ let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                    RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
                    MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
-                   !strconcat("lock\n\t", mnemonic, "{b}\t",
+                   !strconcat(mnemonic, "{b}\t",
                               "{$src2, $dst|$dst, $src2}"),
                    [], IIC_ALU_NONMEM>, LOCK;
 def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
-                    !strconcat("lock\n\t", mnemonic, "{w}\t",
+                    !strconcat(mnemonic, "{w}\t",
                                "{$src2, $dst|$dst, $src2}"),
                     [], IIC_ALU_NONMEM>, OpSize, LOCK;
 def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                     RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                     MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
-                    !strconcat("lock\n\t", mnemonic, "{l}\t",
+                    !strconcat(mnemonic, "{l}\t",
                                "{$src2, $dst|$dst, $src2}"),
                     [], IIC_ALU_NONMEM>, LOCK;
 def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
                      RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
                      MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
-                     !strconcat("lock\n\t", mnemonic, "{q}\t",
+                     !strconcat(mnemonic, "{q}\t",
                                 "{$src2, $dst|$dst, $src2}"),
                      [], IIC_ALU_NONMEM>, LOCK;
 
 def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                      ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
                      ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
-                     !strconcat("lock\n\t", mnemonic, "{b}\t",
+                     !strconcat(mnemonic, "{b}\t",
                                 "{$src2, $dst|$dst, $src2}"),
                      [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                        ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                       !strconcat(mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [], IIC_ALU_MEM>, OpSize, LOCK;
 
 def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                       !strconcat(mnemonic, "{l}\t",
                                   "{$src2, $dst|$dst, $src2}"),
                        [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                           ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
                           ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
-                          !strconcat("lock\n\t", mnemonic, "{q}\t",
+                          !strconcat(mnemonic, "{q}\t",
                                      "{$src2, $dst|$dst, $src2}"),
                           [], IIC_ALU_MEM>, LOCK;
 
 def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{w}\t",
+                       !strconcat(mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [], IIC_ALU_MEM>, OpSize, LOCK;
 def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
-                       !strconcat("lock\n\t", mnemonic, "{l}\t",
+                       !strconcat(mnemonic, "{l}\t",
                                   "{$src2, $dst|$dst, $src2}"),
                        [], IIC_ALU_MEM>, LOCK;
 def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                         ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                         ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
-                        !strconcat("lock\n\t", mnemonic, "{q}\t",
+                        !strconcat(mnemonic, "{q}\t",
                                    "{$src2, $dst|$dst, $src2}"),
                         [], IIC_ALU_MEM>, LOCK;
 
@@ -717,107 +687,117 @@ defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
 defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 
 // Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 
-def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
-                    "lock\n\t"
-                    "inc{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
-                    "lock\n\t"
-                    "inc{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
-def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
-                    "lock\n\t"
-                    "inc{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
-                     "lock\n\t"
-                     "inc{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-
-def LOCK_DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
-                    "lock\n\t"
-                    "dec{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
-                    "lock\n\t"
-                    "dec{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
-def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
-                    "lock\n\t"
-                    "dec{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
-                      "lock\n\t"
-                      "dec{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
+def #NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                  !strconcat(mnemonic, "{b}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+def #NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                  !strconcat(mnemonic, "{w}\t$dst"),
+                  [], IIC_UNARY_MEM>, OpSize, LOCK;
+def #NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                  !strconcat(mnemonic, "{l}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+def #NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                   !strconcat(mnemonic, "{q}\t$dst"),
+                   [], IIC_UNARY_MEM>, LOCK;
+}
 }
 
-// Atomic compare and swap.
-let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    isCodeGenOnly = 1 in
-def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
-               "lock\n\t"
-               "cmpxchg8b\t$ptr",
-               [(X86cas8 addr:$ptr)], IIC_CMPX_LOCK_8B>, TB, LOCK;
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
 
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
-    isCodeGenOnly = 1 in
-def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
-                    "lock\n\t"
-                    "cmpxchg16b\t$ptr",
-                    [(X86cas16 addr:$ptr)], IIC_CMPX_LOCK_16B>, TB, LOCK,
-                    Requires<[HasCmpxchg16b]>;
-
-let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
-def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
-               "lock\n\t"
-               "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR8:$swap, 1)], IIC_CMPX_LOCK_8>, TB, LOCK;
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  def #NAME# : I<Opc, Form, (outs), (ins x86memop:$ptr),
+                 !strconcat(mnemonic, "\t$ptr"),
+                 [(frag addr:$ptr)], itin>, TB, LOCK;
+}
 }
 
-let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in {
-def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
-               "lock\n\t"
-               "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR16:$swap, 2)], IIC_CMPX_LOCK>, TB, OpSize, LOCK;
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def #NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                   !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def #NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                   !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def #NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                   !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def #NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                    !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                    [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
 }
 
-let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in {
-def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
-               "lock\n\t"
-               "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR32:$swap, 4)], IIC_CMPX_LOCK>, TB, LOCK;
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
 }
 
-let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in {
-def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
-               "lock\n\t"
-               "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR64:$swap, 8)], IIC_CMPX_LOCK>, TB, LOCK;
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
 }
 
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
 // Atomic exchange and add
-let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
-def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
-               "lock\n\t"
-               "xadd{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))],
-                IIC_XADD_LOCK_MEM8>,
-                TB, LOCK;
-def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr),
-               "lock\n\t"
-               "xadd{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, OpSize, LOCK;
-def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
-               "lock\n\t"
-               "xadd{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, LOCK;
-def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
-               "lock\n\t"
-               "xadd{q}\t{$val, $ptr|$ptr, $val}",
-               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, LOCK;
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
+    def #NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                     (ins GR8:$val, i8mem:$ptr),
+                     !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                     [(set GR8:$dst,
+                           (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                     itin8>;
+    def #NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                     (ins GR16:$val, i16mem:$ptr),
+                     !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR16:$dst,
+                        (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                     itin>, OpSize;
+    def #NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                     (ins GR32:$val, i32mem:$ptr),
+                     !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR32:$dst,
+                        (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                     itin>;
+    def #NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$val, i64mem:$ptr),
+                      !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                      itin>;
+  }
 }
 
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
+
 def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
@@ -1017,7 +997,24 @@ def : Pat<(X86call (i64 tglobaladdr:$dst)),
 def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>;
 
-// tailcall stuff
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In32BitMode]>;
@@ -1041,7 +1038,9 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
-def : Pat<(X86tcret (load addr:$dst), imm:$off),
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
           Requires<[In64BitMode]>;
 
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index b0c27c882710..bfe954114c55 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -16,15 +16,18 @@
 //
 
 // Return instructions.
+//
+// The X86retflag return instructions are variadic because we may add ST0 and
+// ST1 arguments when returning values on the x87 stack.
 let isTerminator = 1, isReturn = 1, isBarrier = 1,
     hasCtrlDep = 1, FPForm = SpecialFP in {
-  def RET    : I   <0xC3, RawFrm, (outs), (ins),
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
                     "ret",
                     [(X86retflag 0)], IIC_RET>;
   def RETW   : I   <0xC3, RawFrm, (outs), (ins),
                     "ret{w}",
                     [], IIC_RET>, OpSize;
-  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
                     "ret\t$amt",
                     [(X86retflag timm:$amt)], IIC_RET_IMM>;
   def RETIW  : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt),
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 265b4bb997f9..959d91a9ab6b 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -16,243 +16,180 @@
 //===----------------------------------------------------------------------===//
 
 let Constraints = "$src1 = $dst" in {
-multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
-let neverHasSideEffects = 1 in {
-  def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, VR128:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  let mayLoad = 1 in
-  def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, f128mem:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
-                (ins VR256:$src1, VR256:$src2, VR256:$src3),
-                !strconcat(OpcodeStr, 
-                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-  let mayLoad = 1 in
-  def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
-                (ins VR256:$src1, VR256:$src2, f256mem:$src3),
-                !strconcat(OpcodeStr, 
-                           "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-} // neverHasSideEffects = 1
-}
-
-// Intrinsic for 213 pattern
-multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
-                        PatFrag MemFrag128, PatFrag MemFrag256,
-                        Intrinsic Int128, Intrinsic Int256, SDNode Op213, 
-			                  ValueType OpVT128, ValueType OpVT256> {
-  def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, VR128:$src3),
-                   !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
-                                      VR128:$src3))]>;
-
+multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
+                    PatFrag MemFrag128, PatFrag MemFrag256,
+                    ValueType OpVT128, ValueType OpVT256,
+                    SDPatternOperator Op = null_frag> {
+  let isCommutable = 1 in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, 
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
-                   (ins VR128:$src1, VR128:$src2, f128mem:$src3),
-                   !strconcat(OpcodeStr,
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
-                                      (MemFrag128 addr:$src3)))]>;
-
+  let mayLoad = 1 in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, VR128:$src1,
+                   [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
-
-  def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
-                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
-                    !strconcat(OpcodeStr, 
-                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, 
-                                       VR256:$src3))]>;
-
+  let isCommutable = 1 in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set VR256:$dst, (OpVT256 (Op213 VR256:$src2, VR256:$src1,
-                                               VR256:$src3)))]>;
-
-  def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
-                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
-                    !strconcat(OpcodeStr, 
-                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                    [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1, 
-                                       (MemFrag256 addr:$src3)))]>;
+                   [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
+                                               VR256:$src3)))]>, VEX_L;
 
+  let mayLoad = 1 in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR256:$dst,
-                     (OpVT256 (Op213 VR256:$src2, VR256:$src1, 
-                               (MemFrag256 addr:$src3))))]>;
+                     (OpVT256 (Op VR256:$src2, VR256:$src1,
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
 }
 } // Constraints = "$src1 = $dst"
 
 multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
-                       Intrinsic Int128, Intrinsic Int256, SDNode Op,
-                       ValueType OpTy128, ValueType OpTy256> {
-  defm r213 : fma3p_rm_int <opc213, !strconcat(OpcodeStr,
-                            !strconcat("213", PackTy)), MemFrag128, MemFrag256,
-                            Int128, Int256, Op, OpTy128, OpTy256>;
-  defm r132 : fma3p_rm <opc132, 
-                        !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
-  defm r231 : fma3p_rm <opc231, 
-                        !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+                       SDNode Op, ValueType OpTy128, ValueType OpTy256> {
+  defm r213 : fma3p_rm<opc213,
+                       !strconcat(OpcodeStr, !strconcat("213", PackTy)),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3p_rm<opc132,
+                       !strconcat(OpcodeStr, !strconcat("132", PackTy)),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+  defm r231 : fma3p_rm<opc231,
+                       !strconcat(OpcodeStr, !strconcat("231", PackTy)),
+                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+} // neverHasSideEffects = 1
 }
 
 // Fused Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFMADDPS    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
-                                 memopv8f32, int_x86_fma_vfmadd_ps,
-                                 int_x86_fma_vfmadd_ps_256, X86Fmadd,
-                                 v4f32, v8f32>;
+                                 memopv8f32, X86Fmadd, v4f32, v8f32>;
   defm VFMSUBPS    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
-                                 memopv8f32, int_x86_fma_vfmsub_ps, 
-                                 int_x86_fma_vfmsub_ps_256, X86Fmsub,
-                                 v4f32, v8f32>;
+                                 memopv8f32, X86Fmsub, v4f32, v8f32>;
   defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
-                                 memopv4f32, memopv8f32, 
-                                 int_x86_fma_vfmaddsub_ps,
-                                 int_x86_fma_vfmaddsub_ps_256, X86Fmaddsub, 
+                                 memopv4f32, memopv8f32, X86Fmaddsub,
                                  v4f32, v8f32>;
   defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
-                                 memopv4f32, memopv8f32, 
-                                 int_x86_fma_vfmsubadd_ps,
-                                 int_x86_fma_vfmaddsub_ps_256, X86Fmsubadd,
+                                 memopv4f32, memopv8f32, X86Fmsubadd,
                                  v4f32, v8f32>;
 }
 
 let ExeDomain = SSEPackedDouble in {
   defm VFMADDPD    : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
-                                 memopv4f64, int_x86_fma_vfmadd_pd,
-                                 int_x86_fma_vfmadd_pd_256, X86Fmadd, v2f64,
-                                 v4f64>, VEX_W;
+                                 memopv4f64, X86Fmadd, v2f64, v4f64>, VEX_W;
   defm VFMSUBPD    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
-                                 memopv4f64, int_x86_fma_vfmsub_pd,
-                                 int_x86_fma_vfmsub_pd_256, X86Fmsub, v2f64,
-                                 v4f64>, VEX_W;
+                                 memopv4f64, X86Fmsub, v2f64, v4f64>, VEX_W;
   defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
-                                 memopv2f64, memopv4f64,
-                                 int_x86_fma_vfmaddsub_pd,
-                                 int_x86_fma_vfmaddsub_pd_256, X86Fmaddsub,
+                                 memopv2f64, memopv4f64, X86Fmaddsub,
                                  v2f64, v4f64>, VEX_W;
   defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
-                                 memopv2f64, memopv4f64, 
-                                 int_x86_fma_vfmsubadd_pd,
-                                 int_x86_fma_vfmsubadd_pd_256, X86Fmsubadd,
+                                 memopv2f64, memopv4f64, X86Fmsubadd,
                                  v2f64, v4f64>, VEX_W;
 }
 
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps",  memopv4f32,
-                               memopv8f32, int_x86_fma_vfnmadd_ps,
-                               int_x86_fma_vfnmadd_ps_256, X86Fnmadd, v4f32,
-                               v8f32>;
+                               memopv8f32, X86Fnmadd, v4f32, v8f32>;
   defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps",  memopv4f32,
-                               memopv8f32, int_x86_fma_vfnmsub_ps,
-                               int_x86_fma_vfnmsub_ps_256, X86Fnmsub, v4f32,
-                               v8f32>;
+                               memopv8f32, X86Fnmsub, v4f32, v8f32>;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
-                               memopv4f64, int_x86_fma_vfnmadd_pd,
-                               int_x86_fma_vfnmadd_pd_256, X86Fnmadd, v2f64,
-                               v4f64>, VEX_W;
+                               memopv4f64, X86Fnmadd, v2f64, v4f64>, VEX_W;
   defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
-                               memopv2f64,
-                               memopv4f64, int_x86_fma_vfnmsub_pd,
-                               int_x86_fma_vfnmsub_pd_256, X86Fnmsub, v2f64,
+                               memopv2f64, memopv4f64, X86Fnmsub, v2f64,
                                v4f64>, VEX_W;
 }
 
 let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
-                    RegisterClass RC> {
-let neverHasSideEffects = 1 in {
-  def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2, RC:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
+                    RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+                    SDPatternOperator OpNode = null_frag> {
+  let isCommutable = 1 in
+  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, RC:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
   let mayLoad = 1 in
-  def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
-               (ins RC:$src1, RC:$src2, x86memop:$src3),
-               !strconcat(OpcodeStr, 
-                          "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
-} // neverHasSideEffects = 1
+  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+                   (ins RC:$src1, RC:$src2, x86memop:$src3),
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set RC:$dst,
+                     (OpVT (OpNode RC:$src2, RC:$src1,
+                            (mem_frag addr:$src3))))]>;
 }
 
 multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
-                        ComplexPattern mem_cpat, Intrinsic IntId, 
-                        RegisterClass RC, SDNode OpNode, ValueType OpVT> {
+                        ComplexPattern mem_cpat, Intrinsic IntId,
+                        RegisterClass RC> {
+  let isCommutable = 1 in
   def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
-                   !strconcat(OpcodeStr, 
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
-                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1, 
+                   !strconcat(OpcodeStr,
+                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1,
                      VR128:$src3))]>;
   def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, memop:$src3),
-                   !strconcat(OpcodeStr, 
+                   !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR128:$dst,
                      (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
-  def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, RC:$src3),
-                   !strconcat(OpcodeStr, 
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-                   [(set RC:$dst,
-		                 (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
-  let mayLoad = 1 in
-  def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
-                   (ins RC:$src1, RC:$src2, memop:$src3),
-                   !strconcat(OpcodeStr, 
-                              "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
 }
 } // Constraints = "$src1 = $dst"
 
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
-                       string OpStr, Intrinsic IntF32, Intrinsic IntF64, 
-                       SDNode OpNode> {
-  defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>;
-  defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>;
-  defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>,
-                        VEX_W;
-  defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>,
-                        VEX_W;
-  defm SSr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213ss"), ssmem,
-                              sse_load_f32, IntF32, FR32, OpNode, f32>;
-  defm SDr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213sd"), sdmem,
-                              sse_load_f64, IntF64, FR64, OpNode, f64>, VEX_W;
+                       string OpStr, string PackTy, Intrinsic Int,
+                       SDNode OpNode, RegisterClass RC, ValueType OpVT,
+                       X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
+                       ComplexPattern mem_cpat> {
+let neverHasSideEffects = 1 in {
+  defm r132 : fma3s_rm<opc132, !strconcat(OpStr, !strconcat("132", PackTy)),
+                       x86memop, RC, OpVT, mem_frag>;
+  defm r231 : fma3s_rm<opc231, !strconcat(OpStr, !strconcat("231", PackTy)),
+                       x86memop, RC, OpVT, mem_frag>;
+}
+
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+                     x86memop, RC, OpVT, mem_frag, OpNode>,
+            fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+                         memop, mem_cpat, Int, RC>;
+}
+
+multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+                 string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+                 SDNode OpNode> {
+  defm SS : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", IntF32, OpNode,
+                        FR32, f32, f32mem, ssmem, loadf32, sse_load_f32>;
+  defm SD : fma3s_forms<opc132, opc213, opc231, OpStr, "sd", IntF64, OpNode,
+                        FR64, f64, f64mem, sdmem, loadf64, sse_load_f64>, VEX_W;
 }
 
-defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
-                          int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
-                          int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
+                    int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
+                    int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
 
-defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
-                           int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
-defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
-                           int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
+                     int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
+                     int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
 
 
 //===----------------------------------------------------------------------===//
@@ -260,73 +197,102 @@ defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
 //===----------------------------------------------------------------------===//
 
 
-multiclass fma4s<bits<8> opc, string OpcodeStr, Operand memop,
-                 ComplexPattern mem_cpat, Intrinsic Int> {
-  def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                 X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
+                 PatFrag mem_frag> {
+  let isCommutable = 1 in
+  def rr : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
-  def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, VR128:$src2, memop:$src3),
+           [(set RC:$dst,
+             (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, MemOp4;
+  def rm : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, RC:$src2, x86memop:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, VR128:$src2, mem_cpat:$src3))]>, VEX_W, MemOp4;
-  def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
-           (ins VR128:$src1, memop:$src2, VR128:$src3),
+           [(set RC:$dst, (OpNode RC:$src1, RC:$src2,
+                           (mem_frag addr:$src3)))]>, VEX_W, MemOp4;
+  def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+           (ins RC:$src1, x86memop:$src2, RC:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst,
-             (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+           [(set RC:$dst,
+             (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
 // For disassembler
 let isCodeGenOnly = 1 in
-  def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
-               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+  def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+               (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
 }
 
-multiclass fma4p<bits<8> opc, string OpcodeStr,
-                 Intrinsic Int128, Intrinsic Int256,
+multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
+                     ComplexPattern mem_cpat, Intrinsic Int> {
+  let isCommutable = 1 in
+  def rr_Int : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+  def rm_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, memop:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
+                                  mem_cpat:$src3))]>, VEX_W, MemOp4;
+  def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, memop:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               [(set VR128:$dst,
+                 (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>;
+}
+
+multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                 ValueType OpVT128, ValueType OpVT256,
                  PatFrag ld_frag128, PatFrag ld_frag256> {
+  let isCommutable = 1 in
   def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, MemOp4;
+             (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
+           VEX_W, MemOp4;
   def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, VR128:$src2, f128mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2,
+           [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
                               (ld_frag128 addr:$src3)))]>, VEX_W, MemOp4;
   def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, f128mem:$src2, VR128:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR128:$dst,
-             (Int128 VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+             (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+  let isCommutable = 1 in
   def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
-             (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>, VEX_W, MemOp4;
+             (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
+           VEX_W, MemOp4, VEX_L;
   def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, f256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2,
-                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4;
+           [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
+                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
   def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst,
-             (Int256 VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
 // For disassembler
 let isCodeGenOnly = 1 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
@@ -336,51 +302,65 @@ let isCodeGenOnly = 1 in {
   def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
                 (ins VR256:$src1, VR256:$src2, VR256:$src3),
                 !strconcat(OpcodeStr,
-                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
 } // isCodeGenOnly = 1
 }
 
 let Predicates = [HasFMA4] in {
 
-defm VFMADDSS4    : fma4s<0x6A, "vfmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmadd_ss>;
-defm VFMADDSD4    : fma4s<0x6B, "vfmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmadd_sd>;
-defm VFMADDPS4    : fma4p<0x68, "vfmaddps", int_x86_fma_vfmadd_ps,
-                          int_x86_fma_vfmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", int_x86_fma_vfmadd_pd,
-                          int_x86_fma_vfmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBSS4    : fma4s<0x6E, "vfmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfmsub_ss>;
-defm VFMSUBSD4    : fma4s<0x6F, "vfmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfmsub_sd>;
-defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", int_x86_fma_vfmsub_ps,
-                          int_x86_fma_vfmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", int_x86_fma_vfmsub_pd,
-                          int_x86_fma_vfmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFNMADDSS4   : fma4s<0x7A, "vfnmaddss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmadd_ss>;
-defm VFNMADDSD4   : fma4s<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmadd_sd>;
-defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", int_x86_fma_vfnmadd_ps,
-                          int_x86_fma_vfnmadd_ps_256, memopv4f32, memopv8f32>;
-defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", int_x86_fma_vfnmadd_pd,
-                          int_x86_fma_vfnmadd_pd_256, memopv2f64, memopv4f64>;
-defm VFNMSUBSS4   : fma4s<0x7E, "vfnmsubss", ssmem, sse_load_f32,
-                          int_x86_fma_vfnmsub_ss>;
-defm VFNMSUBSD4   : fma4s<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
-                          int_x86_fma_vfnmsub_sd>;
-defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", int_x86_fma_vfnmsub_ps,
-                          int_x86_fma_vfnmsub_ps_256, memopv4f32, memopv8f32>;
-defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", int_x86_fma_vfnmsub_pd,
-                          int_x86_fma_vfnmsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", int_x86_fma_vfmaddsub_ps,
-                          int_x86_fma_vfmaddsub_ps_256, memopv4f32, memopv8f32>;
-defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", int_x86_fma_vfmaddsub_pd,
-                          int_x86_fma_vfmaddsub_pd_256, memopv2f64, memopv4f64>;
-defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", int_x86_fma_vfmsubadd_ps,
-                          int_x86_fma_vfmsubadd_ps_256, memopv4f32, memopv8f32>;
-defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", int_x86_fma_vfmsubadd_pd,
-                          int_x86_fma_vfmsubadd_pd_256, memopv2f64, memopv4f64>;
+defm VFMADDSS4  : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
+                  fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmadd_ss>;
+defm VFMADDSD4  : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
+                  fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmadd_sd>;
+defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
+                  fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfmsub_ss>;
+defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
+                  fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfmsub_sd>;
+defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
+                        X86Fnmadd, loadf32>,
+                  fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmadd_ss>;
+defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
+                        X86Fnmadd, loadf64>,
+                  fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmadd_sd>;
+defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
+                        X86Fnmsub, loadf32>,
+                  fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
+                            int_x86_fma_vfnmsub_ss>;
+defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
+                        X86Fnmsub, loadf64>,
+                  fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
+                            int_x86_fma_vfnmsub_sd>;
+
+defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
+defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps", X86Fmsubadd, v4f32, v8f32,
+                          memopv4f32, memopv8f32>;
+defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd", X86Fmsubadd, v2f64, v4f64,
+                          memopv2f64, memopv4f64>;
 } // HasFMA4
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 81b4f812af66..268e9fc9c017 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -44,14 +44,15 @@ def RawFrmImm16 : Format<44>;
 def MRM_D0 : Format<45>;
 def MRM_D1 : Format<46>;
 def MRM_D4 : Format<47>;
-def MRM_D8 : Format<48>;
-def MRM_D9 : Format<49>;
-def MRM_DA : Format<50>;
-def MRM_DB : Format<51>;
-def MRM_DC : Format<52>;
-def MRM_DD : Format<53>;
-def MRM_DE : Format<54>;
-def MRM_DF : Format<55>;
+def MRM_D5 : Format<48>;
+def MRM_D8 : Format<49>;
+def MRM_D9 : Format<50>;
+def MRM_DA : Format<51>;
+def MRM_DB : Format<52>;
+def MRM_DC : Format<53>;
+def MRM_DD : Format<54>;
+def MRM_DE : Format<55>;
+def MRM_DF : Format<56>;
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
@@ -287,12 +288,14 @@ class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm,
   let CodeSize = 3;
 }
 
+def __xs : XS;
+
 // SI - SSE 1 & 2 scalar instructions
 class SI<bits<8> o, Format F, dag outs, dag ins, string asm,
          list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -303,7 +306,7 @@ class SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-            !if(!eq(Prefix, 12 /* XS */), [HasSSE1], [HasSSE2]));
+            !if(!eq(Prefix, __xs.Prefix), [UseSSE1], [UseSSE2]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
@@ -314,18 +317,25 @@ class PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
          InstrItinClass itin, Domain d>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEXPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEXPrefix, !strconcat("v", asm), asm);
 }
 
+// MMXPI - SSE 1 & 2 packed instructions with MMX operands
+class MMXPI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern,
+            InstrItinClass itin, Domain d>
+      : I<o, F, outs, ins, asm, pattern, itin, d> {
+  let Predicates = !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]);
+}
+
 // PIi8 - SSE 1 & 2 packed instructions with immediate
 class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin, Domain d>
       : Ii8<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(hasVEX_4VPrefix /* VEX */, [HasAVX],
-        !if(hasOpSizePrefix /* OpSize */, [HasSSE2], [HasSSE1]));
+        !if(hasOpSizePrefix /* OpSize */, [UseSSE2], [UseSSE1]));
 
   // AVX instructions have a 'v' prefix in the mnemonic
   let AsmString = !if(hasVEX_4VPrefix, !strconcat("v", asm), asm);
@@ -341,18 +351,18 @@ class PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 
 class SSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE1]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE1]>;
 class PSI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, TB,
-        Requires<[HasSSE1]>;
+        Requires<[UseSSE1]>;
 class VSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XS,
@@ -372,27 +382,31 @@ class VPSI<bits<8> o, Format F, dag outs, dag ins, string asm,
 //   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
 //   VSDI   - SSE2 instructions with XD prefix in AVX form.
 //   VPDI   - SSE2 instructions with TB and OpSize prefixes in AVX form.
+//   MMXSDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix as well as
+//               MMX operands.
+//   MMXSSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix as well as
+//               MMX operands.
 
 class SDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[UseSSE2]>;
 class S2SI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[HasSSE2]>;
+      : I<o, F, outs, ins, asm, pattern, itin>, XS, Requires<[UseSSE2]>;
 class S2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
-      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[UseSSE2]>;
 class PDI<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE2]>;
+        Requires<[UseSSE2]>;
 class VSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin>, XD,
@@ -405,6 +419,12 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, !strconcat("v", asm), pattern, itin, SSEPackedDouble>, TB,
         OpSize, Requires<[HasAVX]>;
+class MMXSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin>, XD, Requires<[HasSSE2]>;
+class MMXS2SIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+                list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
 
 // SSE3 Instruction Templates:
 // 
@@ -415,21 +435,23 @@ class VPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedSingle>, XS,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
            list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, XD,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
           list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedDouble>, TB, OpSize,
-        Requires<[HasSSE3]>;
+        Requires<[UseSSE3]>;
 
 
 // SSSE3 Instruction Templates:
 // 
 //   SS38I - SSSE3 instructions with T8 prefix.
 //   SS3AI - SSSE3 instructions with TA prefix.
+//   MMXSS38I - SSSE3 instructions with T8 prefix and MMX operands.
+//   MMXSS3AI - SSSE3 instructions with TA prefix and MMX operands.
 //
 // Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
 // uses the MMX registers. The 64-bit versions are grouped with the MMX
@@ -438,10 +460,18 @@ class S3I<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSSE3]>;
+        Requires<[UseSSSE3]>;
 class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
+        Requires<[UseSSSE3]>;
+class MMXSS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
+        Requires<[HasSSSE3]>;
+class MMXSS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+               list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
+      : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
         Requires<[HasSSSE3]>;
 
 // SSE4.1 Instruction Templates:
@@ -452,11 +482,11 @@ class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
             list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE41]>;
+        Requires<[UseSSE41]>;
 
 // SSE4.2 Instruction Templates:
 // 
@@ -464,9 +494,10 @@ class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 //   SS42FI - SSE 4.2 instructions with T8XD prefix.
+// NOTE: 'HasSSE42' is used as SS42FI is only used for CRC32 insns.
 class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : I<o, F, outs, ins, asm, pattern, itin>, T8XD, Requires<[HasSSE42]>;
@@ -475,7 +506,7 @@ class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
 class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
              list<dag> pattern, InstrItinClass itin = IIC_DEFAULT>
       : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TA,
-        Requires<[HasSSE42]>;
+        Requires<[UseSSE42]>;
 
 // AVX Instruction Templates:
 //   Instructions introduced in AVX (no SSE equivalent forms)
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 1db68c86b76d..73ba0011df1b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -29,6 +29,13 @@ def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+
+// Commutative and Associative FMIN and FMAX.
+def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+def X86fmaxc    : SDNode<"X86ISD::FMAXC", SDTFPBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
 def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
                         [SDNPCommutative, SDNPAssociative]>;
 def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
@@ -73,18 +80,30 @@ def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
                  SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
 
 def X86vzmovly  : SDNode<"X86ISD::VZEXT_MOVL",
-                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>, 
+                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisOpSmallerThanOp<1, 0> ]>>;
 
 def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
-                 SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
+                 SDTypeProfile<1, 1,
+                 [SDTCisVec<0>, SDTCisInt<1>, SDTCisInt<0>]>>;
 
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>]>>;
+def X86vfpround: SDNode<"X86ISD::VFPROUND",
+                        SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                             SDTCisFP<0>, SDTCisFP<1>]>>;
 
 def X86vshldq  : SDNode<"X86ISD::VSHLDQ",    SDTIntShiftOp>;
 def X86vshrdq  : SDNode<"X86ISD::VSRLDQ",    SDTIntShiftOp>;
@@ -175,8 +194,8 @@ def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
 def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
 
 def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
                                          SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -232,6 +251,10 @@ def loadv8f32    : PatFrag<(ops node:$ptr), (v8f32 (load node:$ptr))>;
 def loadv4f64    : PatFrag<(ops node:$ptr), (v4f64 (load node:$ptr))>;
 def loadv4i64    : PatFrag<(ops node:$ptr), (v4i64 (load node:$ptr))>;
 
+// 128-/256-bit extload pattern fragments
+def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
+def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
+
 // Like 'store', but always requires 128-bit vector alignment.
 def alignedstore : PatFrag<(ops node:$val, node:$ptr),
                            (store node:$val, node:$ptr), [{
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index cca04e5433f5..5a99ff004d48 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -561,6 +561,16 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VSQRTPSYr_Int,   X86::VSQRTPSYm_Int,       TB_ALIGN_32 },
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
+
+    // BMI/BMI2 foldable instructions
+    { X86::RORX32ri,        X86::RORX32mi,            0 },
+    { X86::RORX64ri,        X86::RORX64mi,            0 },
+    { X86::SARX32rr,        X86::SARX32rm,            0 },
+    { X86::SARX64rr,        X86::SARX64rm,            0 },
+    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
+    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
+    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
+    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -1110,6 +1120,44 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      TB_ALIGN_32 },
     { X86::VPXORYrr,          X86::VPXORYrm,           TB_ALIGN_32 },
     // FIXME: add AVX 256-bit foldable instructions
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        0           },
+    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        0           },
+    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,      X86::VFMADDPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,      X86::VFMADDPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       0           },
+    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       0           },
+    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,     X86::VFNMADDPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,     X86::VFNMADDPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        0           },
+    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        0           },
+    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,      X86::VFMSUBPS4mrY,       TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,      X86::VFMSUBPD4mrY,       TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       0           },
+    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       0           },
+    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,     X86::VFNMSUBPS4mrY,      TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,     X86::VFNMSUBPD4mrY,      TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,   X86::VFMADDSUBPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,   X86::VFMADDSUBPD4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,   X86::VFMSUBADDPS4mrY,    TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
+
+    // BMI/BMI2 foldable instructions
+    { X86::MULX32rr,          X86::MULX32rm,            0 },
+    { X86::MULX64rr,          X86::MULX64rm,            0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -1145,10 +1193,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMADDPDr132rY,        X86::VFMADDPDr132mY,        TB_ALIGN_32 },
     { X86::VFMADDPSr213rY,        X86::VFMADDPSr213mY,        TB_ALIGN_32 },
     { X86::VFMADDPDr213rY,        X86::VFMADDPDr213mY,        TB_ALIGN_32 },
-    { X86::VFMADDPSr213r_Int,     X86::VFMADDPSr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMADDPDr213r_Int,     X86::VFMADDPDr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMADDPSr213rY_Int,    X86::VFMADDPSr213mY_Int,    TB_ALIGN_32 },
-    { X86::VFMADDPDr213rY_Int,    X86::VFMADDPDr213mY_Int,    TB_ALIGN_32 },
 
     { X86::VFNMADDSSr231r,        X86::VFNMADDSSr231m,        0 },
     { X86::VFNMADDSDr231r,        X86::VFNMADDSDr231m,        0 },
@@ -1171,10 +1215,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFNMADDPDr132rY,       X86::VFNMADDPDr132mY,       TB_ALIGN_32 },
     { X86::VFNMADDPSr213rY,       X86::VFNMADDPSr213mY,       TB_ALIGN_32 },
     { X86::VFNMADDPDr213rY,       X86::VFNMADDPDr213mY,       TB_ALIGN_32 },
-    { X86::VFNMADDPSr213r_Int,    X86::VFNMADDPSr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMADDPDr213r_Int,    X86::VFNMADDPDr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMADDPSr213rY_Int,   X86::VFNMADDPSr213mY_Int,   TB_ALIGN_32 },
-    { X86::VFNMADDPDr213rY_Int,   X86::VFNMADDPDr213mY_Int,   TB_ALIGN_32 },
 
     { X86::VFMSUBSSr231r,         X86::VFMSUBSSr231m,         0 },
     { X86::VFMSUBSDr231r,         X86::VFMSUBSDr231m,         0 },
@@ -1197,10 +1237,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBPDr132rY,        X86::VFMSUBPDr132mY,        TB_ALIGN_32 },
     { X86::VFMSUBPSr213rY,        X86::VFMSUBPSr213mY,        TB_ALIGN_32 },
     { X86::VFMSUBPDr213rY,        X86::VFMSUBPDr213mY,        TB_ALIGN_32 },
-    { X86::VFMSUBPSr213r_Int,     X86::VFMSUBPSr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMSUBPDr213r_Int,     X86::VFMSUBPDr213m_Int,     TB_ALIGN_16 },
-    { X86::VFMSUBPSr213rY_Int,    X86::VFMSUBPSr213mY_Int,    TB_ALIGN_32 },
-    { X86::VFMSUBPDr213rY_Int,    X86::VFMSUBPDr213mY_Int,    TB_ALIGN_32 },
 
     { X86::VFNMSUBSSr231r,        X86::VFNMSUBSSr231m,        0 },
     { X86::VFNMSUBSDr231r,        X86::VFNMSUBSDr231m,        0 },
@@ -1223,10 +1259,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFNMSUBPDr132rY,       X86::VFNMSUBPDr132mY,       TB_ALIGN_32 },
     { X86::VFNMSUBPSr213rY,       X86::VFNMSUBPSr213mY,       TB_ALIGN_32 },
     { X86::VFNMSUBPDr213rY,       X86::VFNMSUBPDr213mY,       TB_ALIGN_32 },
-    { X86::VFNMSUBPSr213r_Int,    X86::VFNMSUBPSr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMSUBPDr213r_Int,    X86::VFNMSUBPDr213m_Int,    TB_ALIGN_16 },
-    { X86::VFNMSUBPSr213rY_Int,   X86::VFNMSUBPSr213mY_Int,   TB_ALIGN_32 },
-    { X86::VFNMSUBPDr213rY_Int,   X86::VFNMSUBPDr213mY_Int,   TB_ALIGN_32 },
 
     { X86::VFMADDSUBPSr231r,      X86::VFMADDSUBPSr231m,      TB_ALIGN_16 },
     { X86::VFMADDSUBPDr231r,      X86::VFMADDSUBPDr231m,      TB_ALIGN_16 },
@@ -1240,10 +1272,6 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMADDSUBPDr132rY,     X86::VFMADDSUBPDr132mY,     TB_ALIGN_32 },
     { X86::VFMADDSUBPSr213rY,     X86::VFMADDSUBPSr213mY,     TB_ALIGN_32 },
     { X86::VFMADDSUBPDr213rY,     X86::VFMADDSUBPDr213mY,     TB_ALIGN_32 },
-    { X86::VFMADDSUBPSr213r_Int,  X86::VFMADDSUBPSr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMADDSUBPDr213r_Int,  X86::VFMADDSUBPDr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMADDSUBPSr213rY_Int, X86::VFMADDSUBPSr213mY_Int, TB_ALIGN_32 },
-    { X86::VFMADDSUBPDr213rY_Int, X86::VFMADDSUBPDr213mY_Int, TB_ALIGN_32 },
 
     { X86::VFMSUBADDPSr231r,      X86::VFMSUBADDPSr231m,      TB_ALIGN_16 },
     { X86::VFMSUBADDPDr231r,      X86::VFMSUBADDPDr231m,      TB_ALIGN_16 },
@@ -1257,10 +1285,40 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBADDPDr132rY,     X86::VFMSUBADDPDr132mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPSr213rY,     X86::VFMSUBADDPSr213mY,     TB_ALIGN_32 },
     { X86::VFMSUBADDPDr213rY,     X86::VFMSUBADDPDr213mY,     TB_ALIGN_32 },
-    { X86::VFMSUBADDPSr213r_Int,  X86::VFMSUBADDPSr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMSUBADDPDr213r_Int,  X86::VFMSUBADDPDr213m_Int,  TB_ALIGN_16 },
-    { X86::VFMSUBADDPSr213rY_Int, X86::VFMSUBADDPSr213mY_Int, TB_ALIGN_32 },
-    { X86::VFMSUBADDPDr213rY_Int, X86::VFMSUBADDPDr213mY_Int, TB_ALIGN_32 },
+
+    // FMA4 foldable patterns
+    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           0           },
+    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           0           },
+    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_16 },
+    { X86::VFMADDPS4rrY,          X86::VFMADDPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMADDPD4rrY,          X86::VFMADDPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          0           },
+    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          0           },
+    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMADDPS4rrY,         X86::VFNMADDPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMADDPD4rrY,         X86::VFNMADDPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           0           },
+    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           0           },
+    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_16 },
+    { X86::VFMSUBPS4rrY,          X86::VFMSUBPS4rmY,          TB_ALIGN_32 },
+    { X86::VFMSUBPD4rrY,          X86::VFMSUBPD4rmY,          TB_ALIGN_32 },
+    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          0           },
+    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          0           },
+    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_16 },
+    { X86::VFNMSUBPS4rrY,         X86::VFNMSUBPS4rmY,         TB_ALIGN_32 },
+    { X86::VFNMSUBPD4rrY,         X86::VFNMSUBPD4rmY,         TB_ALIGN_32 },
+    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_16 },
+    { X86::VFMADDSUBPS4rrY,       X86::VFMADDSUBPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMADDSUBPD4rrY,       X86::VFMADDSUBPD4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_16 },
+    { X86::VFMSUBADDPS4rrY,       X86::VFMSUBADDPS4rmY,       TB_ALIGN_32 },
+    { X86::VFMSUBADDPD4rrY,       X86::VFMSUBADDPD4rmY,       TB_ALIGN_32 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -1318,8 +1376,7 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
     SrcReg = MI.getOperand(1).getReg();
     DstReg = MI.getOperand(0).getReg();
     switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable(0);
+    default: llvm_unreachable("Unreachable!");
     case X86::MOVSX16rr8:
     case X86::MOVZX16rr8:
     case X86::MOVSX32rr8:
@@ -1483,69 +1540,69 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
                                                 AliasAnalysis *AA) const {
   switch (MI->getOpcode()) {
   default: break;
-    case X86::MOV8rm:
-    case X86::MOV16rm:
-    case X86::MOV32rm:
-    case X86::MOV64rm:
-    case X86::LD_Fp64m:
-    case X86::MOVSSrm:
-    case X86::MOVSDrm:
-    case X86::MOVAPSrm:
-    case X86::MOVUPSrm:
-    case X86::MOVAPDrm:
-    case X86::MOVDQArm:
-    case X86::VMOVSSrm:
-    case X86::VMOVSDrm:
-    case X86::VMOVAPSrm:
-    case X86::VMOVUPSrm:
-    case X86::VMOVAPDrm:
-    case X86::VMOVDQArm:
-    case X86::VMOVAPSYrm:
-    case X86::VMOVUPSYrm:
-    case X86::VMOVAPDYrm:
-    case X86::VMOVDQAYrm:
-    case X86::MMX_MOVD64rm:
-    case X86::MMX_MOVQ64rm:
-    case X86::FsVMOVAPSrm:
-    case X86::FsVMOVAPDrm:
-    case X86::FsMOVAPSrm:
-    case X86::FsMOVAPDrm: {
-      // Loads from constant pools are trivially rematerializable.
-      if (MI->getOperand(1).isReg() &&
-          MI->getOperand(2).isImm() &&
-          MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-          MI->isInvariantLoad(AA)) {
-        unsigned BaseReg = MI->getOperand(1).getReg();
-        if (BaseReg == 0 || BaseReg == X86::RIP)
-          return true;
-        // Allow re-materialization of PIC load.
-        if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
-          return false;
-        const MachineFunction &MF = *MI->getParent()->getParent();
-        const MachineRegisterInfo &MRI = MF.getRegInfo();
-        return regIsPICBase(BaseReg, MRI);
-      }
-      return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVAPSrm:
+  case X86::VMOVUPSrm:
+  case X86::VMOVAPDrm:
+  case X86::VMOVDQArm:
+  case X86::VMOVAPSYrm:
+  case X86::VMOVUPSYrm:
+  case X86::VMOVAPDYrm:
+  case X86::VMOVDQAYrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsVMOVAPSrm:
+  case X86::FsVMOVAPDrm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm: {
+    // Loads from constant pools are trivially rematerializable.
+    if (MI->getOperand(1).isReg() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+        MI->isInvariantLoad(AA)) {
+      unsigned BaseReg = MI->getOperand(1).getReg();
+      if (BaseReg == 0 || BaseReg == X86::RIP)
+        return true;
+      // Allow re-materialization of PIC load.
+      if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+        return false;
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
     }
+    return false;
+  }
 
-     case X86::LEA32r:
-     case X86::LEA64r: {
-       if (MI->getOperand(2).isImm() &&
-           MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-           !MI->getOperand(4).isReg()) {
-         // lea fi#, lea GV, etc. are all rematerializable.
-         if (!MI->getOperand(1).isReg())
-           return true;
-         unsigned BaseReg = MI->getOperand(1).getReg();
-         if (BaseReg == 0)
-           return true;
-         // Allow re-materialization of lea PICBase + x.
-         const MachineFunction &MF = *MI->getParent()->getParent();
-         const MachineRegisterInfo &MRI = MF.getRegInfo();
-         return regIsPICBase(BaseReg, MRI);
-       }
-       return false;
-     }
+  case X86::LEA32r:
+  case X86::LEA64r: {
+    if (MI->getOperand(2).isImm() &&
+        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+        !MI->getOperand(4).isReg()) {
+      // lea fi#, lea GV, etc. are all rematerializable.
+      if (!MI->getOperand(1).isReg())
+        return true;
+      unsigned BaseReg = MI->getOperand(1).getReg();
+      if (BaseReg == 0)
+        return true;
+      // Allow re-materialization of lea PICBase + x.
+      const MachineFunction &MF = *MI->getParent()->getParent();
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      return regIsPICBase(BaseReg, MRI);
+    }
+    return false;
+  }
   }
 
   // All other instructions marked M_REMATERIALIZABLE are always trivially
@@ -1654,7 +1711,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
   case X86::MOV64r0: {
     if (!isSafeToClobberEFLAGS(MBB, I)) {
       switch (Opc) {
-      default: break;
+      default: llvm_unreachable("Unreachable!");
       case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
       case X86::MOV16r0: Opc = X86::MOV16ri; break;
       case X86::MOV32r0: Opc = X86::MOV32ri; break;
@@ -1727,8 +1784,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
                                     get(Opc), leaOutReg);
   switch (MIOpc) {
-  default:
-    llvm_unreachable(0);
+  default: llvm_unreachable("Unreachable!");
   case X86::SHL16ri: {
     unsigned ShAmt = MI->getOperand(2).getImm();
     MIB.addReg(0).addImm(1 << ShAmt)
@@ -1812,10 +1868,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
-  unsigned Dest = MI->getOperand(0).getReg();
-  unsigned Src = MI->getOperand(1).getReg();
-  bool isDead = MI->getOperand(0).isDead();
-  bool isKill = MI->getOperand(1).isKill();
+  const MachineOperand &Dest = MI->getOperand(0);
+  const MachineOperand &Src = MI->getOperand(1);
 
   MachineInstr *NewMI = NULL;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
@@ -1833,11 +1887,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHUFPDrri: {
@@ -1847,15 +1899,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
     if (B != C) return 0;
-    unsigned A = MI->getOperand(0).getReg();
     unsigned M = MI->getOperand(3).getImm();
 
     // Convert to PSHUFD mask.
     M = ((M & 1) << 1) | ((M & 1) << 3) | ((M & 2) << 4) | ((M & 2) << 6)| 0x44;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
-      .addReg(A, RegState::Define | getDeadRegState(isDead))
-      .addReg(B, getKillRegState(isKill)).addImm(M);
+      .addOperand(Dest).addOperand(Src).addImm(M);
     break;
   }
   case X86::SHL64ri: {
@@ -1866,15 +1916,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle RSP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR64_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR64_NOSPRegClass))
       return 0;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL32ri: {
@@ -1885,15 +1934,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (ShAmt == 0 || ShAmt >= 4) return 0;
 
     // LEA can't handle ESP.
-    if (TargetRegisterInfo::isVirtualRegister(Src) &&
-        !MF.getRegInfo().constrainRegClass(Src, &X86::GR32_NOSPRegClass))
+    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+        !MF.getRegInfo().constrainRegClass(Src.getReg(),
+                                           &X86::GR32_NOSPRegClass))
       return 0;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill)).addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   case X86::SHL16ri: {
@@ -1906,10 +1955,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     if (DisableLEA16)
       return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
-      .addReg(0).addImm(1 << ShAmt)
-      .addReg(Src, getKillRegState(isKill))
-      .addImm(0).addReg(0);
+      .addOperand(Dest)
+      .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
     break;
   }
   default: {
@@ -1932,14 +1979,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
 
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     }
     case X86::INC16r:
@@ -1947,10 +1992,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, 1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), 1);
       break;
     case X86::DEC64r:
     case X86::DEC32r:
@@ -1962,14 +2005,12 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         (const TargetRegisterClass*)&X86::GR64_NOSPRegClass :
         (const TargetRegisterClass*)&X86::GR32_NOSPRegClass;
       // LEA can't handle RSP.
-      if (TargetRegisterInfo::isVirtualRegister(Src) &&
-          !MF.getRegInfo().constrainRegClass(Src, RC))
+      if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
+          !MF.getRegInfo().constrainRegClass(Src.getReg(), RC))
         return 0;
 
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     }
     case X86::DEC16r:
@@ -1977,10 +2018,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                           .addReg(Dest, RegState::Define |
-                                   getDeadRegState(isDead)),
-                           Src, isKill, -1);
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src), -1);
       break;
     case X86::ADD64rr:
     case X86::ADD64rr_DB:
@@ -2007,9 +2046,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
         return 0;
 
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
 
       // Preserve undefness of the operands.
       bool isUndef = MI->getOperand(1).isUndef();
@@ -2029,9 +2067,15 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
       NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                        .addReg(Dest, RegState::Define |
-                                getDeadRegState(isDead)),
-                        Src, isKill, Src2, isKill2);
+                        .addOperand(Dest),
+                        Src.getReg(), Src.isKill(), Src2, isKill2);
+
+      // Preserve undefness of the operands.
+      bool isUndef = MI->getOperand(1).isUndef();
+      bool isUndef2 = MI->getOperand(2).isUndef();
+      NewMI->getOperand(1).setIsUndef(isUndef);
+      NewMI->getOperand(3).setIsUndef(isUndef2);
+
       if (LV && isKill2)
         LV->replaceKillInstruction(Src2, MI, NewMI);
       break;
@@ -2041,10 +2085,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD64ri32_DB:
     case X86::ADD64ri8_DB:
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     case X86::ADD32ri:
     case X86::ADD32ri8:
@@ -2052,10 +2095,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD32ri8_DB: {
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                                Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
     case X86::ADD16ri:
@@ -2065,10 +2107,9 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (DisableLEA16)
         return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
-      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
-                              .addReg(Dest, RegState::Define |
-                                      getDeadRegState(isDead)),
-                              Src, isKill, MI->getOperand(2).getImm());
+      NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addOperand(Dest).addOperand(Src),
+                        MI->getOperand(2).getImm());
       break;
     }
   }
@@ -2077,10 +2118,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   if (!NewMI) return 0;
 
   if (LV) {  // Update live variables
-    if (isKill)
-      LV->replaceKillInstruction(Src, MI, NewMI);
-    if (isDead)
-      LV->replaceKillInstruction(Dest, MI, NewMI);
+    if (Src.isKill())
+      LV->replaceKillInstruction(Src.getReg(), MI, NewMI);
+    if (Dest.isDead())
+      LV->replaceKillInstruction(Dest.getReg(), MI, NewMI);
   }
 
   MFI->insert(MBBI, NewMI);          // Insert the new inst
@@ -2120,57 +2161,25 @@ X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     MI->getOperand(3).setImm(Size-Amt);
     return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
   }
-  case X86::CMOVB16rr:
-  case X86::CMOVB32rr:
-  case X86::CMOVB64rr:
-  case X86::CMOVAE16rr:
-  case X86::CMOVAE32rr:
-  case X86::CMOVAE64rr:
-  case X86::CMOVE16rr:
-  case X86::CMOVE32rr:
-  case X86::CMOVE64rr:
-  case X86::CMOVNE16rr:
-  case X86::CMOVNE32rr:
-  case X86::CMOVNE64rr:
-  case X86::CMOVBE16rr:
-  case X86::CMOVBE32rr:
-  case X86::CMOVBE64rr:
-  case X86::CMOVA16rr:
-  case X86::CMOVA32rr:
-  case X86::CMOVA64rr:
-  case X86::CMOVL16rr:
-  case X86::CMOVL32rr:
-  case X86::CMOVL64rr:
-  case X86::CMOVGE16rr:
-  case X86::CMOVGE32rr:
-  case X86::CMOVGE64rr:
-  case X86::CMOVLE16rr:
-  case X86::CMOVLE32rr:
-  case X86::CMOVLE64rr:
-  case X86::CMOVG16rr:
-  case X86::CMOVG32rr:
-  case X86::CMOVG64rr:
-  case X86::CMOVS16rr:
-  case X86::CMOVS32rr:
-  case X86::CMOVS64rr:
-  case X86::CMOVNS16rr:
-  case X86::CMOVNS32rr:
-  case X86::CMOVNS64rr:
-  case X86::CMOVP16rr:
-  case X86::CMOVP32rr:
-  case X86::CMOVP64rr:
-  case X86::CMOVNP16rr:
-  case X86::CMOVNP32rr:
-  case X86::CMOVNP64rr:
-  case X86::CMOVO16rr:
-  case X86::CMOVO32rr:
-  case X86::CMOVO64rr:
-  case X86::CMOVNO16rr:
-  case X86::CMOVNO32rr:
-  case X86::CMOVNO64rr: {
-    unsigned Opc = 0;
+  case X86::CMOVB16rr:  case X86::CMOVB32rr:  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:  case X86::CMOVE32rr:  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr: case X86::CMOVBE32rr: case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:  case X86::CMOVA32rr:  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:  case X86::CMOVL32rr:  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:  case X86::CMOVG32rr:  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:  case X86::CMOVS32rr:  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:  case X86::CMOVP32rr:  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:  case X86::CMOVO32rr:  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: {
+    unsigned Opc;
     switch (MI->getOpcode()) {
-    default: break;
+    default: llvm_unreachable("Unreachable!");
     case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
     case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
     case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
@@ -2279,7 +2288,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) {
 }
 
 /// getCondFromCmovOpc - return condition code of a CMov opcode.
-static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
@@ -2402,7 +2411,7 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) {
 /// whether it has memory operand.
 static unsigned getSETFromCond(X86::CondCode CC,
                                bool HasMemoryOperand) {
-  static const unsigned Opc[16][2] = {
+  static const uint16_t Opc[16][2] = {
     { X86::SETAr,  X86::SETAm  },
     { X86::SETAEr, X86::SETAEm },
     { X86::SETBr,  X86::SETBm  },
@@ -2429,7 +2438,7 @@ static unsigned getSETFromCond(X86::CondCode CC,
 /// register size in bytes, and operand type.
 static unsigned getCMovFromCond(X86::CondCode CC, unsigned RegBytes,
                                 bool HasMemoryOperand) {
-  static const unsigned Opc[32][3] = {
+  static const uint16_t Opc[32][3] = {
     { X86::CMOVA16rr,  X86::CMOVA32rr,  X86::CMOVA64rr  },
     { X86::CMOVAE16rr, X86::CMOVAE32rr, X86::CMOVAE64rr },
     { X86::CMOVB16rr,  X86::CMOVB32rr,  X86::CMOVB64rr  },
@@ -2762,19 +2771,18 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(GR64)  -> DestReg(VR64)
 
   if (X86::GR64RegClass.contains(DestReg)) {
-    if (X86::VR128RegClass.contains(SrcReg)) {
+    if (X86::VR128RegClass.contains(SrcReg))
       // Copy from a VR128 register to a GR64 register.
       return HasAVX ? X86::VMOVPQIto64rr : X86::MOVPQIto64rr;
-    } else if (X86::VR64RegClass.contains(SrcReg)) {
+    if (X86::VR64RegClass.contains(SrcReg))
       // Copy from a VR64 register to a GR64 register.
       return X86::MOVSDto64rr;
-    }
   } else if (X86::GR64RegClass.contains(SrcReg)) {
     // Copy from a GR64 register to a VR128 register.
     if (X86::VR128RegClass.contains(DestReg))
       return HasAVX ? X86::VMOV64toPQIrr : X86::MOV64toPQIrr;
     // Copy from a GR64 register to a VR64 register.
-    else if (X86::VR64RegClass.contains(DestReg))
+    if (X86::VR64RegClass.contains(DestReg))
       return X86::MOV64toSDrr;
   }
 
@@ -2782,12 +2790,12 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
   // SrcReg(GR32) -> DestReg(FR32)
 
   if (X86::GR32RegClass.contains(DestReg) && X86::FR32RegClass.contains(SrcReg))
-      // Copy from a FR32 register to a GR32 register.
-      return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
+    // Copy from a FR32 register to a GR32 register.
+    return HasAVX ? X86::VMOVSS2DIrr : X86::MOVSS2DIrr;
 
   if (X86::FR32RegClass.contains(DestReg) && X86::GR32RegClass.contains(SrcReg))
-      // Copy from a GR32 register to a FR32 register.
-      return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
+    // Copy from a GR32 register to a FR32 register.
+    return HasAVX ? X86::VMOVDI2SSrr : X86::MOVDI2SSrr;
 
   return 0;
 }
@@ -2798,7 +2806,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                bool KillSrc) const {
   // First deal with the normal symmetric copies.
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-  unsigned Opc = 0;
+  unsigned Opc;
   if (X86::GR64RegClass.contains(DestReg, SrcReg))
     Opc = X86::MOV64rr;
   else if (X86::GR32RegClass.contains(DestReg, SrcReg))
@@ -2837,7 +2845,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       BuildMI(MBB, MI, DL, get(X86::PUSHF64));
       BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
       return;
-    } else if (X86::GR32RegClass.contains(DestReg)) {
+    }
+    if (X86::GR32RegClass.contains(DestReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSHF32));
       BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
       return;
@@ -2849,7 +2858,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF64));
       return;
-    } else if (X86::GR32RegClass.contains(SrcReg)) {
+    }
+    if (X86::GR32RegClass.contains(SrcReg)) {
       BuildMI(MBB, MI, DL, get(X86::PUSH32r))
         .addReg(SrcReg, getKillRegState(KillSrc));
       BuildMI(MBB, MI, DL, get(X86::POPF32));
@@ -3139,11 +3149,19 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
   case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
   case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:  case X86::DEC32r:  case X86::DEC16r: case X86::DEC8r:
+  case X86::DEC64m:  case X86::DEC32m:  case X86::DEC16m: case X86::DEC8m:
+  case X86::DEC64_32r: case X86::DEC64_16r:
+  case X86::DEC64_32m: case X86::DEC64_16m:
   case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
   case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
   case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
   case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
   case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:  case X86::INC32r:  case X86::INC16r: case X86::INC8r:
+  case X86::INC64m:  case X86::INC32m:  case X86::INC16m: case X86::INC8m:
+  case X86::INC64_32r: case X86::INC64_16r:
+  case X86::INC64_32m: case X86::INC64_16m:
   case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
   case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
   case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
@@ -3193,7 +3211,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       return false;
     // There is no use of the destination register, we can replace SUB with CMP.
     switch (CmpInstr->getOpcode()) {
-    default: llvm_unreachable(0);
+    default: llvm_unreachable("Unreachable!");
     case X86::SUB64rm:   NewOpcode = X86::CMP64rm;   break;
     case X86::SUB32rm:   NewOpcode = X86::CMP32rm;   break;
     case X86::SUB16rm:   NewOpcode = X86::CMP16rm;   break;
@@ -3318,7 +3336,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
         if (OldCC != X86::COND_INVALID)
           OpcIsSET = true;
         else
-          OldCC = getCondFromCMovOpc(Instr.getOpcode());
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
       }
       if (OldCC == X86::COND_INVALID) return false;
     }
@@ -3383,12 +3401,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
   }
 
-  // Make sure Sub instruction defines EFLAGS.
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned LastOperand = Sub->getNumOperands() - 1;
   assert(Sub->getNumOperands() >= 2 &&
-         Sub->getOperand(Sub->getNumOperands()-1).isReg() &&
-         Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS &&
+         Sub->getOperand(LastOperand).isReg() &&
+         Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
          "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
-  Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDead(false);
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
@@ -3497,10 +3517,25 @@ static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) {
 bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   switch (MI->getOpcode()) {
+  case X86::SETB_C8r:
+    return Expand2AddrUndef(MI, get(X86::SBB8rr));
+  case X86::SETB_C16r:
+    return Expand2AddrUndef(MI, get(X86::SBB16rr));
+  case X86::SETB_C32r:
+    return Expand2AddrUndef(MI, get(X86::SBB32rr));
+  case X86::SETB_C64r:
+    return Expand2AddrUndef(MI, get(X86::SBB64rr));
   case X86::V_SET0:
   case X86::FsFLD0SS:
   case X86::FsFLD0SD:
     return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+  case X86::AVX_SET0:
+    assert(HasAVX && "AVX not supported");
+    return Expand2AddrUndef(MI, get(X86::VXORPSYrr));
+  case X86::V_SETALLONES:
+    return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+  case X86::AVX2_SETALLONES:
+    return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr));
   case X86::TEST8ri_NOREX:
     MI->setDesc(get(X86::TEST8ri));
     return true;
@@ -3614,14 +3649,16 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
     isTwoAddrFold = true;
   } else if (i == 0) { // If operand 0
-    if (MI->getOpcode() == X86::MOV64r0)
-      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV32r0)
-      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV16r0)
-      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
-    else if (MI->getOpcode() == X86::MOV8r0)
-      NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::MOV64r0: Opc = X86::MOV64mi32; break;
+    case X86::MOV32r0: Opc = X86::MOV32mi;   break;
+    case X86::MOV16r0: Opc = X86::MOV16mi;   break;
+    case X86::MOV8r0:  Opc = X86::MOV8mi;    break;
+    }
+    if (Opc)
+       NewMI = MakeM0Inst(*this, Opc, MOs, MI);
     if (NewMI)
       return NewMI;
 
@@ -3799,7 +3836,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+  if (!MF.getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return 0;
 
@@ -3840,7 +3878,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
-  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize) &&
+  if (!MF.getFunction()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
     return 0;
 
@@ -3850,15 +3889,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     Alignment = (*LoadMI->memoperands_begin())->getAlignment();
   else
     switch (LoadMI->getOpcode()) {
-    case X86::AVX_SET0PSY:
-    case X86::AVX_SET0PDY:
     case X86::AVX2_SETALLONES:
-    case X86::AVX2_SET0:
+    case X86::AVX_SET0:
       Alignment = 32;
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
-    case X86::AVX_SETALLONES:
       Alignment = 16;
       break;
     case X86::FsFLD0SD:
@@ -3894,11 +3930,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   switch (LoadMI->getOpcode()) {
   case X86::V_SET0:
   case X86::V_SETALLONES:
-  case X86::AVX_SET0PSY:
-  case X86::AVX_SET0PDY:
-  case X86::AVX_SETALLONES:
   case X86::AVX2_SETALLONES:
-  case X86::AVX2_SET0:
+  case X86::AVX_SET0:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS: {
     // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
@@ -3930,15 +3963,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Ty = Type::getFloatTy(MF.getFunction()->getContext());
     else if (Opc == X86::FsFLD0SD)
       Ty = Type::getDoubleTy(MF.getFunction()->getContext());
-    else if (Opc == X86::AVX_SET0PSY || Opc == X86::AVX_SET0PDY)
-      Ty = VectorType::get(Type::getFloatTy(MF.getFunction()->getContext()), 8);
-    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX2_SET0)
+    else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0)
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES ||
-                      Opc == X86::AVX2_SETALLONES);
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -4013,6 +4043,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
     OpcodeTablePtr = &RegOp2MemOpTable1;
   } else if (OpNum == 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2;
+  } else if (OpNum == 3) {
+    OpcodeTablePtr = &RegOp2MemOpTable3;
   }
 
   if (OpcodeTablePtr && OpcodeTablePtr->count(Opc))
@@ -4102,7 +4134,6 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
                getUndefRegState(MO.isUndef()));
   }
   // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
-  unsigned NewOpc = 0;
   switch (DataMI->getOpcode()) {
   default: break;
   case X86::CMP64ri32:
@@ -4115,8 +4146,9 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
     MachineOperand &MO0 = DataMI->getOperand(0);
     MachineOperand &MO1 = DataMI->getOperand(1);
     if (MO1.getImm() == 0) {
+      unsigned NewOpc;
       switch (DataMI->getOpcode()) {
-      default: break;
+      default: llvm_unreachable("Unreachable!");
       case X86::CMP64ri8:
       case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
       case X86::CMP32ri8:
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index b6f69af037c2..260f054d69cb 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -61,6 +61,9 @@ namespace X86 {
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
 
+  // Turn CMov opcode into condition code.
+  CondCode getCondFromCMovOpc(unsigned Opc);
+
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(X86::CondCode CC);
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index d293156c1f71..650fa95d7f23 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -114,7 +114,7 @@ def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
 def SDT_X86MEMBARRIER : SDTypeProfile<0, 0, []>;
 
 def X86MemBarrier : SDNode<"X86ISD::MEMBARRIER", SDT_X86MEMBARRIER,
-                            [SDNPHasChain]>;
+                            [SDNPHasChain,SDNPSideEffect]>;
 def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
                         [SDNPHasChain]>;
 def X86SFence : SDNode<"X86ISD::SFENCE", SDT_X86MEMBARRIER,
@@ -216,6 +216,14 @@ def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
+def X86eh_sjlj_setjmp  : SDNode<"X86ISD::EH_SJLJ_SETJMP",
+                                SDTypeProfile<1, 1, [SDTCisInt<0>,
+                                                     SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+def X86eh_sjlj_longjmp : SDNode<"X86ISD::EH_SJLJ_LONGJMP",
+                                SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPSideEffect]>;
+
 def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET,
                         [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
 
@@ -397,7 +405,7 @@ def i64mem_TC : Operand<i64> {
 
 let OperandType = "OPERAND_PCREL",
     ParserMatchClass = X86AbsMemAsmOperand,
-    PrintMethod = "print_pcrel_imm" in {
+    PrintMethod = "printPCRelImm" in {
 def i32imm_pcrel : Operand<i32>;
 def i16imm_pcrel : Operand<i16>;
 
@@ -418,7 +426,7 @@ def SSECC : Operand<i8> {
 }
 
 def AVXCC : Operand<i8> {
-  let PrintMethod = "printSSECC";
+  let PrintMethod = "printAVXCC";
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
@@ -499,7 +507,7 @@ def i64i32imm  : Operand<i64> {
 // 64-bits but only 32 bits are significant, and those bits are treated as being
 // pc relative.
 def i64i32imm_pcrel : Operand<i64> {
-  let PrintMethod = "print_pcrel_imm";
+  let PrintMethod = "printPCRelImm";
   let ParserMatchClass = X86AbsMemAsmOperand;
   let OperandType = "OPERAND_PCREL";
 }
@@ -552,14 +560,21 @@ def HasMMX       : Predicate<"Subtarget->hasMMX()">;
 def Has3DNow     : Predicate<"Subtarget->has3DNow()">;
 def Has3DNowA    : Predicate<"Subtarget->has3DNowA()">;
 def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def UseSSE1      : Predicate<"Subtarget->hasSSE1() && !Subtarget->hasAVX()">;
 def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def UseSSE2      : Predicate<"Subtarget->hasSSE2() && !Subtarget->hasAVX()">;
 def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def UseSSE3      : Predicate<"Subtarget->hasSSE3() && !Subtarget->hasAVX()">;
 def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def UseSSSE3     : Predicate<"Subtarget->hasSSSE3() && !Subtarget->hasAVX()">;
 def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def UseSSE41     : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
 def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def UseSSE42     : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
 def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
 def HasAVX       : Predicate<"Subtarget->hasAVX()">;
 def HasAVX2      : Predicate<"Subtarget->hasAVX2()">;
+def HasAVX1Only  : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
 
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
 def HasAES       : Predicate<"Subtarget->hasAES()">;
@@ -574,6 +589,7 @@ def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
 def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -1259,28 +1275,46 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 // Atomic support
 //
 
-
 // Atomic swap. These are just normal xchg instructions. But since a memory
 // operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst" in {
+    def #NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                       (ins GR8:$val, i8mem:$ptr),
+                       !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                       itin>;
+    def #NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                       (ins GR16:$val, i16mem:$ptr),
+                       !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR16:$dst,
+                          (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                       itin>, OpSize;
+    def #NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                       (ins GR32:$val, i32mem:$ptr),
+                       !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR32:$dst,
+                          (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                       itin>;
+    def #NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                        (ins GR64:$val, i64mem:$ptr),
+                        !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                        [(set
+                          GR64:$dst,
+                          (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                        itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
 let Constraints = "$val = $dst" in {
-def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
-               "xchg{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))],
-               IIC_XCHG_MEM>;
-def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr),
-               "xchg{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))],
-               IIC_XCHG_MEM>,
-                OpSize;
-def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr),
-               "xchg{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))],
-               IIC_XCHG_MEM>;
-def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr),
-                  "xchg{q}\t{$val, $ptr|$ptr, $val}",
-                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))],
-                  IIC_XCHG_MEM>;
-
 def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
                 "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
@@ -1291,6 +1325,7 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
                   "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 }
 
+// Swap between EAX and other registers.
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
                   "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
@@ -1672,6 +1707,8 @@ include "X86Instr3DNow.td"
 include "X86InstrVMX.td"
 include "X86InstrSVM.td"
 
+include "X86InstrTSX.td"
+
 // System instructions.
 include "X86InstrSystem.td"
 
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index c8f40bbb4905..127af6f7f93a 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -118,11 +118,11 @@ let Constraints = "$src1 = $dst" in {
 /// Unary MMX instructions requiring SSSE3.
 multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
                                Intrinsic IntId64, OpndItins itins> {
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst, (IntId64 VR64:$src))], itins.rr>;
 
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR64:$dst,
                      (IntId64 (bitconvert (memopmmx addr:$src))))],
@@ -134,11 +134,11 @@ let ImmT = NoImm, Constraints = "$src1 = $dst" in {
 multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
                              Intrinsic IntId64, OpndItins itins> {
   let isCommutable = 0 in
-  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+  def rr64 : MMXSS38I<opc, MRMSrcReg, (outs VR64:$dst),
        (ins VR64:$src1, VR64:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))], itins.rr>;
-  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+  def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst),
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
@@ -149,11 +149,11 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
 
 /// PALIGN MMX instructions (require SSSE3).
 multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
-  def R64irr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+  def R64irr  : MMXSS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
       (ins VR64:$src1, VR64:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), 
       [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2, (i8 imm:$src3)))]>;
-  def R64irm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+  def R64irm  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
       [(set VR64:$dst, (IntId VR64:$src1,
@@ -163,12 +163,10 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId> {
 multiclass sse12_cvt_pint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                          Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag,
                          string asm, OpndItins itins, Domain d> {
-  def irr : PI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
-                        [(set DstRC:$dst, (Int SrcRC:$src))], 
-                        itins.rr, d>;
-  def irm : PI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
-                        [(set DstRC:$dst, (Int (ld_frag addr:$src)))], 
-                        itins.rm, d>;
+  def irr : MMXPI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
+                  [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr, d>;
+  def irm : MMXPI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
+                  [(set DstRC:$dst, (Int (ld_frag addr:$src)))], itins.rm, d>;
 }
 
 multiclass sse12_cvt_pint_3addr<bits<8> opc, RegisterClass SrcRC,
@@ -209,8 +207,14 @@ def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_MM_RM>;
-def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
-                        "movd\t{$src, $dst|$dst, $src}", [], IIC_MMX_MOV_REG_MM>;
+
+// Low word of MMX to GPR.
+def MMX_X86movd2w : SDNode<"X86ISD::MMX_MOVD2W", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, i32>, SDTCisVT<1, x86mmx>]>>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
+                         "movd\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst,
+                          (MMX_X86movd2w (x86mmx VR64:$src)))], IIC_MMX_MOV_REG_MM>;
 
 let neverHasSideEffects = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
@@ -243,29 +247,30 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         [(store (x86mmx VR64:$src), addr:$dst)],
                         IIC_MMX_MOVQ_RM>;
 
-def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                          (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
-                          [(set VR64:$dst,
-                            (x86mmx (bitconvert
-                            (i64 (vector_extract (v2i64 VR128:$src),
-                                  (iPTR 0))))))],
-                          IIC_MMX_MOVQ_RR>;
-
-def MMX_MOVQ2DQrr : S2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
-                            (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
-          [(set VR128:$dst,
-            (v2i64 (scalar_to_vector
-                              (i64 (bitconvert (x86mmx VR64:$src))))))],
-                           IIC_MMX_MOVQ_RR>;
+def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                             (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                             [(set VR64:$dst,
+                               (x86mmx (bitconvert
+                               (i64 (vector_extract (v2i64 VR128:$src),
+                                     (iPTR 0))))))],
+                             IIC_MMX_MOVQ_RR>;
+
+def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
+                              (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                              [(set VR128:$dst,
+                                (v2i64
+                                  (scalar_to_vector
+                                    (i64 (bitconvert (x86mmx VR64:$src))))))],
+                              IIC_MMX_MOVQ_RR>;
 
 let neverHasSideEffects = 1 in
-def MMX_MOVQ2FR64rr: S2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
-                       (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
+                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
+                               [], IIC_MMX_MOVQ_RR>;
 
-def MMX_MOVFR642Qrr: SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
-                       (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}", [],
-                       IIC_MMX_MOVQ_RR>;
+def MMX_MOVFR642Qrr: MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
+                              (ins FR64:$src), "movdq2q\t{$src, $dst|$dst, $src}",
+                              [], IIC_MMX_MOVQ_RR>;
 
 def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          "movntq\t{$src, $dst|$dst, $src}",
@@ -577,6 +582,7 @@ def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
                            IIC_MMX_MASKMOV>;
 
 // 64-bit bit convert.
+let Predicates = [HasSSE2] in {
 def : Pat<(x86mmx (bitconvert (i64 GR64:$src))),
           (MMX_MOVD64to64rr GR64:$src)>;
 def : Pat<(i64 (bitconvert (x86mmx VR64:$src))),
@@ -585,5 +591,6 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+}
 
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 20dc81eb4a37..6f48d7ed7fe1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -251,35 +251,37 @@ def : Pat<(f64 (vector_extract (v2f64 VR128:$src), (iPTR 0))),
 
 // A 128-bit subvector extract from the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (i32 0))),
+def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
           (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (i32 0))),
+def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
           (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
 
-def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (i32 0))),
+def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
           (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (i32 0))),
+def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
           (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
 
-def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (i32 0))),
+def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
           (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
-def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (i32 0))),
+def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
           (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
 
 // A 128-bit subvector insert to the first 256-bit vector position
 // is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (i32 0)),
+let AddedComplexity = 25 in { // to give priority over vinsertf128rm
+def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (i32 0)),
+def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
           (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
+}
 
 // Implicitly promote a 32-bit scalar to a vector.
 def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
@@ -362,7 +364,7 @@ let Predicates = [HasAVX] in {
   def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))),  (v16i16 VR256:$src)>;
 }
 
-// Alias instructions that map fld0 to pxor for sse.
+// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
 // This is expanded by ExpandPostRAPseudos.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
     isPseudo = 1 in {
@@ -382,11 +384,11 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, neverHasSideEffects = 1 in {
-def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "", []>;
+    isPseudo = 1 in {
+def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+               [(set VR128:$dst, (v4f32 immAllZerosV))]>;
 }
 
-def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
 def : Pat<(v4i32 immAllZerosV), (V_SET0)>;
 def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
@@ -394,35 +396,29 @@ def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
 def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
 
 
-// The same as done above but for AVX.  The 256-bit ISA does not support PI,
+// The same as done above but for AVX.  The 256-bit AVX1 ISA doesn't support PI,
 // and doesn't need it because on sandy bridge the register is set to zero
 // at the rename stage without using any execution unit, so SET0PSY
 // and SET0PDY can be used for vector int instructions without penalty
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementatioan, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1 in {
-let Predicates = [HasAVX] in {
-def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v8f32 immAllZerosV))]>, VEX_4V;
-def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   [(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
-}
-let Predicates = [HasAVX2], neverHasSideEffects = 1 in
-def AVX2_SET0   : PDI<0xef, MRMInitReg, (outs VR256:$dst), (ins), "",
-                   []>, VEX_4V;
+    isPseudo = 1, Predicates = [HasAVX] in {
+def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                 [(set VR256:$dst, (v8f32 immAllZerosV))]>;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 5 in {
-  def : Pat<(v4i64 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v8i32 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v16i16 immAllZerosV), (AVX2_SET0)>;
-  def : Pat<(v32i8 immAllZerosV), (AVX2_SET0)>;
+let Predicates = [HasAVX] in
+  def : Pat<(v4f64 immAllZerosV), (AVX_SET0)>;
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v4i64 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v8i32 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v16i16 immAllZerosV), (AVX_SET0)>;
+  def : Pat<(v32i8 immAllZerosV), (AVX_SET0)>;
 }
 
-// AVX has no support for 256-bit integer instructions, but since the 128-bit
+// AVX1 has no support for 256-bit integer instructions, but since the 128-bit
 // VPXOR instruction writes zero to its upper part, it's safe build zeros.
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v32i8 immAllZerosV), (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v32i8 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i8 0), (V_SET0), sub_xmm)>;
@@ -438,22 +434,17 @@ def : Pat<(bc_v8i32 (v8f32 immAllZerosV)),
 def : Pat<(v4i64 immAllZerosV), (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
 def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
           (SUBREG_TO_REG (i64 0), (V_SET0), sub_xmm)>;
+}
 
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-ones value if folding it would be beneficial.
-// FIXME: Change encoding to pseudo! This is blocked right now by the x86
-// JIT implementation, it does not expand the instructions below like
-// X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
-  let Predicates = [HasAVX] in
-  def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
-  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
-                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+    isPseudo = 1 in {
+  def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
+                       [(set VR128:$dst, (v4i32 immAllOnesV))]>;
   let Predicates = [HasAVX2] in
-  def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
-                          [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
+  def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>;
 }
 
 
@@ -605,27 +596,27 @@ let Predicates = [HasAVX] in {
   // Represent the same patterns above but in the form they appear for
   // 256-bit types
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (i32 0)))),
+                   (v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (i32 0)))),
+                   (v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (i32 0)))),
+                   (v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
   }
   def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
-                   (v4f32 (scalar_to_vector FR32:$src)), (i32 0)))),
+                   (v4f32 (scalar_to_vector FR32:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i32 0),
                            (v4f32 (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)),
                            sub_xmm)>;
   def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
-                   (v2f64 (scalar_to_vector FR64:$src)), (i32 0)))),
+                   (v2f64 (scalar_to_vector FR64:$src)), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0),
                            (v2f64 (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)),
                            sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (i32 0)))),
+                   (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVSDrm addr:$src), sub_xmm)>;
 
   // Move low f64 and clear high bits.
@@ -704,7 +695,7 @@ let Predicates = [HasAVX] in {
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSS to the lower bits.
@@ -738,7 +729,7 @@ let Predicates = [HasSSE1] in {
             (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   let AddedComplexity = 15 in {
   // Move scalar to XMM zero-extended, zeroing a VR128 then do a
   // MOVSD to the lower bits.
@@ -822,16 +813,16 @@ defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              TB, VEX;
+                              TB, VEX, VEX_L;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              TB, OpSize, VEX;
+                              TB, OpSize, VEX, VEX_L;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              TB, VEX;
+                              TB, VEX, VEX_L;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
-                              TB, OpSize, VEX;
+                              TB, OpSize, VEX, VEX_L;
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                               TB;
@@ -864,19 +855,19 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
 
 // For disassembler
 let isCodeGenOnly = 1 in {
@@ -899,33 +890,33 @@ let isCodeGenOnly = 1 in {
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
 def : Pat<(v8i32 (X86vzmovl
-                        (insert_subvector undef, (v4i32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4i64 (X86vzmovl
-                        (insert_subvector undef, (v2i64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v8f32 (X86vzmovl
-                        (insert_subvector undef, (v4f32 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 def : Pat<(v4f64 (X86vzmovl
-                        (insert_subvector undef, (v2f64 VR128:$src), (i32 0)))),
+                  (insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)))),
           (SUBREG_TO_REG (i32 0), (VMOVAPSrr VR128:$src), sub_xmm)>;
 }
 
@@ -975,10 +966,10 @@ let Predicates = [HasAVX] in {
             (VMOVUPDmr addr:$dst, VR128:$src)>;
 }
 
-let Predicates = [HasSSE1] in
+let Predicates = [UseSSE1] in
   def : Pat<(int_x86_sse_storeu_ps addr:$dst, VR128:$src),
             (MOVUPSmr addr:$dst, VR128:$src)>;
-let Predicates = [HasSSE2] in
+let Predicates = [UseSSE2] in
   def : Pat<(int_x86_sse2_storeu_pd addr:$dst, VR128:$src),
             (MOVUPDmr addr:$dst, VR128:$src)>;
 
@@ -1028,12 +1019,52 @@ let Predicates = [HasAVX] in {
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v32i8 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2f64 (extract_subvector
+                                  (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4f32 (extract_subvector
+                                  (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2f64 (extract_subvector
+                           (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4f32 (extract_subvector
+                           (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
 // The instructions selected below are then converted to MOVDQA/MOVDQU
 // during the SSE domain pass.
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(alignedloadv2i64 addr:$src),
             (MOVAPSrm addr:$src)>;
   def : Pat<(loadv2i64 addr:$src),
@@ -1180,7 +1211,7 @@ let Predicates = [HasAVX] in {
             (VMOVLPDmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
   def : Pat<(store (i64 (vector_extract (bc_v2i64 (v4f32 VR128:$src2)),
                                  (iPTR 0))), addr:$src1),
@@ -1205,7 +1236,7 @@ let Predicates = [HasSSE1] in {
             (MOVLPSmr addr:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Shuffle with MOVLPD
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
             (MOVLPDrm VR128:$src1, addr:$src2)>;
@@ -1271,7 +1302,7 @@ let Predicates = [HasAVX] in {
             (VMOVHPSrm VR128:$src1, addr:$src2)>;
 
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
-  // is during lowering, where it's not possible to recognize the load fold 
+  // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
   // and the fold opportunity reappears.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1279,7 +1310,7 @@ let Predicates = [HasAVX] in {
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVHPS patterns
   def : Pat<(X86Movlhps VR128:$src1,
                  (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
@@ -1289,9 +1320,9 @@ let Predicates = [HasSSE1] in {
             (MOVHPSrm VR128:$src1, addr:$src2)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
-  // is during lowering, where it's not possible to recognize the load fold 
+  // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
   // and the fold opportunity reappears.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1346,7 +1377,7 @@ let Predicates = [HasAVX] in {
             (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   // MOVLHPS patterns
   def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
             (MOVLHPSrr VR128:$src1, VR128:$src2)>;
@@ -1456,7 +1487,7 @@ def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
 def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
                 (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
   def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
@@ -1628,12 +1659,12 @@ defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               TB, VEX, Requires<[HasAVX]>;
+                               TB, VEX, VEX_L, Requires<[HasAVX]>;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
                             SSEPackedSingle, SSE_CVT_PS>,
-                            TB, Requires<[HasSSE2]>;
+                            TB, Requires<[UseSSE2]>;
 
 /// SSE 2 Only
 
@@ -1663,7 +1694,7 @@ def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
                       [(set FR32:$dst, (fround (loadf64 addr:$src)))],
                       IIC_SSE_CVT_Scalar_RM>,
                       XD,
-                  Requires<[HasSSE2, OptForSize]>;
+                  Requires<[UseSSE2, OptForSize]>;
 
 def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1684,13 +1715,13 @@ def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>;
 def Int_CVTSD2SSrm: I<0x5A, MRMSrcReg,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[HasSSE2]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>;
 }
 
 // Convert scalar single to scalar double
@@ -1709,30 +1740,28 @@ def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>;
 }
 
-let AddedComplexity = 1 in { // give AVX priority
-  def : Pat<(f64 (fextend FR32:$src)),
-            (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
-  def : Pat<(fextend (loadf32 addr:$src)),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
+def : Pat<(f64 (fextend FR32:$src)),
+    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[HasAVX]>;
+def : Pat<(fextend (loadf32 addr:$src)),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX]>;
 
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
-            Requires<[HasAVX, OptForSize]>;
-  def : Pat<(extloadf32 addr:$src),
-            (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
-            Requires<[HasAVX, OptForSpeed]>;
-} // AddedComplexity = 1
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    Requires<[HasAVX, OptForSize]>;
+def : Pat<(extloadf32 addr:$src),
+    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    Requires<[HasAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (fextend FR32:$src))],
                    IIC_SSE_CVT_Scalar_RR>, XS,
-                 Requires<[HasSSE2]>;
+                 Requires<[UseSSE2]>;
 def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
                    "cvtss2sd\t{$src, $dst|$dst, $src}",
                    [(set FR64:$dst, (extloadf32 addr:$src))],
                    IIC_SSE_CVT_Scalar_RM>, XS,
-                 Requires<[HasSSE2, OptForSize]>;
+                 Requires<[UseSSE2, OptForSize]>;
 
 // extload f32 -> f64.  This matches load+fextend because we have a hack in
 // the isel (PreprocessForFPConvert) that can introduce loads after dag
@@ -1740,9 +1769,9 @@ def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
 // Since these loads aren't folded into the fextend, we have to match it
 // explicitly here.
 def : Pat<(fextend (loadf32 addr:$src)),
-          (CVTSS2SDrm addr:$src)>, Requires<[HasSSE2]>;
+          (CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
 def : Pat<(extloadf32 addr:$src),
-          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
 
 def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1762,13 +1791,13 @@ def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>;
 def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "cvtss2sd\t{$src2, $dst|$dst, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[HasSSE2]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>;
 }
 
 // Convert packed single/double fp to doubleword
@@ -1785,12 +1814,12 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX;
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX;
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
@@ -1824,7 +1853,7 @@ def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX;
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -1860,12 +1889,12 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
-                          IIC_SSE_CVT_PS_RR>, VEX;
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
                                              (memopv8f32 addr:$src)))],
-                          IIC_SSE_CVT_PS_RM>, VEX;
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1904,7 +1933,7 @@ let Predicates = [HasAVX] in {
             (VCVTTPS2DQYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
             (CVTDQ2PSrr VR128:$src)>;
   def : Pat<(v4f32 (sint_to_fp (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -1945,7 +1974,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
-                         IIC_SSE_CVT_PD_RR>, VEX;
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
@@ -1978,31 +2007,31 @@ def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                      IIC_SSE_CVT_PD_RR>, TB, VEX;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                     "vcvtps2pd\t{$src, $dst|$dst, $src}", [],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                    "vcvtps2pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                    IIC_SSE_CVT_PD_RM>, TB, VEX;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, TB, VEX;
+                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))],
                        IIC_SSE_CVT_PD_RR>, TB;
-let neverHasSideEffects = 1, mayLoad = 1 in
 def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
-                       "cvtps2pd\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_CVT_PD_RM>, TB;
+                   "cvtps2pd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
+                   IIC_SSE_CVT_PD_RM>, TB;
 }
 
 // Convert Packed DW Integers to Packed Double FP
@@ -2019,11 +2048,11 @@ def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (memopv2i64 addr:$src))))]>, VEX;
+                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
-                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX;
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L;
 }
 
 let neverHasSideEffects = 1, mayLoad = 1 in
@@ -2066,7 +2095,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX;
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -2096,6 +2125,10 @@ let Predicates = [HasAVX] in {
             (VCVTDQ2PSYrm addr:$src)>;
 
   // Match fround and fextend for 128/256-bit conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (VCVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (VCVTPD2PSXrm addr:$src)>;
   def : Pat<(v4f32 (fround (v4f64 VR256:$src))),
             (VCVTPD2PSYrr VR256:$src)>;
   def : Pat<(v4f32 (fround (loadv4f64 addr:$src))),
@@ -2105,12 +2138,17 @@ let Predicates = [HasAVX] in {
             (VCVTPS2PDrr VR128:$src)>;
   def : Pat<(v4f64 (fextend (v4f32 VR128:$src))),
             (VCVTPS2PDYrr VR128:$src)>;
-  def : Pat<(v4f64 (fextend (loadv4f32 addr:$src))),
+  def : Pat<(v4f64 (extloadv4f32 addr:$src)),
             (VCVTPS2PDYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
-  // Match fextend for 128 conversions
+let Predicates = [UseSSE2] in {
+  // Match fround and fextend for 128 conversions
+  def : Pat<(v4f32 (X86vfpround (v2f64 VR128:$src))),
+            (CVTPD2PSrr VR128:$src)>;
+  def : Pat<(v4f32 (X86vfpround (memopv2f64 addr:$src))),
+            (CVTPD2PSrm addr:$src)>;
+
   def : Pat<(v2f64 (X86vfpext (v4f32 VR128:$src))),
             (CVTPS2PDrr VR128:$src)>;
 }
@@ -2121,7 +2159,7 @@ let Predicates = [HasSSE2] in {
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, SDNode OpNode, ValueType VT, 
+                            Operand CC, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm, string asm_alt,
                             OpndItins itins> {
   def rr : SIi8<0xC2, MRMSrcReg,
@@ -2267,7 +2305,7 @@ let Defs = [EFLAGS] in {
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, Intrinsic Int, string asm, 
+                            Operand CC, Intrinsic Int, string asm,
                             string asm_alt, Domain d> {
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
@@ -2300,11 +2338,11 @@ defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, TB, VEX_4V;
+               SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, TB, OpSize, VEX_4V;
+               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
@@ -2336,14 +2374,14 @@ def : Pat<(v4i64 (X86cmpp (v4f64 VR256:$src1), (memop addr:$src2), imm:$cc)),
           (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
 def : Pat<(v4i32 (X86cmpp (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
           (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
           (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
 def : Pat<(v2i64 (X86cmpp (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
@@ -2374,13 +2412,13 @@ defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            memopv4f32, SSEPackedSingle>, TB, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv8f32, SSEPackedSingle>, TB, VEX_4V;
+           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
            memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2420,7 +2458,7 @@ let Predicates = [HasAVX] in {
             (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE1] in {
+let Predicates = [UseSSE1] in {
   def : Pat<(v4i32 (X86Shufp VR128:$src1,
                        (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
             (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
@@ -2428,7 +2466,7 @@ let Predicates = [HasSSE1] in {
             (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // Generic SHUFPD patterns
   def : Pat<(v2i64 (X86Shufp VR128:$src1,
                        (memopv2i64 addr:$src2), (i8 imm:$imm))),
@@ -2474,16 +2512,16 @@ defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2500,7 +2538,27 @@ let Constraints = "$src1 = $dst" in {
                        SSEPackedDouble>, TB, OpSize;
 } // Constraints = "$src1 = $dst"
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
+            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
+            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
+            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
+            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
+}
+
+let Predicates = [HasAVX] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2509,7 +2567,7 @@ let Predicates = [HasAVX], AddedComplexity = 1 in {
             (VUNPCKLPDrr VR128:$src, VR128:$src)>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Movddup, there should be a X86Unpckl here, the
   // problem is during lowering, where it's not possible to recognize the load
   // fold cause it has two uses through a bitcast. One use disappears at isel
@@ -2540,10 +2598,11 @@ let Predicates = [HasAVX] in {
                                         "movmskpd", SSEPackedDouble>, TB,
                                         OpSize, VEX;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, TB, VEX;
+                                        "movmskps", SSEPackedSingle>, TB,
+                                        VEX, VEX_L;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
                                         "movmskpd", SSEPackedDouble>, TB,
-                                        OpSize, VEX;
+                                        OpSize, VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
             (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -2564,11 +2623,11 @@ let Predicates = [HasAVX] in {
              OpSize, VEX;
   def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX;
+             SSEPackedSingle>, TB, VEX, VEX_L;
   def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
              SSEPackedDouble>, TB,
-             OpSize, VEX;
+             OpSize, VEX, VEX_L;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2578,16 +2637,16 @@ defm MOVMSKPD : sse12_extr_sign_mask<VR128, int_x86_sse2_movmsk_pd, "movmskpd",
 
 def : Pat<(i32 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i64 (X86fgetsign FR32:$src)),
           (MOVMSKPSrr64 (COPY_TO_REGCLASS FR32:$src, VR128))>,
-      Requires<[HasSSE1]>;
+      Requires<[UseSSE1]>;
 def : Pat<(i32 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr32 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 def : Pat<(i64 (X86fgetsign FR64:$src)),
           (MOVMSKPDrr64 (COPY_TO_REGCLASS FR64:$src, VR128))>,
-      Requires<[HasSSE2]>;
+      Requires<[UseSSE2]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Logical Instructions
@@ -2646,13 +2705,13 @@ defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
 
 let Predicates = [HasAVX2] in {
 defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPORY  : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
-                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
+                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2683,14 +2742,12 @@ multiclass sse12_fp_alias_pack_logical<bits<8> opc, string OpcodeStr,
 }
 
 // Alias bitwise logical operations using SSE logical ops on packed FP values.
-let mayLoad = 0 in {
-  defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
-                SSE_BIT_ITINS_P>;
-  defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
-                SSE_BIT_ITINS_P>;
-  defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
-                SSE_BIT_ITINS_P>;
-}
+defm FsAND  : sse12_fp_alias_pack_logical<0x54, "and", X86fand,
+              SSE_BIT_ITINS_P>;
+defm FsOR   : sse12_fp_alias_pack_logical<0x56, "or", X86for,
+              SSE_BIT_ITINS_P>;
+defm FsXOR  : sse12_fp_alias_pack_logical<0x57, "xor", X86fxor,
+              SSE_BIT_ITINS_P>;
 
 let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
   defm FsANDN : sse12_fp_alias_pack_logical<0x55, "andn", undef,
@@ -2740,7 +2797,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
           !strconcat(OpcodeStr, "ps"), f256mem,
           [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                                    (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V;
+                             (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
 
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2748,7 +2805,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
                                     (bc_v4i64 (v4f64 VR256:$src2))))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                     (memopv4i64 addr:$src2)))], 0>,
-                                    TB, OpSize, VEX_4V;
+                                    TB, OpSize, VEX_4V, VEX_L;
 }
 
 // AVX 256-bit packed logical ops forms
@@ -2794,27 +2851,23 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    SizeItins itins,
                                    bit Is2Addr = 1> {
-  let mayLoad = 0 in {
   defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
               v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
               TB;
   defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
               v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
               TB, OpSize;
-  }
 }
 
 multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
                                     SDNode OpNode,
                                     SizeItins itins> {
-  let mayLoad = 0 in {
-    defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
+  defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
                 v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
-                TB;
-    defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
+                TB, VEX_L;
+  defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
                 v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
-                TB, OpSize;
-  }
+                TB, OpSize, VEX_L;
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
@@ -2846,11 +2899,11 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr,
                                         SizeItins itins> {
   defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
      !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
-      SSEPackedSingle, itins.s, 0>, TB;
+      SSEPackedSingle, itins.s, 0>, TB, VEX_L;
 
   defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
      !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
-      SSEPackedDouble, itins.d, 0>, TB, OpSize;
+      SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_L;
 }
 
 // Binary Arithmetic instructions
@@ -2872,7 +2925,8 @@ let isCommutable = 0 in {
               basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
                 VEX_4V, VEX_LIG;
   defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>,
-              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>, VEX_4V;
+              basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
+                VEX_4V;
   defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
               basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
                 VEX_4V, VEX_LIG;
@@ -2923,6 +2977,23 @@ let Constraints = "$src1 = $dst" in {
   }
 }
 
+let isCodeGenOnly = 1 in {
+  defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
+       VEX_4V, VEX_LIG;
+  defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>,
+       basic_sse12_fp_binop_p_y<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, VEX_4V;
+  defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
+       VEX_4V, VEX_LIG;
+  defm VMINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P, 0>,
+       basic_sse12_fp_binop_p_y<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, VEX_4V;
+  let Constraints = "$src1 = $dst" in {
+    defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>,
+         basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
+    defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>,
+         basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+  }
+}
+
 /// Unop Arithmetic
 /// In addition, we also have a special variant of the scalar form here to
 /// represent the associated intrinsic operation.  This form is unlike the
@@ -2960,7 +3031,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
   def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
                 !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                 [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
-            Requires<[HasSSE1, OptForSize]>;
+            Requires<[UseSSE1, OptForSize]>;
   def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F32Int VR128:$src))], itins.rr>;
@@ -2974,7 +3045,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
   def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
@@ -2982,6 +3053,7 @@ multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
                 (ins VR128:$src1, ssmem:$src2),
                 !strconcat(OpcodeStr,
                            "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse1_fp_unop_p - SSE1 unops in packed form.
@@ -3001,11 +3073,11 @@ multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
   def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
               !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
               [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-              itins.rr>;
+              itins.rr>, VEX_L;
   def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                 [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
-                itins.rm>;
+                itins.rm>, VEX_L;
 }
 
 /// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
@@ -3027,11 +3099,11 @@ multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
   def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V4F32Int VR256:$src))],
-                    itins.rr>;
+                    itins.rr>, VEX_L;
   def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, VEX_L;
 }
 
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
@@ -3044,7 +3116,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
   def SDm : I<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
                 !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                 [(set FR64:$dst, (OpNode (load addr:$src)))], itins.rm>, XD,
-            Requires<[HasSSE2, OptForSize]>;
+            Requires<[UseSSE2, OptForSize]>;
   def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
                     [(set VR128:$dst, (F64Int VR128:$src))], itins.rr>;
@@ -3054,20 +3126,20 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
 }
 
 /// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
+let hasSideEffects = 0 in
 multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
-  let neverHasSideEffects = 1 in {
   def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  let mayLoad = 1 in
+  let mayLoad = 1 in {
   def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
-  }
   def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
                (ins VR128:$src1, sdmem:$src2),
                !strconcat(OpcodeStr,
                           "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+  }
 }
 
 /// sse2_fp_unop_p - SSE2 unops in vector forms.
@@ -3087,11 +3159,11 @@ multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
   def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
               [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-              itins.rr>;
+              itins.rr>, VEX_L;
   def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                 [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
-                itins.rm>;
+                itins.rm>, VEX_L;
 }
 
 /// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
@@ -3113,11 +3185,11 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
   def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V2F64Int VR256:$src))],
-                    itins.rr>;
+                    itins.rr>, VEX_L;
   def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -3158,7 +3230,6 @@ let Predicates = [HasAVX] in {
                                     SSE_RCPP>, VEX;
 }
 
-let AddedComplexity = 1 in {
 def : Pat<(f32 (fsqrt FR32:$src)),
           (VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
 def : Pat<(f32 (fsqrt (load addr:$src))),
@@ -3181,9 +3252,8 @@ def : Pat<(f32 (X86frcp FR32:$src)),
 def : Pat<(f32 (X86frcp (load addr:$src))),
           (VRCPSSm (f32 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX, OptForSize]>;
-}
 
-let Predicates = [HasAVX], AddedComplexity = 1 in {
+let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse_sqrt_ss VR128:$src),
             (COPY_TO_REGCLASS (VSQRTSSr (f32 (IMPLICIT_DEF)),
                                         (COPY_TO_REGCLASS VR128:$src, FR32)),
@@ -3223,17 +3293,52 @@ defm SQRT  : sse1_fp_unop_s<0x51, "sqrt",  fsqrt, int_x86_sse_sqrt_ss,
              sse2_fp_unop_p<0x51, "sqrt",  fsqrt, SSE_SQRTS>,
              sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
 
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                               Intrinsic F32Int, OpndItins itins> {
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]>;
+  // For scalar unary operations, fold a load into the operation
+  // only in OptForSize mode. It eliminates an instruction, but it also
+  // eliminates a whole-register clobber (the load), so it introduces a
+  // partial register update condition.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+            Requires<[UseSSE1, OptForSize]>;
+  let Constraints = "$src1 = $dst" in {
+    def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rr>;
+    def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, ssmem:$src2),
+                      !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                      [], itins.rm>;
+  }
+}
+
 // Reciprocal approximations. Note that these typically require refinement
 // in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
-                            SSE_SQRTS>,
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
+                             SSE_SQRTS>,
              sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
              sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
                             SSE_SQRTS>;
-defm RCP   : sse1_fp_unop_s<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
-                            SSE_RCPS>,
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
+            (RSQRTSSr_Int VR128:$src, VR128:$src)>;
+}
+
+defm RCP   : sse1_fp_unop_rw<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
+                             SSE_RCPS>,
              sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
              sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
+let Predicates = [UseSSE1] in {
+  def : Pat<(int_x86_sse_rcp_ss VR128:$src),
+            (RCPSSr_Int VR128:$src, VR128:$src)>;
+}
 
 // There is no f64 version of the reciprocal approximation instructions.
 
@@ -3271,20 +3376,20 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
                        "movntps\t{$src, $dst|$dst, $src}",
                        [(alignednontemporalstore (v8f32 VR256:$src),
                                                  addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
+                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
   def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                        (ins f256mem:$dst, VR256:$src),
                        "movntpd\t{$src, $dst|$dst, $src}",
                        [(alignednontemporalstore (v4f64 VR256:$src),
                                                  addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
+                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
   let ExeDomain = SSEPackedInt in
   def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
                       (ins f256mem:$dst, VR256:$src),
                       "movntdq\t{$src, $dst|$dst, $src}",
                       [(alignednontemporalstore (v4i64 VR256:$src),
                                                 addr:$dst)],
-                                                IIC_SSE_MOVNT>, VEX;
+                                                IIC_SSE_MOVNT>, VEX, VEX_L;
 }
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
@@ -3304,7 +3409,7 @@ def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                     IIC_SSE_MOVNT>;
 
 def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
-          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+          (MOVNTDQmr addr:$dst, VR128:$src)>, Requires<[UseSSE2]>;
 
 // There is no AVX form for instructions below this point
 def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
@@ -3393,14 +3498,14 @@ def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     VEX;
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX;
+                    VEX, VEX_L;
 }
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
                     VEX;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX;
+                    VEX, VEX_L;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3410,16 +3515,14 @@ def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         VEX;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>,
-                        VEX;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
                         VEX;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>,
-                        VEX;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3428,14 +3531,14 @@ def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    VEX;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX;
+                   VEX, VEX_L;
 let Predicates = [HasAVX] in {
   def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
                     XS, VEX;
   def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX;
+                    XS, VEX, VEX_L;
 }
 }
 
@@ -3447,14 +3550,14 @@ def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i256mem:$dst, VR256:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX;
+                     VEX, VEX_L;
 let Predicates = [HasAVX] in {
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
                   XS, VEX;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX;
+                  XS, VEX, VEX_L;
 }
 }
 
@@ -3464,7 +3567,7 @@ def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
-                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                   [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3474,7 +3577,7 @@ def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[HasSSE2]>;
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3486,7 +3589,7 @@ def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(set VR128:$dst, (loadv2i64 addr:$src))*/],
                    IIC_SSE_MOVU_P_RM>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 let mayStore = 1 in {
@@ -3498,7 +3601,7 @@ def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                    "movdqu\t{$src, $dst|$dst, $src}",
                    [/*(store (v2i64 VR128:$src), addr:$dst)*/],
                    IIC_SSE_MOVU_P_MR>,
-                 XS, Requires<[HasSSE2]>;
+                 XS, Requires<[UseSSE2]>;
 }
 
 // Intrinsic forms of MOVDQU load and store
@@ -3512,7 +3615,7 @@ def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
                        [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)],
                        IIC_SSE_MOVU_P_MR>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -3690,82 +3793,82 @@ defm VPSADBW  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw,
 
 let Predicates = [HasAVX2] in {
 defm VPADDBY  : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDWY  : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDDY  : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDQY  : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
-                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
-                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPSUBBY  : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBWY  : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBDY  : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBQY  : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
-                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, memopv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 
 // Intrinsic forms
 defm VPSUBSBY  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBSWY  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPADDSBY  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDSWY  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULHWY  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_PMADD, 1, 0>, VEX_4V;
+                                  SSE_PMADD, 1, 0>, VEX_4V, VEX_L;
 defm VPAVGBY   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPAVGWY   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMINUBY  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMINSWY  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMAXUBY  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMAXSWY  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPSADBWY  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -3901,30 +4004,30 @@ let ExeDomain = SSEPackedInt in {
 let Predicates = [HasAVX2] in {
 defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
                              VR256, v4i64, v2i64, bc_v2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
                              VR256, v4i64, v2i64, bc_v2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 let ExeDomain = SSEPackedInt in {
   // 256-bit logical shifts.
@@ -3933,13 +4036,13 @@ let ExeDomain = SSEPackedInt in {
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
                       (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
-                    VEX_4V;
+                    VEX_4V, VEX_L;
   def VPSRLDQYri : PDIi8<0x73, MRM3r,
                     (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
                       (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
-                    VEX_4V;
+                    VEX_4V, VEX_L;
   // PSRADQYri doesn't exist in SSE[1-3].
 }
 } // Predicates = [HasAVX2]
@@ -4010,7 +4113,7 @@ let Predicates = [HasAVX2] in {
             (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
             (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
@@ -4053,22 +4156,22 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
   defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
   defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4111,13 +4214,13 @@ defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
 let Predicates = [HasAVX2] in {
 defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4187,12 +4290,15 @@ let Predicates = [HasAVX] in {
 }
 
 let Predicates = [HasAVX2] in {
-  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX;
-  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX;
-  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX;
+  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>,
+                                TB, OpSize, VEX,VEX_L;
+  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>,
+                                  XS, VEX, VEX_L;
+  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>,
+                                  XD, VEX, VEX_L;
 }
 
-let Predicates = [HasSSE2] in {
+let Predicates = [UseSSE2] in {
  let AddedComplexity = 5 in
   defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
 
@@ -4268,22 +4374,22 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
-                                   bc_v32i8>, VEX_4V;
+                                   bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
-                                   bc_v16i16>, VEX_4V;
+                                   bc_v16i16>, VEX_4V, VEX_L;
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
-                                   bc_v8i32>, VEX_4V;
+                                   bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
-                                   bc_v4i64>, VEX_4V;
+                                   bc_v4i64>, VEX_4V, VEX_L;
 
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
-                                   bc_v32i8>, VEX_4V;
+                                   bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
-                                   bc_v16i16>, VEX_4V;
+                                   bc_v16i16>, VEX_4V, VEX_L;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
-                                   bc_v8i32>, VEX_4V;
+                                   bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
-                                   bc_v4i64>, VEX_4V;
+                                   bc_v4i64>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4307,28 +4413,6 @@ let Constraints = "$src1 = $dst" in {
 }
 } // ExeDomain = SSEPackedInt
 
-// Patterns for using AVX1 instructions with integer vectors
-// Here to give AVX2 priority
-let Predicates = [HasAVX] in {
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
-            (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
-            (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, (memopv4i64 addr:$src2))),
-            (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
-            (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-}
-
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Extract and Insert
 //===---------------------------------------------------------------------===//
@@ -4377,7 +4461,7 @@ let Predicates = [HasAVX] in {
 }
 
 let Constraints = "$src1 = $dst" in
-  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[HasSSE2]>;
+  defm PINSRW : sse2_pinsrw, TB, OpSize, Requires<[UseSSE2]>;
 
 } // ExeDomain = SSEPackedInt
 
@@ -4397,9 +4481,9 @@ def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX;
+           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
 def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
+           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
@@ -4538,7 +4622,7 @@ def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
 // Move Packed Doubleword Int first element to Doubleword Int
 //
 def VMOVPQIto64rr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
-                          "mov{d|q}\t{$src, $dst|$dst, $src}",
+                          "vmov{d|q}\t{$src, $dst|$dst, $src}",
                           [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
                                                            (iPTR 0)))],
                                                            IIC_SSE_MOVD_ToGP>,
@@ -4654,14 +4738,14 @@ let Predicates = [HasAVX] in {
   }
   // Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
   def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
-                                (v4i32 (scalar_to_vector GR32:$src)),(i32 0)))),
+                               (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i32 0), (VMOVZDI2PDIrr GR32:$src), sub_xmm)>;
   def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
-                                (v2i64 (scalar_to_vector GR64:$src)),(i32 0)))),
+                               (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
             (SUBREG_TO_REG (i64 0), (VMOVZQI2PQIrr GR64:$src), sub_xmm)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
             (MOVZDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
@@ -4701,7 +4785,7 @@ def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))],
                       IIC_SSE_MOVDQ>, XS,
-                    Requires<[HasSSE2]>; // SSE2 instruction with XS Prefix
+                    Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
 
 //===---------------------------------------------------------------------===//
 // Move Packed Quadword Int to Quadword Int
@@ -4744,7 +4828,7 @@ def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
                                                  (loadi64 addr:$src))))))],
                                                  IIC_SSE_MOVDQ>,
-                     XS, Requires<[HasSSE2]>;
+                     XS, Requires<[UseSSE2]>;
 
 let Predicates = [HasAVX], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
@@ -4755,7 +4839,7 @@ let Predicates = [HasAVX], AddedComplexity = 20 in {
             (VMOVZQI2PQIrm addr:$src)>;
 }
 
-let Predicates = [HasSSE2], AddedComplexity = 20 in {
+let Predicates = [UseSSE2], AddedComplexity = 20 in {
   def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
             (MOVZQI2PQIrm addr:$src)>;
   def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
@@ -4785,7 +4869,7 @@ def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 
 let AddedComplexity = 20 in
 def VMOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
@@ -4800,7 +4884,7 @@ def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     [(set VR128:$dst, (v2i64 (X86vzmovl
                                              (loadv2i64 addr:$src))))],
                                              IIC_SSE_MOVDQ>,
-                      XS, Requires<[HasSSE2]>;
+                      XS, Requires<[UseSSE2]>;
 }
 
 let AddedComplexity = 20 in {
@@ -4810,7 +4894,7 @@ let AddedComplexity = 20 in {
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
               (VMOVZPQILo2PQIrr VR128:$src)>;
   }
-  let Predicates = [HasSSE2] in {
+  let Predicates = [UseSSE2] in {
     def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
               (MOVZPQILo2PQIrm addr:$src)>;
     def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
@@ -4862,9 +4946,9 @@ let Predicates = [HasAVX] in {
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
                                        v4f32, VR128, memopv4f32, f128mem>, VEX;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -4890,7 +4974,7 @@ let Predicates = [HasAVX] in {
             (VMOVSLDUPYrm addr:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(v4i32 (X86Movshdup VR128:$src)),
             (MOVSHDUPrr VR128:$src)>;
   def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
@@ -4932,7 +5016,7 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
 
 let Predicates = [HasAVX] in {
   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
@@ -4959,7 +5043,7 @@ let Predicates = [HasAVX] in {
             (VMOVDDUPYrr VR256:$src)>;
 }
 
-let Predicates = [HasSSE3] in {
+let Predicates = [UseSSE3] in {
   def : Pat<(X86Movddup (memopv2f64 addr:$src)),
             (MOVDDUPrm addr:$src)>;
   def : Pat<(X86Movddup (bc_v2f64 (memopv4f32 addr:$src))),
@@ -4981,7 +5065,8 @@ let Predicates = [HasAVX] in {
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX;
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
 }
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
@@ -5014,16 +5099,16 @@ let Predicates = [HasAVX] in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
                                  f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                                 f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
+                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
                                  f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                                 f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
+                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
   }
 }
-let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in {
+let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
   let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
                               f128mem, SSE_ALU_F32P>, TB, XD;
@@ -5075,9 +5160,9 @@ let Predicates = [HasAVX] in {
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
                             X86fhsub, 0>, VEX_4V;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, 0>, VEX_4V, VEX_L;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, 0>, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
@@ -5085,9 +5170,9 @@ let Predicates = [HasAVX] in {
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
                             X86fhsub, 0>, VEX_4V;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, 0>, VEX_4V, VEX_L;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -5153,11 +5238,11 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
-                                    int_x86_avx2_pabs_b>, VEX;
+                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
   defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
-                                    int_x86_avx2_pabs_w>, VEX;
+                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
   defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
-                                    int_x86_avx2_pabs_d>, VEX;
+                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
 }
 
 defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
@@ -5296,37 +5381,37 @@ let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
-                                        int_x86_avx2_phadd_sw>, VEX_4V;
+                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
-                                        int_x86_avx2_phsub_sw>, VEX_4V;
+                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
   defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
-                                        int_x86_avx2_pmadd_ub_sw>, VEX_4V;
+                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
 }
 defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
-                                        int_x86_avx2_pmul_hr_sw>, VEX_4V;
+                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
 }
 
 // None of these have i8 immediate fields.
@@ -5405,8 +5490,8 @@ multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
 let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
-  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V;
-let Constraints = "$src1 = $dst", Predicates = [HasSSSE3] in
+  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L;
+let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
 let Predicates = [HasAVX2] in {
@@ -5431,7 +5516,7 @@ def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 }
 
-let Predicates = [HasSSSE3] in {
+let Predicates = [UseSSSE3] in {
 def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
           (PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
 def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
@@ -5512,17 +5597,17 @@ defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
-                                        int_x86_avx2_pmovsxbw>, VEX;
+                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
 defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
-                                        int_x86_avx2_pmovsxwd>, VEX;
+                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
 defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
-                                        int_x86_avx2_pmovsxdq>, VEX;
+                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
 defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
-                                        int_x86_avx2_pmovzxbw>, VEX;
+                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
 defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
-                                        int_x86_avx2_pmovzxwd>, VEX;
+                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
 defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
-                                        int_x86_avx2_pmovzxdq>, VEX;
+                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
 }
 
 defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
@@ -5538,64 +5623,88 @@ let Predicates = [HasAVX] in {
             (VPMOVSXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
             (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVSXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
             (VPMOVSXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
             (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVSXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
             (VPMOVSXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
             (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVSXDQrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
             (VPMOVZXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
             (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVZXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
             (VPMOVZXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
             (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVZXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
             (VPMOVZXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
             (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVZXDQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load.
   def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVSXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
             (PMOVSXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
             (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVSXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
             (PMOVSXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
             (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVSXDQrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
             (PMOVZXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
             (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVZXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
             (PMOVZXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
             (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVZXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
             (PMOVZXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
             (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVZXDQrm addr:$src)>;
 }
 
 let Predicates = [HasAVX2] in {
@@ -5615,7 +5724,7 @@ let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v2i64 (X86vsmovl (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
   def : Pat<(v4i32 (X86vsmovl (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
 }
@@ -5659,13 +5768,13 @@ defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
-                                       int_x86_avx2_pmovsxbd>, VEX;
+                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
 defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
-                                       int_x86_avx2_pmovsxwq>, VEX;
+                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
 defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
-                                       int_x86_avx2_pmovzxbd>, VEX;
+                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
 defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
-                                       int_x86_avx2_pmovzxwq>, VEX;
+                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
 }
 
 defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
@@ -5686,7 +5795,7 @@ let Predicates = [HasAVX] in {
             (VPMOVZXWQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
             (PMOVSXBDrm addr:$src)>;
@@ -5734,9 +5843,9 @@ defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
 }
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
-                                       int_x86_avx2_pmovsxbq>, VEX;
+                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
 defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
-                                       int_x86_avx2_pmovzxbq>, VEX;
+                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
 }
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
@@ -5754,7 +5863,7 @@ let Predicates = [HasAVX] in {
             (VPMOVZXBQrm addr:$src)>;
 }
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   // Common patterns involving scalar load
   def : Pat<(int_x86_sse41_pmovsxbq
               (bitconvert (v4i32 (X86vzmovl
@@ -5767,6 +5876,100 @@ let Predicates = [HasSSE41] in {
             (PMOVZXBQrm addr:$src)>;
 }
 
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
+
+  def : Pat<(v16i16 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v8i32 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v32i8 VR256:$src))),
+            (VPMOVZXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v8i32 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+  def : Pat<(v4i64 (X86vzext (v16i16 VR256:$src))),
+            (VPMOVZXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+  def : Pat<(v4i64 (X86vzext (v8i32 VR256:$src))),
+            (VPMOVZXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (VPMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Extract Instructions
 //===----------------------------------------------------------------------===//
@@ -5900,7 +6103,7 @@ def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
                                               imm:$src2))),
                  addr:$dst),
           (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
-          Requires<[HasSSE41]>;
+          Requires<[UseSSE41]>;
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Insert Instructions
@@ -6147,7 +6350,7 @@ let Predicates = [HasAVX] in {
   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
                                   memopv8f32, memopv4f64,
                                   int_x86_avx_round_ps_256,
-                                  int_x86_avx_round_pd_256>, VEX;
+                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
@@ -6172,6 +6375,15 @@ let Predicates = [HasAVX] in {
             (VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
   def : Pat<(f64 (ftrunc FR64:$src)),
             (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (VROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (VROUNDPDr VR128:$src, (i32 0x1))>;
+  def : Pat<(v8f32 (ffloor VR256:$src)),
+            (VROUNDYPSr VR256:$src, (i32 0x1))>;
+  def : Pat<(v4f64 (ffloor VR256:$src)),
+            (VROUNDYPDr VR256:$src, (i32 0x1))>;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6181,26 +6393,33 @@ let Constraints = "$src1 = $dst" in
 defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
                                int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
 
-def : Pat<(ffloor FR32:$src),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
-def : Pat<(f64 (ffloor FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
-def : Pat<(f32 (fnearbyint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
-def : Pat<(f64 (fnearbyint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
-def : Pat<(f32 (fceil FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
-def : Pat<(f64 (fceil FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
-def : Pat<(f32 (frint FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
-def : Pat<(f64 (frint FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
-def : Pat<(f32 (ftrunc FR32:$src)),
-          (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
-def : Pat<(f64 (ftrunc FR64:$src)),
-          (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+let Predicates = [UseSSE41] in {
+  def : Pat<(ffloor FR32:$src),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x1))>;
+  def : Pat<(f64 (ffloor FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x1))>;
+  def : Pat<(f32 (fnearbyint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
+  def : Pat<(f64 (fnearbyint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
+  def : Pat<(f32 (fceil FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x2))>;
+  def : Pat<(f64 (fceil FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x2))>;
+  def : Pat<(f32 (frint FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
+  def : Pat<(f64 (frint FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
+  def : Pat<(f32 (ftrunc FR32:$src)),
+            (ROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x3))>;
+  def : Pat<(f64 (ftrunc FR64:$src)),
+            (ROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x3))>;
+
+  def : Pat<(v4f32 (ffloor VR128:$src)),
+            (ROUNDPSr VR128:$src, (i32 0x1))>;
+  def : Pat<(v2f64 (ffloor VR128:$src)),
+            (ROUNDPDr VR128:$src, (i32 0x1))>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Packed Bit Test
@@ -6221,11 +6440,11 @@ def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                OpSize, VEX;
+                OpSize, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
-                OpSize, VEX;
+                OpSize, VEX, VEX_L;
 }
 
 let Defs = [EFLAGS] in {
@@ -6254,11 +6473,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+                            VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+                            VEX_L;
 }
 }
 
@@ -6338,7 +6559,7 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
           (bitconvert (memopv2i64 addr:$src2))))]>, OpSize;
 }
 
-/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+/// SS41I_binop_rm_int_y - Simple SSE 4.1 binary operator
 multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
                                 Intrinsic IntId256> {
   let isCommutable = 1 in
@@ -6381,25 +6602,25 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
-                                        int_x86_avx2_packusdw>, VEX_4V;
+                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
   defm VPMINSB   : SS41I_binop_rm_int_y<0x38, "vpminsb",
-                                        int_x86_avx2_pmins_b>, VEX_4V;
+                                        int_x86_avx2_pmins_b>, VEX_4V, VEX_L;
   defm VPMINSD   : SS41I_binop_rm_int_y<0x39, "vpminsd",
-                                        int_x86_avx2_pmins_d>, VEX_4V;
+                                        int_x86_avx2_pmins_d>, VEX_4V, VEX_L;
   defm VPMINUD   : SS41I_binop_rm_int_y<0x3B, "vpminud",
-                                        int_x86_avx2_pminu_d>, VEX_4V;
+                                        int_x86_avx2_pminu_d>, VEX_4V, VEX_L;
   defm VPMINUW   : SS41I_binop_rm_int_y<0x3A, "vpminuw",
-                                        int_x86_avx2_pminu_w>, VEX_4V;
+                                        int_x86_avx2_pminu_w>, VEX_4V, VEX_L;
   defm VPMAXSB   : SS41I_binop_rm_int_y<0x3C, "vpmaxsb",
-                                        int_x86_avx2_pmaxs_b>, VEX_4V;
+                                        int_x86_avx2_pmaxs_b>, VEX_4V, VEX_L;
   defm VPMAXSD   : SS41I_binop_rm_int_y<0x3D, "vpmaxsd",
-                                        int_x86_avx2_pmaxs_d>, VEX_4V;
+                                        int_x86_avx2_pmaxs_d>, VEX_4V, VEX_L;
   defm VPMAXUD   : SS41I_binop_rm_int_y<0x3F, "vpmaxud",
-                                        int_x86_avx2_pmaxu_d>, VEX_4V;
+                                        int_x86_avx2_pmaxu_d>, VEX_4V, VEX_L;
   defm VPMAXUW   : SS41I_binop_rm_int_y<0x3E, "vpmaxuw",
-                                        int_x86_avx2_pmaxu_w>, VEX_4V;
+                                        int_x86_avx2_pmaxu_w>, VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
-                                        int_x86_avx2_pmul_dq>, VEX_4V;
+                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6445,9 +6666,9 @@ let Predicates = [HasAVX] in {
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6490,13 +6711,15 @@ let Predicates = [HasAVX] in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
                                         VR128, memopv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-              int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V;
+                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    f256mem, 0>, VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
                                         VR128, memopv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-              int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V;
+                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     f256mem, 0>, VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, memopv2i64, i128mem, 0>, VEX_4V;
@@ -6511,15 +6734,15 @@ let Predicates = [HasAVX] in {
                                    VR128, memopv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                   VR256, memopv8f32, i256mem, 0>, VEX_4V;
+                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -6570,13 +6793,13 @@ let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
                                            memopv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                         memopv4f64, int_x86_avx_blendv_pd_256>;
+                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
                                            memopv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                         memopv8f32, int_x86_avx_blendv_ps_256>;
+                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
                                            memopv2i64, int_x86_sse41_pblendvb>;
@@ -6584,7 +6807,7 @@ defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                           memopv4i64, int_x86_avx2_pblendvb>;
+                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6687,7 +6910,7 @@ def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
 def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}",
                 (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
 
-let Predicates = [HasSSE41] in {
+let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
                             (v16i8 VR128:$src2))),
             (PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
@@ -6725,7 +6948,7 @@ let Predicates = [HasAVX2] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         OpSize, VEX;
+                         OpSize, VEX, VEX_L;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
@@ -6761,7 +6984,7 @@ let Predicates = [HasAVX] in
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -6779,34 +7002,31 @@ multiclass pseudo_pcmpistrm<string asm> {
                                                   imm:$src3))]>;
   def MEM : PseudoI<(outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
-                       VR128:$src1, (load addr:$src2), imm:$src3))]>;
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
-  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128">, Requires<[HasAVX]>;
+  defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
 }
 
-let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
-  def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
   let mayLoad = 1 in
-  def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
 }
 
 let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
-  def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
-  let mayLoad = 1 in
-  def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
@@ -6817,74 +7037,103 @@ multiclass pseudo_pcmpestrm<string asm> {
                        VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   def MEM : PseudoI<(outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
-  let AddedComplexity = 1 in
-    defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
-  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[HasSSE42]>;
+  defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128">, Requires<[HasAVX]>;
+  defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
 }
 
-let Predicates = [HasAVX],
-    Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
   let mayLoad = 1 in
-  def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
 }
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
-  let mayLoad = 1 in
-  def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
 // Packed Compare Implicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
-  multiclass SS42AI_pcmpistri<string asm> {
-    def rr : SS42AI<0x63, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      []>, OpSize;
-    let mayLoad = 1 in
-    def rm : SS42AI<0x63, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      []>, OpSize;
-  }
+multiclass pseudo_pcmpistri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
 }
 
-let Predicates = [HasAVX] in
-defm VPCMPISTRI  : SS42AI_pcmpistri<"vpcmpistri">, VEX;
-defm PCMPISTRI   : SS42AI_pcmpistri<"pcmpistri">;
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
 
 // Packed Compare Explicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  multiclass SS42AI_pcmpestri<string asm> {
-    def rr : SS42AI<0x61, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      []>, OpSize;
-    let mayLoad = 1 in
-    def rm : SS42AI<0x61, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      []>, OpSize;
-  }
+multiclass pseudo_pcmpestri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+       imm:$src5))]>;
 }
 
-let Predicates = [HasAVX] in
-defm VPCMPESTRI  : SS42AI_pcmpestri<"vpcmpestri">, VEX;
-defm PCMPESTRI   : SS42AI_pcmpestri<"pcmpestri">;
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - CRC Instructions
@@ -7175,27 +7424,27 @@ let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
                                       int_x86_avx_vbroadcast_ss>;
   def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                      int_x86_avx_vbroadcast_ss_256>;
+                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
-                                    int_x86_avx_vbroadcast_sd_256>;
+                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
-                                   int_x86_avx_vbroadcastf128_pd_256>;
+                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
 
 let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
                                            int_x86_avx2_vbroadcast_ss_ps>;
   def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
-                                           int_x86_avx2_vbroadcast_ss_ps_256>;
+                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                          int_x86_avx2_vbroadcast_sd_pd_256>;
+                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
-                                   int_x86_avx2_vbroadcasti128>;
+                                   int_x86_avx2_vbroadcasti128>, VEX_L;
 
 let Predicates = [HasAVX] in
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
@@ -7209,50 +7458,69 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
 def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (v4f32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (v2f64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (memopv4f32 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (memopv2f64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTF128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 
-def : Pat<(vinsertf128_insert:$ins (v8f32 VR256:$src1), (loadv4f32 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4f64 VR256:$src1), (loadv2f64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
-def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (loadv2i64 addr:$src2),
-                                   (i32 imm)),
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTF128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
           (VINSERTF128rm VR256:$src1, addr:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
@@ -7264,64 +7532,69 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX;
+          []>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX;
-}
-
-// Extract and store.
-let Predicates = [HasAVX] in {
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(alignedstore (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2), addr:$dst),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-
-  def : Pat<(int_x86_sse_storeu_ps addr:$dst, (int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_pd addr:$dst, (int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2)),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
-  def : Pat<(int_x86_sse2_storeu_dq addr:$dst, (bc_v16i8 (int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2))),
-          (VEXTRACTF128mr addr:$dst, VR256:$src1, imm:$src2)>;
+          []>, VEX, VEX_L;
 }
 
 // AVX1 patterns
 let Predicates = [HasAVX] in {
-def : Pat<(int_x86_avx_vextractf128_pd_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_ps_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-def : Pat<(int_x86_avx_vextractf128_si_256 VR256:$src1, imm:$src2),
-          (VEXTRACTF128rr VR256:$src1, imm:$src2)>;
-
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4f32 (VEXTRACTF128rr
                     (v8f32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2f64 (VEXTRACTF128rr
                     (v4f64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+
+def : Pat<(alignedstore (v4f32 (vextractf128_extract:$ext (v8f32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v2f64 (vextractf128_extract:$ext (v4f64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTF128rr
-                    (v4i64 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v4i64 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTF128rr
-                    (v8i32 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v8i32 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTF128rr
-                    (v16i16 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+                  (v16i16 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTF128rr
-                    (v32i8 VR256:$src1),
-                    (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+                  (v32i8 VR256:$src1),
+                  (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTF128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7339,7 +7612,7 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7347,7 +7620,7 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7395,13 +7668,13 @@ let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
                                memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                              memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>;
+                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
                                memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                              memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>;
+                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7429,38 +7702,38 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                              (i8 imm:$src3))))]>, VEX_4V;
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V;
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
+def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
+                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
+          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+}
+
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
-          (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 
-def : Pat<(v8f32 (X86VPerm2x128 VR256:$src1,
-                  (memopv8f32 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
                   (bc_v8i32 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
                   (memopv4i64 addr:$src2), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
-                  (memopv4f64 addr:$src2), (i8 imm:$imm))),
-          (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
                   (bc_v32i8 (memopv4i64 addr:$src2)), (i8 imm:$imm))),
           (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
@@ -7511,9 +7784,9 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
 
 let Predicates = [HasAVX, HasF16C] in {
   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7545,7 +7818,7 @@ let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
                                    VR128, memopv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv4i64, i256mem>;
+                                    VR256, memopv4i64, i256mem>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7564,11 +7837,12 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
                     (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX;
+                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
-                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
+                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+                   VEX, VEX_L;
 }
 
 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
@@ -7647,19 +7921,22 @@ let Predicates = [HasAVX2] in {
 }
 
 // AVX1 broadcast patterns
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX1Only] in {
 def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+}
+
+let Predicates = [HasAVX] in {
 def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSYrm addr:$src)>;
 def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
           (VBROADCASTSDYrm addr:$src)>;
 def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
           (VBROADCASTSSrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
 
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
@@ -7700,7 +7977,8 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR256:$dst,
-                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, VEX_4V;
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                   VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
@@ -7708,7 +7986,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1,
                             (bitconvert (mem_frag addr:$src2)))))]>,
-                   VEX_4V;
+                   VEX_4V, VEX_L;
 }
 
 defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
@@ -7722,14 +8000,15 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
-                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, VEX;
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                     VEX, VEX_L;
   def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins i256mem:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>, VEX;
+                              (i8 imm:$src2))))]>, VEX, VEX_L;
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
@@ -7739,20 +8018,18 @@ defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, v4f64>, VEX_W;
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
 //
-let AddedComplexity = 1 in {
 def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                            (i8 imm:$src3))))]>, VEX_4V;
+                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V;
-}
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
           (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
 def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7779,31 +8056,51 @@ let neverHasSideEffects = 1 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 }
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
+let Predicates = [HasAVX2] in {
 def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (v2i64 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1), (v4i32 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1), (v16i8 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
 def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1), (v8i16 VR128:$src2),
-                                   (i32 imm)),
+                                   (iPTR imm)),
           (VINSERTI128rr VR256:$src1, VR128:$src2,
                          (INSERT_get_vinsertf128_imm VR256:$ins))>;
+
+def : Pat<(vinsertf128_insert:$ins (v4i64 VR256:$src1), (memopv2i64 addr:$src2),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v8i32 VR256:$src1),
+                                   (bc_v4i32 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v32i8 VR256:$src1),
+                                   (bc_v16i8 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
+def : Pat<(vinsertf128_insert:$ins (v16i16 VR256:$src1),
+                                   (bc_v8i16 (memopv2i64 addr:$src2)),
+                                   (iPTR imm)),
+          (VINSERTI128rm VR256:$src1, addr:$src2,
+                         (INSERT_get_vinsertf128_imm VR256:$ins))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7814,29 +8111,47 @@ def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128:$dst,
             (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
-          VEX;
+          VEX, VEX_L;
 let neverHasSideEffects = 1, mayStore = 1 in
 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
-          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          VEX, VEX_L;
 
-let Predicates = [HasAVX2], AddedComplexity = 1 in {
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+let Predicates = [HasAVX2] in {
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v2i64 (VEXTRACTI128rr
                     (v4i64 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v4i32 (VEXTRACTI128rr
                     (v8i32 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v8i16 (VEXTRACTI128rr
                     (v16i16 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
-def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
+def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
           (v16i8 (VEXTRACTI128rr
                     (v32i8 VR256:$src1),
                     (EXTRACT_get_vextractf128_imm VR128:$ext)))>;
+
+def : Pat<(alignedstore (v2i64 (vextractf128_extract:$ext (v4i64 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v4i32 (vextractf128_extract:$ext (v8i32 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v8i16 (vextractf128_extract:$ext (v16i16 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
+def : Pat<(alignedstore (v16i8 (vextractf128_extract:$ext (v32i8 VR256:$src1),
+                                (iPTR imm))), addr:$dst),
+          (VEXTRACTI128mr addr:$dst, VR256:$src1,
+           (EXTRACT_get_vextractf128_imm VR128:$ext))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7852,7 +8167,8 @@ multiclass avx2_pmovmask<string OpcodeStr,
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, VEX_4V;
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7860,7 +8176,7 @@ multiclass avx2_pmovmask<string OpcodeStr,
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -7898,14 +8214,14 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
 }
 
 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index bdeb63ffbd69..893488c159ea 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -839,6 +839,16 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
 
 } // Defs = [EFLAGS]
 
+def ROT32L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
+  return getI8Imm(32 - N->getZExtValue());
+}]>;
+
+def ROT64L2R_imm8  : SDNodeXForm<imm, [{
+  // Convert a ROTL shamt to a ROTR shamt on 64-bit integer.
+  return getI8Imm(64 - N->getZExtValue());
+}]>;
+
 multiclass bmi_rotate<string asm, RegisterClass RC, X86MemOperand x86memop> {
 let neverHasSideEffects = 1 in {
   def ri : Ii8<0xF0, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, i8imm:$src2),
@@ -873,4 +883,72 @@ let Predicates = [HasBMI2] in {
   defm SHRX64 : bmi_shift<"shrx{q}", GR64, i64mem>, T8XD, VEX_W;
   defm SHLX32 : bmi_shift<"shlx{l}", GR32, i32mem>, T8, OpSize;
   defm SHLX64 : bmi_shift<"shlx{q}", GR64, i64mem>, T8, OpSize, VEX_W;
+
+  // Prefer RORX which is non-destructive and doesn't update EFLAGS.
+  let AddedComplexity = 10 in {
+    def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+              (RORX32ri GR32:$src, (ROT32L2R_imm8 imm:$shamt))>;
+    def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+              (RORX64ri GR64:$src, (ROT64L2R_imm8 imm:$shamt))>;
+  }
+
+  def : Pat<(rotl (loadi32 addr:$src), (i8 imm:$shamt)),
+            (RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
+  def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
+            (RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+  // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+  // immedidate shift, i.e. the following code is considered better
+  //
+  //  mov %edi, %esi
+  //  shl $imm, %esi
+  //  ... %edi, ...
+  //
+  // than
+  //
+  //  movb $imm, %sil
+  //  shlx %sil, %edi, %esi
+  //  ... %edi, ...
+  //
+  let AddedComplexity = 1 in {
+    def : Pat<(sra GR32:$src1, GR8:$src2),
+              (SARX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(sra GR64:$src1, GR8:$src2),
+              (SARX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(srl GR32:$src1, GR8:$src2),
+              (SHRX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(srl GR64:$src1, GR8:$src2),
+              (SHRX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+    def : Pat<(shl GR32:$src1, GR8:$src2),
+              (SHLX32rr GR32:$src1,
+                        (INSERT_SUBREG
+                          (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+    def : Pat<(shl GR64:$src1, GR8:$src2),
+              (SHLX64rr GR64:$src1,
+                        (INSERT_SUBREG
+                          (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  }
+
+  // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+  //
+  //  mov (%ecx), %esi
+  //  shl $imm, $esi
+  //
+  // over
+  //
+  //  movb $imm %al
+  //  shlx %al, (%ecx), %esi
+  //
+  // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+  // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
 }
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 000000000000..ad55058ede6c
--- /dev/null
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,32 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+               "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+             Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
+                         "xbegin\t$dst", []>;
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+                 "xabort\t$imm",
+                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 8ec2c688d33f..2aa08fad7836 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -75,10 +75,10 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, VEX;
+           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
   def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
 }
 
 let isAsmParserOnly = 1 in {
@@ -238,7 +238,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           VEX_4V, VEX_I8IMM, VEX_L;
   def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, i256mem:$src3),
            !strconcat(OpcodeStr,
@@ -246,7 +246,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst,
              (Int VR256:$src1, VR256:$src2,
               (bitconvert (memopv4i64 addr:$src3))))]>,
-           VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
+           VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
   def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
@@ -254,7 +254,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst,
              (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
               VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           VEX_4V, VEX_I8IMM, VEX_L;
 }
 
 let isAsmParserOnly = 1 in {
@@ -287,20 +287,21 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>;
+          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
   def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
           (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
-        VEX_W, MemOp4;
+        VEX_W, MemOp4, VEX_L;
   def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>;
+           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+        VEX_L;
 }
 
 defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 0168d12231f7..764aa5d4f236 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -532,6 +532,15 @@ uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
 #endif
 }
 
+template<typename T> static void addUnaligned(void *Pos, T Delta) {
+  T Value;
+  std::memcpy(reinterpret_cast<char*>(&Value), reinterpret_cast<char*>(Pos),
+              sizeof(T));
+  Value += Delta;
+  std::memcpy(reinterpret_cast<char*>(Pos), reinterpret_cast<char*>(&Value),
+              sizeof(T));
+}
+
 /// relocate - Before the JIT can run a block of code that has been emitted,
 /// it must rewrite the code to contain the actual addresses of any
 /// referenced global symbols.
@@ -545,24 +554,24 @@ void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
       // PC relative relocation, add the relocated value to the value already in
       // memory, after we adjust it for where the PC is.
       ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
-      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
       break;
     }
     case X86::reloc_picrel_word: {
       // PIC base relative relocation, add the relocated value to the value
       // already in memory, after we adjust it for where the PIC base is.
       ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
-      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
       break;
     }
     case X86::reloc_absolute_word:
     case X86::reloc_absolute_word_sext:
       // Absolute relocation, just add the relocated value to the value already
       // in memory.
-      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      addUnaligned<unsigned>(RelocPos, ResultPtr);
       break;
     case X86::reloc_absolute_dword:
-      *((intptr_t*)RelocPos) += ResultPtr;
+      addUnaligned<intptr_t>(RelocPos, ResultPtr);
       break;
     }
   }
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 9c0ce4ead2fa..cfd68f74b7b2 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86MCInstLower.h"
 #include "X86AsmPrinter.h"
 #include "X86COFFMachineModuleInfo.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
@@ -29,6 +28,31 @@
 #include "llvm/ADT/SmallString.h"
 using namespace llvm;
 
+namespace {
+
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class X86MCInstLower {
+  MCContext &Ctx;
+  Mangler *Mang;
+  const MachineFunction &MF;
+  const TargetMachine &TM;
+  const MCAsmInfo &MAI;
+  X86AsmPrinter &AsmPrinter;
+public:
+  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
+                 X86AsmPrinter &asmprinter);
+
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+} // end anonymous namespace
+
 X86MCInstLower::X86MCInstLower(Mangler *mang, const MachineFunction &mf,
                                X86AsmPrinter &asmprinter)
 : Ctx(mf.getContext()), Mang(mang), MF(mf), TM(mf.getTarget()),
@@ -43,15 +67,11 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::
 GetSymbolFromOperand(const MachineOperand &MO) const {
-  assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference");
+  assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) && "Isn't a symbol reference");
 
   SmallString<128> Name;
 
-  if (!MO.isGlobal()) {
-    assert(MO.isSymbol());
-    Name += MAI.getGlobalPrefix();
-    Name += MO.getSymbolName();
-  } else {
+  if (MO.isGlobal()) {
     const GlobalValue *GV = MO.getGlobal();
     bool isImplicitlyPrivate = false;
     if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
@@ -61,6 +81,11 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
       isImplicitlyPrivate = true;
 
     Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  } else if (MO.isSymbol()) {
+    Name += MAI.getGlobalPrefix();
+    Name += MO.getSymbolName();
+  } else if (MO.isMBB()) {
+    Name += MO.getMBB()->getSymbol()->getName();
   }
 
   // If the target flags on the operand changes the name of the symbol, do that
@@ -191,7 +216,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   if (Expr == 0)
     Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
 
-  if (!MO.isJTI() && MO.getOffset())
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
     Expr = MCBinaryExpr::CreateAdd(Expr,
                                    MCConstantExpr::Create(MO.getOffset(), Ctx),
                                    Ctx);
@@ -324,9 +349,6 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       MCOp = MCOperand::CreateImm(MO.getImm());
       break;
     case MachineOperand::MO_MachineBasicBlock:
-      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                       MO.getMBB()->getSymbol(), Ctx));
-      break;
     case MachineOperand::MO_GlobalAddress:
     case MachineOperand::MO_ExternalSymbol:
       MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
@@ -371,18 +393,8 @@ ReSimplify:
   case X86::MOVZX64rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
   case X86::MOVZX64rr16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
   case X86::MOVZX64rm16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
-  case X86::SETB_C8r:     LowerUnaryToTwoAddr(OutMI, X86::SBB8rr); break;
-  case X86::SETB_C16r:    LowerUnaryToTwoAddr(OutMI, X86::SBB16rr); break;
-  case X86::SETB_C32r:    LowerUnaryToTwoAddr(OutMI, X86::SBB32rr); break;
-  case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
   case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
   case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
-  case X86::V_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
-  case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
-  case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
-  case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
-  case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break;
-  case X86::AVX2_SET0:     LowerUnaryToTwoAddr(OutMI, X86::VPXORYrr); break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
diff --git a/lib/Target/X86/X86MCInstLower.h b/lib/Target/X86/X86MCInstLower.h
deleted file mode 100644
index b4d4cfd301a5..000000000000
--- a/lib/Target/X86/X86MCInstLower.h
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- X86MCInstLower.h - Lower MachineInstr to MCInst ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86_MCINSTLOWER_H
-#define X86_MCINSTLOWER_H
-
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-  class MCAsmInfo;
-  class MCContext;
-  class MCInst;
-  class MCOperand;
-  class MCSymbol;
-  class MachineInstr;
-  class MachineFunction;
-  class MachineModuleInfoMachO;
-  class MachineOperand;
-  class Mangler;
-  class TargetMachine;
-  class X86AsmPrinter;
-
-/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
-class LLVM_LIBRARY_VISIBILITY X86MCInstLower {
-  MCContext &Ctx;
-  Mangler *Mang;
-  const MachineFunction &MF;
-  const TargetMachine &TM;
-  const MCAsmInfo &MAI;
-  X86AsmPrinter &AsmPrinter;
-public:
-  X86MCInstLower(Mangler *mang, const MachineFunction &MF,
-                 X86AsmPrinter &asmprinter);
-
-  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
-
-  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
-  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
-
-private:
-  MachineModuleInfoMachO &getMachOMMI() const;
-};
-
-}
-
-#endif
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 877b8f6bc3d1..73ac7477427f 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -106,23 +106,7 @@ X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
 
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
-  int reg = X86_MC::getX86RegNum(i);
-  switch (i) {
-  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
-  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
-  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
-  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
-  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
-  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
-  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
-  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
-  case X86::XMM8: case X86::XMM9: case X86::XMM10: case X86::XMM11:
-  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
-  case X86::YMM8: case X86::YMM9: case X86::YMM10: case X86::YMM11:
-  case X86::YMM12: case X86::YMM13: case X86::YMM14: case X86::YMM15:
-    reg += 8;
-  }
-  return reg;
+  return getEncodingValue(i);
 }
 
 const TargetRegisterClass *
@@ -245,15 +229,26 @@ const uint16_t *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool callsEHReturn = false;
   bool ghcCall = false;
+  bool oclBiCall = false;
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
 
   if (MF) {
     callsEHReturn = MF->getMMI().callsEHReturn();
     const Function *F = MF->getFunction();
     ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+    oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
   }
 
   if (ghcCall)
     return CSR_NoRegs_SaveList;
+  if (oclBiCall) {
+    if (HasAVX && IsWin64)
+        return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+        return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+        return CSR_64_Intel_OCL_BI_SaveList;
+  }
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
@@ -268,6 +263,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+  if (CC == CallingConv::Intel_OCL_BI) {
+    if (IsWin64 && HasAVX)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (Is64Bit && HasAVX)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+  }
   if (CC == CallingConv::GHC)
     return CSR_NoRegs_RegMask;
   if (!Is64Bit)
@@ -277,6 +282,11 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   return CSR_64_RegMask;
 }
 
+const uint32_t*
+X86RegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
 BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
@@ -398,8 +408,9 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   const Function *F = MF.getFunction();
   unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
-  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
-                               F->hasFnAttr(Attribute::StackAlignment));
+  bool requiresRealignment =
+    ((MFI->getMaxAlignment() > StackAlign) ||
+     F->getFnAttributes().hasAttribute(Attributes::StackAlignment));
 
   // If we've requested that we force align the stack do so now.
   if (ForceStackAlign)
@@ -522,7 +533,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
 
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                     int SPAdj, RegScavenger *RS) const{
+                                     int SPAdj, RegScavenger *RS) const {
   assert(SPAdj == 0 && "Unexpected");
 
   unsigned i = 0;
@@ -590,9 +601,10 @@ unsigned X86RegisterInfo::getEHHandlerRegister() const {
 }
 
 namespace llvm {
-unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
-  switch (VT.getSimpleVT().SimpleTy) {
-  default: return Reg;
+unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
+                                bool High) {
+  switch (VT) {
+  default: llvm_unreachable("Unexpected VT");
   case MVT::i8:
     if (High) {
       switch (Reg) {
@@ -608,7 +620,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
       }
     } else {
       switch (Reg) {
-      default: return 0;
+      default: llvm_unreachable("Unexpected register");
       case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
         return X86::AL;
       case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -645,7 +657,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
     }
   case MVT::i16:
     switch (Reg) {
-    default: return Reg;
+    default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::AX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -681,7 +693,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
     }
   case MVT::i32:
     switch (Reg) {
-    default: return Reg;
+    default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::EAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -733,7 +745,7 @@ unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
       }
     }
     switch (Reg) {
-    default: return Reg;
+    default: llvm_unreachable("Unexpected register");
     case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
       return X86::RAX;
     case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 1bc32cbb78f4..7932ede8dd65 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -58,10 +58,6 @@ private:
 public:
   X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
 
-  /// getX86RegNum - Returns the native X86 register number for the given LLVM
-  /// register identifier.
-  static unsigned getX86RegNum(unsigned RegNo);
-
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
@@ -104,6 +100,7 @@ public:
   /// callee-save registers on this target.
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint32_t *getNoPreservedMask() const;
 
   /// getReservedRegs - Returns a bitset indexed by physical register number
   /// indicating if a register is a special register that has particular uses and
@@ -141,8 +138,8 @@ public:
 
 // getX86SubSuperRegister - X86 utility function. It returns the sub or super
 // register of a specific X86 register.
-// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX
-unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false);
+// e.g. getX86SubSuperRegister(X86::EAX, MVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, MVT::SimpleValueType, bool High=false);
 
 } // End llvm namespace
 
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index edc71845acba..be6282a643bd 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -13,258 +13,264 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-//  Register definitions...
-//
-let Namespace = "X86" in {
+class X86Reg<string n, bits<16> Enc, list<Register> subregs = []> : Register<n> {
+  let Namespace = "X86";
+  let HWEncoding = Enc;
+  let SubRegs = subregs;
+}
 
-  // Subregister indices.
+// Subregister indices.
+let Namespace = "X86" in {
   def sub_8bit    : SubRegIndex;
   def sub_8bit_hi : SubRegIndex;
   def sub_16bit   : SubRegIndex;
   def sub_32bit   : SubRegIndex;
-  def sub_xmm : SubRegIndex;
-
-
-  // In the register alias definitions below, we define which registers alias
-  // which others.  We only specify which registers the small registers alias,
-  // because the register file generator is smart enough to figure out that
-  // AL aliases AX if we tell it that AX aliased AL (for example).
-
-  // Dwarf numbering is different for 32-bit and 64-bit, and there are
-  // variations by target as well. Currently the first entry is for X86-64,
-  // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
-  // and debug information on X86-32/Darwin)
-
-  // 8-bit registers
-  // Low registers
-  def AL : Register<"al">;
-  def DL : Register<"dl">;
-  def CL : Register<"cl">;
-  def BL : Register<"bl">;
-
-  // X86-64 only, requires REX.
-  let CostPerUse = 1 in {
-  def SIL : Register<"sil">;
-  def DIL : Register<"dil">;
-  def BPL : Register<"bpl">;
-  def SPL : Register<"spl">;
-  def R8B  : Register<"r8b">;
-  def R9B  : Register<"r9b">;
-  def R10B : Register<"r10b">;
-  def R11B : Register<"r11b">;
-  def R12B : Register<"r12b">;
-  def R13B : Register<"r13b">;
-  def R14B : Register<"r14b">;
-  def R15B : Register<"r15b">;
-  }
-
-  // High registers. On x86-64, these cannot be used in any instruction
-  // with a REX prefix.
-  def AH : Register<"ah">;
-  def DH : Register<"dh">;
-  def CH : Register<"ch">;
-  def BH : Register<"bh">;
-
-  // 16-bit registers
-  let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
-  def AX : RegisterWithSubRegs<"ax", [AL,AH]>;
-  def DX : RegisterWithSubRegs<"dx", [DL,DH]>;
-  def CX : RegisterWithSubRegs<"cx", [CL,CH]>;
-  def BX : RegisterWithSubRegs<"bx", [BL,BH]>;
-  }
-  let SubRegIndices = [sub_8bit] in {
-  def SI : RegisterWithSubRegs<"si", [SIL]>;
-  def DI : RegisterWithSubRegs<"di", [DIL]>;
-  def BP : RegisterWithSubRegs<"bp", [BPL]>;
-  def SP : RegisterWithSubRegs<"sp", [SPL]>;
-  }
-  def IP : Register<"ip">;
-
-  // X86-64 only, requires REX.
-  let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
-  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>;
-  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>;
-  def R10W : RegisterWithSubRegs<"r10w", [R10B]>;
-  def R11W : RegisterWithSubRegs<"r11w", [R11B]>;
-  def R12W : RegisterWithSubRegs<"r12w", [R12B]>;
-  def R13W : RegisterWithSubRegs<"r13w", [R13B]>;
-  def R14W : RegisterWithSubRegs<"r14w", [R14B]>;
-  def R15W : RegisterWithSubRegs<"r15w", [R15B]>;
-  }
-  // 32-bit registers
-  let SubRegIndices = [sub_16bit] in {
-  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[-2, 0, 0]>;
-  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[-2, 2, 2]>;
-  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[-2, 1, 1]>;
-  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[-2, 3, 3]>;
-  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[-2, 6, 6]>;
-  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[-2, 7, 7]>;
-  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[-2, 4, 5]>;
-  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[-2, 5, 4]>;
-  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[-2, 8, 8]>;
-
-  // X86-64 only, requires REX
-  let CostPerUse = 1 in {
-  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>;
-  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>;
-  def R10D : RegisterWithSubRegs<"r10d", [R10W]>;
-  def R11D : RegisterWithSubRegs<"r11d", [R11W]>;
-  def R12D : RegisterWithSubRegs<"r12d", [R12W]>;
-  def R13D : RegisterWithSubRegs<"r13d", [R13W]>;
-  def R14D : RegisterWithSubRegs<"r14d", [R14W]>;
-  def R15D : RegisterWithSubRegs<"r15d", [R15W]>;
-  }}
-
-  // 64-bit registers, X86-64 only
-  let SubRegIndices = [sub_32bit] in {
-  def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>;
-  def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>;
-  def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>;
-  def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>;
-  def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>;
-  def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>;
-  def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
-  def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
-
-  // These also require REX.
-  let CostPerUse = 1 in {
-  def R8  : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
-  def R9  : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
-  def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
-  def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>;
-  def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>;
-  def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>;
-  def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
-  def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
-  def RIP : RegisterWithSubRegs<"rip", [EIP]>,  DwarfRegNum<[16, -2, -2]>;
-  }}
-
-  // MMX Registers. These are actually aliased to ST0 .. ST7
-  def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
-  def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>;
-  def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>;
-  def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>;
-  def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>;
-  def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
-  def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
-  def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
-
-  // Pseudo Floating Point registers
-  def FP0 : Register<"fp0">;
-  def FP1 : Register<"fp1">;
-  def FP2 : Register<"fp2">;
-  def FP3 : Register<"fp3">;
-  def FP4 : Register<"fp4">;
-  def FP5 : Register<"fp5">;
-  def FP6 : Register<"fp6">;
-
-  // XMM Registers, used by the various SSE instruction set extensions.
-  def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
-  def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
-  def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
-  def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>;
-  def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>;
-  def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>;
-  def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>;
-  def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
-
-  // X86-64 only
-  let CostPerUse = 1 in {
-  def XMM8:  Register<"xmm8">,  DwarfRegNum<[25, -2, -2]>;
-  def XMM9:  Register<"xmm9">,  DwarfRegNum<[26, -2, -2]>;
-  def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
-  def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>;
-  def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>;
-  def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
-  def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
-  def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
-  } // CostPerUse
-
-  // YMM Registers, used by AVX instructions
-  let SubRegIndices = [sub_xmm] in {
-  def YMM0: RegisterWithSubRegs<"ymm0", [XMM0]>, DwarfRegAlias<XMM0>;
-  def YMM1: RegisterWithSubRegs<"ymm1", [XMM1]>, DwarfRegAlias<XMM1>;
-  def YMM2: RegisterWithSubRegs<"ymm2", [XMM2]>, DwarfRegAlias<XMM2>;
-  def YMM3: RegisterWithSubRegs<"ymm3", [XMM3]>, DwarfRegAlias<XMM3>;
-  def YMM4: RegisterWithSubRegs<"ymm4", [XMM4]>, DwarfRegAlias<XMM4>;
-  def YMM5: RegisterWithSubRegs<"ymm5", [XMM5]>, DwarfRegAlias<XMM5>;
-  def YMM6: RegisterWithSubRegs<"ymm6", [XMM6]>, DwarfRegAlias<XMM6>;
-  def YMM7: RegisterWithSubRegs<"ymm7", [XMM7]>, DwarfRegAlias<XMM7>;
-  def YMM8:  RegisterWithSubRegs<"ymm8", [XMM8]>, DwarfRegAlias<XMM8>;
-  def YMM9:  RegisterWithSubRegs<"ymm9", [XMM9]>, DwarfRegAlias<XMM9>;
-  def YMM10: RegisterWithSubRegs<"ymm10", [XMM10]>, DwarfRegAlias<XMM10>;
-  def YMM11: RegisterWithSubRegs<"ymm11", [XMM11]>, DwarfRegAlias<XMM11>;
-  def YMM12: RegisterWithSubRegs<"ymm12", [XMM12]>, DwarfRegAlias<XMM12>;
-  def YMM13: RegisterWithSubRegs<"ymm13", [XMM13]>, DwarfRegAlias<XMM13>;
-  def YMM14: RegisterWithSubRegs<"ymm14", [XMM14]>, DwarfRegAlias<XMM14>;
-  def YMM15: RegisterWithSubRegs<"ymm15", [XMM15]>, DwarfRegAlias<XMM15>;
-  }
-
-  class STRegister<string Name, list<Register> A> : Register<Name> {
-    let Aliases = A;
-  }
-
-  // Floating point stack registers. These don't map one-to-one to the FP
-  // pseudo registers, but we still mark them as aliasing FP registers. That
-  // way both kinds can be live without exceeding the stack depth. ST registers
-  // are only live around inline assembly.
-  def ST0 : STRegister<"st(0)", []>, DwarfRegNum<[33, 12, 11]>;
-  def ST1 : STRegister<"st(1)", [FP6]>, DwarfRegNum<[34, 13, 12]>;
-  def ST2 : STRegister<"st(2)", [FP5]>, DwarfRegNum<[35, 14, 13]>;
-  def ST3 : STRegister<"st(3)", [FP4]>, DwarfRegNum<[36, 15, 14]>;
-  def ST4 : STRegister<"st(4)", [FP3]>, DwarfRegNum<[37, 16, 15]>;
-  def ST5 : STRegister<"st(5)", [FP2]>, DwarfRegNum<[38, 17, 16]>;
-  def ST6 : STRegister<"st(6)", [FP1]>, DwarfRegNum<[39, 18, 17]>;
-  def ST7 : STRegister<"st(7)", [FP0]>, DwarfRegNum<[40, 19, 18]>;
-
-  // Floating-point status word
-  def FPSW : Register<"fpsw">;
-
-  // Status flags register
-  def EFLAGS : Register<"flags">;
-
-  // Segment registers
-  def CS : Register<"cs">;
-  def DS : Register<"ds">;
-  def SS : Register<"ss">;
-  def ES : Register<"es">;
-  def FS : Register<"fs">;
-  def GS : Register<"gs">;
-
-  // Debug registers
-  def DR0 : Register<"dr0">;
-  def DR1 : Register<"dr1">;
-  def DR2 : Register<"dr2">;
-  def DR3 : Register<"dr3">;
-  def DR4 : Register<"dr4">;
-  def DR5 : Register<"dr5">;
-  def DR6 : Register<"dr6">;
-  def DR7 : Register<"dr7">;
-
-  // Control registers
-  def CR0 : Register<"cr0">;
-  def CR1 : Register<"cr1">;
-  def CR2 : Register<"cr2">;
-  def CR3 : Register<"cr3">;
-  def CR4 : Register<"cr4">;
-  def CR5 : Register<"cr5">;
-  def CR6 : Register<"cr6">;
-  def CR7 : Register<"cr7">;
-  def CR8 : Register<"cr8">;
-  def CR9 : Register<"cr9">;
-  def CR10 : Register<"cr10">;
-  def CR11 : Register<"cr11">;
-  def CR12 : Register<"cr12">;
-  def CR13 : Register<"cr13">;
-  def CR14 : Register<"cr14">;
-  def CR15 : Register<"cr15">;
-
-  // Pseudo index registers
-  def EIZ : Register<"eiz">;
-  def RIZ : Register<"riz">;
+  def sub_xmm     : SubRegIndex;
 }
 
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+
+// In the register alias definitions below, we define which registers alias
+// which others.  We only specify which registers the small registers alias,
+// because the register file generator is smart enough to figure out that
+// AL aliases AX if we tell it that AX aliased AL (for example).
+
+// Dwarf numbering is different for 32-bit and 64-bit, and there are
+// variations by target as well. Currently the first entry is for X86-64,
+// second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+// and debug information on X86-32/Darwin)
+
+// 8-bit registers
+// Low registers
+def AL : X86Reg<"al", 0>;
+def DL : X86Reg<"dl", 2>;
+def CL : X86Reg<"cl", 1>;
+def BL : X86Reg<"bl", 3>;
+
+// High registers. On x86-64, these cannot be used in any instruction
+// with a REX prefix.
+def AH : X86Reg<"ah", 4>;
+def DH : X86Reg<"dh", 6>;
+def CH : X86Reg<"ch", 5>;
+def BH : X86Reg<"bh", 7>;
+
+// X86-64 only, requires REX.
+let CostPerUse = 1 in {
+def SIL  : X86Reg<"sil",   6>;
+def DIL  : X86Reg<"dil",   7>;
+def BPL  : X86Reg<"bpl",   5>;
+def SPL  : X86Reg<"spl",   4>;
+def R8B  : X86Reg<"r8b",   8>;
+def R9B  : X86Reg<"r9b",   9>;
+def R10B : X86Reg<"r10b", 10>;
+def R11B : X86Reg<"r11b", 11>;
+def R12B : X86Reg<"r12b", 12>;
+def R13B : X86Reg<"r13b", 13>;
+def R14B : X86Reg<"r14b", 14>;
+def R15B : X86Reg<"r15b", 15>;
+}
+
+// 16-bit registers
+let SubRegIndices = [sub_8bit, sub_8bit_hi], CoveredBySubRegs = 1 in {
+def AX : X86Reg<"ax", 0, [AL,AH]>;
+def DX : X86Reg<"dx", 2, [DL,DH]>;
+def CX : X86Reg<"cx", 1, [CL,CH]>;
+def BX : X86Reg<"bx", 3, [BL,BH]>;
+}
+let SubRegIndices = [sub_8bit] in {
+def SI : X86Reg<"si", 6, [SIL]>;
+def DI : X86Reg<"di", 7, [DIL]>;
+def BP : X86Reg<"bp", 5, [BPL]>;
+def SP : X86Reg<"sp", 4, [SPL]>;
+}
+def IP : X86Reg<"ip", 0>;
+
+// X86-64 only, requires REX.
+let SubRegIndices = [sub_8bit], CostPerUse = 1 in {
+def R8W  : X86Reg<"r8w",   8, [R8B]>;
+def R9W  : X86Reg<"r9w",   9, [R9B]>;
+def R10W : X86Reg<"r10w", 10, [R10B]>;
+def R11W : X86Reg<"r11w", 11, [R11B]>;
+def R12W : X86Reg<"r12w", 12, [R12B]>;
+def R13W : X86Reg<"r13w", 13, [R13B]>;
+def R14W : X86Reg<"r14w", 14, [R14B]>;
+def R15W : X86Reg<"r15w", 15, [R15B]>;
+}
+
+// 32-bit registers
+let SubRegIndices = [sub_16bit] in {
+def EAX : X86Reg<"eax", 0, [AX]>, DwarfRegNum<[-2, 0, 0]>;
+def EDX : X86Reg<"edx", 2, [DX]>, DwarfRegNum<[-2, 2, 2]>;
+def ECX : X86Reg<"ecx", 1, [CX]>, DwarfRegNum<[-2, 1, 1]>;
+def EBX : X86Reg<"ebx", 3, [BX]>, DwarfRegNum<[-2, 3, 3]>;
+def ESI : X86Reg<"esi", 6, [SI]>, DwarfRegNum<[-2, 6, 6]>;
+def EDI : X86Reg<"edi", 7, [DI]>, DwarfRegNum<[-2, 7, 7]>;
+def EBP : X86Reg<"ebp", 5, [BP]>, DwarfRegNum<[-2, 4, 5]>;
+def ESP : X86Reg<"esp", 4, [SP]>, DwarfRegNum<[-2, 5, 4]>;
+def EIP : X86Reg<"eip", 0, [IP]>, DwarfRegNum<[-2, 8, 8]>;
+
+// X86-64 only, requires REX
+let CostPerUse = 1 in {
+def R8D  : X86Reg<"r8d",   8, [R8W]>;
+def R9D  : X86Reg<"r9d",   9, [R9W]>;
+def R10D : X86Reg<"r10d", 10, [R10W]>;
+def R11D : X86Reg<"r11d", 11, [R11W]>;
+def R12D : X86Reg<"r12d", 12, [R12W]>;
+def R13D : X86Reg<"r13d", 13, [R13W]>;
+def R14D : X86Reg<"r14d", 14, [R14W]>;
+def R15D : X86Reg<"r15d", 15, [R15W]>;
+}}
+
+// 64-bit registers, X86-64 only
+let SubRegIndices = [sub_32bit] in {
+def RAX : X86Reg<"rax", 0, [EAX]>, DwarfRegNum<[0, -2, -2]>;
+def RDX : X86Reg<"rdx", 2, [EDX]>, DwarfRegNum<[1, -2, -2]>;
+def RCX : X86Reg<"rcx", 1, [ECX]>, DwarfRegNum<[2, -2, -2]>;
+def RBX : X86Reg<"rbx", 3, [EBX]>, DwarfRegNum<[3, -2, -2]>;
+def RSI : X86Reg<"rsi", 6, [ESI]>, DwarfRegNum<[4, -2, -2]>;
+def RDI : X86Reg<"rdi", 7, [EDI]>, DwarfRegNum<[5, -2, -2]>;
+def RBP : X86Reg<"rbp", 5, [EBP]>, DwarfRegNum<[6, -2, -2]>;
+def RSP : X86Reg<"rsp", 4, [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+// These also require REX.
+let CostPerUse = 1 in {
+def R8  : X86Reg<"r8",   8, [R8D]>,  DwarfRegNum<[ 8, -2, -2]>;
+def R9  : X86Reg<"r9",   9, [R9D]>,  DwarfRegNum<[ 9, -2, -2]>;
+def R10 : X86Reg<"r10", 10, [R10D]>, DwarfRegNum<[10, -2, -2]>;
+def R11 : X86Reg<"r11", 11, [R11D]>, DwarfRegNum<[11, -2, -2]>;
+def R12 : X86Reg<"r12", 12, [R12D]>, DwarfRegNum<[12, -2, -2]>;
+def R13 : X86Reg<"r13", 13, [R13D]>, DwarfRegNum<[13, -2, -2]>;
+def R14 : X86Reg<"r14", 14, [R14D]>, DwarfRegNum<[14, -2, -2]>;
+def R15 : X86Reg<"r15", 15, [R15D]>, DwarfRegNum<[15, -2, -2]>;
+def RIP : X86Reg<"rip",  0, [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+}}
+
+// MMX Registers. These are actually aliased to ST0 .. ST7
+def MM0 : X86Reg<"mm0", 0>, DwarfRegNum<[41, 29, 29]>;
+def MM1 : X86Reg<"mm1", 1>, DwarfRegNum<[42, 30, 30]>;
+def MM2 : X86Reg<"mm2", 2>, DwarfRegNum<[43, 31, 31]>;
+def MM3 : X86Reg<"mm3", 3>, DwarfRegNum<[44, 32, 32]>;
+def MM4 : X86Reg<"mm4", 4>, DwarfRegNum<[45, 33, 33]>;
+def MM5 : X86Reg<"mm5", 5>, DwarfRegNum<[46, 34, 34]>;
+def MM6 : X86Reg<"mm6", 6>, DwarfRegNum<[47, 35, 35]>;
+def MM7 : X86Reg<"mm7", 7>, DwarfRegNum<[48, 36, 36]>;
+
+// Pseudo Floating Point registers
+def FP0 : X86Reg<"fp0", 0>;
+def FP1 : X86Reg<"fp1", 0>;
+def FP2 : X86Reg<"fp2", 0>;
+def FP3 : X86Reg<"fp3", 0>;
+def FP4 : X86Reg<"fp4", 0>;
+def FP5 : X86Reg<"fp5", 0>;
+def FP6 : X86Reg<"fp6", 0>;
+
+// XMM Registers, used by the various SSE instruction set extensions.
+def XMM0: X86Reg<"xmm0", 0>, DwarfRegNum<[17, 21, 21]>;
+def XMM1: X86Reg<"xmm1", 1>, DwarfRegNum<[18, 22, 22]>;
+def XMM2: X86Reg<"xmm2", 2>, DwarfRegNum<[19, 23, 23]>;
+def XMM3: X86Reg<"xmm3", 3>, DwarfRegNum<[20, 24, 24]>;
+def XMM4: X86Reg<"xmm4", 4>, DwarfRegNum<[21, 25, 25]>;
+def XMM5: X86Reg<"xmm5", 5>, DwarfRegNum<[22, 26, 26]>;
+def XMM6: X86Reg<"xmm6", 6>, DwarfRegNum<[23, 27, 27]>;
+def XMM7: X86Reg<"xmm7", 7>, DwarfRegNum<[24, 28, 28]>;
+
+// X86-64 only
+let CostPerUse = 1 in {
+def XMM8:  X86Reg<"xmm8",   8>, DwarfRegNum<[25, -2, -2]>;
+def XMM9:  X86Reg<"xmm9",   9>, DwarfRegNum<[26, -2, -2]>;
+def XMM10: X86Reg<"xmm10", 10>, DwarfRegNum<[27, -2, -2]>;
+def XMM11: X86Reg<"xmm11", 11>, DwarfRegNum<[28, -2, -2]>;
+def XMM12: X86Reg<"xmm12", 12>, DwarfRegNum<[29, -2, -2]>;
+def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
+def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
+def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
+} // CostPerUse
+
+// YMM Registers, used by AVX instructions
+let SubRegIndices = [sub_xmm] in {
+def YMM0:  X86Reg<"ymm0",   0, [XMM0]>,  DwarfRegAlias<XMM0>;
+def YMM1:  X86Reg<"ymm1",   1, [XMM1]>,  DwarfRegAlias<XMM1>;
+def YMM2:  X86Reg<"ymm2",   2, [XMM2]>,  DwarfRegAlias<XMM2>;
+def YMM3:  X86Reg<"ymm3",   3, [XMM3]>,  DwarfRegAlias<XMM3>;
+def YMM4:  X86Reg<"ymm4",   4, [XMM4]>,  DwarfRegAlias<XMM4>;
+def YMM5:  X86Reg<"ymm5",   5, [XMM5]>,  DwarfRegAlias<XMM5>;
+def YMM6:  X86Reg<"ymm6",   6, [XMM6]>,  DwarfRegAlias<XMM6>;
+def YMM7:  X86Reg<"ymm7",   7, [XMM7]>,  DwarfRegAlias<XMM7>;
+def YMM8:  X86Reg<"ymm8",   8, [XMM8]>,  DwarfRegAlias<XMM8>;
+def YMM9:  X86Reg<"ymm9",   9, [XMM9]>,  DwarfRegAlias<XMM9>;
+def YMM10: X86Reg<"ymm10", 10, [XMM10]>, DwarfRegAlias<XMM10>;
+def YMM11: X86Reg<"ymm11", 11, [XMM11]>, DwarfRegAlias<XMM11>;
+def YMM12: X86Reg<"ymm12", 12, [XMM12]>, DwarfRegAlias<XMM12>;
+def YMM13: X86Reg<"ymm13", 13, [XMM13]>, DwarfRegAlias<XMM13>;
+def YMM14: X86Reg<"ymm14", 14, [XMM14]>, DwarfRegAlias<XMM14>;
+def YMM15: X86Reg<"ymm15", 15, [XMM15]>, DwarfRegAlias<XMM15>;
+}
+
+class STRegister<string n, bits<16> Enc, list<Register> A> : X86Reg<n, Enc> {
+  let Aliases = A;
+}
+
+// Floating point stack registers. These don't map one-to-one to the FP
+// pseudo registers, but we still mark them as aliasing FP registers. That
+// way both kinds can be live without exceeding the stack depth. ST registers
+// are only live around inline assembly.
+def ST0 : STRegister<"st(0)", 0, []>,    DwarfRegNum<[33, 12, 11]>;
+def ST1 : STRegister<"st(1)", 1, [FP6]>, DwarfRegNum<[34, 13, 12]>;
+def ST2 : STRegister<"st(2)", 2, [FP5]>, DwarfRegNum<[35, 14, 13]>;
+def ST3 : STRegister<"st(3)", 3, [FP4]>, DwarfRegNum<[36, 15, 14]>;
+def ST4 : STRegister<"st(4)", 4, [FP3]>, DwarfRegNum<[37, 16, 15]>;
+def ST5 : STRegister<"st(5)", 5, [FP2]>, DwarfRegNum<[38, 17, 16]>;
+def ST6 : STRegister<"st(6)", 6, [FP1]>, DwarfRegNum<[39, 18, 17]>;
+def ST7 : STRegister<"st(7)", 7, [FP0]>, DwarfRegNum<[40, 19, 18]>;
+
+// Floating-point status word
+def FPSW : X86Reg<"fpsw", 0>;
+
+// Status flags register
+def EFLAGS : X86Reg<"flags", 0>;
+
+// Segment registers
+def CS : X86Reg<"cs", 1>;
+def DS : X86Reg<"ds", 3>;
+def SS : X86Reg<"ss", 2>;
+def ES : X86Reg<"es", 0>;
+def FS : X86Reg<"fs", 4>;
+def GS : X86Reg<"gs", 5>;
+
+// Debug registers
+def DR0 : X86Reg<"dr0", 0>;
+def DR1 : X86Reg<"dr1", 1>;
+def DR2 : X86Reg<"dr2", 2>;
+def DR3 : X86Reg<"dr3", 3>;
+def DR4 : X86Reg<"dr4", 4>;
+def DR5 : X86Reg<"dr5", 5>;
+def DR6 : X86Reg<"dr6", 6>;
+def DR7 : X86Reg<"dr7", 7>;
+
+// Control registers
+def CR0  : X86Reg<"cr0",   0>;
+def CR1  : X86Reg<"cr1",   1>;
+def CR2  : X86Reg<"cr2",   2>;
+def CR3  : X86Reg<"cr3",   3>;
+def CR4  : X86Reg<"cr4",   4>;
+def CR5  : X86Reg<"cr5",   5>;
+def CR6  : X86Reg<"cr6",   6>;
+def CR7  : X86Reg<"cr7",   7>;
+def CR8  : X86Reg<"cr8",   8>;
+def CR9  : X86Reg<"cr9",   9>;
+def CR10 : X86Reg<"cr10", 10>;
+def CR11 : X86Reg<"cr11", 11>;
+def CR12 : X86Reg<"cr12", 12>;
+def CR13 : X86Reg<"cr13", 13>;
+def CR14 : X86Reg<"cr14", 14>;
+def CR15 : X86Reg<"cr15", 15>;
+
+// Pseudo index registers
+def EIZ : X86Reg<"eiz", 4>;
+def RIZ : X86Reg<"riz", 4>;
+
 
 //===----------------------------------------------------------------------===//
 // Register Class Definitions... now that we have all of the pieces, define the
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 00edcbc7d470..723e50cc1886 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -54,7 +54,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
       EVT IntPtr = TLI.getPointerTy();
-      Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 908785296d78..d1ed68028771 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -163,17 +163,6 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
-/// getSpecialAddressLatency - For targets where it is beneficial to
-/// backschedule instructions that compute addresses, return a value
-/// indicating the number of scheduling cycles of backscheduling that
-/// should be attempted.
-unsigned X86Subtarget::getSpecialAddressLatency() const {
-  // For x86 out-of-order targets, back-schedule address computations so
-  // that loads and stores aren't blocked.
-  // This value was chosen arbitrarily.
-  return 200;
-}
-
 void X86Subtarget::AutoDetectSubtargetFeatures() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
@@ -313,6 +302,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasBMI2 = true;
         ToggleFeature(X86::FeatureBMI2);
       }
+      if (IsIntel && ((EBX >> 11) & 0x1)) {
+        HasRTM = true;
+        ToggleFeature(X86::FeatureRTM);
+      }
     }
   }
 }
@@ -341,11 +334,13 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasLZCNT(false)
   , HasBMI(false)
   , HasBMI2(false)
+  , HasRTM(false)
   , IsBTMemSlow(false)
   , IsUAMemFast(false)
   , HasVectorUAMem(false)
   , HasCmpxchg16b(false)
   , UseLeaForSP(false)
+  , HasSlowDivide(false)
   , PostRAScheduler(false)
   , stackAlignment(4)
   // FIXME: this is a known good value for Yonah. How about others?
@@ -400,6 +395,10 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
     }
   }
 
+  // CPUName may have been set by the CPU detection code. Make sure the
+  // new MCSchedModel is used.
+  InitMCProcessorInfo(CPUName, FS);
+
   if (X86ProcFamily == IntelAtom)
     PostRAScheduler = true;
 
@@ -416,12 +415,12 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux and Solaris (both
+  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
   // 32 and 64 bit) and for all 64-bit targets.
   if (StackAlignOverride)
     stackAlignment = StackAlignOverride;
-  else if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() ||
-           isTargetSolaris() || In64BitMode)
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           In64BitMode)
     stackAlignment = 16;
 }
 
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 6841c5bafa32..8bf4cc77f762 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -118,6 +118,9 @@ protected:
   /// HasBMI2 - Processor has BMI2 instructions.
   bool HasBMI2;
 
+  /// HasRTM - Processor has RTM instructions.
+  bool HasRTM;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -136,6 +139,10 @@ protected:
   /// the stack pointer. This is an optimization for Intel Atom processors.
   bool UseLeaForSP;
 
+  /// HasSlowDivide - True if smaller divides are significantly faster than
+  /// full divides and should be used when possible.
+  bool HasSlowDivide;
+
   /// PostRAScheduler - True if using post-register-allocation scheduler.
   bool PostRAScheduler;
 
@@ -205,7 +212,8 @@ public:
   bool hasAES() const { return HasAES; }
   bool hasPCLMUL() const { return HasPCLMUL; }
   bool hasFMA() const { return HasFMA; }
-  bool hasFMA4() const { return HasFMA4; }
+  // FIXME: Favor FMA when both are enabled. Is this the right thing to do?
+  bool hasFMA4() const { return HasFMA4 && !HasFMA; }
   bool hasXOP() const { return HasXOP; }
   bool hasMOVBE() const { return HasMOVBE; }
   bool hasRDRAND() const { return HasRDRAND; }
@@ -214,11 +222,13 @@ public:
   bool hasLZCNT() const { return HasLZCNT; }
   bool hasBMI() const { return HasBMI; }
   bool hasBMI2() const { return HasBMI2; }
+  bool hasRTM() const { return HasRTM; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
+  bool hasSlowDivide() const { return HasSlowDivide; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
 
@@ -231,10 +241,10 @@ public:
   bool isTargetSolaris() const {
     return TargetTriple.getOS() == Triple::Solaris;
   }
-
-  // ELF is a reasonably sane default and the only other X86 targets we
-  // support are Darwin and Windows. Just use "not those".
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+  bool isTargetELF() const {
+    return (TargetTriple.getEnvironment() == Triple::ELF ||
+            TargetTriple.isOSBinFormatELF());
+  }
   bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
   bool isTargetNaCl() const {
     return TargetTriple.getOS() == Triple::NativeClient;
@@ -245,7 +255,10 @@ public:
   bool isTargetMingw() const { return TargetTriple.getOS() == Triple::MinGW32; }
   bool isTargetCygwin() const { return TargetTriple.getOS() == Triple::Cygwin; }
   bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); }
-  bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
+  bool isTargetCOFF() const {
+    return (TargetTriple.getEnvironment() != Triple::ELF &&
+            TargetTriple.isOSBinFormatCOFF());
+  }
   bool isTargetEnvMacho() const { return TargetTriple.isEnvironmentMachO(); }
 
   bool isTargetWin64() const {
@@ -296,12 +309,6 @@ public:
   /// returns null.
   const char *getBZeroEntry() const;
 
-  /// getSpecialAddressLatency - For targets where it is beneficial to
-  /// backschedule instructions that compute addresses, return a value
-  /// indicating the number of scheduling cycles of backscheduling that
-  /// should be attempted.
-  unsigned getSpecialAddressLatency() const;
-
   /// enablePostRAScheduler - run for Atom optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index b7ba568394bc..158f9dc06693 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -36,7 +36,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
   : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false),
-    DataLayout(getSubtargetImpl()->isTargetDarwin() ?
+    DL(getSubtargetImpl()->isTargetDarwin() ?
                "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
                "n8:16:32-S128" :
                (getSubtargetImpl()->isTargetCygMing() ||
@@ -48,7 +48,8 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
-    JITInfo(*this) {
+    JITInfo(*this),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 void X86_64TargetMachine::anchor() { }
@@ -59,12 +60,13 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
   : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
-    DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+    DL("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
                "n8:16:32:64-S128"),
     InstrInfo(*this),
     TSInfo(*this),
     TLInfo(*this),
-    JITInfo(*this) {
+    JITInfo(*this),
+    STTI(&TLInfo), VTTI(&TLInfo){
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
@@ -78,7 +80,6 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS, Options.StackAlignmentOverride, is64Bit),
     FrameLowering(*this, Subtarget),
-    ELFWriterInfo(is64Bit, true),
     InstrItins(Subtarget.getInstrItineraryData()){
   // Determine the PICStyle based on the target selected.
   if (getRelocationModel() == Reloc::Static) {
@@ -113,6 +114,12 @@ UseVZeroUpper("x86-use-vzeroupper",
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt",
+	       cl::desc("Enable early if-conversion on X86"));
+
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
@@ -142,7 +149,7 @@ public:
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   X86PassConfig *PC = new X86PassConfig(this, PM);
 
-  if (Subtarget.hasCMov())
+  if (X86EarlyIfConv && Subtarget.hasCMov())
     PC->enablePass(&EarlyIfConverterID);
 
   return PC;
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 8e935af67fe3..12311a1abfbd 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -15,7 +15,6 @@
 #define X86TARGETMACHINE_H
 
 #include "X86.h"
-#include "X86ELFWriterInfo.h"
 #include "X86InstrInfo.h"
 #include "X86ISelLowering.h"
 #include "X86FrameLowering.h"
@@ -23,8 +22,9 @@
 #include "X86SelectionDAGInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetTransformImpl.h"
 
 namespace llvm {
 
@@ -33,7 +33,6 @@ class StringRef;
 class X86TargetMachine : public LLVMTargetMachine {
   X86Subtarget       Subtarget;
   X86FrameLowering   FrameLowering;
-  X86ELFWriterInfo   ELFWriterInfo;
   InstrItineraryData InstrItins;
 
 public:
@@ -62,9 +61,6 @@ public:
   virtual const X86RegisterInfo  *getRegisterInfo() const {
     return &getInstrInfo()->getRegisterInfo();
   }
-  virtual const X86ELFWriterInfo *getELFWriterInfo() const {
-    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
-  }
   virtual const InstrItineraryData *getInstrItineraryData() const {
     return &InstrItins;
   }
@@ -80,17 +76,19 @@ public:
 ///
 class X86_32TargetMachine : public X86TargetMachine {
   virtual void anchor();
-  const TargetData  DataLayout; // Calculates type size & alignment
+  const DataLayout  DL; // Calculates type size & alignment
   X86InstrInfo      InstrInfo;
   X86SelectionDAGInfo TSInfo;
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
+  ScalarTargetTransformImpl STTI;
+  X86VectorTargetTransformInfo VTTI;
 public:
   X86_32TargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
-  virtual const TargetData *getTargetData() const { return &DataLayout; }
+  virtual const DataLayout *getDataLayout() const { return &DL; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
   }
@@ -103,23 +101,31 @@ public:
   virtual       X86JITInfo       *getJITInfo()         {
     return &JITInfo;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 };
 
 /// X86_64TargetMachine - X86 64-bit target machine.
 ///
 class X86_64TargetMachine : public X86TargetMachine {
   virtual void anchor();
-  const TargetData  DataLayout; // Calculates type size & alignment
+  const DataLayout  DL; // Calculates type size & alignment
   X86InstrInfo      InstrInfo;
   X86SelectionDAGInfo TSInfo;
   X86TargetLowering TLInfo;
   X86JITInfo        JITInfo;
+  ScalarTargetTransformImpl STTI;
+  X86VectorTargetTransformInfo VTTI;
 public:
   X86_64TargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS, const TargetOptions &Options,
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
-  virtual const TargetData *getTargetData() const { return &DataLayout; }
+  virtual const DataLayout *getDataLayout() const { return &DL; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
   }
@@ -132,6 +138,12 @@ public:
   virtual       X86JITInfo       *getJITInfo()         {
     return &JITInfo;
   }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
 };
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 80b75dc5f992..c4a58874a414 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -42,7 +42,6 @@ namespace {
 
   private:
     const TargetInstrInfo *TII; // Machine instruction info.
-    MachineBasicBlock *MBB;     // Current basic block
 
     // Any YMM register live-in to this function?
     bool FnHasLiveInYmm;
@@ -84,7 +83,7 @@ namespace {
     //  2) All states must be clean for the result to be clean
     //  3) If none above and one unknown, the result state is also unknown
     //
-    unsigned computeState(unsigned PrevState, unsigned CurState) {
+    static unsigned computeState(unsigned PrevState, unsigned CurState) {
       if (PrevState == ST_INIT)
         return CurState;
 
@@ -122,7 +121,7 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
 }
 
 static bool hasYmmReg(MachineInstr *MI) {
-  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg())
       continue;
@@ -148,7 +147,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   const TargetRegisterClass *RC = &X86::VR256RegClass;
   for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end();
        i != e; i++) {
-    if (MRI.isPhysRegUsed(*i)) {
+    if (!MRI.reg_nodbg_empty(*i)) {
       YMMUsed = true;
       break;
     }
@@ -189,7 +188,6 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
                                            MachineBasicBlock &BB) {
   bool Changed = false;
   unsigned BBNum = BB.getNumber();
-  MBB = &BB;
 
   // Don't process already solved BBs
   if (BBSolved[BBNum])
@@ -207,7 +205,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
 
   // The entry MBB for the function may set the initial state to dirty if
   // the function receives any YMM incoming arguments
-  if (MBB == MF.begin()) {
+  if (&BB == MF.begin()) {
     EntryState = ST_CLEAN;
     if (FnHasLiveInYmm)
       EntryState = ST_DIRTY;
@@ -253,7 +251,7 @@ bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
       // When unknown, only compute the information within the block to have
       // it available in the exit if possible, but don't change the block.
       if (EntryState != ST_UNKNOWN) {
-        BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+        BuildMI(BB, I, dl, TII->get(X86::VZEROUPPER));
         ++NumVZU;
       }
 
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index c76866f47bed..caae56227214 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -31,7 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -112,7 +112,7 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       EmitSpecialLLVMGlobal(GV))
     return;
 
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
 
   
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index a4e56472babc..e18d97384d3d 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -23,7 +23,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/ErrorHandling.h"
 
@@ -98,12 +98,13 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF) const {
   DebugLoc dl = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
 
   bool FP = hasFP(MF);
-  bool Nested = MF.getFunction()->
-                getAttributes().hasAttrSomewhere(Attribute::Nest);
+  const AttrListPtr &PAL = MF.getFunction()->getAttributes();
 
-  if (Nested) {
-    loadFromStack(MBB, MBBI, XCore::R11, 0, dl, TII);
-  }
+  for (unsigned I = 0, E = PAL.getNumAttrs(); I != E; ++I)
+    if (PAL.getAttributesAtIndex(I).hasAttribute(Attributes::Nest)) {
+      loadFromStack(MBB, MBBI, XCore::R11, 0, dl, TII);
+      break;
+    }
 
   // Work out frame sizes.
   int FrameSize = MFI->getStackSize();
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 8643ffc19d09..9e7816e21f80 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -285,7 +285,7 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     llvm_unreachable(0);
   }
   SDValue base = getGlobalAddressWrapper(GA, GV, DAG);
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   unsigned Size = TD->getTypeAllocSize(Ty);
   SDValue offset = DAG.getNode(ISD::MUL, dl, MVT::i32, BuildGetId(DAG, dl),
                        DAG.getConstant(Size, MVT::i32));
@@ -298,7 +298,7 @@ LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
   DebugLoc DL = Op.getDebugLoc();
 
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), /*isTarget=*/true);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
 
   return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
 }
@@ -405,7 +405,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (allowsUnalignedMemoryAccesses(LD->getMemoryVT()))
     return SDValue();
 
-  unsigned ABIAlignment = getTargetData()->
+  unsigned ABIAlignment = getDataLayout()->
     getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
   // Leave aligned load alone.
   if (LD->getAlignment() >= ABIAlignment)
@@ -477,7 +477,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
-  Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -507,7 +507,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
     return SDValue();
   }
-  unsigned ABIAlignment = getTargetData()->
+  unsigned ABIAlignment = getDataLayout()->
     getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
   // Leave aligned store alone.
   if (ST->getAlignment() >= ABIAlignment) {
@@ -536,7 +536,7 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Lower to a call to __misaligned_store(BasePtr, Value).
-  Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -1499,7 +1499,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
     if (StoreBits % 8) {
       break;
     }
-    unsigned ABIAlignment = getTargetData()->getABITypeAlignment(
+    unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(
         ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
     unsigned Alignment = ST->getAlignment();
     if (Alignment >= ABIAlignment) {
@@ -1570,7 +1570,7 @@ XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM,
   if (Ty->getTypeID() == Type::VoidTyID)
     return AM.Scale == 0 && isImmUs(AM.BaseOffs) && isImmUs4(AM.BaseOffs);
 
-  const TargetData *TD = TM.getTargetData();
+  const DataLayout *TD = TM.getDataLayout();
   unsigned Size = TD->getTypeAllocSize(Ty);
   if (AM.BaseGV) {
     return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index ae646a248524..3e7666bdb936 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -33,7 +33,7 @@ def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
                              SDNPVariadic]>;
 
 def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTBrind,
-                         [SDNPHasChain, SDNPOptInGlue]>;
+                         [SDNPHasChain, SDNPOptInGlue, SDNPMayLoad]>;
 
 def SDT_XCoreBR_JT    : SDTypeProfile<0, 2,
                                       [SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
@@ -58,7 +58,7 @@ def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
 
 def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
 def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
-                               [SDNPHasChain]>;
+                               [SDNPHasChain, SDNPMayStore]>;
 
 // These are target-independent nodes, but have target-specific formats.
 def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index cdd0a0893b98..be5855abcd0b 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -176,7 +176,7 @@ XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   #ifndef NDEBUG
   DEBUG(errs() << "\nFunction         : " 
-        << MF.getFunction()->getName() << "\n");
+        << MF.getName() << "\n");
   DEBUG(errs() << "<--------->\n");
   DEBUG(MI.print(errs()));
   DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 11ec86b0fa8c..d5a932c5189d 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -27,12 +27,12 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
                                        CodeGenOpt::Level OL)
   : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
     Subtarget(TT, CPU, FS),
-    DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
+    DL("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
                "i16:16:32-i32:32:32-i64:32:32-n32"),
     InstrInfo(),
     FrameLowering(Subtarget),
     TLInfo(*this),
-    TSInfo(*this) {
+    TSInfo(*this), STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 254668142aaf..c60c6a37f95b 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -20,17 +20,20 @@
 #include "XCoreISelLowering.h"
 #include "XCoreSelectionDAGInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetTransformImpl.h"
+#include "llvm/DataLayout.h"
 
 namespace llvm {
 
 class XCoreTargetMachine : public LLVMTargetMachine {
   XCoreSubtarget Subtarget;
-  const TargetData DataLayout;       // Calculates type size & alignment
+  const DataLayout DL;       // Calculates type size & alignment
   XCoreInstrInfo InstrInfo;
   XCoreFrameLowering FrameLowering;
   XCoreTargetLowering TLInfo;
   XCoreSelectionDAGInfo TSInfo;
+  ScalarTargetTransformImpl STTI;
+  VectorTargetTransformImpl VTTI;
 public:
   XCoreTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -53,7 +56,13 @@ public:
   virtual const TargetRegisterInfo *getRegisterInfo() const {
     return &InstrInfo.getRegisterInfo();
   }
-  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
+    return &STTI;
+  }
+  virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
+    return &VTTI;
+  }
+  virtual const DataLayout       *getDataLayout() const { return &DL; }
 
   // Pass Pipeline Configuration
   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index b94dd69deb75..be48b2063fb6 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -153,7 +153,8 @@ CallGraphNode *ArgPromotion::PromoteArguments(CallGraphNode *CGN) {
   SmallPtrSet<Argument*, 8> ArgsToPromote;
   SmallPtrSet<Argument*, 8> ByValArgsToTransform;
   for (unsigned i = 0; i != PointerArgs.size(); ++i) {
-    bool isByVal = F->paramHasAttr(PointerArgs[i].second+1, Attribute::ByVal);
+    bool isByVal=F->getParamAttributes(PointerArgs[i].second+1).
+      hasAttribute(Attributes::ByVal);
     Argument *PtrArg = PointerArgs[i].first;
     Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
 
@@ -517,8 +518,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   const AttrListPtr &PAL = F->getAttributes();
 
   // Add any return attributes.
-  if (Attributes attrs = PAL.getRetAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+  Attributes attrs = PAL.getRetAttributes();
+  if (attrs.hasAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                                                    attrs));
 
   // First, determine the new argument list
   unsigned ArgIndex = 1;
@@ -534,7 +537,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     } else if (!ArgsToPromote.count(I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      if (Attributes attrs = PAL.getParamAttributes(ArgIndex))
+      Attributes attrs = PAL.getParamAttributes(ArgIndex);
+      if (attrs.hasAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(Params.size(), attrs));
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
@@ -587,19 +591,13 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   }
 
   // Add any function attributes.
-  if (Attributes attrs = PAL.getFnAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+  attrs = PAL.getFnAttributes();
+  if (attrs.hasAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                                    attrs));
 
   Type *RetTy = FTy->getReturnType();
 
-  // Work around LLVM bug PR56: the CWriter cannot emit varargs functions which
-  // have zero fixed arguments.
-  bool ExtraArgHack = false;
-  if (Params.empty() && FTy->isVarArg()) {
-    ExtraArgHack = true;
-    Params.push_back(Type::getInt32Ty(F->getContext()));
-  }
-
   // Construct the new function type using the new arguments.
   FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
 
@@ -613,7 +611,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttrListPtr::get(AttributesVec));
+  NF->setAttributes(AttrListPtr::get(F->getContext(), AttributesVec));
   AttributesVec.clear();
 
   F->getParent()->getFunctionList().insert(F, NF);
@@ -641,8 +639,10 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     const AttrListPtr &CallPAL = CS.getAttributes();
 
     // Add any return attributes.
-    if (Attributes attrs = CallPAL.getRetAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(0, attrs));
+    Attributes attrs = CallPAL.getRetAttributes();
+    if (attrs.hasAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                                                      attrs));
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
@@ -653,7 +653,8 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       if (!ArgsToPromote.count(I) && !ByValArgsToTransform.count(I)) {
         Args.push_back(*AI);          // Unmodified argument
 
-        if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
+        Attributes Attrs = CallPAL.getParamAttributes(ArgIndex);
+        if (Attrs.hasAttributes())
           AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
 
       } else if (ByValArgsToTransform.count(I)) {
@@ -711,30 +712,32 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
         }
       }
 
-    if (ExtraArgHack)
-      Args.push_back(Constant::getNullValue(Type::getInt32Ty(F->getContext())));
-
     // Push any varargs arguments on the list.
     for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
       Args.push_back(*AI);
-      if (Attributes Attrs = CallPAL.getParamAttributes(ArgIndex))
+      Attributes Attrs = CallPAL.getParamAttributes(ArgIndex);
+      if (Attrs.hasAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
     }
 
     // Add any function attributes.
-    if (Attributes attrs = CallPAL.getFnAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(~0, attrs));
+    attrs = CallPAL.getFnAttributes();
+    if (attrs.hasAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                                      attrs));
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
                                Args, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec));
+      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(II->getContext(),
+                                                            AttributesVec));
     } else {
       New = CallInst::Create(NF, Args, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec));
+      cast<CallInst>(New)->setAttributes(AttrListPtr::get(New->getContext(),
+                                                          AttributesVec));
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
     }
@@ -870,16 +873,9 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     }
 
     // Increment I2 past all of the arguments added for this promoted pointer.
-    for (unsigned i = 0, e = ArgIndices.size(); i != e; ++i)
-      ++I2;
+    std::advance(I2, ArgIndices.size());
   }
 
-  // Notify the alias analysis implementation that we inserted a new argument.
-  if (ExtraArgHack)
-    AA.copyValue(Constant::getNullValue(Type::getInt32Ty(F->getContext())), 
-                 NF->arg_begin());
-
-
   // Tell the alias analysis that the old function is about to disappear.
   AA.replaceWithNewValue(F, NF);
 
diff --git a/lib/Transforms/IPO/BarrierNoopPass.cpp b/lib/Transforms/IPO/BarrierNoopPass.cpp
new file mode 100644
index 000000000000..2e32240621f9
--- /dev/null
+++ b/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -0,0 +1,47 @@
+//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: DO NOT USE THIS IF AVOIDABLE
+//
+// This pass is a nonce pass intended to allow manipulation of the implicitly
+// nesting pass manager. For example, it can be used to cause a CGSCC pass
+// manager to be closed prior to running a new collection of function passes.
+//
+// FIXME: This is a huge HACK. This should be removed when the pass manager's
+// nesting is made explicit instead of implicit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+/// \brief A nonce module pass used to place a barrier in a pass manager.
+///
+/// There is no mechanism for ending a CGSCC pass manager once one is started.
+/// This prevents extension points from having clear deterministic ordering
+/// when they are phrased as non-module passes.
+class BarrierNoop : public ModulePass {
+public:
+  static char ID; // Pass identification.
+
+  BarrierNoop() : ModulePass(ID) {
+    initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) { return false; }
+};
+}
+
+ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
+
+char BarrierNoop::ID = 0;
+INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
+                false, false)
diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt
index 3f6b1de614db..90c1c33e6dca 100644
--- a/lib/Transforms/IPO/CMakeLists.txt
+++ b/lib/Transforms/IPO/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMipo
   ArgumentPromotion.cpp
+  BarrierNoopPass.cpp
   ConstantMerge.cpp
   DeadArgumentElimination.cpp
   ExtractGV.cpp
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index d8fae8a4b2b9..e2f012657fdd 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -23,7 +23,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -50,7 +50,7 @@ namespace {
     // alignment to a concrete value.
     unsigned getAlignment(GlobalVariable *GV) const;
 
-    const TargetData *TD;
+    const DataLayout *TD;
   };
 }
 
@@ -98,7 +98,7 @@ unsigned ConstantMerge::getAlignment(GlobalVariable *GV) const {
 }
 
 bool ConstantMerge::runOnModule(Module &M) {
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
 
   // Find all the globals that are marked "used".  These cannot be merged.
   SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
@@ -107,7 +107,7 @@ bool ConstantMerge::runOnModule(Module &M) {
   
   // Map unique <constants, has-unknown-alignment> pairs to globals.  We don't
   // want to merge globals of unknown alignment with those of explicit
-  // alignment.  If we have TargetData, we always know the alignment.
+  // alignment.  If we have DataLayout, we always know the alignment.
   DenseMap<PointerIntPair<Constant*, 1, bool>, GlobalVariable*> CMap;
 
   // Replacements - This vector contains a list of replacements to perform.
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index fd23a935b905..4cfd0b235ab8 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -21,7 +21,9 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constant.h"
+#include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
@@ -30,6 +32,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
@@ -121,6 +124,15 @@ namespace {
 
     typedef SmallVector<RetOrArg, 5> UseVector;
 
+    // Map each LLVM function to corresponding metadata with debug info. If
+    // the function is replaced with another one, we should patch the pointer
+    // to LLVM function in metadata.
+    // As the code generation for module is finished (and DIBuilder is
+    // finalized) we assume that subprogram descriptors won't be changed, and
+    // they are stored in map for short duration anyway.
+    typedef DenseMap<Function*, DISubprogram> FunctionDIMap;
+    FunctionDIMap FunctionDIs;
+
   protected:
     // DAH uses this to specify a different ID.
     explicit DAE(char &ID) : ModulePass(ID) {}
@@ -141,6 +153,7 @@ namespace {
                        unsigned RetValNum = 0);
     Liveness SurveyUses(const Value *V, UseVector &MaybeLiveUses);
 
+    void CollectFunctionDIs(Module &M);
     void SurveyFunction(const Function &F);
     void MarkValue(const RetOrArg &RA, Liveness L,
                    const UseVector &MaybeLiveUses);
@@ -180,6 +193,33 @@ INITIALIZE_PASS(DAH, "deadarghaX0r",
 ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
 ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
 
+/// CollectFunctionDIs - Map each function in the module to its debug info
+/// descriptor.
+void DAE::CollectFunctionDIs(Module &M) {
+  FunctionDIs.clear();
+
+  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+       E = M.named_metadata_end(); I != E; ++I) {
+    NamedMDNode &NMD = *I;
+    for (unsigned MDIndex = 0, MDNum = NMD.getNumOperands();
+         MDIndex < MDNum; ++MDIndex) {
+      MDNode *Node = NMD.getOperand(MDIndex);
+      if (!DIDescriptor(Node).isCompileUnit())
+        continue;
+      DICompileUnit CU(Node);
+      const DIArray &SPs = CU.getSubprograms();
+      for (unsigned SPIndex = 0, SPNum = SPs.getNumElements();
+           SPIndex < SPNum; ++SPIndex) {
+        DISubprogram SP(SPs.getElement(SPIndex));
+        if (!SP.Verify())
+          continue;
+        if (Function *F = SP.getFunction())
+          FunctionDIs[F] = SP;
+      }
+    }
+  }
+}
+
 /// DeleteDeadVarargs - If this is an function that takes a ... list, and if
 /// llvm.vastart is never called, the varargs list is dead for the function.
 bool DAE::DeleteDeadVarargs(Function &Fn) {
@@ -236,9 +276,11 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
       SmallVector<AttributeWithIndex, 8> AttributesVec;
       for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
         AttributesVec.push_back(PAL.getSlot(i));
-      if (Attributes FnAttrs = PAL.getFnAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
-      PAL = AttrListPtr::get(AttributesVec);
+      Attributes FnAttrs = PAL.getFnAttributes();
+      if (FnAttrs.hasAttributes())
+        AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                                        FnAttrs));
+      PAL = AttrListPtr::get(Fn.getContext(), AttributesVec);
     }
 
     Instruction *New;
@@ -284,6 +326,11 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
     I2->takeName(I);
   }
 
+  // Patch the pointer to LLVM function in debug info descriptor.
+  FunctionDIMap::iterator DI = FunctionDIs.find(&Fn);
+  if (DI != FunctionDIs.end())
+    DI->second.replaceFunction(NF);
+
   // Finally, nuke the old function.
   Fn.eraseFromParent();
   return true;
@@ -717,13 +764,17 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
-    RAttrs &= ~Attribute::typeIncompatible(NRetTy);
+    RAttrs =
+      Attributes::get(NRetTy->getContext(), AttrBuilder(RAttrs).
+                      removeAttributes(Attributes::typeIncompatible(NRetTy)));
   else
-    assert((RAttrs & Attribute::typeIncompatible(NRetTy)) == 0
-           && "Return attributes no longer compatible?");
+    assert(!AttrBuilder(RAttrs).
+             hasAttributes(Attributes::typeIncompatible(NRetTy)) &&
+           "Return attributes no longer compatible?");
 
-  if (RAttrs)
-    AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs));
+  if (RAttrs.hasAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                                                    RAttrs));
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -740,7 +791,8 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
       // Get the original parameter attributes (skipping the first one, that is
       // for the return value.
-      if (Attributes Attrs = PAL.getParamAttributes(i + 1))
+      Attributes Attrs = PAL.getParamAttributes(i + 1);
+      if (Attrs.hasAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(Params.size(), Attrs));
     } else {
       ++NumArgumentsEliminated;
@@ -749,11 +801,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     }
   }
 
-  if (FnAttrs != Attribute::None)
-    AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+  if (FnAttrs.hasAttributes())
+    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                                    FnAttrs));
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttrListPtr NewPAL = AttrListPtr::get(AttributesVec);
+  AttrListPtr NewPAL = AttrListPtr::get(F->getContext(), AttributesVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -786,9 +839,12 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     Attributes RAttrs = CallPAL.getRetAttributes();
     Attributes FnAttrs = CallPAL.getFnAttributes();
     // Adjust in case the function was changed to return void.
-    RAttrs &= ~Attribute::typeIncompatible(NF->getReturnType());
-    if (RAttrs)
-      AttributesVec.push_back(AttributeWithIndex::get(0, RAttrs));
+    RAttrs =
+      Attributes::get(NF->getContext(), AttrBuilder(RAttrs).
+           removeAttributes(Attributes::typeIncompatible(NF->getReturnType())));
+    if (RAttrs.hasAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                                                      RAttrs));
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -800,22 +856,25 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        if (Attributes Attrs = CallPAL.getParamAttributes(i + 1))
+        Attributes Attrs = CallPAL.getParamAttributes(i + 1);
+        if (Attrs.hasAttributes())
           AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      if (Attributes Attrs = CallPAL.getParamAttributes(i + 1))
+      Attributes Attrs = CallPAL.getParamAttributes(i + 1);
+      if (Attrs.hasAttributes())
         AttributesVec.push_back(AttributeWithIndex::get(Args.size(), Attrs));
     }
 
-    if (FnAttrs != Attribute::None)
-      AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+    if (FnAttrs.hasAttributes())
+      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                                      FnAttrs));
 
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec);
+    AttrListPtr NewCallPAL = AttrListPtr::get(F->getContext(), AttributesVec);
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
@@ -952,6 +1011,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
         BB->getInstList().erase(RI);
       }
 
+  // Patch the pointer to LLVM function in debug info descriptor.
+  FunctionDIMap::iterator DI = FunctionDIs.find(F);
+  if (DI != FunctionDIs.end())
+    DI->second.replaceFunction(NF);
+
   // Now that the old function is dead, delete it.
   F->eraseFromParent();
 
@@ -961,6 +1025,9 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 bool DAE::runOnModule(Module &M) {
   bool Changed = false;
 
+  // Collect debug info descriptors for functions.
+  CollectFunctionDIs(M);
+
   // First pass: Do a simple check to see if any functions can have their "..."
   // removed.  We can do this if they never call va_start.  This loop cannot be
   // fused with the next loop, because deleting a function invalidates
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 4c7f0ed23640..6716deb9e47b 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -51,32 +51,75 @@ namespace {
       // Visit the GlobalVariables.
       for (Module::global_iterator I = M.global_begin(), E = M.global_end();
            I != E; ++I) {
-        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
-          I->setInitializer(0);
-        } else {
+        bool Delete =
+          deleteStuff == (bool)Named.count(I) && !I->isDeclaration();
+        if (!Delete) {
           if (I->hasAvailableExternallyLinkage())
             continue;
           if (I->getName() == "llvm.global_ctors")
             continue;
         }
 
-        if (I->hasLocalLinkage())
+        bool Local = I->hasLocalLinkage();
+        if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
-        I->setLinkage(GlobalValue::ExternalLinkage);
+
+        if (Local || Delete)
+          I->setLinkage(GlobalValue::ExternalLinkage);
+
+        if (Delete)
+          I->setInitializer(0);
       }
 
       // Visit the Functions.
       for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-        if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) {
-          I->deleteBody();
-        } else {
+        bool Delete =
+          deleteStuff == (bool)Named.count(I) && !I->isDeclaration();
+        if (!Delete) {
           if (I->hasAvailableExternallyLinkage())
             continue;
         }
 
-        if (I->hasLocalLinkage())
+        bool Local = I->hasLocalLinkage();
+        if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
-        I->setLinkage(GlobalValue::ExternalLinkage);
+
+        if (Local || Delete)
+          I->setLinkage(GlobalValue::ExternalLinkage);
+
+        if (Delete)
+          I->deleteBody();
+      }
+
+      // Visit the Aliases.
+      for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+           I != E;) {
+        Module::alias_iterator CurI = I;
+        ++I;
+
+        if (CurI->hasLocalLinkage()) {
+          CurI->setVisibility(GlobalValue::HiddenVisibility);
+          CurI->setLinkage(GlobalValue::ExternalLinkage);
+        }
+
+        if (deleteStuff == (bool)Named.count(CurI)) {
+          Type *Ty =  CurI->getType()->getElementType();
+
+          CurI->removeFromParent();
+          llvm::Value *Declaration;
+          if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+            Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
+                                           CurI->getName(), &M);
+
+          } else {
+            Declaration =
+              new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+                                 0, CurI->getName());
+
+          }
+          CurI->replaceAllUsesWith(Declaration);
+          delete CurI;
+        }
       }
 
       return true;
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index f3f622843340..18409f77b3fa 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -28,9 +28,9 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/UniqueVector.h"
 #include "llvm/Support/InstIterator.h"
 using namespace llvm;
 
@@ -212,10 +212,17 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
     MadeChange = true;
 
     // Clear out any existing attributes.
-    F->removeAttribute(~0, Attribute::ReadOnly | Attribute::ReadNone);
+    AttrBuilder B;
+    B.addAttribute(Attributes::ReadOnly)
+      .addAttribute(Attributes::ReadNone);
+    F->removeAttribute(AttrListPtr::FunctionIndex,
+                       Attributes::get(F->getContext(), B));
 
     // Add in the new attribute.
-    F->addAttribute(~0, ReadsMemory? Attribute::ReadOnly : Attribute::ReadNone);
+    B.clear();
+    B.addAttribute(ReadsMemory ? Attributes::ReadOnly : Attributes::ReadNone);
+    F->addAttribute(AttrListPtr::FunctionIndex,
+                    Attributes::get(F->getContext(), B));
 
     if (ReadsMemory)
       ++NumReadOnly;
@@ -276,8 +283,6 @@ namespace {
 
     void tooManyUses() { Captured = true; }
 
-    bool shouldExplore(Use *U) { return true; }
-
     bool captured(Use *U) {
       CallSite CS(U->getUser());
       if (!CS.getInstruction()) { Captured = true; return true; }
@@ -352,6 +357,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
 
   ArgumentGraph AG;
 
+  AttrBuilder B;
+  B.addAttribute(Attributes::NoCapture);
+
   // Check each function in turn, determining which pointer arguments are not
   // captured.
   for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
@@ -373,7 +381,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end();
            A != E; ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(Attribute::NoCapture);
+          A->addAttr(Attributes::get(F->getContext(), B));
           ++NumNoCapture;
           Changed = true;
         }
@@ -388,7 +396,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
         if (!Tracker.Captured) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
-            A->addAttr(Attribute::NoCapture);
+            A->addAttr(Attributes::get(F->getContext(), B));
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -421,7 +429,9 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
       // eg. "void f(int* x) { if (...) f(x); }"
       if (ArgumentSCC[0]->Uses.size() == 1 &&
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
-        ArgumentSCC[0]->Definition->addAttr(Attribute::NoCapture);
+        ArgumentSCC[0]->
+          Definition->
+          addAttr(Attributes::get(ArgumentSCC[0]->Definition->getContext(), B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -463,7 +473,7 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(Attribute::NoCapture);
+      A->addAttr(Attributes::get(A->getContext(), B));
       ++NumNoCapture;
       Changed = true;
     }
@@ -476,13 +486,13 @@ bool FunctionAttrs::AddNoCaptureAttrs(const CallGraphSCC &SCC) {
 /// or a pointer that doesn't alias any other pointer visible to the caller.
 bool FunctionAttrs::IsFunctionMallocLike(Function *F,
                               SmallPtrSet<Function*, 8> &SCCNodes) const {
-  UniqueVector<Value *> FlowsToReturn;
+  SmallSetVector<Value *, 8> FlowsToReturn;
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I)
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(I->getTerminator()))
       FlowsToReturn.insert(Ret->getReturnValue());
 
   for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
-    Value *RetVal = FlowsToReturn[i+1];   // UniqueVector[0] is reserved.
+    Value *RetVal = FlowsToReturn[i];
 
     if (Constant *C = dyn_cast<Constant>(RetVal)) {
       if (!C->isNullValue() && !isa<UndefValue>(C))
@@ -520,7 +530,7 @@ bool FunctionAttrs::IsFunctionMallocLike(Function *F,
         case Instruction::Call:
         case Instruction::Invoke: {
           CallSite CS(RVI);
-          if (CS.paramHasAttr(0, Attribute::NoAlias))
+          if (CS.paramHasAttr(0, Attributes::NoAlias))
             break;
           if (CS.getCalledFunction() &&
               SCCNodes.count(CS.getCalledFunction()))
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 6d950d20240a..591278fa62c8 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -25,7 +25,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
@@ -83,7 +83,7 @@ namespace {
                                const GlobalStatus &GS);
     bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn);
 
-    TargetData *TD;
+    DataLayout *TD;
     TargetLibraryInfo *TLI;
   };
 }
@@ -225,6 +225,7 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
 
         // Don't hack on volatile stores.
         if (SI->isVolatile()) return true;
+
         GS.Ordering = StrongerOrdering(GS.Ordering, SI->getOrdering());
 
         // If this is a direct store to the global (i.e., the global is a scalar
@@ -234,6 +235,14 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
           if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(
                                                            SI->getOperand(1))) {
             Value *StoredVal = SI->getOperand(0);
+
+            if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+              if (C->isThreadDependent()) {
+                // The stored value changes between threads; don't track it.
+                return true;
+              }
+            }
+
             if (StoredVal == GV->getInitializer()) {
               if (GS.StoredType < GlobalStatus::isInitializerStored)
                 GS.StoredType = GlobalStatus::isInitializerStored;
@@ -346,7 +355,7 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
 /// Given a value that is stored to a global but never read, determine whether
 /// it's safe to remove the store and the chain of computation that feeds the
 /// store.
-static bool IsSafeComputationToRemove(Value *V) {
+static bool IsSafeComputationToRemove(Value *V, const TargetLibraryInfo *TLI) {
   do {
     if (isa<Constant>(V))
       return true;
@@ -355,7 +364,7 @@ static bool IsSafeComputationToRemove(Value *V) {
     if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
         isa<GlobalValue>(V))
       return false;
-    if (isAllocationFn(V))
+    if (isAllocationFn(V, TLI))
       return true;
 
     Instruction *I = cast<Instruction>(V);
@@ -376,7 +385,8 @@ static bool IsSafeComputationToRemove(Value *V) {
 /// of the global and clean up any that obviously don't assign the global a
 /// value that isn't dynamically allocated.
 ///
-static bool CleanupPointerRootUsers(GlobalVariable *GV) {
+static bool CleanupPointerRootUsers(GlobalVariable *GV,
+                                    const TargetLibraryInfo *TLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
   // pointers are forgotten, causing an accumulating growth in memory
   // usage over time.  The common strategy for leak checkers is to whitelist the
@@ -432,18 +442,18 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV) {
         C->destroyConstant();
         // This could have invalidated UI, start over from scratch.
         Dead.clear();
-        CleanupPointerRootUsers(GV);
+        CleanupPointerRootUsers(GV, TLI);
         return true;
       }
     }
   }
 
   for (int i = 0, e = Dead.size(); i != e; ++i) {
-    if (IsSafeComputationToRemove(Dead[i].first)) {
+    if (IsSafeComputationToRemove(Dead[i].first, TLI)) {
       Dead[i].second->eraseFromParent();
       Instruction *I = Dead[i].first;
       do {
-	if (isAllocationFn(I))
+	if (isAllocationFn(I, TLI))
 	  break;
         Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
         if (!J)
@@ -463,7 +473,7 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV) {
 /// quick scan over the use list to clean up the easy and obvious cruft.  This
 /// returns true if it made a change.
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
-                                       TargetData *TD, TargetLibraryInfo *TLI) {
+                                       DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
     User *U = *UI++;
@@ -655,7 +665,7 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
 /// behavior of the program in a more fine-grained way.  We have determined that
 /// this transformation is safe already.  We return the first global variable we
 /// insert so that the caller can reprocess it.
-static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) {
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &TD) {
   // Make sure this global only has simple uses that we can SRA.
   if (!GlobalUsersSafeToSRA(GV))
     return 0;
@@ -931,7 +941,7 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
 /// if the loaded value is dynamically null, then we know that they cannot be
 /// reachable with a null optimize away the load.
 static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
-                                            TargetData *TD,
+                                            DataLayout *TD,
                                             TargetLibraryInfo *TLI) {
   bool Changed = false;
 
@@ -961,7 +971,9 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
       // If we get here we could have other crazy uses that are transitively
       // loaded.
       assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
-              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser)) &&
+              isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) ||
+              isa<BitCastInst>(GlobalUser) ||
+              isa<GetElementPtrInst>(GlobalUser)) &&
              "Only expect load and stores!");
     }
   }
@@ -975,7 +987,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
   // nor is the global.
   if (AllNonStoreUsesGone) {
     if (isLeakCheckerRoot(GV)) {
-      Changed |= CleanupPointerRootUsers(GV);
+      Changed |= CleanupPointerRootUsers(GV, TLI);
     } else {
       Changed = true;
       CleanupConstantGlobalUsers(GV, 0, TD, TLI);
@@ -993,7 +1005,7 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV,
 /// ConstantPropUsersOf - Walk the use list of V, constant folding all of the
 /// instructions that are foldable.
 static void ConstantPropUsersOf(Value *V,
-                                TargetData *TD, TargetLibraryInfo *TLI) {
+                                DataLayout *TD, TargetLibraryInfo *TLI) {
   for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; )
     if (Instruction *I = dyn_cast<Instruction>(*UI++))
       if (Constant *NewC = ConstantFoldInstruction(I, TD, TLI)) {
@@ -1016,7 +1028,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV,
                                                      CallInst *CI,
                                                      Type *AllocTy,
                                                      ConstantInt *NElements,
-                                                     TargetData *TD,
+                                                     DataLayout *TD,
                                                      TargetLibraryInfo *TLI) {
   DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << "  CALL = " << *CI << '\n');
 
@@ -1465,9 +1477,10 @@ static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
 /// PerformHeapAllocSRoA - CI is an allocation of an array of structures.  Break
 /// it up into multiple allocations of arrays of the fields.
 static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
-                                            Value *NElems, TargetData *TD) {
+                                            Value *NElems, DataLayout *TD,
+                                            const TargetLibraryInfo *TLI) {
   DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << "  MALLOC = " << *CI << '\n');
-  Type *MAT = getMallocAllocatedType(CI);
+  Type *MAT = getMallocAllocatedType(CI, TLI);
   StructType *STy = cast<StructType>(MAT);
 
   // There is guaranteed to be at least one use of the malloc (storing
@@ -1656,7 +1669,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
                                                Type *AllocTy,
                                                AtomicOrdering Ordering,
                                                Module::global_iterator &GVI,
-                                               TargetData *TD,
+                                               DataLayout *TD,
                                                TargetLibraryInfo *TLI) {
   if (!TD)
     return false;
@@ -1688,7 +1701,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
   // This eliminates dynamic allocation, avoids an indirection accessing the
   // data, and exposes the resultant global to further GlobalOpt.
   // We cannot optimize the malloc if we cannot determine malloc array size.
-  Value *NElems = getMallocArraySize(CI, TD, true);
+  Value *NElems = getMallocArraySize(CI, TD, TLI, true);
   if (!NElems)
     return false;
 
@@ -1725,7 +1738,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
-    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI))) {
+    if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
       Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
@@ -1742,7 +1755,8 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
         CI = cast<CallInst>(Malloc);
     }
 
-    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD);
+    GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, TLI, true),
+                               TD, TLI);
     return true;
   }
 
@@ -1754,7 +1768,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
 static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
                                      AtomicOrdering Ordering,
                                      Module::global_iterator &GVI,
-                                     TargetData *TD, TargetLibraryInfo *TLI) {
+                                     DataLayout *TD, TargetLibraryInfo *TLI) {
   // Ignore no-op GEPs and bitcasts.
   StoredOnceVal = StoredOnceVal->stripPointerCasts();
 
@@ -1771,8 +1785,8 @@ static bool OptimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
       // Optimize away any trapping uses of the loaded value.
       if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, TD, TLI))
         return true;
-    } else if (CallInst *CI = extractMallocCall(StoredOnceVal)) {
-      Type *MallocType = getMallocAllocatedType(CI);
+    } else if (CallInst *CI = extractMallocCall(StoredOnceVal, TLI)) {
+      Type *MallocType = getMallocAllocatedType(CI, TLI);
       if (MallocType &&
           TryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType, Ordering, GVI,
                                              TD, TLI))
@@ -1964,7 +1978,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     bool Changed;
     if (isLeakCheckerRoot(GV)) {
       // Delete any constant stores to the global.
-      Changed = CleanupPointerRootUsers(GV);
+      Changed = CleanupPointerRootUsers(GV, TLI);
     } else {
       // Delete any stores we can find to the global.  We may not be able to
       // make it completely dead though.
@@ -1997,7 +2011,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV,
     ++NumMarked;
     return true;
   } else if (!GV->getInitializer()->getType()->isSingleValueType()) {
-    if (TargetData *TD = getAnalysisIfAvailable<TargetData>())
+    if (DataLayout *TD = getAnalysisIfAvailable<DataLayout>())
       if (GlobalVariable *FirstNewGV = SRAGlobal(GV, *TD)) {
         GVI = FirstNewGV;  // Don't skip the newly produced globals!
         return true;
@@ -2056,25 +2070,26 @@ static void ChangeCalleesToFastCall(Function *F) {
   }
 }
 
-static AttrListPtr StripNest(const AttrListPtr &Attrs) {
+static AttrListPtr StripNest(LLVMContext &C, const AttrListPtr &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
-    if ((Attrs.getSlot(i).Attrs & Attribute::Nest) == 0)
+    if (!Attrs.getSlot(i).Attrs.hasAttribute(Attributes::Nest))
       continue;
 
     // There can be only one.
-    return Attrs.removeAttr(Attrs.getSlot(i).Index, Attribute::Nest);
+    return Attrs.removeAttr(C, Attrs.getSlot(i).Index,
+                            Attributes::get(C, Attributes::Nest));
   }
 
   return Attrs;
 }
 
 static void RemoveNestAttribute(Function *F) {
-  F->setAttributes(StripNest(F->getAttributes()));
+  F->setAttributes(StripNest(F->getContext(), F->getAttributes()));
   for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){
     if (isa<BlockAddress>(*UI))
       continue;
     CallSite User(cast<Instruction>(*UI));
-    User.setAttributes(StripNest(User.getAttributes()));
+    User.setAttributes(StripNest(F->getContext(), User.getAttributes()));
   }
 }
 
@@ -2103,7 +2118,7 @@ bool GlobalOpt::OptimizeFunctions(Module &M) {
         Changed = true;
       }
 
-      if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+      if (F->getAttributes().hasAttrSomewhere(Attributes::Nest) &&
           !F->hasAddressTaken()) {
         // The function is not used by a trampoline intrinsic, so it is safe
         // to remove the 'nest' attribute.
@@ -2251,7 +2266,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
 static inline bool 
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
-                            const TargetData *TD);
+                            const DataLayout *TD);
 
 
 /// isSimpleEnoughValueToCommit - Return true if the specified constant can be
@@ -2264,7 +2279,7 @@ isSimpleEnoughValueToCommit(Constant *C,
 /// time.
 static bool isSimpleEnoughValueToCommitHelper(Constant *C,
                                    SmallPtrSet<Constant*, 8> &SimpleConstants,
-                                   const TargetData *TD) {
+                                   const DataLayout *TD) {
   // Simple integer, undef, constant aggregate zero, global addresses, etc are
   // all supported.
   if (C->getNumOperands() == 0 || isa<BlockAddress>(C) ||
@@ -2319,7 +2334,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
 static inline bool 
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
-                            const TargetData *TD) {
+                            const DataLayout *TD) {
   // If we already checked this constant, we win.
   if (!SimpleConstants.insert(C)) return true;
   // Check the constant.
@@ -2450,7 +2465,7 @@ namespace {
 /// Once an evaluation call fails, the evaluation object should not be reused.
 class Evaluator {
 public:
-  Evaluator(const TargetData *TD, const TargetLibraryInfo *TLI)
+  Evaluator(const DataLayout *TD, const TargetLibraryInfo *TLI)
     : TD(TD), TLI(TLI) {
     ValueStack.push_back(new DenseMap<Value*, Constant*>);
   }
@@ -2531,7 +2546,7 @@ private:
   /// simple enough to live in a static initializer of a global.
   SmallPtrSet<Constant*, 8> SimpleConstants;
 
-  const TargetData *TD;
+  const DataLayout *TD;
   const TargetLibraryInfo *TLI;
 };
 
@@ -2869,7 +2884,7 @@ bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
 
 /// EvaluateStaticConstructor - Evaluate static constructors in the function, if
 /// we can.  Return true if we can, false otherwise.
-static bool EvaluateStaticConstructor(Function *F, const TargetData *TD,
+static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD,
                                       const TargetLibraryInfo *TLI) {
   // Call the function.
   Evaluator Eval(TD, TLI);
@@ -3110,7 +3125,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
 bool GlobalOpt::runOnModule(Module &M) {
   bool Changed = false;
 
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   // Try to find the llvm.globalctors list.
diff --git a/lib/Transforms/IPO/IPO.cpp b/lib/Transforms/IPO/IPO.cpp
index 6233922db927..5d563d8bbf51 100644
--- a/lib/Transforms/IPO/IPO.cpp
+++ b/lib/Transforms/IPO/IPO.cpp
@@ -1,4 +1,4 @@
-//===-- Scalar.cpp --------------------------------------------------------===//
+//===-- IPO.cpp -----------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -95,7 +95,10 @@ void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
 }
 
 void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
-  unwrap(PM)->add(createInternalizePass(AllButMain != 0));
+  std::vector<const char *> Export;
+  if (AllButMain)
+    Export.push_back("main");
+  unwrap(PM)->add(createInternalizePass(Export));
 }
 
 void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 664ddf6f7a2b..b1c36c15db0b 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -23,7 +23,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/SmallPtrSet.h"
 
 using namespace llvm;
@@ -65,7 +65,7 @@ Pass *llvm::createAlwaysInlinerPass(bool InsertLifetime) {
 
 /// \brief Minimal filter to detect invalid constructs for inlining.
 static bool isInlineViable(Function &F) {
-  bool ReturnsTwice = F.hasFnAttr(Attribute::ReturnsTwice);
+  bool ReturnsTwice =F.getFnAttributes().hasAttribute(Attributes::ReturnsTwice);
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Disallow inlining of functions which contain an indirect branch.
     if (isa<IndirectBrInst>(BI->getTerminator()))
@@ -114,7 +114,7 @@ InlineCost AlwaysInliner::getInlineCost(CallSite CS) {
   if (Callee->isDeclaration()) return InlineCost::getNever();
 
   // Return never for anything not marked as always inline.
-  if (!Callee->hasFnAttr(Attribute::AlwaysInline))
+  if (!Callee->getFnAttributes().hasAttribute(Attributes::AlwaysInline))
     return InlineCost::getNever();
 
   // Do some minimal analysis to preclude non-viable functions.
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 50038d81161b..bf0b1f91a210 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 
 using namespace llvm;
 
@@ -62,7 +62,7 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 // doInitialization - Initializes the vector of functions that have been
 // annotated with the noinline attribute.
 bool SimpleInliner::doInitialization(CallGraph &CG) {
-  CA.setTargetData(getAnalysisIfAvailable<TargetData>());
+  CA.setDataLayout(getAnalysisIfAvailable<DataLayout>());
   return false;
 }
 
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 712888aee9e5..abcb25fd4555 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -19,7 +19,8 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -92,11 +93,11 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI,
 
   // If the inlined function had a higher stack protection level than the
   // calling function, then bump up the caller's stack protection level.
-  if (Callee->hasFnAttr(Attribute::StackProtectReq))
-    Caller->addFnAttr(Attribute::StackProtectReq);
-  else if (Callee->hasFnAttr(Attribute::StackProtect) &&
-           !Caller->hasFnAttr(Attribute::StackProtectReq))
-    Caller->addFnAttr(Attribute::StackProtect);
+  if (Callee->getFnAttributes().hasAttribute(Attributes::StackProtectReq))
+    Caller->addFnAttr(Attributes::StackProtectReq);
+  else if (Callee->getFnAttributes().hasAttribute(Attributes::StackProtect) &&
+           !Caller->getFnAttributes().hasAttribute(Attributes::StackProtectReq))
+    Caller->addFnAttr(Attributes::StackProtect);
 
   // Look at all of the allocas that we inlined through this call site.  If we
   // have already inlined other allocas through other calls into this function,
@@ -208,14 +209,15 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   // would decrease the threshold.
   Function *Caller = CS.getCaller();
   bool OptSize = Caller && !Caller->isDeclaration() &&
-    Caller->hasFnAttr(Attribute::OptimizeForSize);
-  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres)
+    Caller->getFnAttributes().hasAttribute(Attributes::OptimizeForSize);
+  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
+      OptSizeThreshold < thres)
     thres = OptSizeThreshold;
 
   // Listen to the inlinehint attribute when it would increase the threshold.
   Function *Callee = CS.getCalledFunction();
   bool InlineHint = Callee && !Callee->isDeclaration() &&
-    Callee->hasFnAttr(Attribute::InlineHint);
+    Callee->getFnAttributes().hasAttribute(Attributes::InlineHint);
   if (InlineHint && HintThreshold > thres)
     thres = HintThreshold;
 
@@ -338,7 +340,8 @@ static bool InlineHistoryIncludes(Function *F, int InlineHistoryID,
 
 bool Inliner::runOnSCC(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraph>();
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   SmallPtrSet<Function*, 8> SCCFunctions;
   DEBUG(dbgs() << "Inliner visiting SCC:");
@@ -417,7 +420,7 @@ bool Inliner::runOnSCC(CallGraphSCC &SCC) {
       // just delete the call instead of trying to inline it, regardless of
       // size.  This happens because IPSCCP propagates the result out of the
       // call and then we're left with the dead call.
-      if (isInstructionTriviallyDead(CS.getInstruction())) {
+      if (isInstructionTriviallyDead(CS.getInstruction(), TLI)) {
         DEBUG(dbgs() << "    -> Deleting dead call: "
                      << *CS.getInstruction() << "\n");
         // Update the call graph by deleting the edge from Callee to Caller.
@@ -530,7 +533,8 @@ bool Inliner::removeDeadFunctions(CallGraph &CG, bool AlwaysInlineOnly) {
     // Handle the case when this function is called and we only want to care
     // about always-inline functions. This is a bit of a hack to share code
     // between here and the InlineAlways pass.
-    if (AlwaysInlineOnly && !F->hasFnAttr(Attribute::AlwaysInline))
+    if (AlwaysInlineOnly &&
+        !F->getFnAttributes().hasAttribute(Attributes::AlwaysInline))
       continue;
 
     // If the only remaining users of the function are dead constants, remove
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index fb5869ede2bb..aa629cc0c6fb 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass loops over all of the functions in the input module, looking for a
-// main function.  If a main function is found, all other functions and all
-// global variables with initializers are marked as internal.
+// This pass loops over all of the functions and variables in the input module.
+// If the function or variable is not in the list of external names given to
+// the pass it is marked as internal.
 //
 //===----------------------------------------------------------------------===//
 
@@ -45,12 +45,9 @@ APIList("internalize-public-api-list", cl::value_desc("list"),
 namespace {
   class InternalizePass : public ModulePass {
     std::set<std::string> ExternalNames;
-    /// If no api symbols were specified and a main function is defined,
-    /// assume the main function is the only API
-    bool AllButMain;
   public:
     static char ID; // Pass identification, replacement for typeid
-    explicit InternalizePass(bool AllButMain = true);
+    explicit InternalizePass();
     explicit InternalizePass(const std::vector <const char *>& exportList);
     void LoadFile(const char *Filename);
     virtual bool runOnModule(Module &M);
@@ -66,8 +63,8 @@ char InternalizePass::ID = 0;
 INITIALIZE_PASS(InternalizePass, "internalize",
                 "Internalize Global Symbols", false, false)
 
-InternalizePass::InternalizePass(bool AllButMain)
-  : ModulePass(ID), AllButMain(AllButMain){
+InternalizePass::InternalizePass()
+  : ModulePass(ID) {
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
   if (!APIFile.empty())           // If a filename is specified, use it.
     LoadFile(APIFile.c_str());
@@ -76,7 +73,7 @@ InternalizePass::InternalizePass(bool AllButMain)
 }
 
 InternalizePass::InternalizePass(const std::vector<const char *>&exportList)
-  : ModulePass(ID), AllButMain(false){
+  : ModulePass(ID){
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
   for(std::vector<const char *>::const_iterator itr = exportList.begin();
         itr != exportList.end(); itr++) {
@@ -103,23 +100,6 @@ void InternalizePass::LoadFile(const char *Filename) {
 bool InternalizePass::runOnModule(Module &M) {
   CallGraph *CG = getAnalysisIfAvailable<CallGraph>();
   CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : 0;
-  
-  if (ExternalNames.empty()) {
-    // Return if we're not in 'all but main' mode and have no external api
-    if (!AllButMain)
-      return false;
-    // If no list or file of symbols was specified, check to see if there is a
-    // "main" symbol defined in the module.  If so, use it, otherwise do not
-    // internalize the module, it must be a library or something.
-    //
-    Function *MainFunc = M.getFunction("main");
-    if (MainFunc == 0 || MainFunc->isDeclaration())
-      return false;  // No main found, must be a library...
-
-    // Preserve main, internalize all else.
-    ExternalNames.insert(MainFunc->getName());
-  }
-
   bool Changed = false;
 
   // Never internalize functions which code-gen might insert.
@@ -189,8 +169,8 @@ bool InternalizePass::runOnModule(Module &M) {
   return Changed;
 }
 
-ModulePass *llvm::createInternalizePass(bool AllButMain) {
-  return new InternalizePass(AllButMain);
+ModulePass *llvm::createInternalizePass() {
+  return new InternalizePass();
 }
 
 ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) {
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 9f70f668a88b..44283ddce7ae 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -63,7 +63,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include <vector>
 using namespace llvm;
 
@@ -92,19 +92,19 @@ static unsigned profileFunction(const Function *F) {
 namespace {
 
 /// ComparableFunction - A struct that pairs together functions with a
-/// TargetData so that we can keep them together as elements in the DenseSet.
+/// DataLayout so that we can keep them together as elements in the DenseSet.
 class ComparableFunction {
 public:
   static const ComparableFunction EmptyKey;
   static const ComparableFunction TombstoneKey;
-  static TargetData * const LookupOnly;
+  static DataLayout * const LookupOnly;
 
-  ComparableFunction(Function *Func, TargetData *TD)
+  ComparableFunction(Function *Func, DataLayout *TD)
     : Func(Func), Hash(profileFunction(Func)), TD(TD) {}
 
   Function *getFunc() const { return Func; }
   unsigned getHash() const { return Hash; }
-  TargetData *getTD() const { return TD; }
+  DataLayout *getTD() const { return TD; }
 
   // Drops AssertingVH reference to the function. Outside of debug mode, this
   // does nothing.
@@ -120,13 +120,13 @@ private:
 
   AssertingVH<Function> Func;
   unsigned Hash;
-  TargetData *TD;
+  DataLayout *TD;
 };
 
 const ComparableFunction ComparableFunction::EmptyKey = ComparableFunction(0);
 const ComparableFunction ComparableFunction::TombstoneKey =
     ComparableFunction(1);
-TargetData *const ComparableFunction::LookupOnly = (TargetData*)(-1);
+DataLayout *const ComparableFunction::LookupOnly = (DataLayout*)(-1);
 
 }
 
@@ -150,12 +150,12 @@ namespace llvm {
 namespace {
 
 /// FunctionComparator - Compares two functions to determine whether or not
-/// they will generate machine code with the same behaviour. TargetData is
+/// they will generate machine code with the same behaviour. DataLayout is
 /// used if available. The comparator always fails conservatively (erring on the
 /// side of claiming that two functions are different).
 class FunctionComparator {
 public:
-  FunctionComparator(const TargetData *TD, const Function *F1,
+  FunctionComparator(const DataLayout *TD, const Function *F1,
                      const Function *F2)
     : F1(F1), F2(F2), TD(TD) {}
 
@@ -190,7 +190,7 @@ private:
   // The two functions undergoing comparison.
   const Function *F1, *F2;
 
-  const TargetData *TD;
+  const DataLayout *TD;
 
   DenseMap<const Value *, const Value *> id_map;
   DenseSet<const Value *> seen_values;
@@ -591,8 +591,8 @@ private:
   /// to modify it.
   FnSetType FnSet;
 
-  /// TargetData for more accurate GEP comparisons. May be NULL.
-  TargetData *TD;
+  /// DataLayout for more accurate GEP comparisons. May be NULL.
+  DataLayout *TD;
 
   /// Whether or not the target supports global aliases.
   bool HasGlobalAliases;
@@ -609,7 +609,7 @@ ModulePass *llvm::createMergeFunctionsPass() {
 
 bool MergeFunctions::runOnModule(Module &M) {
   bool Changed = false;
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
 
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
     if (!I->isDeclaration() && !I->hasAvailableExternallyLinkage())
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 43b4ab5efa4d..05253fcddab3 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -33,13 +33,21 @@
 using namespace llvm;
 
 static cl::opt<bool>
-RunVectorization("vectorize", cl::desc("Run vectorization passes"));
+RunLoopVectorization("vectorize-loops",
+                     cl::desc("Run the Loop vectorization passes"));
+
+static cl::opt<bool>
+RunBBVectorization("vectorize", cl::desc("Run the BB vectorization passes"));
 
 static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::init(false), cl::Hidden,
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
 
+static cl::opt<bool> UseNewSROA("use-new-sroa",
+  cl::init(true), cl::Hidden,
+  cl::desc("Enable the new, experimental SROA pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -48,7 +56,8 @@ PassManagerBuilder::PassManagerBuilder() {
     DisableSimplifyLibCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    Vectorize = RunVectorization;
+    Vectorize = RunBBVectorization;
+    LoopVectorize = RunLoopVectorization;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -100,7 +109,10 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
   addInitialAliasAnalysisPasses(FPM);
 
   FPM.add(createCFGSimplificationPass());
-  FPM.add(createScalarReplAggregatesPass());
+  if (UseNewSROA)
+    FPM.add(createSROAPass());
+  else
+    FPM.add(createScalarReplAggregatesPass());
   FPM.add(createEarlyCSEPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
@@ -112,6 +124,14 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
       MPM.add(Inliner);
       Inliner = 0;
     }
+
+    // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+    // pass manager, but we don't want to add extensions into that pass manager.
+    // To prevent this we must insert a no-op module pass to reset the pass
+    // manager to get the same behavior as EP_OptimizerLast in non-O0 builds.
+    if (!GlobalExtensions->empty() || !Extensions.empty())
+      MPM.add(createBarrierNoopPass());
+
     addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
     return;
   }
@@ -147,7 +167,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
-  MPM.add(createScalarReplAggregatesPass(-1, false));
+  if (UseNewSROA)
+    MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+  else
+    MPM.add(createScalarReplAggregatesPass(-1, false));
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
   if (!DisableSimplifyLibCalls)
     MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
@@ -166,6 +189,12 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
+
+  if (LoopVectorize) {
+    MPM.add(createLoopVectorizePass());
+    MPM.add(createLICMPass());
+  }
+
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
@@ -201,13 +230,12 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
     // FIXME: We shouldn't bother with this anymore.
     MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
 
-    // GlobalOpt already deletes dead functions and globals, at -O3 try a
+    // GlobalOpt already deletes dead functions and globals, at -O2 try a
     // late pass of GlobalDCE.  It is capable of deleting dead cycles.
-    if (OptLevel > 2)
+    if (OptLevel > 1) {
       MPM.add(createGlobalDCEPass());         // Remove dead fns and globals.
-
-    if (OptLevel > 1)
       MPM.add(createConstantMergePass());     // Merge dup global constants
+    }
   }
   addExtensionsToPM(EP_OptimizerLast, MPM);
 }
@@ -222,8 +250,11 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   // Now that composite has been compiled, scan through the module, looking
   // for a main function.  If main is defined, mark all other functions
   // internal.
-  if (Internalize)
-    PM.add(createInternalizePass(true));
+  if (Internalize) {
+    std::vector<const char*> E;
+    E.push_back("main");
+    PM.add(createInternalizePass(E));
+  }
 
   // Propagate constants at call sites into the functions they call.  This
   // opens opportunities for globalopt (and inlining) by substituting function
@@ -265,7 +296,10 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createInstructionCombiningPass());
   PM.add(createJumpThreadingPass());
   // Break up allocas
-  PM.add(createScalarReplAggregatesPass());
+  if (UseNewSROA)
+    PM.add(createSROAPass());
+  else
+    PM.add(createScalarReplAggregatesPass());
 
   // Run a few AA driven optimizations here and now, to cleanup the code.
   PM.add(createFunctionAttrsPass()); // Add nocapture.
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index c8cc8fd1930b..fb4ecbfe7b08 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -137,16 +137,18 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
   // If the SCC doesn't unwind or doesn't throw, note this fact.
   if (!SCCMightUnwind || !SCCMightReturn)
     for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
-      Attributes NewAttributes = Attribute::None;
+      AttrBuilder NewAttributes;
 
       if (!SCCMightUnwind)
-        NewAttributes |= Attribute::NoUnwind;
+        NewAttributes.addAttribute(Attributes::NoUnwind);
       if (!SCCMightReturn)
-        NewAttributes |= Attribute::NoReturn;
+        NewAttributes.addAttribute(Attributes::NoReturn);
 
       Function *F = (*I)->getFunction();
       const AttrListPtr &PAL = F->getAttributes();
-      const AttrListPtr &NPAL = PAL.addAttr(~0, NewAttributes);
+      const AttrListPtr &NPAL = PAL.addAttr(F->getContext(), ~0,
+                                            Attributes::get(F->getContext(),
+                                                            NewAttributes));
       if (PAL != NPAL) {
         MadeChange = true;
         F->setAttributes(NPAL);
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 0d5ef904ee47..7467eca7ab1f 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -18,10 +18,11 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/TargetFolder.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 
 namespace llvm {
   class CallSite;
-  class TargetData;
+  class DataLayout;
   class TargetLibraryInfo;
   class DbgDeclareInst;
   class MemIntrinsic;
@@ -71,9 +72,10 @@ public:
 class LLVM_LIBRARY_VISIBILITY InstCombiner
                              : public FunctionPass,
                                public InstVisitor<InstCombiner, Instruction*> {
-  TargetData *TD;
+  DataLayout *TD;
   TargetLibraryInfo *TLI;
   bool MadeIRChange;
+  LibCallSimplifier *Simplifier;
 public:
   /// Worklist - All of the instructions that need to be simplified.
   InstCombineWorklist Worklist;
@@ -95,7 +97,7 @@ public:
 
   virtual void getAnalysisUsage(AnalysisUsage &AU) const;
 
-  TargetData *getTargetData() const { return TD; }
+  DataLayout *getDataLayout() const { return TD; }
 
   TargetLibraryInfo *getTargetLibraryInfo() const { return TLI; }
 
@@ -218,7 +220,7 @@ private:
                           Type *Ty);
 
   Instruction *visitCallSite(CallSite CS);
-  Instruction *tryOptimizeCall(CallInst *CI, const TargetData *TD);
+  Instruction *tryOptimizeCall(CallInst *CI, const DataLayout *TD);
   bool transformConstExprCastCall(CallSite CS);
   Instruction *transformCallThroughTrampoline(CallSite CS,
                                               IntrinsicInst *Tramp);
@@ -365,6 +367,10 @@ private:
 
 
   Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
+
+  /// Descale - Return a value X such that Val = X * Scale, or null if none.  If
+  /// the multiplication is known not to overflow then NoSignedWrap is set.
+  Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
       
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 99b62f8d05a7..d8257e64d837 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -13,7 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index cbe1ca4ddcec..48f270429e5a 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -13,7 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -29,6 +29,26 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
+/// reduceToSingleValueType - Given an aggregate type which ultimately holds a
+/// single scalar element, like {{{type}}} or [1 x type], return type.
+static Type *reduceToSingleValueType(Type *T) {
+  while (!T->isSingleValueType()) {
+    if (StructType *STy = dyn_cast<StructType>(T)) {
+      if (STy->getNumElements() == 1)
+        T = STy->getElementType(0);
+      else
+        break;
+    } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) {
+      if (ATy->getNumElements() == 1)
+        T = ATy->getElementType();
+      else
+        break;
+    } else
+      break;
+  }
+
+  return T;
+}
 
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD);
@@ -74,35 +94,37 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // dest address will be promotable.  See if we can find a better type than the
   // integer datatype.
   Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  MDNode *CopyMD = 0;
   if (StrippedDest != MI->getArgOperand(0)) {
     Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
     if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
       // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
       // down through these levels if so.
-      while (!SrcETy->isSingleValueType()) {
-        if (StructType *STy = dyn_cast<StructType>(SrcETy)) {
-          if (STy->getNumElements() == 1)
-            SrcETy = STy->getElementType(0);
-          else
-            break;
-        } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
-          if (ATy->getNumElements() == 1)
-            SrcETy = ATy->getElementType();
-          else
-            break;
-        } else
-          break;
-      }
+      SrcETy = reduceToSingleValueType(SrcETy);
 
       if (SrcETy->isSingleValueType()) {
         NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
         NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
+
+        // If the memcpy has metadata describing the members, see if we can
+        // get the TBAA tag describing our copy.
+        if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+          if (M->getNumOperands() == 3 &&
+              M->getOperand(0) &&
+              isa<ConstantInt>(M->getOperand(0)) &&
+              cast<ConstantInt>(M->getOperand(0))->isNullValue() &&
+              M->getOperand(1) &&
+              isa<ConstantInt>(M->getOperand(1)) &&
+              cast<ConstantInt>(M->getOperand(1))->getValue() == Size &&
+              M->getOperand(2) &&
+              isa<MDNode>(M->getOperand(2)))
+            CopyMD = cast<MDNode>(M->getOperand(2));
+        }
       }
     }
   }
 
-
   // If the memcpy/memmove provides better alignment info than we can
   // infer, use it.
   SrcAlign = std::max(SrcAlign, CopyAlign);
@@ -112,8 +134,12 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
   L->setAlignment(SrcAlign);
+  if (CopyMD)
+    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
   S->setAlignment(DstAlign);
+  if (CopyMD)
+    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
 
   // Set the size of the copy to 0, it will be deleted on the next iteration.
   MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
@@ -168,7 +194,7 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) {
 /// the heavy lifting.
 ///
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
-  if (isFreeCall(&CI))
+  if (isFreeCall(&CI, TLI))
     return visitFree(CI);
 
   // If the caller function is nounwind, mark the call as nounwind, even if the
@@ -243,7 +269,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   default: break;
   case Intrinsic::objectsize: {
     uint64_t Size;
-    if (getObjectSize(II->getArgOperand(0), Size, TD))
+    if (getObjectSize(II->getArgOperand(0), Size, TD, TLI))
       return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
     return 0;
   }
@@ -731,7 +757,7 @@ Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
 /// passed through the varargs area, we can eliminate the use of the cast.
 static bool isSafeToEliminateVarargsCast(const CallSite CS,
                                          const CastInst * const CI,
-                                         const TargetData * const TD,
+                                         const DataLayout * const TD,
                                          const int ix) {
   if (!CI->isLosslessCast())
     return false;
@@ -752,49 +778,17 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   return true;
 }
 
-namespace {
-class InstCombineFortifiedLibCalls : public SimplifyFortifiedLibCalls {
-  InstCombiner *IC;
-protected:
-  void replaceCall(Value *With) {
-    NewInstruction = IC->ReplaceInstUsesWith(*CI, With);
-  }
-  bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const {
-    if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
-      return true;
-    if (ConstantInt *SizeCI =
-                           dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
-      if (SizeCI->isAllOnesValue())
-        return true;
-      if (isString) {
-        uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
-        // If the length is 0 we don't know how long it is and so we can't
-        // remove the check.
-        if (Len == 0) return false;
-        return SizeCI->getZExtValue() >= Len;
-      }
-      if (ConstantInt *Arg = dyn_cast<ConstantInt>(
-                                                  CI->getArgOperand(SizeArgOp)))
-        return SizeCI->getZExtValue() >= Arg->getZExtValue();
-    }
-    return false;
-  }
-public:
-  InstCombineFortifiedLibCalls(InstCombiner *IC) : IC(IC), NewInstruction(0) { }
-  Instruction *NewInstruction;
-};
-} // end anonymous namespace
-
 // Try to fold some different type of calls here.
 // Currently we're only working with the checking functions, memcpy_chk,
 // mempcpy_chk, memmove_chk, memset_chk, strcpy_chk, stpcpy_chk, strncpy_chk,
 // strcat_chk and strncat_chk.
-Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) {
+Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) {
   if (CI->getCalledFunction() == 0) return 0;
 
-  InstCombineFortifiedLibCalls Simplifier(this);
-  Simplifier.fold(CI, TD, TLI);
-  return Simplifier.NewInstruction;
+  if (Value *With = Simplifier->optimizeCall(CI))
+    return ReplaceInstUsesWith(*CI, With);
+
+  return 0;
 }
 
 static IntrinsicInst *FindInitTrampolineFromAlloca(Value *TrampMem) {
@@ -877,7 +871,7 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) {
 // visitCallSite - Improvements for call and invoke instructions.
 //
 Instruction *InstCombiner::visitCallSite(CallSite CS) {
-  if (isAllocLikeFn(CS.getInstruction()))
+  if (isAllocLikeFn(CS.getInstruction(), TLI))
     return visitAllocSite(*CS.getInstruction());
 
   bool Changed = false;
@@ -961,7 +955,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
     Changed = true;
   }
 
-  // Try to optimize the call if possible, we require TargetData for most of
+  // Try to optimize the call if possible, we require DataLayout for most of
   // this.  None of these calls are seen as possibly dead so go ahead and
   // delete the instruction now.
   if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
@@ -1013,8 +1007,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       return false;   // Cannot transform this return value.
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      Attributes RAttrs = CallerPAL.getRetAttributes();
-      if (RAttrs & Attribute::typeIncompatible(NewRetTy))
+      AttrBuilder RAttrs = CallerPAL.getRetAttributes();
+      if (RAttrs.hasAttributes(Attributes::typeIncompatible(NewRetTy)))
         return false;   // Attribute not compatible with transformed value.
     }
 
@@ -1044,12 +1038,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       return false;   // Cannot transform this parameter value.
 
     Attributes Attrs = CallerPAL.getParamAttributes(i + 1);
-    if (Attrs & Attribute::typeIncompatible(ParamTy))
+    if (AttrBuilder(Attrs).
+          hasAttributes(Attributes::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy && (Attrs & Attribute::ByVal)) {
+    if (ParamTy != ActTy && Attrs.hasAttribute(Attributes::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (ParamPTy == 0 || !ParamPTy->getElementType()->isSized() || TD == 0)
         return false;
@@ -1101,7 +1096,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
       if (CallerPAL.getSlot(i - 1).Index <= FT->getNumParams())
         break;
       Attributes PAttrs = CallerPAL.getSlot(i - 1).Attrs;
-      if (PAttrs & Attribute::VarArgsIncompatible)
+      if (PAttrs.hasIncompatibleWithVarArgsAttrs())
         return false;
     }
 
@@ -1114,15 +1109,17 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   attrVec.reserve(NumCommonArgs);
 
   // Get any return attributes.
-  Attributes RAttrs = CallerPAL.getRetAttributes();
+  AttrBuilder RAttrs = CallerPAL.getRetAttributes();
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
-  RAttrs &= ~Attribute::typeIncompatible(NewRetTy);
+  RAttrs.removeAttributes(Attributes::typeIncompatible(NewRetTy));
 
   // Add the new return attributes.
-  if (RAttrs)
-    attrVec.push_back(AttributeWithIndex::get(0, RAttrs));
+  if (RAttrs.hasAttributes())
+    attrVec.push_back(
+      AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                              Attributes::get(FT->getContext(), RAttrs)));
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
@@ -1136,7 +1133,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     // Add any parameter attributes.
-    if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1))
+    Attributes PAttrs = CallerPAL.getParamAttributes(i + 1);
+    if (PAttrs.hasAttributes())
       attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
   }
 
@@ -1164,19 +1162,23 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         }
 
         // Add any parameter attributes.
-        if (Attributes PAttrs = CallerPAL.getParamAttributes(i + 1))
+        Attributes PAttrs = CallerPAL.getParamAttributes(i + 1);
+        if (PAttrs.hasAttributes())
           attrVec.push_back(AttributeWithIndex::get(i + 1, PAttrs));
       }
     }
   }
 
-  if (Attributes FnAttrs =  CallerPAL.getFnAttributes())
-    attrVec.push_back(AttributeWithIndex::get(~0, FnAttrs));
+  Attributes FnAttrs = CallerPAL.getFnAttributes();
+  if (FnAttrs.hasAttributes())
+    attrVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                              FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec);
+  const AttrListPtr &NewCallerPAL = AttrListPtr::get(Callee->getContext(),
+                                                     attrVec);
 
   Instruction *NC;
   if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
@@ -1240,8 +1242,9 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
-  if (Attrs.hasAttrSomewhere(Attribute::Nest))
-    return 0;
+  for (unsigned I = 0, E = Attrs.getNumAttrs(); I != E; ++I)
+    if (Attrs.getAttributesAtIndex(I).hasAttribute(Attributes::Nest))
+      return 0;
 
   assert(Tramp &&
          "transformCallThroughTrampoline called with incorrect CallSite.");
@@ -1254,12 +1257,12 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
     Type *NestTy = 0;
-    Attributes NestAttr = Attribute::None;
+    Attributes NestAttr;
 
     // Look for a parameter marked with the 'nest' attribute.
     for (FunctionType::param_iterator I = NestFTy->param_begin(),
          E = NestFTy->param_end(); I != E; ++NestIdx, ++I)
-      if (NestAttrs.paramHasAttr(NestIdx, Attribute::Nest)) {
+      if (NestAttrs.getParamAttributes(NestIdx).hasAttribute(Attributes::Nest)){
         // Record the parameter type and any other attributes.
         NestTy = *I;
         NestAttr = NestAttrs.getParamAttributes(NestIdx);
@@ -1278,8 +1281,10 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       // mean appending it.  Likewise for attributes.
 
       // Add any result attributes.
-      if (Attributes Attr = Attrs.getRetAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(0, Attr));
+      Attributes Attr = Attrs.getRetAttributes();
+      if (Attr.hasAttributes())
+        NewAttrs.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+                                                   Attr));
 
       {
         unsigned Idx = 1;
@@ -1299,7 +1304,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          if (Attributes Attr = Attrs.getParamAttributes(Idx))
+          Attr = Attrs.getParamAttributes(Idx);
+          if (Attr.hasAttributes())
             NewAttrs.push_back
               (AttributeWithIndex::get(Idx + (Idx >= NestIdx), Attr));
 
@@ -1308,8 +1314,10 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       }
 
       // Add any function attributes.
-      if (Attributes Attr = Attrs.getFnAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(~0, Attr));
+      Attr = Attrs.getFnAttributes();
+      if (Attr.hasAttributes())
+        NewAttrs.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+                                                   Attr));
 
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
@@ -1348,7 +1356,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs);
+      const AttrListPtr &NewPAL = AttrListPtr::get(FTy->getContext(), NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 555b4428d2e8..bb59db8e7ba1 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -13,7 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
@@ -78,7 +78,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
 /// try to eliminate the cast by moving the type information into the alloc.
 Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
                                                    AllocaInst &AI) {
-  // This requires TargetData to get the alloca alignment and size information.
+  // This requires DataLayout to get the alloca alignment and size information.
   if (!TD) return 0;
 
   PointerType *PTy = cast<PointerType>(CI.getType());
@@ -229,7 +229,7 @@ isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
   Type *DstTy,     ///< The target type for the second cast instruction
-  TargetData *TD         ///< The target data for pointer size
+  DataLayout *TD         ///< The target data for pointer size
 ) {
 
   Type *SrcTy = CI->getOperand(0)->getType();   // A from above
@@ -238,17 +238,20 @@ isEliminableCastPair(
   // Get the opcodes of the two Cast instructions
   Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opcode);
-
+  Type *SrcIntPtrTy = TD && SrcTy->isPtrOrPtrVectorTy() ?
+    TD->getIntPtrType(SrcTy) : 0;
+  Type *MidIntPtrTy = TD && MidTy->isPtrOrPtrVectorTy() ?
+    TD->getIntPtrType(MidTy) : 0;
+  Type *DstIntPtrTy = TD && DstTy->isPtrOrPtrVectorTy() ?
+    TD->getIntPtrType(DstTy) : 0;
   unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
-                                                DstTy,
-                                  TD ? TD->getIntPtrType(CI->getContext()) : 0);
-  
+                                                DstTy, SrcIntPtrTy, MidIntPtrTy,
+                                                DstIntPtrTy);
+
   // We don't want to form an inttoptr or ptrtoint that converts to an integer
   // type that differs from the pointer size.
-  if ((Res == Instruction::IntToPtr &&
-          (!TD || SrcTy != TD->getIntPtrType(CI->getContext()))) ||
-      (Res == Instruction::PtrToInt &&
-          (!TD || DstTy != TD->getIntPtrType(CI->getContext()))))
+  if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
+      (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
     Res = 0;
   
   return Instruction::CastOps(Res);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index bdd310e97f6c..7c3f8fe15d30 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -16,7 +16,8 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
@@ -473,7 +474,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
 /// If we can't emit an optimized form for this expression, this returns null.
 ///
 static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
-  TargetData &TD = *IC.getTargetData();
+  DataLayout &TD = *IC.getDataLayout();
   gep_type_iterator GTI = gep_type_begin(GEP);
 
   // Check to see if this gep only has a single variable index.  If so, and if
@@ -2355,8 +2356,25 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         // Try not to increase register pressure.
         BO0->hasOneUse() && BO1->hasOneUse()) {
       // Determine Y and Z in the form icmp (X+Y), (X+Z).
-      Value *Y = (A == C || A == D) ? B : A;
-      Value *Z = (C == A || C == B) ? D : C;
+      Value *Y, *Z;
+      if (A == C) {
+        // C + B == C + D  ->  B == D
+        Y = B;
+        Z = D;
+      } else if (A == D) {
+        // D + B == C + D  ->  B == C
+        Y = B;
+        Z = C;
+      } else if (B == C) {
+        // A + C == C + D  ->  A == D
+        Y = A;
+        Z = D;
+      } else {
+        assert(B == D);
+        // A + D == C + D  ->  A == C
+        Y = A;
+        Z = C;
+      }
       return new ICmpInst(Pred, Y, Z);
     }
 
@@ -2894,10 +2912,6 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
         if (!RHSF)
           break;
 
-        // We can't convert a PPC double double.
-        if (RHSF->getType()->isPPC_FP128Ty())
-          break;
-
         const fltSemantics *Sem;
         // FIXME: This shouldn't be here.
         if (LHSExt->getSrcTy()->isHalfTy())
@@ -2910,6 +2924,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
           Sem = &APFloat::IEEEquad;
         else if (LHSExt->getSrcTy()->isX86_FP80Ty())
           Sem = &APFloat::x87DoubleExtended;
+        else if (LHSExt->getSrcTy()->isPPC_FP128Ty())
+          Sem = &APFloat::PPCDoubleDouble;
         else
           break;
 
@@ -2985,6 +3001,44 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
                 return Res;
         }
         break;
+      case Instruction::Call: {
+        CallInst *CI = cast<CallInst>(LHSI);
+        LibFunc::Func Func;
+        // Various optimization for fabs compared with zero.
+        if (RHSC->isNullValue() && CI->getCalledFunction() &&
+            TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+            TLI->has(Func)) {
+          if (Func == LibFunc::fabs || Func == LibFunc::fabsf ||
+              Func == LibFunc::fabsl) {
+            switch (I.getPredicate()) {
+            default: break;
+            // fabs(x) < 0 --> false
+            case FCmpInst::FCMP_OLT:
+              return ReplaceInstUsesWith(I, Builder->getFalse());
+            // fabs(x) > 0 --> x != 0
+            case FCmpInst::FCMP_OGT:
+              return new FCmpInst(FCmpInst::FCMP_ONE, CI->getArgOperand(0),
+                                  RHSC);
+            // fabs(x) <= 0 --> x == 0
+            case FCmpInst::FCMP_OLE:
+              return new FCmpInst(FCmpInst::FCMP_OEQ, CI->getArgOperand(0),
+                                  RHSC);
+            // fabs(x) >= 0 --> !isnan(x)
+            case FCmpInst::FCMP_OGE:
+              return new FCmpInst(FCmpInst::FCMP_ORD, CI->getArgOperand(0),
+                                  RHSC);
+            // fabs(x) == 0 --> x == 0
+            // fabs(x) != 0 --> x != 0
+            case FCmpInst::FCMP_OEQ:
+            case FCmpInst::FCMP_UEQ:
+            case FCmpInst::FCMP_ONE:
+            case FCmpInst::FCMP_UNE:
+              return new FCmpInst(I.getPredicate(), CI->getArgOperand(0),
+                                  RHSC);
+            }
+          }
+        }
+      }
       }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index c485844aaeb4..4d106fc18853 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -14,13 +14,161 @@
 #include "InstCombine.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-STATISTIC(NumDeadStore, "Number of dead stores eliminated");
+STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
+STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
+
+/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
+/// some part of a constant global variable.  This intentionally only accepts
+/// constant expressions because we can't rewrite arbitrary instructions.
+static bool pointsToConstantGlobal(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast ||
+        CE->getOpcode() == Instruction::GetElementPtr)
+      return pointsToConstantGlobal(CE->getOperand(0));
+  return false;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with IsOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant global, we
+/// can optimize this.
+static bool
+isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
+                               SmallVectorImpl<Instruction *> &ToDelete,
+                               bool IsOffset = false) {
+  // We track lifetime intrinsics as we encounter them.  If we decide to go
+  // ahead and replace the value with the global, this lets the caller quickly
+  // eliminate the markers.
+
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    User *U = cast<Instruction>(*UI);
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+      // Ignore non-volatile loads, they are always ok.
+      if (!LI->isSimple()) return false;
+      continue;
+    }
+
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+      // If uses of the bitcast are ok, we are ok.
+      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, ToDelete, IsOffset))
+        return false;
+      continue;
+    }
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
+      // doesn't, it does.
+      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, ToDelete,
+                                          IsOffset || !GEP->hasAllZeroIndices()))
+        return false;
+      continue;
+    }
+
+    if (CallSite CS = U) {
+      // If this is the function being called then we treat it like a load and
+      // ignore it.
+      if (CS.isCallee(UI))
+        continue;
+
+      // If this is a readonly/readnone call site, then we know it is just a
+      // load (but one that potentially returns the value itself), so we can
+      // ignore it if we know that the value isn't captured.
+      unsigned ArgNo = CS.getArgumentNo(UI);
+      if (CS.onlyReadsMemory() &&
+          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
+        continue;
+
+      // If this is being passed as a byval argument, the caller is making a
+      // copy, so it is only a read of the alloca.
+      if (CS.isByValArgument(ArgNo))
+        continue;
+    }
+
+    // Lifetime intrinsics can be handled by the caller.
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+          II->getIntrinsicID() == Intrinsic::lifetime_end) {
+        assert(II->use_empty() && "Lifetime markers have no result to use!");
+        ToDelete.push_back(II);
+        continue;
+      }
+    }
+
+    // If this is isn't our memcpy/memmove, reject it as something we can't
+    // handle.
+    MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
+    if (MI == 0)
+      return false;
+
+    // If the transfer is using the alloca as a source of the transfer, then
+    // ignore it since it is a load (unless the transfer is volatile).
+    if (UI.getOperandNo() == 1) {
+      if (MI->isVolatile()) return false;
+      continue;
+    }
+
+    // If we already have seen a copy, reject the second one.
+    if (TheCopy) return false;
+
+    // If the pointer has been offset from the start of the alloca, we can't
+    // safely handle this.
+    if (IsOffset) return false;
+
+    // If the memintrinsic isn't using the alloca as the dest, reject it.
+    if (UI.getOperandNo() != 0) return false;
+
+    // If the source of the memcpy/move is not a constant global, reject it.
+    if (!pointsToConstantGlobal(MI->getSource()))
+      return false;
+
+    // Otherwise, the transform is safe.  Remember the copy instruction.
+    TheCopy = MI;
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+static MemTransferInst *
+isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
+                               SmallVectorImpl<Instruction *> &ToDelete) {
+  MemTransferInst *TheCopy = 0;
+  if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
+    return TheCopy;
+  return 0;
+}
+
+/// getPointeeAlignment - Compute the minimum alignment of the value pointed
+/// to by the given pointer.
+static unsigned getPointeeAlignment(Value *V, const DataLayout &TD) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast ||
+        (CE->getOpcode() == Instruction::GetElementPtr &&
+         cast<GEPOperator>(CE)->hasAllZeroIndices()))
+      return getPointeeAlignment(CE->getOperand(0), TD);
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    if (!GV->isDeclaration())
+      return TD.getPreferredAlignment(GV);
+
+  if (PointerType *PT = dyn_cast<PointerType>(V->getType()))
+    if (PT->getElementType()->isSized())
+      return TD.getABITypeAlignment(PT->getElementType());
+
+  return 0;
+}
 
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
@@ -99,12 +247,16 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
           return &AI;
         }
 
+        // If the alignment of the entry block alloca is 0 (unspecified),
+        // assign it the preferred alignment.
+        if (EntryAI->getAlignment() == 0)
+          EntryAI->setAlignment(
+            TD->getPrefTypeAlignment(EntryAI->getAllocatedType()));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
-        unsigned MaxAlign =
-          std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()),
-                   TD->getPrefTypeAlignment(AI.getAllocatedType()));
+        unsigned MaxAlign = std::max(EntryAI->getAlignment(),
+                                     AI.getAlignment());
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());
@@ -113,6 +265,31 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     }
   }
 
+  if (TD) {
+    // Check to see if this allocation is only modified by a memcpy/memmove from
+    // a constant global whose alignment is equal to or exceeds that of the
+    // allocation.  If this is the case, we can change all users to use
+    // the constant global instead.  This is commonly produced by the CFE by
+    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
+    // is only subsequently read.
+    SmallVector<Instruction *, 4> ToDelete;
+    if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
+      if (AI.getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) {
+        DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+          EraseInstFromFunction(*ToDelete[i]);
+        Constant *TheSrc = cast<Constant>(Copy->getSource());
+        Instruction *NewI
+          = ReplaceInstUsesWith(AI, ConstantExpr::getBitCast(TheSrc,
+                                                             AI.getType()));
+        EraseInstFromFunction(*Copy);
+        ++NumGlobalCopies;
+        return NewI;
+      }
+    }
+  }
+
   // At last, use the generic allocation site handler to aggressively remove
   // unused allocas.
   return visitAllocSite(AI);
@@ -121,7 +298,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
 
 /// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
 static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
-                                        const TargetData *TD) {
+                                        const DataLayout *TD) {
   User *CI = cast<User>(LI.getOperand(0));
   Value *CastOp = CI->getOperand(0);
 
@@ -151,14 +328,14 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
             SrcPTy = SrcTy->getElementType();
           }
 
-      if (IC.getTargetData() &&
+      if (IC.getDataLayout() &&
           (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || 
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
           (SrcPTy->isPointerTy() == LI.getType()->isPointerTy()) &&
-          IC.getTargetData()->getTypeSizeInBits(SrcPTy) ==
-               IC.getTargetData()->getTypeSizeInBits(DestPTy)) {
+          IC.getDataLayout()->getTypeSizeInBits(SrcPTy) ==
+               IC.getDataLayout()->getTypeSizeInBits(DestPTy)) {
 
         // Okay, we are casting from one integer or pointer type to another of
         // the same size.  Instead of casting the pointer before the load, cast
@@ -336,11 +513,11 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   
   // If the pointers point into different address spaces or if they point to
   // values with different sizes, we can't do the transformation.
-  if (!IC.getTargetData() ||
+  if (!IC.getDataLayout() ||
       SrcTy->getAddressSpace() != 
         cast<PointerType>(CI->getType())->getAddressSpace() ||
-      IC.getTargetData()->getTypeSizeInBits(SrcPTy) !=
-      IC.getTargetData()->getTypeSizeInBits(DestPTy))
+      IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
+      IC.getDataLayout()->getTypeSizeInBits(DestPTy))
     return 0;
 
   // Okay, we are casting from one integer or pointer type to another of
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 35a0bbb76146..cefe45ec862c 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -37,7 +37,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
                       m_Value(B))) &&
       // The "1" can be any value known to be a power of 2.
-      isPowerOfTwo(PowerOf2, IC.getTargetData())) {
+      isPowerOfTwo(PowerOf2, IC.getDataLayout())) {
     A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
@@ -46,7 +46,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
     if (I->isLogicalShift() &&
-        isPowerOfTwo(I->getOperand(0), IC.getTargetData())) {
+        isPowerOfTwo(I->getOperand(0), IC.getDataLayout())) {
       // We know that this is an exact/nuw shift and that the input is a
       // non-zero context as well.
       if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
@@ -462,12 +462,23 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
     }
   }
 
+  // (x lshr C1) udiv C2 --> x udiv (C2 << C1)
+  if (ConstantInt *C2 = dyn_cast<ConstantInt>(Op1)) {
+    Value *X;
+    ConstantInt *C1;
+    if (match(Op0, m_LShr(m_Value(X), m_ConstantInt(C1)))) {
+      APInt NC = C2->getValue().shl(C1->getLimitedValue(C1->getBitWidth()-1));
+      return BinaryOperator::CreateUDiv(X, Builder->getInt(NC));
+    }
+  }
+
   // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
   { const APInt *CI; Value *N;
     if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
         match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) {
       if (*CI != 1)
-        N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2()));
+        N = Builder->CreateAdd(N,
+                               ConstantInt::get(N->getType(), CI->logBase2()));
       if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
         N = Builder->CreateZExt(N, Z->getDestTy());
       if (I.isExact())
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 664546c16551..de9c77e6005a 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -13,7 +13,7 @@
 
 #include "InstCombine.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/STLExtras.h"
 using namespace llvm;
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 291e80019e8d..a2d4c888f2cf 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -287,7 +287,7 @@ Instruction *InstCombiner::FoldSelectIntoOp(SelectInst &SI, Value *TrueVal,
 /// SimplifyWithOpReplaced - See if V simplifies when its operand Op is
 /// replaced with RepOp.
 static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                     const TargetData *TD,
+                                     const DataLayout *TD,
                                      const TargetLibraryInfo *TLI) {
   // Trivial replacement.
   if (V == Op)
@@ -333,6 +333,10 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
 
     // All operands were constants, fold it.
     if (ConstOps.size() == I->getNumOperands()) {
+      if (CmpInst *C = dyn_cast<CmpInst>(I))
+        return ConstantFoldCompareInstOperands(C->getPredicate(), ConstOps[0],
+                                               ConstOps[1], TD, TLI);
+
       if (LoadInst *LI = dyn_cast<LoadInst>(I))
         if (!LI->isVolatile())
           return ConstantFoldLoadFromConstPtr(ConstOps[0], TD);
@@ -903,7 +907,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return &SI;
   }
 
-  if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) {
+  if (VectorType *VecTy = dyn_cast<VectorType>(SI.getType())) {
     unsigned VWidth = VecTy->getNumElements();
     APInt UndefElts(VWidth, 0);
     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
@@ -912,6 +916,28 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
         return ReplaceInstUsesWith(SI, V);
       return &SI;
     }
+
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CondVal)) {
+      // Form a shufflevector instruction.
+      SmallVector<Constant *, 8> Mask(VWidth);
+      Type *Int32Ty = Type::getInt32Ty(CV->getContext());
+      for (unsigned i = 0; i != VWidth; ++i) {
+        Constant *Elem = cast<Constant>(CV->getOperand(i));
+        if (ConstantInt *E = dyn_cast<ConstantInt>(Elem))
+          Mask[i] = ConstantInt::get(Int32Ty, i + (E->isZero() ? VWidth : 0));
+        else if (isa<UndefValue>(Elem))
+          Mask[i] = UndefValue::get(Int32Ty);
+        else
+          return 0;
+      }
+      Constant *MaskVal = ConstantVector::get(Mask);
+      Value *V = Builder->CreateShuffleVector(TrueVal, FalseVal, MaskVal);
+      return ReplaceInstUsesWith(SI, V);
+    }
+
+    if (isa<ConstantAggregateZero>(CondVal)) {
+      return ReplaceInstUsesWith(SI, FalseVal);
+    }
   }
 
   return 0;
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 4bb2403299ce..57021f1bef84 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -190,7 +190,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       V = IC.Builder->CreateLShr(C, NumBits);
     // If we got a constantexpr back, try to simplify it with TD info.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-      V = ConstantFoldConstantExpression(CE, IC.getTargetData(),
+      V = ConstantFoldConstantExpression(CE, IC.getDataLayout(),
                                          IC.getTargetLibraryInfo());
     return V;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 54be8ed3fa90..602b20337144 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -14,7 +14,7 @@
 
 
 #include "InstCombine.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/IntrinsicInst.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index cf60f0f426dc..dd7ea14e8a89 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -636,8 +636,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
       // If LHS's width is changed, shift the mask value accordingly.
       // If newRHS == NULL, i.e. LHSOp0 == RHSOp0, we want to remap any
-      // references to RHSOp0 to LHSOp0, so we don't need to shift the mask.
-      if (eltMask >= 0 && newRHS != NULL)
+      // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+      // If newRHS == newLHS, we want to remap any references from newRHS to
+      // newLHS so that we can properly identify splats that may occur due to
+      // obfuscation accross the two vectors.
+      if (eltMask >= 0 && newRHS != NULL && newLHS != newRHS)
         eltMask += newLHSWidth;
     }
 
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 99a02fc0df3f..ea654ae9ed0a 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -26,8 +26,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
   SmallVector<Instruction*, 256> Worklist;
   DenseMap<Instruction*, unsigned> WorklistMap;
   
-  void operator=(const InstCombineWorklist&RHS);   // DO NOT IMPLEMENT
-  InstCombineWorklist(const InstCombineWorklist&); // DO NOT IMPLEMENT
+  void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
+  InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
 public:
   InstCombineWorklist() {}
   
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 68ecd516049d..9a46f25e66ff 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -40,7 +40,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CFG.h"
@@ -88,7 +88,7 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 
 
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
-  return llvm::EmitGEPOffset(Builder, *getTargetData(), GEP);
+  return llvm::EmitGEPOffset(Builder, *getDataLayout(), GEP);
 }
 
 /// ShouldChangeType - Return true if it is desirable to convert a computation
@@ -805,6 +805,244 @@ static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
   return true;
 }
 
+/// Descale - Return a value X such that Val = X * Scale, or null if none.  If
+/// the multiplication is known not to overflow then NoSignedWrap is set.
+Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
+  assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
+  assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
+         Scale.getBitWidth() && "Scale not compatible with value!");
+
+  // If Val is zero or Scale is one then Val = Val * Scale.
+  if (match(Val, m_Zero()) || Scale == 1) {
+    NoSignedWrap = true;
+    return Val;
+  }
+
+  // If Scale is zero then it does not divide Val.
+  if (Scale.isMinValue())
+    return 0;
+
+  // Look through chains of multiplications, searching for a constant that is
+  // divisible by Scale.  For example, descaling X*(Y*(Z*4)) by a factor of 4
+  // will find the constant factor 4 and produce X*(Y*Z).  Descaling X*(Y*8) by
+  // a factor of 4 will produce X*(Y*2).  The principle of operation is to bore
+  // down from Val:
+  //
+  //     Val = M1 * X          ||   Analysis starts here and works down
+  //      M1 = M2 * Y          ||   Doesn't descend into terms with more
+  //      M2 =  Z * 4          \/   than one use
+  //
+  // Then to modify a term at the bottom:
+  //
+  //     Val = M1 * X
+  //      M1 =  Z * Y          ||   Replaced M2 with Z
+  //
+  // Then to work back up correcting nsw flags.
+
+  // Op - the term we are currently analyzing.  Starts at Val then drills down.
+  // Replaced with its descaled value before exiting from the drill down loop.
+  Value *Op = Val;
+
+  // Parent - initially null, but after drilling down notes where Op came from.
+  // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
+  // 0'th operand of Val.
+  std::pair<Instruction*, unsigned> Parent;
+
+  // RequireNoSignedWrap - Set if the transform requires a descaling at deeper
+  // levels that doesn't overflow.
+  bool RequireNoSignedWrap = false;
+
+  // logScale - log base 2 of the scale.  Negative if not a power of 2.
+  int32_t logScale = Scale.exactLogBase2();
+
+  for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      // If Op is a constant divisible by Scale then descale to the quotient.
+      APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
+      APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
+      if (!Remainder.isMinValue())
+        // Not divisible by Scale.
+        return 0;
+      // Replace with the quotient in the parent.
+      Op = ConstantInt::get(CI->getType(), Quotient);
+      NoSignedWrap = true;
+      break;
+    }
+
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
+
+      if (BO->getOpcode() == Instruction::Mul) {
+        // Multiplication.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return 0;
+
+        // There are three cases for multiplication: multiplication by exactly
+        // the scale, multiplication by a constant different to the scale, and
+        // multiplication by something else.
+        Value *LHS = BO->getOperand(0);
+        Value *RHS = BO->getOperand(1);
+
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+          // Multiplication by a constant.
+          if (CI->getValue() == Scale) {
+            // Multiplication by exactly the scale, replace the multiplication
+            // by its left-hand side in the parent.
+            Op = LHS;
+            break;
+          }
+
+          // Otherwise drill down into the constant.
+          if (!Op->hasOneUse())
+            return 0;
+
+          Parent = std::make_pair(BO, 1);
+          continue;
+        }
+
+        // Multiplication by something else. Drill down into the left-hand side
+        // since that's where the reassociate pass puts the good stuff.
+        if (!Op->hasOneUse())
+          return 0;
+
+        Parent = std::make_pair(BO, 0);
+        continue;
+      }
+
+      if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
+          isa<ConstantInt>(BO->getOperand(1))) {
+        // Multiplication by a power of 2.
+        NoSignedWrap = BO->hasNoSignedWrap();
+        if (RequireNoSignedWrap && !NoSignedWrap)
+          return 0;
+
+        Value *LHS = BO->getOperand(0);
+        int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
+          getLimitedValue(Scale.getBitWidth());
+        // Op = LHS << Amt.
+
+        if (Amt == logScale) {
+          // Multiplication by exactly the scale, replace the multiplication
+          // by its left-hand side in the parent.
+          Op = LHS;
+          break;
+        }
+        if (Amt < logScale || !Op->hasOneUse())
+          return 0;
+
+        // Multiplication by more than the scale.  Reduce the multiplying amount
+        // by the scale in the parent.
+        Parent = std::make_pair(BO, 1);
+        Op = ConstantInt::get(BO->getType(), Amt - logScale);
+        break;
+      }
+    }
+
+    if (!Op->hasOneUse())
+      return 0;
+
+    if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
+      if (Cast->getOpcode() == Instruction::SExt) {
+        // Op is sign-extended from a smaller type, descale in the smaller type.
+        unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        APInt SmallScale = Scale.trunc(SmallSize);
+        // Suppose Op = sext X, and we descale X as Y * SmallScale.  We want to
+        // descale Op as (sext Y) * Scale.  In order to have
+        //   sext (Y * SmallScale) = (sext Y) * Scale
+        // some conditions need to hold however: SmallScale must sign-extend to
+        // Scale and the multiplication Y * SmallScale should not overflow.
+        if (SmallScale.sext(Scale.getBitWidth()) != Scale)
+          // SmallScale does not sign-extend to Scale.
+          return 0;
+        assert(SmallScale.exactLogBase2() == logScale);
+        // Require that Y * SmallScale must not overflow.
+        RequireNoSignedWrap = true;
+
+        // Drill down through the cast.
+        Parent = std::make_pair(Cast, 0);
+        Scale = SmallScale;
+        continue;
+      }
+
+      if (Cast->getOpcode() == Instruction::Trunc) {
+        // Op is truncated from a larger type, descale in the larger type.
+        // Suppose Op = trunc X, and we descale X as Y * sext Scale.  Then
+        //   trunc (Y * sext Scale) = (trunc Y) * Scale
+        // always holds.  However (trunc Y) * Scale may overflow even if
+        // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
+        // from this point up in the expression (see later).
+        if (RequireNoSignedWrap)
+          return 0;
+
+        // Drill down through the cast.
+        unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+        Parent = std::make_pair(Cast, 0);
+        Scale = Scale.sext(LargeSize);
+        if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
+          logScale = -1;
+        assert(Scale.exactLogBase2() == logScale);
+        continue;
+      }
+    }
+
+    // Unsupported expression, bail out.
+    return 0;
+  }
+
+  // We know that we can successfully descale, so from here on we can safely
+  // modify the IR.  Op holds the descaled version of the deepest term in the
+  // expression.  NoSignedWrap is 'true' if multiplying Op by Scale is known
+  // not to overflow.
+
+  if (!Parent.first)
+    // The expression only had one term.
+    return Op;
+
+  // Rewrite the parent using the descaled version of its operand.
+  assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
+  assert(Op != Parent.first->getOperand(Parent.second) &&
+         "Descaling was a no-op?");
+  Parent.first->setOperand(Parent.second, Op);
+  Worklist.Add(Parent.first);
+
+  // Now work back up the expression correcting nsw flags.  The logic is based
+  // on the following observation: if X * Y is known not to overflow as a signed
+  // multiplication, and Y is replaced by a value Z with smaller absolute value,
+  // then X * Z will not overflow as a signed multiplication either.  As we work
+  // our way up, having NoSignedWrap 'true' means that the descaled value at the
+  // current level has strictly smaller absolute value than the original.
+  Instruction *Ancestor = Parent.first;
+  do {
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
+      // If the multiplication wasn't nsw then we can't say anything about the
+      // value of the descaled multiplication, and we have to clear nsw flags
+      // from this point on up.
+      bool OpNoSignedWrap = BO->hasNoSignedWrap();
+      NoSignedWrap &= OpNoSignedWrap;
+      if (NoSignedWrap != OpNoSignedWrap) {
+        BO->setHasNoSignedWrap(NoSignedWrap);
+        Worklist.Add(Ancestor);
+      }
+    } else if (Ancestor->getOpcode() == Instruction::Trunc) {
+      // The fact that the descaled input to the trunc has smaller absolute
+      // value than the original input doesn't tell us anything useful about
+      // the absolute values of the truncations.
+      NoSignedWrap = false;
+    }
+    assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
+           "Failed to keep proper track of nsw flags while drilling down?");
+
+    if (Ancestor == Val)
+      // Got to the top, all done!
+      return Val;
+
+    // Move up one level in the expression.
+    assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
+    Ancestor = Ancestor->use_back();
+  } while (1);
+}
+
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
 
@@ -817,7 +1055,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // by multiples of a zero size type with zero.
   if (TD) {
     bool MadeChange = false;
-    Type *IntPtrTy = TD->getIntPtrType(GEP.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(GEP.getPointerOperandType());
 
     gep_type_iterator GTI = gep_type_begin(GEP);
     for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end();
@@ -836,7 +1074,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         }
 
       Type *IndexTy = (*I)->getType();
-      if (IndexTy != IntPtrTy && !IndexTy->isVectorTy()) {
+      if (IndexTy != IntPtrTy) {
         // If we are using a wider index than needed for this platform, shrink
         // it to what we need.  If narrower, sign-extend it to what we need.
         // This explicit cast can make subsequent optimizations more obvious.
@@ -855,7 +1093,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
       return 0;
 
-    // Note that if our source is a gep chain itself that we wait for that
+    // Note that if our source is a gep chain itself then we wait for that
     // chain to be resolved before we perform this transformation.  This
     // avoids us creating a TON of code in some cases.
     if (GEPOperator *SrcGEP =
@@ -987,63 +1225,74 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       }
 
       // Transform things like:
+      // %V = mul i64 %N, 4
+      // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
+      // into:  %t1 = getelementptr i32* %arr, i32 %N; bitcast
+      if (TD && ResElTy->isSized() && SrcElTy->isSized()) {
+        // Check that changing the type amounts to dividing the index by a scale
+        // factor.
+        uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
+        uint64_t SrcSize = TD->getTypeAllocSize(SrcElTy);
+        if (ResSize && SrcSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = SrcSize / ResSize;
+
+          // Earlier transforms ensure that the index has type IntPtrType, which
+          // considerably simplifies the logic by eliminating implicit casts.
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+                 "Index not cast to pointer width?");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *NewGEP = GEP.isInBounds() && NSW ?
+              Builder->CreateInBoundsGEP(StrippedPtr, NewIdx, GEP.getName()) :
+              Builder->CreateGEP(StrippedPtr, NewIdx, GEP.getName());
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return new BitCastInst(NewGEP, GEP.getType());
+          }
+        }
+      }
+
+      // Similarly, transform things like:
       // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
       //   (where tmp = 8*tmp2) into:
       // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
-
-      if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) {
+      if (TD && ResElTy->isSized() && SrcElTy->isSized() &&
+          SrcElTy->isArrayTy()) {
+        // Check that changing to the array element type amounts to dividing the
+        // index by a scale factor.
+        uint64_t ResSize = TD->getTypeAllocSize(ResElTy);
         uint64_t ArrayEltSize =
-            TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
-
-        // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
-        // allow either a mul, shift, or constant here.
-        Value *NewIdx = 0;
-        ConstantInt *Scale = 0;
-        if (ArrayEltSize == 1) {
-          NewIdx = GEP.getOperand(1);
-          Scale = ConstantInt::get(cast<IntegerType>(NewIdx->getType()), 1);
-        } else if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP.getOperand(1))) {
-          NewIdx = ConstantInt::get(CI->getType(), 1);
-          Scale = CI;
-        } else if (Instruction *Inst =dyn_cast<Instruction>(GEP.getOperand(1))){
-          if (Inst->getOpcode() == Instruction::Shl &&
-              isa<ConstantInt>(Inst->getOperand(1))) {
-            ConstantInt *ShAmt = cast<ConstantInt>(Inst->getOperand(1));
-            uint32_t ShAmtVal = ShAmt->getLimitedValue(64);
-            Scale = ConstantInt::get(cast<IntegerType>(Inst->getType()),
-                                     1ULL << ShAmtVal);
-            NewIdx = Inst->getOperand(0);
-          } else if (Inst->getOpcode() == Instruction::Mul &&
-                     isa<ConstantInt>(Inst->getOperand(1))) {
-            Scale = cast<ConstantInt>(Inst->getOperand(1));
-            NewIdx = Inst->getOperand(0);
+          TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType());
+        if (ResSize && ArrayEltSize % ResSize == 0) {
+          Value *Idx = GEP.getOperand(1);
+          unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+          uint64_t Scale = ArrayEltSize / ResSize;
+
+          // Earlier transforms ensure that the index has type IntPtrType, which
+          // considerably simplifies the logic by eliminating implicit casts.
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+                 "Index not cast to pointer width?");
+
+          bool NSW;
+          if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+            // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+            // If the multiplication NewIdx * Scale may overflow then the new
+            // GEP may not be "inbounds".
+            Value *Off[2];
+            Off[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
+            Off[1] = NewIdx;
+            Value *NewGEP = GEP.isInBounds() && NSW ?
+              Builder->CreateInBoundsGEP(StrippedPtr, Off, GEP.getName()) :
+              Builder->CreateGEP(StrippedPtr, Off, GEP.getName());
+            // The NewGEP must be pointer typed, so must the old one -> BitCast
+            return new BitCastInst(NewGEP, GEP.getType());
           }
         }
-
-        // If the index will be to exactly the right offset with the scale taken
-        // out, perform the transformation. Note, we don't know whether Scale is
-        // signed or not. We'll use unsigned version of division/modulo
-        // operation after making sure Scale doesn't have the sign bit set.
-        if (ArrayEltSize && Scale && Scale->getSExtValue() >= 0LL &&
-            Scale->getZExtValue() % ArrayEltSize == 0) {
-          Scale = ConstantInt::get(Scale->getType(),
-                                   Scale->getZExtValue() / ArrayEltSize);
-          if (Scale->getZExtValue() != 1) {
-            Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(),
-                                                       false /*ZExt*/);
-            NewIdx = Builder->CreateMul(NewIdx, C, "idxscale");
-          }
-
-          // Insert the new GEP instruction.
-          Value *Idx[2];
-          Idx[0] = Constant::getNullValue(Type::getInt32Ty(GEP.getContext()));
-          Idx[1] = NewIdx;
-          Value *NewGEP = GEP.isInBounds() ?
-            Builder->CreateInBoundsGEP(StrippedPtr, Idx, GEP.getName()):
-            Builder->CreateGEP(StrippedPtr, Idx, GEP.getName());
-          // The NewGEP must be pointer typed, so must the old one -> BitCast
-          return new BitCastInst(NewGEP, GEP.getType());
-        }
       }
     }
   }
@@ -1068,7 +1317,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         // If the bitcast is of an allocation, and the allocation will be
         // converted to match the type of the cast, don't touch this.
         if (isa<AllocaInst>(BCI->getOperand(0)) ||
-            isAllocationFn(BCI->getOperand(0))) {
+            isAllocationFn(BCI->getOperand(0), TLI)) {
           // See if the bitcast simplifies, if so, don't nuke this GEP yet.
           if (Instruction *I = visitBitCast(*BCI)) {
             if (I != BCI) {
@@ -1107,7 +1356,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
 
 static bool
-isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
+isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users,
+                     const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 4> Worklist;
   Worklist.push_back(AI);
 
@@ -1163,7 +1413,7 @@ isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) {
           }
         }
 
-        if (isFreeCall(I)) {
+        if (isFreeCall(I, TLI)) {
           Users.push_back(I);
           continue;
         }
@@ -1188,7 +1438,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
   // to null and free calls, delete the calls and replace the comparisons with
   // true or false as appropriate.
   SmallVector<WeakVH, 64> Users;
-  if (isAllocSiteRemovable(&MI, Users)) {
+  if (isAllocSiteRemovable(&MI, Users, TLI)) {
     for (unsigned i = 0, e = Users.size(); i != e; ++i) {
       Instruction *I = cast_or_null<Instruction>(&*Users[i]);
       if (!I) continue;
@@ -1853,7 +2103,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
 static bool AddReachableCodeToWorklist(BasicBlock *BB,
                                        SmallPtrSet<BasicBlock*, 64> &Visited,
                                        InstCombiner &IC,
-                                       const TargetData *TD,
+                                       const DataLayout *TD,
                                        const TargetLibraryInfo *TLI) {
   bool MadeIRChange = false;
   SmallVector<BasicBlock*, 256> Worklist;
@@ -1872,7 +2122,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB,
       Instruction *Inst = BBI++;
 
       // DCE instruction if trivially dead.
-      if (isInstructionTriviallyDead(Inst)) {
+      if (isInstructionTriviallyDead(Inst, TLI)) {
         ++NumDeadInst;
         DEBUG(errs() << "IC: DCE: " << *Inst << '\n');
         Inst->eraseFromParent();
@@ -2002,7 +2252,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
     if (I == 0) continue;  // skip null values.
 
     // Check to see if we can DCE the instruction.
-    if (isInstructionTriviallyDead(I)) {
+    if (isInstructionTriviallyDead(I, TLI)) {
       DEBUG(errs() << "IC: DCE: " << *I << '\n');
       EraseInstFromFunction(*I);
       ++NumDeadInst;
@@ -2102,7 +2352,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
 
         // If the instruction was modified, it's possible that it is now dead.
         // if so, remove it.
-        if (isInstructionTriviallyDead(I)) {
+        if (isInstructionTriviallyDead(I, TLI)) {
           EraseInstFromFunction(*I);
         } else {
           Worklist.Add(I);
@@ -2117,9 +2367,27 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   return MadeIRChange;
 }
 
+namespace {
+class InstCombinerLibCallSimplifier : public LibCallSimplifier {
+  InstCombiner *IC;
+public:
+  InstCombinerLibCallSimplifier(const DataLayout *TD,
+                                const TargetLibraryInfo *TLI,
+                                InstCombiner *IC)
+    : LibCallSimplifier(TD, TLI) {
+    this->IC = IC;
+  }
+
+  /// replaceAllUsesWith - override so that instruction replacement
+  /// can be defined in terms of the instruction combiner framework.
+  virtual void replaceAllUsesWith(Instruction *I, Value *With) const {
+    IC->ReplaceInstUsesWith(*I, With);
+  }
+};
+}
 
 bool InstCombiner::runOnFunction(Function &F) {
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   /// Builder - This is an IRBuilder that automatically inserts new
@@ -2129,6 +2397,9 @@ bool InstCombiner::runOnFunction(Function &F) {
                InstCombineIRInserter(Worklist));
   Builder = &TheBuilder;
 
+  InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
+  Simplifier = &TheSimplifier;
+
   bool EverMadeChange = false;
 
   // Lower dbg.declare intrinsics otherwise their value may be clobbered
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 17b83ceee194..b7be4625ca8d 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -15,7 +15,7 @@
 
 #define DEBUG_TYPE "asan"
 
-#include "FunctionBlackList.h"
+#include "BlackList.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
@@ -35,7 +35,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -61,6 +61,8 @@ static const int   kAsanCtorAndCtorPriority = 1;
 static const char *kAsanReportErrorTemplate = "__asan_report_";
 static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
+static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
+static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
 static const char *kAsanInitName = "__asan_init";
 static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
@@ -106,6 +108,8 @@ static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
 // This flag may need to be replaced with -f[no]asan-globals.
 static cl::opt<bool> ClGlobals("asan-globals",
        cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+       cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClMemIntrin("asan-memintrin",
        cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
 // This flag may need to be replaced with -fasan-blacklist.
@@ -144,41 +148,33 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
                                cl::Hidden, cl::init(-1));
 
 namespace {
-
-/// An object of this type is created while instrumenting every function.
-struct AsanFunctionContext {
-  AsanFunctionContext(Function &Function) : F(Function) { }
-
-  Function &F;
-};
-
 /// AddressSanitizer: instrument the code in module to find memory bugs.
-struct AddressSanitizer : public ModulePass {
+struct AddressSanitizer : public FunctionPass {
   AddressSanitizer();
   virtual const char *getPassName() const;
-  void instrumentMop(AsanFunctionContext &AFC, Instruction *I);
-  void instrumentAddress(AsanFunctionContext &AFC,
-                         Instruction *OrigIns, IRBuilder<> &IRB,
+  void instrumentMop(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
                          Value *Addr, uint32_t TypeSize, bool IsWrite);
   Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
                            Value *ShadowValue, uint32_t TypeSize);
   Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
                                  bool IsWrite, size_t AccessSizeIndex);
-  bool instrumentMemIntrinsic(AsanFunctionContext &AFC, MemIntrinsic *MI);
-  void instrumentMemIntrinsicParam(AsanFunctionContext &AFC,
-                                   Instruction *OrigIns, Value *Addr,
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
                                    Value *Size,
                                    Instruction *InsertBefore, bool IsWrite);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
-  bool handleFunction(Module &M, Function &F);
+  bool runOnFunction(Function &F);
+  void createInitializerPoisonCalls(Module &M,
+                                    Value *FirstAddr, Value *LastAddr);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
-  bool poisonStackInFunction(Module &M, Function &F);
-  virtual bool runOnModule(Module &M);
+  bool poisonStackInFunction(Function &F);
+  virtual bool doInitialization(Module &M);
+  virtual bool doFinalization(Module &M);
   bool insertGlobalRedzones(Module &M);
   static char ID;  // Pass identification, replacement for typeid
 
  private:
-
   uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
@@ -194,12 +190,15 @@ struct AddressSanitizer : public ModulePass {
   }
 
   Function *checkInterfaceFunction(Constant *FuncOrBitcast);
+  bool ShouldInstrumentGlobal(GlobalVariable *G);
   void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
                    Value *ShadowBase, bool DoPoison);
   bool LooksLikeCodeInBug11395(Instruction *I);
+  void FindDynamicInitializers(Module &M);
+  bool HasDynamicInitializer(GlobalVariable *G);
 
   LLVMContext *C;
-  TargetData *TD;
+  DataLayout *TD;
   uint64_t MappingOffset;
   int MappingScale;
   size_t RedzoneSize;
@@ -208,11 +207,15 @@ struct AddressSanitizer : public ModulePass {
   Type *IntptrPtrTy;
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
+  Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanHandleNoReturnFunc;
   Instruction *CtorInsertBefore;
-  OwningPtr<FunctionBlackList> BL;
+  OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
   InlineAsm *EmptyAsm;
+  SmallSet<GlobalValue*, 32> DynamicallyInitializedGlobals;
+  SmallSet<GlobalValue*, 32> GlobalsCreatedByAsan;
 };
 
 }  // namespace
@@ -221,8 +224,8 @@ char AddressSanitizer::ID = 0;
 INITIALIZE_PASS(AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
     false, false)
-AddressSanitizer::AddressSanitizer() : ModulePass(ID) { }
-ModulePass *llvm::createAddressSanitizerPass() {
+AddressSanitizer::AddressSanitizer() : FunctionPass(ID) { }
+FunctionPass *llvm::createAddressSanitizerPass() {
   return new AddressSanitizer();
 }
 
@@ -243,38 +246,6 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
                             GlobalValue::PrivateLinkage, StrConst, "");
 }
 
-// Split the basic block and insert an if-then code.
-// Before:
-//   Head
-//   Cmp
-//   Tail
-// After:
-//   Head
-//   if (Cmp)
-//     ThenBlock
-//   Tail
-//
-// ThenBlock block is created and its terminator is returned.
-// If Unreachable, ThenBlock is terminated with UnreachableInst, otherwise
-// it is terminated with BranchInst to Tail.
-static TerminatorInst *splitBlockAndInsertIfThen(Value *Cmp, bool Unreachable) {
-  Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode();
-  BasicBlock *Head = SplitBefore->getParent();
-  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
-  TerminatorInst *HeadOldTerm = Head->getTerminator();
-  LLVMContext &C = Head->getParent()->getParent()->getContext();
-  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
-  TerminatorInst *CheckTerm;
-  if (Unreachable)
-    CheckTerm = new UnreachableInst(C, ThenBlock);
-  else
-    CheckTerm = BranchInst::Create(Tail, ThenBlock);
-  BranchInst *HeadNewTerm =
-    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
-  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
-  return CheckTerm;
-}
-
 Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
   // Shadow >> scale
   Shadow = IRB.CreateLShr(Shadow, MappingScale);
@@ -286,12 +257,12 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
 }
 
 void AddressSanitizer::instrumentMemIntrinsicParam(
-    AsanFunctionContext &AFC, Instruction *OrigIns,
+    Instruction *OrigIns,
     Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
   // Check the first byte.
   {
     IRBuilder<> IRB(InsertBefore);
-    instrumentAddress(AFC, OrigIns, IRB, Addr, 8, IsWrite);
+    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
   }
   // Check the last byte.
   {
@@ -301,13 +272,12 @@ void AddressSanitizer::instrumentMemIntrinsicParam(
     SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
     Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
     Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
-    instrumentAddress(AFC, OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
+    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
   }
 }
 
 // Instrument memset/memmove/memcpy
-bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC,
-                                              MemIntrinsic *MI) {
+bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
   Value *Dst = MI->getDest();
   MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
   Value *Src = MemTran ? MemTran->getSource() : 0;
@@ -323,12 +293,12 @@ bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC,
 
     Value *Cmp = IRB.CreateICmpNE(Length,
                                   Constant::getNullValue(Length->getType()));
-    InsertBefore = splitBlockAndInsertIfThen(Cmp, false);
+    InsertBefore = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
   }
 
-  instrumentMemIntrinsicParam(AFC, MI, Dst, Length, InsertBefore, true);
+  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
   if (Src)
-    instrumentMemIntrinsicParam(AFC, MI, Src, Length, InsertBefore, false);
+    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
   return true;
 }
 
@@ -358,14 +328,50 @@ static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) {
   return NULL;
 }
 
-void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) {
-  bool IsWrite;
+void AddressSanitizer::FindDynamicInitializers(Module& M) {
+  // Clang generates metadata identifying all dynamically initialized globals.
+  NamedMDNode *DynamicGlobals =
+      M.getNamedMetadata("llvm.asan.dynamically_initialized_globals");
+  if (!DynamicGlobals)
+    return;
+  for (int i = 0, n = DynamicGlobals->getNumOperands(); i < n; ++i) {
+    MDNode *MDN = DynamicGlobals->getOperand(i);
+    assert(MDN->getNumOperands() == 1);
+    Value *VG = MDN->getOperand(0);
+    // The optimizer may optimize away a global entirely, in which case we
+    // cannot instrument access to it.
+    if (!VG)
+      continue;
+
+    GlobalVariable *G = cast<GlobalVariable>(VG);
+    DynamicallyInitializedGlobals.insert(G);
+  }
+}
+// Returns true if a global variable is initialized dynamically in this TU.
+bool AddressSanitizer::HasDynamicInitializer(GlobalVariable *G) {
+  return DynamicallyInitializedGlobals.count(G);
+}
+
+void AddressSanitizer::instrumentMop(Instruction *I) {
+  bool IsWrite = false;
   Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
   assert(Addr);
-  if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) {
-    // We are accessing a global scalar variable. Nothing to catch here.
-    return;
+  if (ClOpt && ClOptGlobals) {
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
+      // If initialization order checking is disabled, a simple access to a
+      // dynamically initialized global is always valid.
+      if (!ClInitializers)
+        return;
+      // If a global variable does not have dynamic initialization we don't
+      // have to instrument it.  However, if a global has external linkage, we
+      // assume it has dynamic initialization, as it may have an initializer
+      // in a different TU.
+      if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+          !HasDynamicInitializer(G))
+        return;
+    }
   }
+
   Type *OrigPtrTy = Addr->getType();
   Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
 
@@ -379,7 +385,7 @@ void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) {
   }
 
   IRBuilder<> IRB(I);
-  instrumentAddress(AFC, I, IRB, Addr, TypeSize, IsWrite);
+  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
 }
 
 // Validate the result of Module::getOrInsertFunction called for an interface
@@ -424,8 +430,7 @@ Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
   return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
 }
 
-void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC,
-                                         Instruction *OrigIns,
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
                                          IRBuilder<> &IRB, Value *Addr,
                                          uint32_t TypeSize, bool IsWrite) {
   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
@@ -444,17 +449,19 @@ void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC,
   TerminatorInst *CrashTerm = 0;
 
   if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
-    TerminatorInst *CheckTerm = splitBlockAndInsertIfThen(Cmp, false);
+    TerminatorInst *CheckTerm =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false);
     assert(dyn_cast<BranchInst>(CheckTerm)->isUnconditional());
     BasicBlock *NextBB = CheckTerm->getSuccessor(0);
     IRB.SetInsertPoint(CheckTerm);
     Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
-    BasicBlock *CrashBlock = BasicBlock::Create(*C, "", &AFC.F, NextBB);
+    BasicBlock *CrashBlock =
+        BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
     CrashTerm = new UnreachableInst(*C, CrashBlock);
     BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
     ReplaceInstWithInst(CheckTerm, NewTerm);
   } else {
-    CrashTerm = splitBlockAndInsertIfThen(Cmp, true);
+    CrashTerm = SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), true);
   }
 
   Instruction *Crash =
@@ -462,68 +469,108 @@ void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC,
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
+void AddressSanitizer::createInitializerPoisonCalls(Module &M,
+                                                    Value *FirstAddr,
+                                                    Value *LastAddr) {
+  // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
+  Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
+  // If that function is not present, this TU contains no globals, or they have
+  // all been optimized away
+  if (!GlobalInit)
+    return;
+
+  // Set up the arguments to our poison/unpoison functions.
+  IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
+
+  // Declare our poisoning and unpoisoning functions.
+  Function *AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
+  Function *AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
+  AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
+
+  // Add a call to poison all external globals before the given function starts.
+  IRB.CreateCall2(AsanPoisonGlobals, FirstAddr, LastAddr);
+
+  // Add calls to unpoison all globals before each return instruction.
+  for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
+      I != E; ++I) {
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator())) {
+      CallInst::Create(AsanUnpoisonGlobals, "", RI);
+    }
+  }
+}
+
+bool AddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
+  Type *Ty = cast<PointerType>(G->getType())->getElementType();
+  DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+  if (BL->isIn(*G)) return false;
+  if (!Ty->isSized()) return false;
+  if (!G->hasInitializer()) return false;
+  if (GlobalsCreatedByAsan.count(G)) return false;  // Our own global.
+  // Touch only those globals that will not be defined in other modules.
+  // Don't handle ODR type linkages since other modules may be built w/o asan.
+  if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+      G->getLinkage() != GlobalVariable::PrivateLinkage &&
+      G->getLinkage() != GlobalVariable::InternalLinkage)
+    return false;
+  // Two problems with thread-locals:
+  //   - The address of the main thread's copy can't be computed at link-time.
+  //   - Need to poison all copies, not just the main thread's one.
+  if (G->isThreadLocal())
+    return false;
+  // For now, just ignore this Alloca if the alignment is large.
+  if (G->getAlignment() > RedzoneSize) return false;
+
+  // Ignore all the globals with the names starting with "\01L_OBJC_".
+  // Many of those are put into the .cstring section. The linker compresses
+  // that section by removing the spare \0s after the string terminator, so
+  // our redzones get broken.
+  if ((G->getName().find("\01L_OBJC_") == 0) ||
+      (G->getName().find("\01l_OBJC_") == 0)) {
+    DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G);
+    return false;
+  }
+
+  if (G->hasSection()) {
+    StringRef Section(G->getSection());
+    // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+    // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+    // them.
+    if ((Section.find("__OBJC,") == 0) ||
+        (Section.find("__DATA, __objc_") == 0)) {
+      DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G);
+      return false;
+    }
+    // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
+    // Constant CFString instances are compiled in the following way:
+    //  -- the string buffer is emitted into
+    //     __TEXT,__cstring,cstring_literals
+    //  -- the constant NSConstantString structure referencing that buffer
+    //     is placed into __DATA,__cfstring
+    // Therefore there's no point in placing redzones into __DATA,__cfstring.
+    // Moreover, it causes the linker to crash on OS X 10.7
+    if (Section.find("__DATA,__cfstring") == 0) {
+      DEBUG(dbgs() << "Ignoring CFString: " << *G);
+      return false;
+    }
+  }
+
+  return true;
+}
+
 // This function replaces all global variables with new variables that have
 // trailing redzones. It also creates a function that poisons
 // redzones and inserts this function into llvm.global_ctors.
 bool AddressSanitizer::insertGlobalRedzones(Module &M) {
   SmallVector<GlobalVariable *, 16> GlobalsToChange;
 
-  for (Module::GlobalListType::iterator G = M.getGlobalList().begin(),
-       E = M.getGlobalList().end(); G != E; ++G) {
-    Type *Ty = cast<PointerType>(G->getType())->getElementType();
-    DEBUG(dbgs() << "GLOBAL: " << *G);
-
-    if (!Ty->isSized()) continue;
-    if (!G->hasInitializer()) continue;
-    // Touch only those globals that will not be defined in other modules.
-    // Don't handle ODR type linkages since other modules may be built w/o asan.
-    if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
-        G->getLinkage() != GlobalVariable::PrivateLinkage &&
-        G->getLinkage() != GlobalVariable::InternalLinkage)
-      continue;
-    // Two problems with thread-locals:
-    //   - The address of the main thread's copy can't be computed at link-time.
-    //   - Need to poison all copies, not just the main thread's one.
-    if (G->isThreadLocal())
-      continue;
-    // For now, just ignore this Alloca if the alignment is large.
-    if (G->getAlignment() > RedzoneSize) continue;
-
-    // Ignore all the globals with the names starting with "\01L_OBJC_".
-    // Many of those are put into the .cstring section. The linker compresses
-    // that section by removing the spare \0s after the string terminator, so
-    // our redzones get broken.
-    if ((G->getName().find("\01L_OBJC_") == 0) ||
-        (G->getName().find("\01l_OBJC_") == 0)) {
-      DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G);
-      continue;
-    }
-
-    if (G->hasSection()) {
-      StringRef Section(G->getSection());
-      // Ignore the globals from the __OBJC section. The ObjC runtime assumes
-      // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
-      // them.
-      if ((Section.find("__OBJC,") == 0) ||
-          (Section.find("__DATA, __objc_") == 0)) {
-        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G);
-        continue;
-      }
-      // See http://code.google.com/p/address-sanitizer/issues/detail?id=32
-      // Constant CFString instances are compiled in the following way:
-      //  -- the string buffer is emitted into
-      //     __TEXT,__cstring,cstring_literals
-      //  -- the constant NSConstantString structure referencing that buffer
-      //     is placed into __DATA,__cfstring
-      // Therefore there's no point in placing redzones into __DATA,__cfstring.
-      // Moreover, it causes the linker to crash on OS X 10.7
-      if (Section.find("__DATA,__cfstring") == 0) {
-        DEBUG(dbgs() << "Ignoring CFString: " << *G);
-        continue;
-      }
-    }
-
-    GlobalsToChange.push_back(G);
+  for (Module::GlobalListType::iterator G = M.global_begin(),
+       E = M.global_end(); G != E; ++G) {
+    if (ShouldInstrumentGlobal(G))
+      GlobalsToChange.push_back(G);
   }
 
   size_t n = GlobalsToChange.size();
@@ -534,13 +581,22 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
   //   size_t size;
   //   size_t size_with_redzone;
   //   const char *name;
+  //   size_t has_dynamic_init;
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
-                                               IntptrTy, IntptrTy, NULL);
-  SmallVector<Constant *, 16> Initializers(n);
+                                               IntptrTy, IntptrTy,
+                                               IntptrTy, NULL);
+  SmallVector<Constant *, 16> Initializers(n), DynamicInit;
 
   IRBuilder<> IRB(CtorInsertBefore);
 
+  if (ClInitializers)
+    FindDynamicInitializers(M);
+
+  // The addresses of the first and last dynamically initialized globals in
+  // this TU.  Used in initialization order checking.
+  Value *FirstDynamic = 0, *LastDynamic = 0;
+
   for (size_t i = 0; i < n; i++) {
     GlobalVariable *G = GlobalsToChange[i];
     PointerType *PtrTy = cast<PointerType>(G->getType());
@@ -549,6 +605,10 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
     uint64_t RightRedzoneSize = RedzoneSize +
         (RedzoneSize - (SizeInBytes % RedzoneSize));
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+    // Determine whether this global should be poisoned in initialization.
+    bool GlobalHasDynamicInitializer = HasDynamicInitializer(G);
+    // Don't check initialization order if this global is blacklisted.
+    GlobalHasDynamicInitializer &= !BL->isInInit(*G);
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
     Constant *NewInitializer = ConstantStruct::get(
@@ -583,8 +643,17 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
         ConstantInt::get(IntptrTy, SizeInBytes),
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer),
         NULL);
-    DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal);
+
+    // Populate the first and last globals declared in this TU.
+    if (ClInitializers && GlobalHasDynamicInitializer) {
+      LastDynamic = ConstantExpr::getPointerCast(NewGlobal, IntptrTy);
+      if (FirstDynamic == 0)
+        FirstDynamic = LastDynamic;
+    }
+
+    DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
 
   ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
@@ -592,8 +661,13 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
       M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
+  // Create calls for poisoning before initializers run and unpoisoning after.
+  if (ClInitializers && FirstDynamic && LastDynamic)
+    createInitializerPoisonCalls(M, FirstDynamic, LastDynamic);
+
   Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+      kAsanRegisterGlobalsName, IRB.getVoidTy(),
+      IntptrTy, IntptrTy, NULL));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
 
   IRB.CreateCall2(AsanRegisterGlobals,
@@ -623,12 +697,13 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) {
 }
 
 // virtual
-bool AddressSanitizer::runOnModule(Module &M) {
+bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+
   if (!TD)
     return false;
-  BL.reset(new FunctionBlackList(ClBlackListFile));
+  BL.reset(new BlackList(ClBlackListFile));
 
   C = &(M.getContext());
   LongSize = TD->getPointerSizeInBits();
@@ -656,17 +731,27 @@ bool AddressSanitizer::runOnModule(Module &M) {
       std::string FunctionName = std::string(kAsanReportErrorTemplate) +
           (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
       // If we are merging crash callbacks, they have two parameters.
-      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>(
-          M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
+      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] =
+          checkInterfaceFunction(M.getOrInsertFunction(
+              FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
     }
   }
+
+  AsanStackMallocFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL));
+  AsanStackFreeFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanStackFreeName, IRB.getVoidTy(),
+      IntptrTy, IntptrTy, IntptrTy, NULL));
+  AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
 
   llvm::Triple targetTriple(M.getTargetTriple());
-  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI;
+  bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
 
   MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid :
     (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64);
@@ -686,10 +771,6 @@ bool AddressSanitizer::runOnModule(Module &M) {
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
   RedzoneSize = std::max(32, (int)(1 << MappingScale));
 
-  bool Res = false;
-
-  if (ClGlobals)
-    Res |= insertGlobalRedzones(M);
 
   if (ClMappingOffsetLog >= 0) {
     // Tell the run-time the current values of mapping offset and scale.
@@ -709,17 +790,20 @@ bool AddressSanitizer::runOnModule(Module &M) {
     IRB.CreateLoad(asan_mapping_scale, true);
   }
 
-
-  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
-    if (F->isDeclaration()) continue;
-    Res |= handleFunction(M, *F);
-  }
-
   appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndCtorPriority);
 
-  return Res;
+  return true;
+}
+
+bool AddressSanitizer::doFinalization(Module &M) {
+  // We transform the globals at the very end so that the optimization analysis
+  // works on the original globals.
+  if (ClGlobals)
+    return insertGlobalRedzones(M);
+  return false;
 }
 
+
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
@@ -736,19 +820,22 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   return false;
 }
 
-bool AddressSanitizer::handleFunction(Module &M, Function &F) {
+bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
+  DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
 
   // If needed, insert __asan_init before checking for AddressSafety attr.
   maybeInsertAsanInitAtFunctionEntry(F);
 
-  if (!F.hasFnAttr(Attribute::AddressSafety)) return false;
+  if (!F.getFnAttributes().hasAttribute(Attributes::AddressSafety))
+    return false;
 
   if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
     return false;
-  // We want to instrument every address only once per basic block
-  // (unless there are calls between uses).
+
+  // We want to instrument every address only once per basic block (unless there
+  // are calls between uses).
   SmallSet<Value*, 16> TempsToInstrument;
   SmallVector<Instruction*, 16> ToInstrument;
   SmallVector<Instruction*, 8> NoReturnCalls;
@@ -786,8 +873,6 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) {
     }
   }
 
-  AsanFunctionContext AFC(F);
-
   // Instrument.
   int NumInstrumented = 0;
   for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
@@ -795,25 +880,23 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) {
     if (ClDebugMin < 0 || ClDebugMax < 0 ||
         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
       if (isInterestingMemoryAccess(Inst, &IsWrite))
-        instrumentMop(AFC, Inst);
+        instrumentMop(Inst);
       else
-        instrumentMemIntrinsic(AFC, cast<MemIntrinsic>(Inst));
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
     }
     NumInstrumented++;
   }
 
-  DEBUG(dbgs() << F);
-
-  bool ChangedStack = poisonStackInFunction(M, F);
+  bool ChangedStack = poisonStackInFunction(F);
 
   // We must unpoison the stack before every NoReturn call (throw, _exit, etc).
   // See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
   for (size_t i = 0, n = NoReturnCalls.size(); i != n; i++) {
     Instruction *CI = NoReturnCalls[i];
     IRBuilder<> IRB(CI);
-    IRB.CreateCall(M.getOrInsertFunction(kAsanHandleNoReturnName,
-                                         IRB.getVoidTy(), NULL));
+    IRB.CreateCall(AsanHandleNoReturnFunc);
   }
+  DEBUG(dbgs() << "ASAN done instrumenting:\n" << F << "\n");
 
   return NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty();
 }
@@ -926,7 +1009,7 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
 // compiler hoists the load of the shadow value somewhere too high.
 // This causes asan to report a non-existing bug on 453.povray.
 // It sounds like an LLVM bug.
-bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
+bool AddressSanitizer::poisonStackInFunction(Function &F) {
   if (!ClStack) return false;
   SmallVector<AllocaInst*, 16> AllocaVec;
   SmallVector<Instruction*, 8> RetVec;
@@ -976,8 +1059,6 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
   Value *LocalStackBase = OrigStackBase;
 
   if (DoStackMalloc) {
-    Value *AsanStackMallocFunc = M.getOrInsertFunction(
-        kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL);
     LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
   }
@@ -1012,22 +1093,16 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
   Value *BasePlus1 = IRB.CreateAdd(LocalStackBase,
                                    ConstantInt::get(IntptrTy, LongSize/8));
   BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy);
-  Value *Description = IRB.CreatePointerCast(
-      createPrivateGlobalForString(M, StackDescription.str()),
-      IntptrTy);
+  GlobalVariable *StackDescriptionGlobal =
+      createPrivateGlobalForString(*F.getParent(), StackDescription.str());
+  GlobalsCreatedByAsan.insert(StackDescriptionGlobal);
+  Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
 
   // Poison the stack redzones at the entry.
   Value *ShadowBase = memToShadow(LocalStackBase, IRB);
   PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRB, ShadowBase, true);
 
-  Value *AsanStackFreeFunc = NULL;
-  if (DoStackMalloc) {
-    AsanStackFreeFunc = M.getOrInsertFunction(
-        kAsanStackFreeName, IRB.getVoidTy(),
-        IntptrTy, IntptrTy, IntptrTy, NULL);
-  }
-
   // Unpoison the stack before all ret instructions.
   for (size_t i = 0, n = RetVec.size(); i < n; i++) {
     Instruction *Ret = RetVec[i];
@@ -1046,6 +1121,10 @@ bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
     }
   }
 
+  // We are done. Remove the old unused alloca instructions.
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++)
+    AllocaVec[i]->eraseFromParent();
+
   if (ClDebugStack) {
     DEBUG(dbgs() << F);
   }
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
new file mode 100644
index 000000000000..ef34b8a56d88
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -0,0 +1,105 @@
+//===-- BlackList.cpp - blacklist for sanitizers --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+//===----------------------------------------------------------------------===//
+
+#include <utility>
+#include <string>
+
+#include "BlackList.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Module.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+
+namespace llvm {
+
+BlackList::BlackList(const StringRef Path) {
+  // Validate and open blacklist file.
+  if (!Path.size()) return;
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(Path, File)) {
+    report_fatal_error("Can't open blacklist file: " + Path + ": " +
+                       EC.message());
+  }
+
+  // Iterate through each line in the blacklist file.
+  SmallVector<StringRef, 16> Lines;
+  SplitString(File.take()->getBuffer(), Lines, "\n\r");
+  StringMap<std::string> Regexps;
+  for (SmallVector<StringRef, 16>::iterator I = Lines.begin(), E = Lines.end();
+       I != E; ++I) {
+    // Ignore empty lines and lines starting with "#"
+    if (I->empty() || I->startswith("#"))
+      continue;
+    // Get our prefix and unparsed regexp.
+    std::pair<StringRef, StringRef> SplitLine = I->split(":");
+    StringRef Prefix = SplitLine.first;
+    std::string Regexp = SplitLine.second;
+
+    // Replace * with .*
+    for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
+         pos += strlen(".*")) {
+      Regexp.replace(pos, strlen("*"), ".*");
+    }
+
+    // Check that the regexp is valid.
+    Regex CheckRE(Regexp);
+    std::string Error;
+    if (!CheckRE.isValid(Error)) {
+      report_fatal_error("malformed blacklist regex: " + SplitLine.second +
+          ": " + Error);
+    }
+
+    // Add this regexp into the proper group by its prefix.
+    if (Regexps[Prefix].size())
+      Regexps[Prefix] += "|";
+    Regexps[Prefix] += Regexp;
+  }
+
+  // Iterate through each of the prefixes, and create Regexs for them.
+  for (StringMap<std::string>::iterator I = Regexps.begin(), E = Regexps.end();
+       I != E; ++I) {
+    Entries[I->getKey()] = new Regex(I->getValue());
+  }
+}
+
+bool BlackList::isIn(const Function &F) {
+  return isIn(*F.getParent()) || inSection("fun", F.getName());
+}
+
+bool BlackList::isIn(const GlobalVariable &G) {
+  return isIn(*G.getParent()) || inSection("global", G.getName());
+}
+
+bool BlackList::isIn(const Module &M) {
+  return inSection("src", M.getModuleIdentifier());
+}
+
+bool BlackList::isInInit(const GlobalVariable &G) {
+  return isIn(*G.getParent()) || inSection("global-init", G.getName());
+}
+
+bool BlackList::inSection(const StringRef Section,
+                                  const StringRef Query) {
+  Regex *FunctionRegex = Entries[Section];
+  return FunctionRegex ? FunctionRegex->match(Query) : false;
+}
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/BlackList.h b/lib/Transforms/Instrumentation/BlackList.h
new file mode 100644
index 000000000000..f3c05a5058cc
--- /dev/null
+++ b/lib/Transforms/Instrumentation/BlackList.h
@@ -0,0 +1,57 @@
+//===-- BlackList.h - blacklist for sanitizers ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+//
+// This is a utility class for instrumentation passes (like AddressSanitizer
+// or ThreadSanitizer) to avoid instrumenting some functions or global
+// variables based on a user-supplied blacklist.
+//
+// The blacklist disables instrumentation of various functions and global
+// variables.  Each line contains a prefix, followed by a wild card expression.
+// Empty lines and lines starting with "#" are ignored.
+// ---
+// # Blacklisted items:
+// fun:*_ZN4base6subtle*
+// global:*global_with_bad_access_or_initialization*
+// global-init:*global_with_initialization_issues*
+// src:file_with_tricky_code.cc
+// ---
+// Note that the wild card is in fact an llvm::Regex, but * is automatically
+// replaced with .*
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "llvm/ADT/StringMap.h"
+
+namespace llvm {
+class Function;
+class GlobalVariable;
+class Module;
+class Regex;
+class StringRef;
+
+class BlackList {
+ public:
+  BlackList(const StringRef Path);
+  // Returns whether either this function or it's source file are blacklisted.
+  bool isIn(const Function &F);
+  // Returns whether either this global or it's source file are blacklisted.
+  bool isIn(const GlobalVariable &G);
+  // Returns whether this module is blacklisted by filename.
+  bool isIn(const Module &M);
+  // Returns whether a global should be excluded from initialization checking.
+  bool isInInit(const GlobalVariable &G);
+ private:
+  StringMap<Regex*> Entries;
+
+  bool inSection(const StringRef Section, const StringRef Query);
+};
+
+}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 09e0f1445126..7810b1b8a3ef 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -23,7 +23,8 @@
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Instrumentation.h"
 using namespace llvm;
 
@@ -47,11 +48,13 @@ namespace {
     virtual bool runOnFunction(Function &F);
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.addRequired<TargetData>();
+      AU.addRequired<DataLayout>();
+      AU.addRequired<TargetLibraryInfo>();
     }
 
   private:
-    const TargetData *TD;
+    const DataLayout *TD;
+    const TargetLibraryInfo *TLI;
     ObjectSizeOffsetEvaluator *ObjSizeEval;
     BuilderTy *Builder;
     Instruction *Inst;
@@ -140,7 +143,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
   Value *Offset = SizeOffset.second;
   ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
 
-  IntegerType *IntTy = TD->getIntPtrType(Inst->getContext());
+  Type *IntTy = TD->getIntPtrType(Ptr->getType());
   Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
 
   // three checks are required to ensure safety:
@@ -165,12 +168,13 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
 }
 
 bool BoundsChecking::runOnFunction(Function &F) {
-  TD = &getAnalysis<TargetData>();
+  TD = &getAnalysis<DataLayout>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
 
   TrapBB = 0;
   BuilderTy TheBuilder(F.getContext(), TargetFolder(TD));
   Builder = &TheBuilder;
-  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext());
+  ObjectSizeOffsetEvaluator TheObjSizeEval(TD, TLI, F.getContext());
   ObjSizeEval = &TheObjSizeEval;
 
   // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 00de882f1711..058f68c7cecd 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,8 +1,8 @@
 add_llvm_library(LLVMInstrumentation
   AddressSanitizer.cpp
+  BlackList.cpp
   BoundsChecking.cpp
   EdgeProfiling.cpp
-  FunctionBlackList.cpp
   GCOVProfiling.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.cpp b/lib/Transforms/Instrumentation/FunctionBlackList.cpp
deleted file mode 100644
index 188ea4d9b3cb..000000000000
--- a/lib/Transforms/Instrumentation/FunctionBlackList.cpp
+++ /dev/null
@@ -1,79 +0,0 @@
-//===-- FunctionBlackList.cpp - blacklist of functions --------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer 
-// or ThreadSanitizer) to avoid instrumenting some functions based on
-// user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-
-#include "FunctionBlackList.h"
-#include "llvm/ADT/OwningPtr.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Function.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/system_error.h"
-
-namespace llvm {
-
-FunctionBlackList::FunctionBlackList(const std::string &Path) {
-  Functions = NULL;
-  const char *kFunPrefix = "fun:";
-  if (!Path.size()) return;
-  std::string Fun;
-
-  OwningPtr<MemoryBuffer> File;
-  if (error_code EC = MemoryBuffer::getFile(Path.c_str(), File)) {
-    report_fatal_error("Can't open blacklist file " + Path + ": " +
-                       EC.message());
-  }
-  MemoryBuffer *Buff = File.take();
-  const char *Data = Buff->getBufferStart();
-  size_t DataLen = Buff->getBufferSize();
-  SmallVector<StringRef, 16> Lines;
-  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
-  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
-    if (Lines[i].startswith(kFunPrefix)) {
-      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
-      std::string ThisFuncRE;
-      // add ThisFunc replacing * with .*
-      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
-        if (ThisFunc[j] == '*')
-          ThisFuncRE += '.';
-        ThisFuncRE += ThisFunc[j];
-      }
-      // Check that the regexp is valid.
-      Regex CheckRE(ThisFuncRE);
-      std::string Error;
-      if (!CheckRE.isValid(Error))
-        report_fatal_error("malformed blacklist regex: " + ThisFunc +
-                           ": " + Error);
-      // Append to the final regexp.
-      if (Fun.size())
-        Fun += "|";
-      Fun += ThisFuncRE;
-    }
-  }
-  if (Fun.size()) {
-    Functions = new Regex(Fun);
-  }
-}
-
-bool FunctionBlackList::isIn(const Function &F) {
-  if (Functions) {
-    bool Res = Functions->match(F.getName());
-    return Res;
-  }
-  return false;
-}
-
-}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/FunctionBlackList.h b/lib/Transforms/Instrumentation/FunctionBlackList.h
deleted file mode 100644
index c1239b9b7e0d..000000000000
--- a/lib/Transforms/Instrumentation/FunctionBlackList.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- FunctionBlackList.cpp - blacklist of functions ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//===----------------------------------------------------------------------===//
-//
-// This is a utility class for instrumentation passes (like AddressSanitizer
-// or ThreadSanitizer) to avoid instrumenting some functions based on
-// user-supplied blacklist.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#include <string>
-
-namespace llvm {
-class Function;
-class Regex;
-
-// Blacklisted functions are not instrumented.
-// The blacklist file contains one or more lines like this:
-// ---
-// fun:FunctionWildCard
-// ---
-// This is similar to the "ignore" feature of ThreadSanitizer.
-// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
-class FunctionBlackList {
- public:
-  FunctionBlackList(const std::string &Path);
-  bool isIn(const Function &F);
- private:
-  Regex *Functions;
-};
-
-}  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 264a6a615361..e9192e5cdd52 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -88,11 +88,11 @@ namespace {
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *,
-                                                     MDNode *>, 8> &);
+    void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
     void insertIndirectCounterIncrement();
+    void insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
 
-    std::string mangleName(DICompileUnit CU, std::string NewStem);
+    std::string mangleName(DICompileUnit CU, const char *NewStem);
 
     bool EmitNotes;
     bool EmitData;
@@ -329,7 +329,7 @@ namespace {
   };
 }
 
-std::string GCOVProfiler::mangleName(DICompileUnit CU, std::string NewStem) {
+std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
   if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
     for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
       MDNode *N = GCov->getOperand(i);
@@ -519,6 +519,7 @@ bool GCOVProfiler::emitProfileArcs() {
     }
 
     insertCounterWriteout(CountersBySP);
+    insertFlush(CountersBySP);
   }
 
   if (InsertIndCounterIncrCode)
@@ -630,14 +631,15 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
 }
 
 void GCOVProfiler::insertCounterWriteout(
-    SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &CountersBySP) {
-  FunctionType *WriteoutFTy =
-      FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *WriteoutF = Function::Create(WriteoutFTy,
-                                         GlobalValue::InternalLinkage,
-                                         "__llvm_gcov_writeout", M);
+    ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
   WriteoutF->setUnnamedAddr(true);
-  BasicBlock *BB = BasicBlock::Create(*Ctx, "", WriteoutF);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
   IRBuilder<> Builder(BB);
 
   Constant *StartFile = getStartFileFunc();
@@ -648,11 +650,11 @@ void GCOVProfiler::insertCounterWriteout(
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (CU_Nodes) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-      DICompileUnit compile_unit(CU_Nodes->getOperand(i));
-      std::string FilenameGcda = mangleName(compile_unit, "gcda");
+      DICompileUnit CU(CU_Nodes->getOperand(i));
+      std::string FilenameGcda = mangleName(CU, "gcda");
       Builder.CreateCall(StartFile,
                          Builder.CreateGlobalStringPtr(FilenameGcda));
-      for (SmallVector<std::pair<GlobalVariable *, MDNode *>, 8>::iterator
+      for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
              I = CountersBySP.begin(), E = CountersBySP.end();
            I != E; ++I) {
         DISubprogram SP(I->second);
@@ -680,7 +682,7 @@ void GCOVProfiler::insertCounterWriteout(
                                  "__llvm_gcov_init", M);
   F->setUnnamedAddr(true);
   F->setLinkage(GlobalValue::InternalLinkage);
-  F->addFnAttr(Attribute::NoInline);
+  F->addFnAttr(Attributes::NoInline);
 
   BB = BasicBlock::Create(*Ctx, "entry", F);
   Builder.SetInsertPoint(BB);
@@ -699,7 +701,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
     cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc());
   Fn->setUnnamedAddr(true);
   Fn->setLinkage(GlobalValue::InternalLinkage);
-  Fn->addFnAttr(Attribute::NoInline);
+  Fn->addFnAttr(Attributes::NoInline);
 
   Type *Int32Ty = Type::getInt32Ty(*Ctx);
   Type *Int64Ty = Type::getInt64Ty(*Ctx);
@@ -745,3 +747,42 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Builder.SetInsertPoint(Exit);
   Builder.CreateRetVoid();
 }
+
+void GCOVProfiler::
+insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *FlushF = M->getFunction("__gcov_flush");
+  if (!FlushF)
+    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__gcov_flush", M);
+  else
+    FlushF->setLinkage(GlobalValue::InternalLinkage);
+  FlushF->setUnnamedAddr(true);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+
+  // Write out the current counters.
+  Constant *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  assert(WriteoutF && "Need to create the writeout function first!");
+
+  IRBuilder<> Builder(Entry);
+  Builder.CreateCall(WriteoutF);
+
+  // Zero out the counters.
+  for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
+         I = CountersBySP.begin(), E = CountersBySP.end();
+       I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    Constant *Null = Constant::getNullValue(GV->getType()->getElementType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = FlushF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __gcov_flush was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __gcov_flush");
+}
diff --git a/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/lib/Transforms/Instrumentation/MaximumSpanningTree.h
index f76c77e1bdbf..a4bb5a66af6d 100644
--- a/lib/Transforms/Instrumentation/MaximumSpanningTree.h
+++ b/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -26,30 +26,6 @@ namespace llvm {
   /// The type parameter T determines the type of the nodes of the graph.
   template <typename T>
   class MaximumSpanningTree {
-
-    // A comparing class for comparing weighted edges.
-    template <typename CT>
-    struct EdgeWeightCompare {
-      bool operator()(typename MaximumSpanningTree<CT>::EdgeWeight X, 
-                      typename MaximumSpanningTree<CT>::EdgeWeight Y) const {
-        if (X.second > Y.second) return true;
-        if (X.second < Y.second) return false;
-        if (const BasicBlock *BBX = dyn_cast<BasicBlock>(X.first.first)) {
-          if (const BasicBlock *BBY = dyn_cast<BasicBlock>(Y.first.first)) {
-            if (BBX->size() > BBY->size()) return true;
-            if (BBX->size() < BBY->size()) return false;
-          }
-        }
-        if (const BasicBlock *BBX = dyn_cast<BasicBlock>(X.first.second)) {
-          if (const BasicBlock *BBY = dyn_cast<BasicBlock>(Y.first.second)) {
-            if (BBX->size() > BBY->size()) return true;
-            if (BBX->size() < BBY->size()) return false;
-          }
-        }
-        return false;
-      }
-    };
-
   public:
     typedef std::pair<const T*, const T*> Edge;
     typedef std::pair<Edge, double> EdgeWeight;
@@ -59,6 +35,33 @@ namespace llvm {
 
     MaxSpanTree MST;
 
+  private:
+    // A comparing class for comparing weighted edges.
+    struct EdgeWeightCompare {
+      static bool getBlockSize(const T *X) {
+        const BasicBlock *BB = dyn_cast_or_null<BasicBlock>(X);
+        return BB ? BB->size() : 0;
+      }
+
+      bool operator()(EdgeWeight X, EdgeWeight Y) const {
+        if (X.second > Y.second) return true;
+        if (X.second < Y.second) return false;
+
+        // Equal edge weights: break ties by comparing block sizes.
+        size_t XSizeA = getBlockSize(X.first.first);
+        size_t YSizeA = getBlockSize(Y.first.first);
+        if (XSizeA > YSizeA) return true;
+        if (XSizeA < YSizeA) return false;
+
+        size_t XSizeB = getBlockSize(X.first.second);
+        size_t YSizeB = getBlockSize(Y.first.second);
+        if (XSizeB > YSizeB) return true;
+        if (XSizeB < YSizeB) return false;
+
+        return false;
+      }
+    };
+
   public:
     static char ID; // Class identification, replacement for typeinfo
 
@@ -66,7 +69,7 @@ namespace llvm {
     /// spanning tree.
     MaximumSpanningTree(EdgeWeights &EdgeVector) {
 
-      std::stable_sort(EdgeVector.begin(), EdgeVector.end(), EdgeWeightCompare<T>());
+      std::stable_sort(EdgeVector.begin(), EdgeVector.end(), EdgeWeightCompare());
 
       // Create spanning tree, Forest contains a special data structure
       // that makes checking if two nodes are already in a common (sub-)tree
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index dc0fa7175d98..9e10fc4416de 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -21,7 +21,7 @@
 
 #define DEBUG_TYPE "tsan"
 
-#include "FunctionBlackList.h"
+#include "BlackList.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
@@ -38,7 +38,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -47,10 +47,19 @@ using namespace llvm;
 
 static cl::opt<std::string>  ClBlackListFile("tsan-blacklist",
        cl::desc("Blacklist file"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "tsan-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "tsan-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "tsan-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
-STATISTIC(NumOmittedReadsBeforeWrite, 
+STATISTIC(NumOmittedReadsBeforeWrite,
           "Number of reads ignored due to following writes");
 STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
 STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
@@ -76,8 +85,8 @@ struct ThreadSanitizer : public FunctionPass {
   bool addrPointsToConstantData(Value *Addr);
   int getMemoryAccessFuncIndex(Value *Addr);
 
-  TargetData *TD;
-  OwningPtr<FunctionBlackList> BL;
+  DataLayout *TD;
+  OwningPtr<BlackList> BL;
   IntegerType *OrdTy;
   // Callbacks to run-time library are computed in doInitialization.
   Function *TsanFuncEntry;
@@ -88,6 +97,10 @@ struct ThreadSanitizer : public FunctionPass {
   Function *TsanWrite[kNumberOfAccessSizes];
   Function *TsanAtomicLoad[kNumberOfAccessSizes];
   Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
+  Function *TsanAtomicCAS[kNumberOfAccessSizes];
+  Function *TsanAtomicThreadFence;
+  Function *TsanAtomicSignalFence;
   Function *TsanVptrUpdate;
 };
 }  // namespace
@@ -118,10 +131,10 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   if (!TD)
     return false;
-  BL.reset(new FunctionBlackList(ClBlackListFile));
+  BL.reset(new BlackList(ClBlackListFile));
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
@@ -158,10 +171,42 @@ bool ThreadSanitizer::doInitialization(Module &M) {
     TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
         AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
         NULL));
+
+    for (int op = AtomicRMWInst::FIRST_BINOP;
+        op <= AtomicRMWInst::LAST_BINOP; ++op) {
+      TsanAtomicRMW[op][i] = NULL;
+      const char *NamePart = NULL;
+      if (op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction(
+          RMWName, Ty, PtrTy, Ty, OrdTy, NULL));
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, NULL));
   }
   TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
       IRB.getInt8PtrTy(), NULL));
+  TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
+  TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
   return true;
 }
 
@@ -186,7 +231,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
       NumOmittedReadsFromConstantGlobals++;
       return true;
     }
-  } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
     if (isVtableAccess(L)) {
       // Reads from a vtable pointer can not race with any writes.
       NumOmittedReadsFromVtable++;
@@ -244,8 +289,8 @@ static bool isAtomic(Instruction *I) {
     return true;
   if (isa<AtomicCmpXchgInst>(I))
     return true;
-  if (FenceInst *FI = dyn_cast<FenceInst>(I))
-    return FI->getSynchScope() == CrossThread;
+  if (isa<FenceInst>(I))
+    return true;
   return false;
 }
 
@@ -284,17 +329,19 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   // (e.g. variables that do not escape, etc).
 
   // Instrument memory accesses.
-  for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
-    Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
-  }
+  if (ClInstrumentMemoryAccesses)
+    for (size_t i = 0, n = AllLoadsAndStores.size(); i < n; ++i) {
+      Res |= instrumentLoadOrStore(AllLoadsAndStores[i]);
+    }
 
   // Instrument atomic memory accesses.
-  for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
-    Res |= instrumentAtomic(AtomicAccesses[i]);
-  }
+  if (ClInstrumentAtomics)
+    for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) {
+      Res |= instrumentAtomic(AtomicAccesses[i]);
+    }
 
   // Instrument function entry/exit points if there were instrumented accesses.
-  if (Res || HasCalls) {
+  if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
     Value *ReturnAddress = IRB.CreateCall(
         Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
@@ -343,12 +390,12 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   switch (ord) {
     case NotAtomic:              assert(false);
     case Unordered:              // Fall-through.
-    case Monotonic:              v = 1 << 0; break;
- // case Consume:                v = 1 << 1; break;  // Not specified yet.
-    case Acquire:                v = 1 << 2; break;
-    case Release:                v = 1 << 3; break;
-    case AcquireRelease:         v = 1 << 4; break;
-    case SequentiallyConsistent: v = 1 << 5; break;
+    case Monotonic:              v = 0; break;
+ // case Consume:                v = 1; break;  // Not specified yet.
+    case Acquire:                v = 2; break;
+    case Release:                v = 3; break;
+    case AcquireRelease:         v = 4; break;
+    case SequentiallyConsistent: v = 5; break;
   }
   return IRB->getInt32(v);
 }
@@ -385,12 +432,44 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
     CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
                                    ArrayRef<Value*>(Args));
     ReplaceInstWithInst(I, C);
-  } else if (isa<AtomicRMWInst>(I)) {
-    // FIXME: Not yet supported.
-  } else if (isa<AtomicCmpXchgInst>(I)) {
-    // FIXME: Not yet supported.
-  } else if (isa<FenceInst>(I)) {
-    // FIXME: Not yet supported.
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (F == NULL)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
+                     IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
+                     createOrdering(&IRB, CASI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    Function *F = FI->getSynchScope() == SingleThread ?
+        TsanAtomicSignalFence : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
   }
   return true;
 }
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index a01e0661b1ff..b3fc6e338c00 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMScalarOpts
   Reassociate.cpp
   Reg2Mem.cpp
   SCCP.cpp
+  SROA.cpp
   Scalar.cpp
   ScalarReplAggregates.cpp
   SimplifyCFGPass.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index a3c426a714e0..123ed0f4f3de 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Assembly/Writer.h"
@@ -37,12 +38,13 @@
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -146,9 +148,18 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   TLInfo = &getAnalysis<TargetLibraryInfo>();
   DT = getAnalysisIfAvailable<DominatorTree>();
   PFI = getAnalysisIfAvailable<ProfileInfo>();
-  OptSize = F.hasFnAttr(Attribute::OptimizeForSize);
+  OptSize = F.getFnAttributes().hasAttribute(Attributes::OptimizeForSize);
+
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (TLI && TLI->isSlowDivBypassed()) {
+    const DenseMap<unsigned int, unsigned int> &BypassWidths =
+       TLI->getBypassSlowDivWidths();
+    for (Function::iterator I = F.begin(); I != F.end(); I++)
+      EverMadeChange |= bypassSlowDivision(F, I, BypassWidths);
+  }
 
-  // First pass, eliminate blocks that contain only PHI nodes and an
+  // Eliminate blocks that contain only PHI nodes and an
   // unconditional branch.
   EverMadeChange |= EliminateMostlyEmptyBlocks(F);
 
@@ -160,7 +171,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
-    for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = I++;
       MadeChange |= OptimizeBlock(*BB);
     }
@@ -215,11 +226,13 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) {
     // edge, just collapse it.
     BasicBlock *SinglePred = BB->getSinglePredecessor();
 
-    if (!SinglePred || SinglePred == BB) continue;
+    // Don't merge if BB's address is taken.
+    if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
 
     BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
     if (Term && !Term->isConditional()) {
       Changed = true;
+      DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
       // Remember if SinglePred was the entry block of the function.
       // If so, we will need to move BB back to the entry position.
       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
@@ -230,7 +243,6 @@ bool CodeGenPrepare::EliminateFallThrough(Function &F) {
 
       // We have erased a block. Update the iterator.
       I = BB;
-      DEBUG(dbgs() << "Merged:\n"<< *SinglePred << "\n\n\n");
     }
   }
   return Changed;
@@ -610,7 +622,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
     // happens.
     WeakVH IterHandle(CurInstIterator);
 
-    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0,
+    replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getDataLayout() : 0,
                                   TLInfo, ModifiedDT ? 0 : DT);
 
     // If the iterator instruction was recursively deleted, start over at the
@@ -634,8 +646,8 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
   // From here on out we're working with named functions.
   if (CI->getCalledFunction() == 0) return false;
 
-  // We'll need TargetData from here on out.
-  const TargetData *TD = TLI ? TLI->getTargetData() : 0;
+  // We'll need DataLayout from here on out.
+  const DataLayout *TD = TLI ? TLI->getDataLayout() : 0;
   if (!TD) return false;
 
   // Lower all default uses of _chk calls.  This is very similar
@@ -649,6 +661,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
 /// instructions to the predecessor to enable tail call optimizations. The
 /// case it is currently looking for is:
+/// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
 ///   br label %return
@@ -661,9 +674,11 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 /// return:
 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
 ///   ret i32 %retval
+/// @endcode
 ///
 /// =>
 ///
+/// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
 ///   ret i32 %tmp0
@@ -673,7 +688,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 /// bb2:
 ///   %tmp2 = tail call i32 @f2()
 ///   ret i32 %tmp2
-///
+/// @endcode
 bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   if (!TLI)
     return false;
@@ -699,7 +714,8 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   // See llvm::isInTailCallPosition().
   const Function *F = BB->getParent();
   Attributes CallerRetAttr = F->getAttributes().getRetAttributes();
-  if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
+  if (CallerRetAttr.hasAttribute(Attributes::ZExt) ||
+      CallerRetAttr.hasAttribute(Attributes::SExt))
     return false;
 
   // Make sure there are no instructions between the PHI and return, or that the
@@ -757,7 +773,10 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
     Attributes CalleeRetAttr = CS.getAttributes().getRetAttributes();
-    if ((CalleeRetAttr ^ CallerRetAttr) & ~Attribute::NoAlias)
+    if (AttrBuilder(CalleeRetAttr).
+          removeAttribute(Attributes::NoAlias) !=
+        AttrBuilder(CallerRetAttr).
+          removeAttribute(Attributes::NoAlias))
       continue;
 
     // Make sure the call instruction is followed by an unconditional branch to
@@ -774,7 +793,7 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   }
 
   // If we eliminated all predecessors of the block, delete the block now.
-  if (Changed && pred_begin(BB) == pred_end(BB))
+  if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
     BB->eraseFromParent();
 
   return Changed;
@@ -914,7 +933,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
     Type *IntPtrTy =
-          TLI->getTargetData()->getIntPtrType(AccessTy->getContext());
+          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
 
     Value *Result = 0;
 
@@ -988,7 +1007,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     WeakVH IterHandle(CurInstIterator);
     BasicBlock *BB = CurInstIterator->getParent();
 
-    RecursivelyDeleteTriviallyDeadInstructions(Repl);
+    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
 
     if (IterHandle != CurInstIterator) {
       // If the iterator instruction was recursively deleted, start over at the
@@ -1174,17 +1193,32 @@ static bool isFormingBranchFromSelectProfitable(SelectInst *SI) {
 }
 
 
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
 bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
-  // If we have a SelectInst that will likely profit from branch prediction,
-  // turn it into a branch.
-  if (DisableSelectToBranch || OptSize || !TLI ||
-      !TLI->isPredictableSelectExpensive())
-    return false;
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
 
-  if (!SI->getCondition()->getType()->isIntegerTy(1) ||
-      !isFormingBranchFromSelectProfitable(SI))
+  // Can we convert the 'select' to CF ?
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond)
     return false;
 
+  TargetLowering::SelectSupportKind SelectKind;
+  if (VectorCond)
+    SelectKind = TargetLowering::VectorMaskSelect;
+  else if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  // Do we have efficient codegen support for this kind of 'selects' ?
+  if (TLI->isSelectSupported(SelectKind)) {
+    // We have efficient codegen support for the select instruction.
+    // Check if it is profitable to keep this 'select'.
+    if (!TLI->isPredictableSelectExpensive() ||
+        !isFormingBranchFromSelectProfitable(SI))
+      return false;
+  }
+
   ModifiedDT = true;
 
   // First, we split the block containing the select into 2 blocks.
@@ -1302,7 +1336,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
   bool MadeChange = false;
 
   CurInstIterator = BB.begin();
-  for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; )
+  while (CurInstIterator != BB.end())
     MadeChange |= OptimizeInst(CurInstIterator++);
 
   return MadeChange;
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 5430f6253884..369720b3dcef 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Constant.h"
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -67,7 +67,7 @@ bool ConstantPropagation::runOnFunction(Function &F) {
       WorkList.insert(&*i);
   }
   bool Changed = false;
-  TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
   TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   while (!WorkList.empty()) {
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 9b0aadb0b5b0..3ec6f3dcc31b 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -235,6 +235,11 @@ bool CorrelatedValuePropagation::processSwitch(SwitchInst *SI) {
       // This case never fires - remove it.
       CI.getCaseSuccessor()->removePredecessor(BB);
       SI->removeCase(CI); // Does not invalidate the iterator.
+
+      // The condition can be modified by removePredecessor's PHI simplification
+      // logic.
+      Cond = SI->getCondition();
+
       ++NumDeadCases;
       Changed = true;
     } else if (State == LazyValueInfo::True) {
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index 8dbcc23d7ec8..a2e074fae896 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
@@ -38,10 +39,11 @@ namespace {
       initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry());
     }
     virtual bool runOnBasicBlock(BasicBlock &BB) {
+      TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
       bool Changed = false;
       for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) {
         Instruction *Inst = DI++;
-        if (isInstructionTriviallyDead(Inst)) {
+        if (isInstructionTriviallyDead(Inst, TLI)) {
           Inst->eraseFromParent();
           Changed = true;
           ++DIEEliminated;
@@ -87,6 +89,8 @@ char DCE::ID = 0;
 INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false)
 
 bool DCE::runOnFunction(Function &F) {
+  TargetLibraryInfo *TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
+
   // Start out with all of the instructions in the worklist...
   std::vector<Instruction*> WorkList;
   for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
@@ -101,7 +105,7 @@ bool DCE::runOnFunction(Function &F) {
     Instruction *I = WorkList.back();
     WorkList.pop_back();
 
-    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+    if (isInstructionTriviallyDead(I, TLI)) { // If the instruction is dead.
       // Loop over all of the values that the instruction uses, if there are
       // instructions being used, add them to the worklist, because they might
       // go dead after this one is removed.
@@ -114,13 +118,8 @@ bool DCE::runOnFunction(Function &F) {
       I->eraseFromParent();
 
       // Remove the instruction from the worklist if it still exists in it.
-      for (std::vector<Instruction*>::iterator WI = WorkList.begin();
-           WI != WorkList.end(); ) {
-        if (*WI == I)
-          WI = WorkList.erase(WI);
-        else
-          ++WI;
-      }
+      WorkList.erase(std::remove(WorkList.begin(), WorkList.end(), I),
+                     WorkList.end());
 
       MadeChange = true;
       ++DCEEliminated;
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 8b1283ff2531..736cc05e043e 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -29,7 +29,8 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/SetVector.h"
@@ -45,6 +46,7 @@ namespace {
     AliasAnalysis *AA;
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
+    const TargetLibraryInfo *TLI;
 
     static char ID; // Pass identification, replacement for typeid
     DSE() : FunctionPass(ID), AA(0), MD(0), DT(0) {
@@ -55,6 +57,7 @@ namespace {
       AA = &getAnalysis<AliasAnalysis>();
       MD = &getAnalysis<MemoryDependenceAnalysis>();
       DT = &getAnalysis<DominatorTree>();
+      TLI = AA->getTargetLibraryInfo();
 
       bool Changed = false;
       for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
@@ -106,6 +109,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
 ///
 static void DeleteDeadInstruction(Instruction *I,
                                   MemoryDependenceAnalysis &MD,
+                                  const TargetLibraryInfo *TLI,
                                   SmallSetVector<Value*, 16> *ValueSet = 0) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
@@ -130,7 +134,7 @@ static void DeleteDeadInstruction(Instruction *I,
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -143,7 +147,7 @@ static void DeleteDeadInstruction(Instruction *I,
 
 /// hasMemoryWrite - Does this instruction write some memory?  This only returns
 /// true for things that we can analyze with other helpers below.
-static bool hasMemoryWrite(Instruction *I) {
+static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo *TLI) {
   if (isa<StoreInst>(I))
     return true;
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
@@ -158,6 +162,26 @@ static bool hasMemoryWrite(Instruction *I) {
       return true;
     }
   }
+  if (CallSite CS = I) {
+    if (Function *F = CS.getCalledFunction()) {
+      if (TLI && TLI->has(LibFunc::strcpy) &&
+          F->getName() == TLI->getName(LibFunc::strcpy)) {
+        return true;
+      }
+      if (TLI && TLI->has(LibFunc::strncpy) &&
+          F->getName() == TLI->getName(LibFunc::strncpy)) {
+        return true;
+      }
+      if (TLI && TLI->has(LibFunc::strcat) &&
+          F->getName() == TLI->getName(LibFunc::strcat)) {
+        return true;
+      }
+      if (TLI && TLI->has(LibFunc::strncat) &&
+          F->getName() == TLI->getName(LibFunc::strncat)) {
+        return true;
+      }
+    }
+  }
   return false;
 }
 
@@ -175,7 +199,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // If we don't have target data around, an unknown size in Location means
     // that we should use the size of the pointee type.  This isn't valid for
     // memset/memcpy, which writes more than an i8.
-    if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0)
+    if (Loc.Size == AliasAnalysis::UnknownSize && AA.getDataLayout() == 0)
       return AliasAnalysis::Location();
     return Loc;
   }
@@ -189,7 +213,7 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
     // If we don't have target data around, an unknown size in Location means
     // that we should use the size of the pointee type.  This isn't valid for
     // init.trampoline, which writes more than an i8.
-    if (AA.getTargetData() == 0) return AliasAnalysis::Location();
+    if (AA.getDataLayout() == 0) return AliasAnalysis::Location();
 
     // FIXME: We don't know the size of the trampoline, so we can't really
     // handle it here.
@@ -205,7 +229,8 @@ getLocForWrite(Instruction *Inst, AliasAnalysis &AA) {
 /// instruction if any.
 static AliasAnalysis::Location
 getLocForRead(Instruction *Inst, AliasAnalysis &AA) {
-  assert(hasMemoryWrite(Inst) && "Unknown instruction case");
+  assert(hasMemoryWrite(Inst, AA.getTargetLibraryInfo()) &&
+         "Unknown instruction case");
 
   // The only instructions that both read and write are the mem transfer
   // instructions (memcpy/memmove).
@@ -222,23 +247,29 @@ static bool isRemovable(Instruction *I) {
   if (StoreInst *SI = dyn_cast<StoreInst>(I))
     return SI->isUnordered();
 
-  IntrinsicInst *II = cast<IntrinsicInst>(I);
-  switch (II->getIntrinsicID()) {
-  default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
-  case Intrinsic::lifetime_end:
-    // Never remove dead lifetime_end's, e.g. because it is followed by a
-    // free.
-    return false;
-  case Intrinsic::init_trampoline:
-    // Always safe to remove init_trampoline.
-    return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("doesn't pass 'hasMemoryWrite' predicate");
+    case Intrinsic::lifetime_end:
+      // Never remove dead lifetime_end's, e.g. because it is followed by a
+      // free.
+      return false;
+    case Intrinsic::init_trampoline:
+      // Always safe to remove init_trampoline.
+      return true;
 
-  case Intrinsic::memset:
-  case Intrinsic::memmove:
-  case Intrinsic::memcpy:
-    // Don't remove volatile memory intrinsics.
-    return !cast<MemIntrinsic>(II)->isVolatile();
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      // Don't remove volatile memory intrinsics.
+      return !cast<MemIntrinsic>(II)->isVolatile();
+    }
   }
+
+  if (CallSite CS = I)
+    return CS.getInstruction()->use_empty();
+
+  return false;
 }
 
 
@@ -249,14 +280,19 @@ static bool isShortenable(Instruction *I) {
   if (isa<StoreInst>(I))
     return false;
 
-  IntrinsicInst *II = cast<IntrinsicInst>(I);
-  switch (II->getIntrinsicID()) {
-    default: return false;
-    case Intrinsic::memset:
-    case Intrinsic::memcpy:
-      // Do shorten memory intrinsics.
-      return true;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+      default: return false;
+      case Intrinsic::memset:
+      case Intrinsic::memcpy:
+        // Do shorten memory intrinsics.
+        return true;
+    }
   }
+
+  // Don't shorten libcalls calls for now.
+
+  return false;
 }
 
 /// getStoredPointerOperand - Return the pointer that is being written to.
@@ -266,17 +302,23 @@ static Value *getStoredPointerOperand(Instruction *I) {
   if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
     return MI->getDest();
 
-  IntrinsicInst *II = cast<IntrinsicInst>(I);
-  switch (II->getIntrinsicID()) {
-  default: llvm_unreachable("Unexpected intrinsic!");
-  case Intrinsic::init_trampoline:
-    return II->getArgOperand(0);
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    default: llvm_unreachable("Unexpected intrinsic!");
+    case Intrinsic::init_trampoline:
+      return II->getArgOperand(0);
+    }
   }
+
+  CallSite CS = I;
+  // All the supported functions so far happen to have dest as their first
+  // argument.
+  return CS.getArgument(0);
 }
 
 static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
   uint64_t Size;
-  if (getObjectSize(V, Size, AA.getTargetData()))
+  if (getObjectSize(V, Size, AA.getDataLayout(), AA.getTargetLibraryInfo()))
     return Size;
   return AliasAnalysis::UnknownSize;
 }
@@ -309,10 +351,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
     // comparison.
     if (Later.Size == AliasAnalysis::UnknownSize ||
         Earlier.Size == AliasAnalysis::UnknownSize) {
-      // If we have no TargetData information around, then the size of the store
+      // If we have no DataLayout information around, then the size of the store
       // is inferrable from the pointee type.  If they are the same type, then
       // we know that the store is safe.
-      if (AA.getTargetData() == 0 &&
+      if (AA.getDataLayout() == 0 &&
           Later.Ptr->getType() == Earlier.Ptr->getType())
         return OverwriteComplete;
 
@@ -328,13 +370,13 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
   // larger than the earlier one.
   if (Later.Size == AliasAnalysis::UnknownSize ||
       Earlier.Size == AliasAnalysis::UnknownSize ||
-      AA.getTargetData() == 0)
+      AA.getDataLayout() == 0)
     return OverwriteUnknown;
 
   // Check to see if the later store is to the entire object (either a global,
   // an alloca, or a byval argument).  If so, then it clearly overwrites any
   // other store to the same object.
-  const TargetData &TD = *AA.getTargetData();
+  const DataLayout &TD = *AA.getDataLayout();
 
   const Value *UO1 = GetUnderlyingObject(P1, &TD),
               *UO2 = GetUnderlyingObject(P2, &TD);
@@ -454,13 +496,13 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
     Instruction *Inst = BBI++;
 
     // Handle 'free' calls specially.
-    if (CallInst *F = isFreeCall(Inst)) {
+    if (CallInst *F = isFreeCall(Inst, TLI)) {
       MadeChange |= HandleFree(F);
       continue;
     }
 
     // If we find something that writes memory, get its memory dependence.
-    if (!hasMemoryWrite(Inst))
+    if (!hasMemoryWrite(Inst, TLI))
       continue;
 
     MemDepResult InstDep = MD->getDependency(Inst);
@@ -483,7 +525,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
           // in case we need it.
           WeakVH NextInst(BBI);
 
-          DeleteDeadInstruction(SI, *MD);
+          DeleteDeadInstruction(SI, *MD, TLI);
 
           if (NextInst == 0)  // Next instruction deleted.
             BBI = BB.begin();
@@ -530,7 +572,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) {
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          DeleteDeadInstruction(DepWrite, *MD);
+          DeleteDeadInstruction(DepWrite, *MD, TLI);
           ++NumFastStores;
           MadeChange = true;
 
@@ -627,7 +669,7 @@ bool DSE::HandleFree(CallInst *F) {
     MemDepResult Dep = MD->getPointerDependencyFrom(Loc, false, InstPt, BB);
     while (Dep.isDef() || Dep.isClobber()) {
       Instruction *Dependency = Dep.getInst();
-      if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency))
+      if (!hasMemoryWrite(Dependency, TLI) || !isRemovable(Dependency))
         break;
 
       Value *DepPointer =
@@ -640,7 +682,7 @@ bool DSE::HandleFree(CallInst *F) {
       Instruction *Next = llvm::next(BasicBlock::iterator(Dependency));
 
       // DCE instructions only used to calculate that store
-      DeleteDeadInstruction(Dependency, *MD);
+      DeleteDeadInstruction(Dependency, *MD, TLI);
       ++NumFastStores;
       MadeChange = true;
 
@@ -659,6 +701,22 @@ bool DSE::HandleFree(CallInst *F) {
   return MadeChange;
 }
 
+namespace {
+  struct CouldRef {
+    typedef Value *argument_type;
+    const CallSite CS;
+    AliasAnalysis *AA;
+
+    bool operator()(Value *I) {
+      // See if the call site touches the value.
+      AliasAnalysis::ModRefResult A =
+        AA->getModRefInfo(CS, I, getPointerSize(I, *AA));
+
+      return A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref;
+    }
+  };
+}
+
 /// handleEndBlock - Remove dead stores to stack-allocated locations in the
 /// function end block.  Ex:
 /// %A = alloca i32
@@ -680,7 +738,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
     // Okay, so these are dead heap objects, but if the pointer never escapes
     // then it's leaked by this function anyways.
-    else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true))
+    else if (isAllocLikeFn(I, TLI) && !PointerMayBeCaptured(I, true, true))
       DeadStackObjects.insert(I);
   }
 
@@ -696,7 +754,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     --BBI;
 
     // If we find a store, check to see if it points into a dead stack value.
-    if (hasMemoryWrite(BBI) && isRemovable(BBI)) {
+    if (hasMemoryWrite(BBI, TLI) && isRemovable(BBI)) {
       // See through pointer-to-pointer bitcasts
       SmallVector<Value *, 4> Pointers;
       GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers);
@@ -724,7 +782,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
               dbgs() << '\n');
 
         // DCE instructions only used to calculate that store.
-        DeleteDeadInstruction(Dead, *MD, &DeadStackObjects);
+        DeleteDeadInstruction(Dead, *MD, TLI, &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
         continue;
@@ -732,9 +790,9 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     }
 
     // Remove any dead non-memory-mutating instructions.
-    if (isInstructionTriviallyDead(BBI)) {
+    if (isInstructionTriviallyDead(BBI, TLI)) {
       Instruction *Inst = BBI++;
-      DeleteDeadInstruction(Inst, *MD, &DeadStackObjects);
+      DeleteDeadInstruction(Inst, *MD, TLI, &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
       continue;
@@ -750,7 +808,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
     if (CallSite CS = cast<Value>(BBI)) {
       // Remove allocation function calls from the list of dead stack objects; 
       // there can't be any references before the definition.
-      if (isAllocLikeFn(BBI))
+      if (isAllocLikeFn(BBI, TLI))
         DeadStackObjects.remove(BBI);
 
       // If this call does not access memory, it can't be loading any of our
@@ -760,20 +818,8 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
 
       // If the call might load from any of our allocas, then any store above
       // the call is live.
-      SmallVector<Value*, 8> LiveAllocas;
-      for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(),
-           E = DeadStackObjects.end(); I != E; ++I) {
-        // See if the call site touches it.
-        AliasAnalysis::ModRefResult A =
-          AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA));
-
-        if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
-          LiveAllocas.push_back(*I);
-      }
-
-      for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(),
-           E = LiveAllocas.end(); I != E; ++I)
-        DeadStackObjects.remove(*I);
+      CouldRef Pred = { CS, AA };
+      DeadStackObjects.remove_if(Pred);
 
       // If all of the allocas were clobbered by the call then we're not going
       // to find anything else to process.
@@ -816,6 +862,20 @@ bool DSE::handleEndBlock(BasicBlock &BB) {
   return MadeChange;
 }
 
+namespace {
+  struct CouldAlias {
+    typedef Value *argument_type;
+    const AliasAnalysis::Location &LoadedLoc;
+    AliasAnalysis *AA;
+
+    bool operator()(Value *I) {
+      // See if the loaded location could alias the stack location.
+      AliasAnalysis::Location StackLoc(I, getPointerSize(I, *AA));
+      return !AA->isNoAlias(StackLoc, LoadedLoc);
+    }
+  };
+}
+
 /// RemoveAccessedObjects - Check to see if the specified location may alias any
 /// of the stack objects in the DeadStackObjects set.  If so, they become live
 /// because the location is being loaded.
@@ -834,16 +894,7 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc,
     return;
   }
 
-  SmallVector<Value*, 16> NowLive;
-  for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(),
-       E = DeadStackObjects.end(); I != E; ++I) {
-    // See if the loaded location could alias the stack location.
-    AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA));
-    if (!AA->isNoAlias(StackLoc, LoadedLoc))
-      NowLive.push_back(*I);
-  }
-
-  for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end();
-       I != E; ++I)
-    DeadStackObjects.remove(*I);
+  // Remove objects that could alias LoadedLoc.
+  CouldAlias Pred = { LoadedLoc, AA };
+  DeadStackObjects.remove_if(Pred);
 }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 975954953b29..101009dd64c7 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -18,11 +18,12 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/ScopedHashTable.h"
 #include "llvm/ADT/Statistic.h"
 #include <deque>
@@ -90,35 +91,56 @@ template<> struct DenseMapInfo<SimpleValue> {
 
 unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
   Instruction *Inst = Val.Inst;
-
   // Hash in all of the operands as pointers.
-  unsigned Res = 0;
-  for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
-    Res ^= getHash(Inst->getOperand(i)) << (i & 0xF);
+  if (BinaryOperator* BinOp = dyn_cast<BinaryOperator>(Inst)) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+      std::swap(LHS, RHS);
+
+    if (isa<OverflowingBinaryOperator>(BinOp)) {
+      // Hash the overflow behavior
+      unsigned Overflow =
+        BinOp->hasNoSignedWrap()   * OverflowingBinaryOperator::NoSignedWrap |
+        BinOp->hasNoUnsignedWrap() * OverflowingBinaryOperator::NoUnsignedWrap;
+      return hash_combine(BinOp->getOpcode(), Overflow, LHS, RHS);
+    }
 
-  if (CastInst *CI = dyn_cast<CastInst>(Inst))
-    Res ^= getHash(CI->getType());
-  else if (CmpInst *CI = dyn_cast<CmpInst>(Inst))
-    Res ^= CI->getPredicate();
-  else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) {
-    for (ExtractValueInst::idx_iterator I = EVI->idx_begin(),
-         E = EVI->idx_end(); I != E; ++I)
-      Res ^= *I;
-  } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) {
-    for (InsertValueInst::idx_iterator I = IVI->idx_begin(),
-         E = IVI->idx_end(); I != E; ++I)
-      Res ^= *I;
-  } else {
-    // nothing extra to hash in.
-    assert((isa<CallInst>(Inst) ||
-            isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
-            isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
-            isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) &&
-           "Invalid/unknown instruction");
+    return hash_combine(BinOp->getOpcode(), LHS, RHS);
   }
 
+  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+    Value *LHS = CI->getOperand(0);
+    Value *RHS = CI->getOperand(1);
+    CmpInst::Predicate Pred = CI->getPredicate();
+    if (Inst->getOperand(0) > Inst->getOperand(1)) {
+      std::swap(LHS, RHS);
+      Pred = CI->getSwappedPredicate();
+    }
+    return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(Inst))
+    return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+  if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+    return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+                        hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+  if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+    return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+                        IVI->getOperand(1),
+                        hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+  assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
+          isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
+          isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+          isa<ShuffleVectorInst>(Inst)) && "Invalid/unknown instruction");
+
   // Mix in the opcode.
-  return (Res << 1) ^ Inst->getOpcode();
+  return hash_combine(Inst->getOpcode(),
+                      hash_combine_range(Inst->value_op_begin(),
+                                         Inst->value_op_end()));
 }
 
 bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
@@ -128,7 +150,41 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
     return LHSI == RHSI;
 
   if (LHSI->getOpcode() != RHSI->getOpcode()) return false;
-  return LHSI->isIdenticalTo(RHSI);
+  if (LHSI->isIdenticalTo(RHSI)) return true;
+
+  // If we're not strictly identical, we still might be a commutable instruction
+  if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+    if (!LHSBinOp->isCommutative())
+      return false;
+
+    assert(isa<BinaryOperator>(RHSI)
+           && "same opcode, but different instruction type?");
+    BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+    // Check overflow attributes
+    if (isa<OverflowingBinaryOperator>(LHSBinOp)) {
+      assert(isa<OverflowingBinaryOperator>(RHSBinOp)
+             && "same opcode, but different operator type?");
+      if (LHSBinOp->hasNoUnsignedWrap() != RHSBinOp->hasNoUnsignedWrap() ||
+          LHSBinOp->hasNoSignedWrap() != RHSBinOp->hasNoSignedWrap())
+        return false;
+    }
+
+    // Commuted equality
+    return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+      LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+  }
+  if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+    assert(isa<CmpInst>(RHSI)
+           && "same opcode, but different instruction type?");
+    CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+    // Commuted equality
+    return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+      LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+      LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+  }
+
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
@@ -216,7 +272,7 @@ namespace {
 /// cases.
 class EarlyCSE : public FunctionPass {
 public:
-  const TargetData *TD;
+  const DataLayout *TD;
   const TargetLibraryInfo *TLI;
   DominatorTree *DT;
   typedef RecyclingAllocator<BumpPtrAllocator,
@@ -274,7 +330,8 @@ private:
         CallScope(*availableCalls) {}
 
    private:
-    NodeScope(const NodeScope&); // DO NOT IMPLEMENT
+    NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION;
+    void operator=(const NodeScope&) LLVM_DELETED_FUNCTION;
 
     ScopedHTType::ScopeTy Scope;
     LoadHTType::ScopeTy LoadScope;
@@ -313,7 +370,8 @@ private:
     void process() { Processed = true; }
 
    private:
-    StackNode(const StackNode&); // DO NOT IMPLEMENT
+    StackNode(const StackNode&) LLVM_DELETED_FUNCTION;
+    void operator=(const StackNode&) LLVM_DELETED_FUNCTION;
 
     // Members.
     unsigned CurrentGeneration;
@@ -374,7 +432,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     Instruction *Inst = I++;
 
     // Dead instructions should just be removed.
-    if (isInstructionTriviallyDead(Inst)) {
+    if (isInstructionTriviallyDead(Inst, TLI)) {
       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
       Inst->eraseFromParent();
       Changed = true;
@@ -506,7 +564,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 bool EarlyCSE::runOnFunction(Function &F) {
   std::deque<StackNode *> nodesToProcess;
 
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   DT = &getAnalysis<DominatorTree>();
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 4822fd09448c..f003e0669966 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -41,7 +41,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PatternMatch.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -271,16 +271,16 @@ void ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
 }
 
-uint32_t ValueTable::lookup_or_add_call(CallInst* C) {
+uint32_t ValueTable::lookup_or_add_call(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) e = nextValueNumber++;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = create_expression(C);
-    uint32_t& e = expressionNumbering[exp];
+    uint32_t &e = expressionNumbering[exp];
     if (!e) {
       e = nextValueNumber++;
       valueNumbering[C] = e;
@@ -413,7 +413,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) {
     case Instruction::LShr:
     case Instruction::AShr:
     case Instruction::And:
-    case Instruction::Or :
+    case Instruction::Or:
     case Instruction::Xor:
     case Instruction::ICmp:
     case Instruction::FCmp:
@@ -503,7 +503,7 @@ namespace {
     bool NoLoads;
     MemoryDependenceAnalysis *MD;
     DominatorTree *DT;
-    const TargetData *TD;
+    const DataLayout *TD;
     const TargetLibraryInfo *TLI;
 
     ValueTable VN;
@@ -535,7 +535,7 @@ namespace {
       InstrsToErase.push_back(I);
     }
 
-    const TargetData *getTargetData() const { return TD; }
+    const DataLayout *getDataLayout() const { return TD; }
     DominatorTree &getDominatorTree() const { return *DT; }
     AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); }
     MemoryDependenceAnalysis &getMemDep() const { return *MD; }
@@ -632,6 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
@@ -641,6 +642,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   }
   errs() << "}\n";
 }
+#endif
 
 /// IsValueFullyAvailableInBlock - Return true if we can prove that the value
 /// we're analyzing is fully available in the specified block.  As we go, keep
@@ -728,7 +730,7 @@ SpeculationFailure:
 /// CoerceAvailableValueToLoadType will succeed.
 static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
                                             Type *LoadTy,
-                                            const TargetData &TD) {
+                                            const DataLayout &TD) {
   // If the loaded or stored value is an first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
@@ -744,7 +746,6 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
   return true;
 }
 
-
 /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
 /// then a load from a must-aliased pointer of a different type, try to coerce
 /// the stored value.  LoadedTy is the type of the load we want to replace and
@@ -754,7 +755,7 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
 static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
                                              Type *LoadedTy,
                                              Instruction *InsertPt,
-                                             const TargetData &TD) {
+                                             const DataLayout &TD) {
   if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD))
     return 0;
 
@@ -767,24 +768,25 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
     // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy())
+    if (StoredValTy->getScalarType()->isPointerTy() &&
+        LoadedTy->getScalarType()->isPointerTy())
       return new BitCastInst(StoredVal, LoadedTy, "", InsertPt);
 
     // Convert source pointers to integers, which can be bitcast.
-    if (StoredValTy->isPointerTy()) {
-      StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
+    if (StoredValTy->getScalarType()->isPointerTy()) {
+      StoredValTy = TD.getIntPtrType(StoredValTy);
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
 
     Type *TypeToCastTo = LoadedTy;
-    if (TypeToCastTo->isPointerTy())
-      TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext());
+    if (TypeToCastTo->getScalarType()->isPointerTy())
+      TypeToCastTo = TD.getIntPtrType(TypeToCastTo);
 
     if (StoredValTy != TypeToCastTo)
       StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt);
 
     // Cast to pointer if the load needs a pointer type.
-    if (LoadedTy->isPointerTy())
+    if (LoadedTy->getScalarType()->isPointerTy())
       StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt);
 
     return StoredVal;
@@ -796,8 +798,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->isPointerTy()) {
-    StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
+  if (StoredValTy->getScalarType()->isPointerTy()) {
+    StoredValTy = TD.getIntPtrType(StoredValTy);
     StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
   }
 
@@ -822,7 +824,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
     return StoredVal;
 
   // If the result is a pointer, inttoptr.
-  if (LoadedTy->isPointerTy())
+  if (LoadedTy->getScalarType()->isPointerTy())
     return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt);
 
   // Otherwise, bitcast.
@@ -840,7 +842,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
 static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
-                                          const TargetData &TD) {
+                                          const DataLayout &TD) {
   // If the loaded or stored value is a first class array or struct, don't try
   // to transform them.  We need to be able to bitcast to integer.
   if (LoadTy->isStructTy() || LoadTy->isArrayTy())
@@ -913,7 +915,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 /// memdep query of a load that ends up being a clobbering store.
 static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                           StoreInst *DepSI,
-                                          const TargetData &TD) {
+                                          const DataLayout &TD) {
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepSI->getValueOperand()->getType()->isStructTy() ||
       DepSI->getValueOperand()->getType()->isArrayTy())
@@ -929,7 +931,7 @@ static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
 /// memdep query of a load that ends up being clobbered by another load.  See if
 /// the other load can feed into the second load.
 static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
-                                         LoadInst *DepLI, const TargetData &TD){
+                                         LoadInst *DepLI, const DataLayout &TD){
   // Cannot handle reading from store of first-class aggregate yet.
   if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
     return -1;
@@ -957,7 +959,7 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
 
 static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
                                             MemIntrinsic *MI,
-                                            const TargetData &TD) {
+                                            const DataLayout &TD) {
   // If the mem operation is a non-constant size, we can't handle it.
   ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
   if (SizeCst == 0) return -1;
@@ -1007,7 +1009,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
 /// before we give up.
 static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
                                    Type *LoadTy,
-                                   Instruction *InsertPt, const TargetData &TD){
+                                   Instruction *InsertPt, const DataLayout &TD){
   LLVMContext &Ctx = SrcVal->getType()->getContext();
 
   uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
@@ -1017,8 +1019,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
-  if (SrcVal->getType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
+  if (SrcVal->getType()->getScalarType()->isPointerTy())
+    SrcVal = Builder.CreatePtrToInt(SrcVal,
+        TD.getIntPtrType(SrcVal->getType()));
   if (!SrcVal->getType()->isIntegerTy())
     SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
 
@@ -1046,7 +1049,7 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
                                   Type *LoadTy, Instruction *InsertPt,
                                   GVN &gvn) {
-  const TargetData &TD = *gvn.getTargetData();
+  const DataLayout &TD = *gvn.getDataLayout();
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
   unsigned SrcValSize = TD.getTypeStoreSize(SrcVal->getType());
@@ -1105,7 +1108,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
 /// memdep query of a load that ends up being a clobbering mem intrinsic.
 static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
                                      Type *LoadTy, Instruction *InsertPt,
-                                     const TargetData &TD){
+                                     const DataLayout &TD){
   LLVMContext &Ctx = LoadTy->getContext();
   uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8;
 
@@ -1229,7 +1232,7 @@ struct AvailableValueInBlock {
     if (isSimpleValue()) {
       Res = getSimpleValue();
       if (Res->getType() != LoadTy) {
-        const TargetData *TD = gvn.getTargetData();
+        const DataLayout *TD = gvn.getDataLayout();
         assert(TD && "Need target data to handle type mismatch case");
         Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
                                    *TD);
@@ -1251,7 +1254,7 @@ struct AvailableValueInBlock {
                      << *Res << '\n' << "\n\n\n");
       }
     } else {
-      const TargetData *TD = gvn.getTargetData();
+      const DataLayout *TD = gvn.getDataLayout();
       assert(TD && "Need target data to handle type mismatch case");
       Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
                                    LoadTy, BB->getTerminator(), *TD);
@@ -1299,7 +1302,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
 
   // If new PHI nodes were created, notify alias analysis.
-  if (V->getType()->isPointerTy()) {
+  if (V->getType()->getScalarType()->isPointerTy()) {
     AliasAnalysis *AA = gvn.getAliasAnalysis();
 
     for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
@@ -1436,7 +1439,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     Instruction *DepInst = DepInfo.getInst();
 
     // Loading the allocation -> undef.
-    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) ||
+    if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
         // Loading immediately after lifetime begin -> undef.
         isLifetimeStart(DepInst)) {
       ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
@@ -1496,7 +1499,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
 
     if (isa<PHINode>(V))
       V->takeName(LI);
-    if (V->getType()->isPointerTy())
+    if (V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(LI);
     ++NumGVNLoad;
@@ -1728,7 +1731,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   LI->replaceAllUsesWith(V);
   if (isa<PHINode>(V))
     V->takeName(LI);
-  if (V->getType()->isPointerTy())
+  if (V->getType()->getScalarType()->isPointerTy())
     MD->invalidateCachedPointerInfo(V);
   markInstructionForDeletion(LI);
   ++NumPRELoad;
@@ -1855,7 +1858,7 @@ bool GVN::processLoad(LoadInst *L) {
 
       // Replace the load!
       L->replaceAllUsesWith(AvailVal);
-      if (AvailVal->getType()->isPointerTy())
+      if (AvailVal->getType()->getScalarType()->isPointerTy())
         MD->invalidateCachedPointerInfo(AvailVal);
       markInstructionForDeletion(L);
       ++NumGVNLoad;
@@ -1912,7 +1915,7 @@ bool GVN::processLoad(LoadInst *L) {
 
     // Remove it!
     L->replaceAllUsesWith(StoredVal);
-    if (StoredVal->getType()->isPointerTy())
+    if (StoredVal->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(StoredVal);
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -1941,7 +1944,7 @@ bool GVN::processLoad(LoadInst *L) {
 
     // Remove it!
     patchAndReplaceAllUsesWith(AvailableVal, L);
-    if (DepLI->getType()->isPointerTy())
+    if (DepLI->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -1951,7 +1954,7 @@ bool GVN::processLoad(LoadInst *L) {
   // If this load really doesn't depend on anything, then we must be loading an
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
     L->replaceAllUsesWith(UndefValue::get(L->getType()));
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -2182,7 +2185,7 @@ bool GVN::processInstruction(Instruction *I) {
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
   if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
     I->replaceAllUsesWith(V);
-    if (MD && V->getType()->isPointerTy())
+    if (MD && V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(I);
     ++NumGVNSimpl;
@@ -2231,12 +2234,20 @@ bool GVN::processInstruction(Instruction *I) {
     Value *SwitchCond = SI->getCondition();
     BasicBlock *Parent = SI->getParent();
     bool Changed = false;
+
+    // Remember how many outgoing edges there are to every successor.
+    SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+    for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+      ++SwitchEdges[SI->getSuccessor(i)];
+
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
       BasicBlock *Dst = i.getCaseSuccessor();
-      BasicBlockEdge E(Parent, Dst);
-      if (E.isSingleEdge())
+      // If there is only a single edge, propagate the case value into it.
+      if (SwitchEdges.lookup(Dst) == 1) {
+        BasicBlockEdge E(Parent, Dst);
         Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E);
+      }
     }
     return Changed;
   }
@@ -2274,7 +2285,7 @@ bool GVN::processInstruction(Instruction *I) {
 
   // Remove it!
   patchAndReplaceAllUsesWith(repl, I);
-  if (MD && repl->getType()->isPointerTy())
+  if (MD && repl->getType()->getScalarType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
   markInstructionForDeletion(I);
   return true;
@@ -2285,7 +2296,7 @@ bool GVN::runOnFunction(Function& F) {
   if (!NoLoads)
     MD = &getAnalysis<MemoryDependenceAnalysis>();
   DT = &getAnalysis<DominatorTree>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>());
   VN.setMemDep(MD);
@@ -2522,7 +2533,7 @@ bool GVN::performPRE(Function &F) {
       addToLeaderTable(ValNo, Phi, CurrentBlock);
       Phi->setDebugLoc(CurInst->getDebugLoc());
       CurInst->replaceAllUsesWith(Phi);
-      if (Phi->getType()->isPointerTy()) {
+      if (Phi->getType()->getScalarType()->isPointerTy()) {
         // Because we have added a PHI-use of the pointer value, it has now
         // "escaped" from alias analysis' perspective.  We need to inform
         // AA of this.
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index b36a3cb77653..6301aad6106b 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -62,7 +62,7 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/ADT/Statistic.h"
@@ -98,9 +98,9 @@ namespace {
     }
 
     struct GlobalCmp {
-      const TargetData *TD;
+      const DataLayout *TD;
 
-      GlobalCmp(const TargetData *td) : TD(td) { }
+      GlobalCmp(const DataLayout *td) : TD(td) { }
 
       bool operator()(const GlobalVariable *GV1, const GlobalVariable *GV2) {
         Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
@@ -119,7 +119,7 @@ INITIALIZE_PASS(GlobalMerge, "global-merge",
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                              Module &M, bool isConst) const {
-  const TargetData *TD = TLI->getTargetData();
+  const DataLayout *TD = TLI->getDataLayout();
 
   // FIXME: Infer the maximum possible offset depending on the actual users
   // (these max offsets are different for the users inside Thumb or ARM
@@ -170,7 +170,7 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
 
 bool GlobalMerge::doInitialization(Module &M) {
   SmallVector<GlobalVariable*, 16> Globals, ConstGlobals, BSSGlobals;
-  const TargetData *TD = TLI->getTargetData();
+  const DataLayout *TD = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
 
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 37f8bdfbffed..310fd6147aa9 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -43,7 +43,8 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -67,7 +68,8 @@ namespace {
     LoopInfo        *LI;
     ScalarEvolution *SE;
     DominatorTree   *DT;
-    TargetData      *TD;
+    DataLayout      *TD;
+    TargetLibraryInfo *TLI;
 
     SmallVector<WeakVH, 16> DeadInsts;
     bool Changed;
@@ -218,8 +220,6 @@ static Instruction *getInsertPointForUses(Instruction *User, Value *Def,
 /// ConvertToSInt - Convert APF to an integer, if possible.
 static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   bool isExact = false;
-  if (&APF.getSemantics() == &APFloat::PPCDoubleDouble)
-    return false;
   // See if we can convert this to an int64_t
   uint64_t UIntVal;
   if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
@@ -414,11 +414,11 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
   // new comparison.
   NewCompare->takeName(Compare);
   Compare->replaceAllUsesWith(NewCompare);
-  RecursivelyDeleteTriviallyDeadInstructions(Compare);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
 
   // Delete the old floating point increment.
   Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
-  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
 
   // If the FP induction variable still has uses, this is because something else
   // in the loop uses its value.  In order to canonicalize the induction
@@ -431,7 +431,7 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PN) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
                                  PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
-    RecursivelyDeleteTriviallyDeadInstructions(PN);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
   }
   Changed = true;
 }
@@ -549,15 +549,17 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
 
         PN->setIncomingValue(i, ExitVal);
 
-        // If this instruction is dead now, delete it.
-        RecursivelyDeleteTriviallyDeadInstructions(Inst);
+        // If this instruction is dead now, delete it. Don't do it now to avoid
+        // invalidating iterators.
+        if (isInstructionTriviallyDead(Inst, TLI))
+          DeadInsts.push_back(Inst);
 
         if (NumPreds == 1) {
           // Completely replace a single-pred PHI. This is safe, because the
           // NewVal won't be variant in the loop, so we don't need an LCSSA phi
           // node anymore.
           PN->replaceAllUsesWith(ExitVal);
-          RecursivelyDeleteTriviallyDeadInstructions(PN);
+          PN->eraseFromParent();
         }
       }
       if (NumPreds != 1) {
@@ -595,13 +597,13 @@ namespace {
 
   class WideIVVisitor : public IVVisitor {
     ScalarEvolution *SE;
-    const TargetData *TD;
+    const DataLayout *TD;
 
   public:
     WideIVInfo WI;
 
     WideIVVisitor(PHINode *NarrowIV, ScalarEvolution *SCEV,
-                  const TargetData *TData) :
+                  const DataLayout *TData) :
       SE(SCEV), TD(TData) { WI.NarrowIV = NarrowIV; }
 
     // Implement the interface used by simplifyUsersOfIV.
@@ -1259,8 +1261,13 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) {
   if (!Phi)
     return true;
 
+  // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+  int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+  if (Idx < 0)
+    return true;
+
   // Do LFTR if the exit condition's IV is *not* a simple counter.
-  Value *IncV = Phi->getIncomingValueForBlock(L->getLoopLatch());
+  Value *IncV = Phi->getIncomingValue(Idx);
   return Phi != getLoopPhiForCounter(IncV, L, DT);
 }
 
@@ -1339,7 +1346,7 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
 /// could at least handle constant BECounts.
 static PHINode *
 FindLoopCounter(Loop *L, const SCEV *BECount,
-                ScalarEvolution *SE, DominatorTree *DT, const TargetData *TD) {
+                ScalarEvolution *SE, DominatorTree *DT, const DataLayout *TD) {
   uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
 
   Value *Cond =
@@ -1696,7 +1703,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTree>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   DeadInsts.clear();
   Changed = false;
@@ -1763,7 +1771,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   while (!DeadInsts.empty())
     if (Instruction *Inst =
           dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
-      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+      RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
 
   // The Rewriter may not be used from this point on.
 
@@ -1772,7 +1780,7 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   SinkUnusedInvariants(L);
 
   // Clean up dead instructions.
-  Changed |= DeleteDeadPHIs(L->getHeader());
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
   // Check a post-condition.
   assert(L->isLCSSAForm(*DT) &&
          "Indvars did not leave the loop in lcssa form!");
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index dd42c59059ab..e7ffa09f1767 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -23,7 +23,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -75,7 +75,7 @@ namespace {
   /// revectored to the false side of the second if.
   ///
   class JumpThreading : public FunctionPass {
-    TargetData *TD;
+    DataLayout *TD;
     TargetLibraryInfo *TLI;
     LazyValueInfo *LVI;
 #ifdef NDEBUG
@@ -147,7 +147,7 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); }
 ///
 bool JumpThreading::runOnFunction(Function &F) {
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
   LVI = &getAnalysis<LazyValueInfo>();
 
@@ -1455,7 +1455,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
   // At this point, the IR is fully up to date and consistent.  Do a quick scan
   // over the new instructions and zap any that are constants or dead.  This
   // frequently happens because of phi translation.
-  SimplifyInstructionsInBlock(NewBB, TD);
+  SimplifyInstructionsInBlock(NewBB, TD, TLI);
 
   // Threaded an edge!
   ++NumThreads;
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 0192e928fe8e..4818437c243a 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -46,7 +46,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
@@ -100,7 +100,7 @@ namespace {
     LoopInfo      *LI;       // Current LoopInfo
     DominatorTree *DT;       // Dominator Tree for the current Loop.
 
-    TargetData *TD;          // TargetData for constant folding.
+    DataLayout *TD;          // DataLayout for constant folding.
     TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
 
     // State that is updated as we process loops.
@@ -108,6 +108,9 @@ namespace {
     BasicBlock *Preheader;   // The preheader block of the current loop...
     Loop *CurLoop;           // The current loop we are working on...
     AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    bool MayThrow;           // The current loop contains an instruction which
+                             // may throw, thus preventing code motion of
+                             // instructions with side effects.
     DenseMap<Loop*, AliasSetTracker*> LoopToAliasSetMap;
 
     /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
@@ -204,7 +207,7 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
   AA = &getAnalysis<AliasAnalysis>();
   DT = &getAnalysis<DominatorTree>();
 
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   CurAST = new AliasSetTracker(*AA);
@@ -240,6 +243,15 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
       CurAST->add(*BB);                 // Incorporate the specified basic block
   }
 
+  MayThrow = false;
+  // TODO: We've already searched for instructions which may throw in subloops.
+  // We may want to reuse this information.
+  for (Loop::block_iterator BB = L->block_begin(), BBE = L->block_end();
+       (BB != BBE) && !MayThrow ; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end();
+         (I != E) && !MayThrow; ++I)
+      MayThrow |= I->mayThrow();
+
   // We want to visit all of the instructions in this loop... that are not parts
   // of our subloops (they have already had their invariants hoisted out of
   // their loop, into this loop, so there is no need to process the BODIES of
@@ -307,7 +319,7 @@ void LICM::SinkRegion(DomTreeNode *N) {
 
     // If the instruction is dead, we would try to sink it because it isn't used
     // in the loop, instead, just delete it.
-    if (isInstructionTriviallyDead(&I)) {
+    if (isInstructionTriviallyDead(&I, TLI)) {
       DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
       ++II;
       CurAST->deleteValue(&I);
@@ -418,17 +430,22 @@ bool LICM::canSinkOrHoistInst(Instruction &I) {
       if (!FoundMod) return true;
     }
 
-    // FIXME: This should use mod/ref information to see if we can hoist or sink
-    // the call.
+    // FIXME: This should use mod/ref information to see if we can hoist or
+    // sink the call.
 
     return false;
   }
 
-  // Otherwise these instructions are hoistable/sinkable
-  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
-         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
-         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
-         isa<ShuffleVectorInst>(I);
+  // Only these instructions are hoistable/sinkable.
+  bool HoistableKind = (isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+                            isa<SelectInst>(I) || isa<GetElementPtrInst>(I) ||
+                            isa<CmpInst>(I)    || isa<InsertElementInst>(I) ||
+                            isa<ExtractElementInst>(I) ||
+                            isa<ShuffleVectorInst>(I));
+  if (!HoistableKind)
+      return false;
+
+  return isSafeToExecuteUnconditionally(I);
 }
 
 /// isNotUsedInLoop - Return true if the only users of this instruction are
@@ -604,6 +621,12 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
 }
 
 bool LICM::isGuaranteedToExecute(Instruction &Inst) {
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (MayThrow)
+    return false;
+
   // Otherwise we have to check to make sure that the instruction dominates all
   // of the exit blocks.  If it doesn't, then there is a path out of the loop
   // which does not execute this instruction, so we can't hoist it.
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index ac1082cbfbdb..a44e798f121b 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -54,7 +54,7 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -65,7 +65,7 @@ STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 namespace {
   class LoopIdiomRecognize : public LoopPass {
     Loop *CurLoop;
-    const TargetData *TD;
+    const DataLayout *TD;
     DominatorTree *DT;
     ScalarEvolution *SE;
     TargetLibraryInfo *TLI;
@@ -132,7 +132,8 @@ Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); }
 /// and zero out all the operands of this instruction.  If any of them become
 /// dead, delete them and the computation tree that feeds them.
 ///
-static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
+static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
+                                  const TargetLibraryInfo *TLI) {
   SmallVector<Instruction*, 32> NowDeadInsts;
 
   NowDeadInsts.push_back(I);
@@ -153,7 +154,7 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
       if (!Op->use_empty()) continue;
 
       if (Instruction *OpI = dyn_cast<Instruction>(Op))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           NowDeadInsts.push_back(OpI);
     }
 
@@ -164,15 +165,21 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE) {
 
 /// deleteIfDeadInstruction - If the specified value is a dead instruction,
 /// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) {
+static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
+                                    const TargetLibraryInfo *TLI) {
   if (Instruction *I = dyn_cast<Instruction>(V))
-    if (isInstructionTriviallyDead(I))
-      deleteDeadInstruction(I, SE);
+    if (isInstructionTriviallyDead(I, TLI))
+      deleteDeadInstruction(I, SE, TLI);
 }
 
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
   // Disable loop idiom recognition if the function's name is a common idiom.
   StringRef Name = L->getHeader()->getParent()->getName();
   if (Name == "memset" || Name == "memcpy")
@@ -192,7 +199,7 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
       return false;
 
   // We require target data for now.
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   if (TD == 0) return false;
 
   DT = &getAnalysis<DominatorTree>();
@@ -401,7 +408,7 @@ static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
 ///
 /// Note that we don't ever attempt to use memset_pattern8 or 4, because these
 /// just replicate their input array and then pass on to memset_pattern16.
-static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) {
+static Constant *getMemSetPatternValue(Value *V, const DataLayout &TD) {
   // If the value isn't a constant, we can't promote it to being in a constant
   // array.  We could theoretically do a store to an alloca or something, but
   // that doesn't seem worthwhile.
@@ -490,7 +497,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(BasePtr, *SE);
+    deleteIfDeadInstruction(BasePtr, *SE, TLI);
     return false;
   }
 
@@ -538,7 +545,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(TheStore, *SE);
+  deleteDeadInstruction(TheStore, *SE, TLI);
   ++NumMemSet;
   return true;
 }
@@ -579,7 +586,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -594,8 +601,8 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
                             StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
     Expander.clear();
     // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(LoadBasePtr, *SE);
-    deleteIfDeadInstruction(StoreBasePtr, *SE);
+    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
+    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
     return false;
   }
 
@@ -628,7 +635,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  deleteDeadInstruction(SI, *SE);
+  deleteDeadInstruction(SI, *SE, TLI);
   ++NumMemCpy;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 982400c5a387..558f62e6b439 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -66,7 +66,7 @@ Pass *llvm::createLoopInstSimplifyPass() {
 bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
   LoopInfo *LI = &getAnalysis<LoopInfo>();
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
 
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -120,7 +120,7 @@ bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
             ++NumSimplified;
           }
         }
-        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I);
+        LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
         if (IsSubloopHeader && !isa<PHINode>(I))
           break;
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 7eeb1527ad40..abe07aa9d34d 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
@@ -256,6 +257,7 @@ bool LoopRotate::rotateLoop(Loop *L) {
     return false;
 
   BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
 
   BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
   if (BI == 0 || BI->isUnconditional())
@@ -267,13 +269,9 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // Updating PHInodes in loops with multiple exits adds complexity.
-  // Keep it simple, and restrict loop rotation to loops with one exit only.
-  // In future, lift this restriction and support for multiple exits if
-  // required.
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() > 1)
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
     return false;
 
   // Check size of original header and reject loop if it is very big.
@@ -286,11 +284,10 @@ bool LoopRotate::rotateLoop(Loop *L) {
 
   // Now, this loop is suitable for rotation.
   BasicBlock *OrigPreheader = L->getLoopPreheader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
 
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (OrigPreheader == 0 || OrigLatch == 0)
+  if (OrigPreheader == 0)
     return false;
 
   // Anything ScalarEvolution may know about this loop or the PHI nodes
@@ -298,6 +295,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
   if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>())
     SE->forgetLoop(L);
 
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
   // Find new Loop header. NewHeader is a Header's one and only successor
   // that is inside loop.  Header's other successor is outside the
   // loop.  Otherwise loop is not suitable for rotation.
@@ -408,10 +407,19 @@ bool LoopRotate::rotateLoop(Loop *L) {
     // Update DominatorTree to reflect the CFG change we just made.  Then split
     // edges as necessary to preserve LoopSimplify form.
     if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) {
-      // Since OrigPreheader now has the conditional branch to Exit block, it is
-      // the dominator of Exit.
-      DT->changeImmediateDominator(Exit, OrigPreheader);
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
+      // Everything that was dominated by the old loop header is now dominated
+      // by the original loop preheader. Conceptually the header was merged
+      // into the preheader, even though we reuse the actual block as a new
+      // loop latch.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
+      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
+        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
+
+      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
+      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
 
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
@@ -440,6 +448,35 @@ bool LoopRotate::rotateLoop(Loop *L) {
       // Update OrigHeader to be dominated by the new header block.
       DT->changeImmediateDominator(NewHeader, OrigPreheader);
       DT->changeImmediateDominator(OrigHeader, OrigLatch);
+
+      // Brute force incremental dominator tree update. Call
+      // findNearestCommonDominator on all CFG predecessors of each child of the
+      // original header.
+      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
+      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
+                                                   OrigHeaderNode->end());
+      bool Changed;
+      do {
+        Changed = false;
+        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
+          DomTreeNode *Node = HeaderChildren[I];
+          BasicBlock *BB = Node->getBlock();
+
+          pred_iterator PI = pred_begin(BB);
+          BasicBlock *NearestDom = *PI;
+          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
+            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
+
+          // Remember if this changes the DomTree.
+          if (Node->getIDom()->getBlock() != NearestDom) {
+            DT->changeImmediateDominator(BB, NearestDom);
+            Changed = true;
+          }
+        }
+
+      // If the dominator changed, this may have an effect on other
+      // predecessors, continue until we reach a fixpoint.
+      } while (Changed);
     }
   }
 
@@ -452,6 +489,8 @@ bool LoopRotate::rotateLoop(Loop *L) {
   // emitted code isn't too gross in this common case.
   MergeBlockIntoPredecessor(OrigHeader, this);
 
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
   ++NumRotated;
   return true;
 }
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index b14a713ce47a..958348d9faad 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -54,7 +54,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-reduce"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/AddressingMode.h"
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
@@ -64,6 +64,7 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/SmallBitVector.h"
@@ -121,9 +122,11 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -223,7 +226,7 @@ namespace {
 struct Formula {
   /// AM - This is used to represent complex addressing, as well as other kinds
   /// of interesting uses.
-  TargetLowering::AddrMode AM;
+  AddrMode AM;
 
   /// BaseRegs - The list of "base" registers for this use. When this is
   /// non-empty, AM.HasBaseReg should be set to true.
@@ -414,9 +417,11 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isAddRecSExtable - Return true if the given addrec can be sign-extended
 /// without changing its value.
@@ -738,7 +743,8 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
   bool Changed = false;
 
   while (!DeadInsts.empty()) {
-    Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val());
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = dyn_cast_or_null<Instruction>(V);
 
     if (I == 0 || !isInstructionTriviallyDead(I))
       continue;
@@ -973,9 +979,11 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1059,9 +1067,11 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
@@ -1251,14 +1261,16 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// isLegalUse - Test whether the use described by AM is "legal", meaning it can
 /// be completely folded into the user instruction at isel time. This includes
 /// address-mode folding and special icmp tricks.
-static bool isLegalUse(const TargetLowering::AddrMode &AM,
+static bool isLegalUse(const AddrMode &AM,
                        LSRUse::KindType Kind, Type *AccessTy,
                        const TargetLowering *TLI) {
   switch (Kind) {
@@ -1315,7 +1327,7 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM,
   llvm_unreachable("Invalid LSRUse Kind!");
 }
 
-static bool isLegalUse(TargetLowering::AddrMode AM,
+static bool isLegalUse(AddrMode AM,
                        int64_t MinOffset, int64_t MaxOffset,
                        LSRUse::KindType Kind, Type *AccessTy,
                        const TargetLowering *TLI) {
@@ -1346,7 +1358,7 @@ static bool isAlwaysFoldable(int64_t BaseOffs,
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
-  TargetLowering::AddrMode AM;
+  AddrMode AM;
   AM.BaseOffs = BaseOffs;
   AM.BaseGV = BaseGV;
   AM.HasBaseReg = HasBaseReg;
@@ -1384,7 +1396,7 @@ static bool isAlwaysFoldable(const SCEV *S,
 
   // Conservatively, create an address with an immediate and a
   // base and a scale.
-  TargetLowering::AddrMode AM;
+  AddrMode AM;
   AM.BaseOffs = BaseOffs;
   AM.BaseGV = BaseGV;
   AM.HasBaseReg = HasBaseReg;
@@ -2009,7 +2021,7 @@ LSRInstance::OptimizeLoopTermCond() {
               goto decline_post_inc;
             // Check for possible scaled-address reuse.
             Type *AccessTy = getAccessType(UI->getUser());
-            TargetLowering::AddrMode AM;
+            AddrMode AM;
             AM.Scale = C->getSExtValue();
             if (TLI->isLegalAddressingMode(AM, AccessTy))
               goto decline_post_inc;
@@ -3435,9 +3447,11 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 /// GenerateCrossUseConstantOffsets - Look for registers which are a constant
 /// distance apart and try to form reuse opportunities between them.
@@ -4451,17 +4465,21 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
             NewBB = NewBBs[0];
           }
-
-          // If PN is outside of the loop and BB is in the loop, we want to
-          // move the block to be immediately before the PHI block, not
-          // immediately after BB.
-          if (L->contains(BB) && !L->contains(PN))
-            NewBB->moveBefore(PN->getParent());
-
-          // Splitting the edge can reduce the number of PHI entries we have.
-          e = PN->getNumIncomingValues();
-          BB = NewBB;
-          i = PN->getBasicBlockIndex(BB);
+          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+          // phi predecessors are identical. The simple thing to do is skip
+          // splitting in this case rather than complicate the API.
+          if (NewBB) {
+            // If PN is outside of the loop and BB is in the loop, we want to
+            // move the block to be immediately before the PHI block, not
+            // immediately after BB.
+            if (L->contains(BB) && !L->contains(PN))
+              NewBB->moveBefore(PN->getParent());
+
+            // Splitting the edge can reduce the number of PHI entries we have.
+            e = PN->getNumIncomingValues();
+            BB = NewBB;
+            i = PN->getBasicBlockIndex(BB);
+          }
         }
       }
 
@@ -4730,9 +4748,11 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
+#endif
 
 namespace {
 
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 09a186f7f940..0d781ac97725 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include <climits>
 
 using namespace llvm;
@@ -113,7 +113,7 @@ Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial) {
 
 /// ApproximateLoopSize - Approximate the size of the loop.
 static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
-                                    const TargetData *TD) {
+                                    const DataLayout *TD) {
   CodeMetrics Metrics;
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
        I != E; ++I)
@@ -145,7 +145,8 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
   // not user specified.
   unsigned Threshold = CurrentThreshold;
   if (!UserThreshold &&
-      Header->getParent()->hasFnAttr(Attribute::OptimizeForSize))
+      Header->getParent()->getFnAttributes().
+        hasAttribute(Attributes::OptimizeForSize))
     Threshold = OptSizeUnrollThreshold;
 
   // Find trip count and trip multiple if count is not available
@@ -178,7 +179,7 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // Enforce the threshold.
   if (Threshold != NoThreshold) {
-    const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+    const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
     unsigned NumInlineCandidates;
     unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates, TD);
     DEBUG(dbgs() << "  Loop Size = " << LoopSize << "\n");
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 58f7739888fb..047b43eb84fc 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -638,7 +638,8 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) {
   // Check to see if it would be profitable to unswitch current loop.
 
   // Do not do non-trivial unswitch while optimizing for size.
-  if (OptimizeForSize || F->hasFnAttr(Attribute::OptimizeForSize))
+  if (OptimizeForSize ||
+      F->getFnAttributes().hasAttribute(Attributes::OptimizeForSize))
     return false;
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop);
@@ -906,13 +907,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 /// specified.
 static void RemoveFromWorklist(Instruction *I,
                                std::vector<Instruction*> &Worklist) {
-  std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
-                                                     Worklist.end(), I);
-  while (WI != Worklist.end()) {
-    unsigned Offset = WI-Worklist.begin();
-    Worklist.erase(WI);
-    WI = std::find(Worklist.begin()+Offset, Worklist.end(), I);
-  }
+
+  Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
+                 Worklist.end());
 }
 
 /// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 2a5ee33eb1ed..517657cf526c 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
@@ -38,8 +38,8 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 
-static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
-                                  bool &VariableIdxFound, const TargetData &TD){
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
+                                  bool &VariableIdxFound, const DataLayout &TD){
   // Skip over the first indices.
   gep_type_iterator GTI = gep_type_begin(GEP);
   for (unsigned i = 1; i != Idx; ++i, ++GTI)
@@ -72,11 +72,11 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
 /// constant offset, and return that constant offset.  For example, Ptr1 might
 /// be &A[42], and Ptr2 might be &A[40].  In this case offset would be -8.
 static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
-                            const TargetData &TD) {
+                            const DataLayout &TD) {
   Ptr1 = Ptr1->stripPointerCasts();
   Ptr2 = Ptr2->stripPointerCasts();
-  GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
-  GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
+  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
 
   bool VariableIdxFound = false;
 
@@ -141,12 +141,12 @@ struct MemsetRange {
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
 
-  bool isProfitableToUseMemset(const TargetData &TD) const;
+  bool isProfitableToUseMemset(const DataLayout &TD) const;
 
 };
 } // end anon namespace
 
-bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const {
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &TD) const {
   // If we found more than 4 stores to merge or 16 bytes, use memset.
   if (TheStores.size() >= 4 || End-Start >= 16) return true;
 
@@ -192,9 +192,9 @@ class MemsetRanges {
   /// because each element is relatively large and expensive to copy.
   std::list<MemsetRange> Ranges;
   typedef std::list<MemsetRange>::iterator range_iterator;
-  const TargetData &TD;
+  const DataLayout &TD;
 public:
-  MemsetRanges(const TargetData &td) : TD(td) {}
+  MemsetRanges(const DataLayout &td) : TD(td) {}
 
   typedef std::list<MemsetRange>::const_iterator const_iterator;
   const_iterator begin() const { return Ranges.begin(); }
@@ -302,7 +302,7 @@ namespace {
   class MemCpyOpt : public FunctionPass {
     MemoryDependenceAnalysis *MD;
     TargetLibraryInfo *TLI;
-    const TargetData *TD;
+    const DataLayout *TD;
   public:
     static char ID; // Pass identification, replacement for typeid
     MemCpyOpt() : FunctionPass(ID) {
@@ -332,7 +332,7 @@ namespace {
     bool processMemCpy(MemCpyInst *M);
     bool processMemMove(MemMoveInst *M);
     bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                              uint64_t cpyLen, CallInst *C);
+                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
     bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
                                        uint64_t MSize);
     bool processByValArgument(CallSite CS, unsigned ArgNo);
@@ -509,10 +509,18 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
       }
 
       if (C) {
+        unsigned storeAlign = SI->getAlignment();
+        if (!storeAlign)
+          storeAlign = TD->getABITypeAlignment(SI->getOperand(0)->getType());
+        unsigned loadAlign = LI->getAlignment();
+        if (!loadAlign)
+          loadAlign = TD->getABITypeAlignment(LI->getType());
+
         bool changed = performCallSlotOptzn(LI,
                         SI->getPointerOperand()->stripPointerCasts(),
                         LI->getPointerOperand()->stripPointerCasts(),
-                        TD->getTypeStoreSize(SI->getOperand(0)->getType()), C);
+                        TD->getTypeStoreSize(SI->getOperand(0)->getType()),
+                        std::min(storeAlign, loadAlign), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -559,7 +567,8 @@ bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// the call write its result directly into the destination of the memcpy.
 bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
                                      Value *cpyDest, Value *cpySrc,
-                                     uint64_t cpyLen, CallInst *C) {
+                                     uint64_t cpyLen, unsigned cpyAlign,
+                                     CallInst *C) {
   // The general transformation to keep in mind is
   //
   //   call @func(..., src, ...)
@@ -625,6 +634,16 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
     return false;
   }
 
+  // Check that dest points to memory that is at least as aligned as src.
+  unsigned srcAlign = srcAlloca->getAlignment();
+  if (!srcAlign)
+    srcAlign = TD->getABITypeAlignment(srcAlloca->getAllocatedType());
+  bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+  // If dest is not aligned enough and we can't increase its alignment then
+  // bail out.
+  if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+    return false;
+
   // Check that src is not accessed except via the call and the memcpy.  This
   // guarantees that it holds only undefined values when passed in (so the final
   // memcpy can be dropped), that it is not read or written between the call and
@@ -673,20 +692,26 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy,
   bool changedArgument = false;
   for (unsigned i = 0; i < CS.arg_size(); ++i)
     if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
-      if (cpySrc->getType() != cpyDest->getType())
-        cpyDest = CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
-                                              cpyDest->getName(), C);
+      Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
+        : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+                                      cpyDest->getName(), C);
       changedArgument = true;
-      if (CS.getArgument(i)->getType() == cpyDest->getType())
-        CS.setArgument(i, cpyDest);
+      if (CS.getArgument(i)->getType() == Dest->getType())
+        CS.setArgument(i, Dest);
       else
-        CS.setArgument(i, CastInst::CreatePointerCast(cpyDest,
-                          CS.getArgument(i)->getType(), cpyDest->getName(), C));
+        CS.setArgument(i, CastInst::CreatePointerCast(Dest,
+                          CS.getArgument(i)->getType(), Dest->getName(), C));
     }
 
   if (!changedArgument)
     return false;
 
+  // If the destination wasn't sufficiently aligned then increase its alignment.
+  if (!isDestSufficientlyAligned) {
+    assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+  }
+
   // Drop any cached information about the call, because we may have changed
   // its dependence information by changing its parameter.
   MD->removeInstruction(C);
@@ -813,7 +838,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) {
   if (DepInfo.isClobber()) {
     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
       if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), C)) {
+                               CopySize->getZExtValue(), M->getAlignment(),
+                               C)) {
         MD->removeInstruction(M);
         M->eraseFromParent();
         return true;
@@ -974,7 +1000,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
 bool MemCpyOpt::runOnFunction(Function &F) {
   bool MadeChange = false;
   MD = &getAnalysis<MemoryDependenceAnalysis>();
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
   // If we don't have at least memset and memcpy, there is little point of doing
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index 3222f2083b99..dfdf50549da4 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -29,6 +29,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "objc-arc"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/DenseMap.h"
 using namespace llvm;
@@ -1120,9 +1121,8 @@ namespace {
     bool relatedSelect(const SelectInst *A, const Value *B);
     bool relatedPHI(const PHINode *A, const Value *B);
 
-    // Do not implement.
-    void operator=(const ProvenanceAnalysis &);
-    ProvenanceAnalysis(const ProvenanceAnalysis &);
+    void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+    ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
 
   public:
     ProvenanceAnalysis() {}
@@ -1236,16 +1236,19 @@ bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
 
   // An ObjC-Identified object can't alias a load if it is never locally stored.
   if (AIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(B))
+      return isStoredObjCPointer(A);
     if (BIsIdentified) {
-      // If both pointers have provenance, they can be directly compared.
-      if (A != B)
-        return false;
-    } else {
-      if (isa<LoadInst>(B))
-        return isStoredObjCPointer(A);
+      // Check for an obvious escape.
+      if (isa<LoadInst>(A))
+        return isStoredObjCPointer(B);
+      // Both pointers are identified and escapes aren't an evident problem.
+      return false;
     }
-  } else {
-    if (BIsIdentified && isa<LoadInst>(A))
+  } else if (BIsIdentified) {
+    // Check for an obvious escape.
+    if (isa<LoadInst>(A))
       return isStoredObjCPointer(B);
   }
 
@@ -1381,9 +1384,6 @@ namespace {
   /// PtrState - This class summarizes several per-pointer runtime properties
   /// which are propogated through the flow graph.
   class PtrState {
-    /// NestCount - The known minimum level of retain+release nesting.
-    unsigned NestCount;
-
     /// KnownPositiveRefCount - True if the reference count is known to
     /// be incremented.
     bool KnownPositiveRefCount;
@@ -1401,7 +1401,7 @@ namespace {
     /// TODO: Encapsulate this better.
     RRInfo RRI;
 
-    PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false),
+    PtrState() : KnownPositiveRefCount(false), Partial(false),
                  Seq(S_None) {}
 
     void SetKnownPositiveRefCount() {
@@ -1416,18 +1416,6 @@ namespace {
       return KnownPositiveRefCount;
     }
 
-    void IncrementNestCount() {
-      if (NestCount != UINT_MAX) ++NestCount;
-    }
-
-    void DecrementNestCount() {
-      if (NestCount != 0) --NestCount;
-    }
-
-    bool IsKnownNested() const {
-      return NestCount > 0;
-    }
-
     void SetSeq(Sequence NewSeq) {
       Seq = NewSeq;
     }
@@ -1454,7 +1442,6 @@ void
 PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
-  NestCount = std::min(NestCount, Other.NestCount);
 
   // We can't merge a plain objc_retain with an objc_retainBlock.
   if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
@@ -1610,6 +1597,12 @@ void BBState::MergePred(const BBState &Other) {
   // loop backedge. Loop backedges are special.
   TopDownPathCount += Other.TopDownPathCount;
 
+  // Check for overflow. If we have overflow, fall back to conservative behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    clearTopDownPointers();
+    return;
+  }
+
   // For each entry in the other set, if our set has an entry with the same key,
   // merge the entries. Otherwise, copy the entry and merge it with an empty
   // entry.
@@ -1635,6 +1628,12 @@ void BBState::MergeSucc(const BBState &Other) {
   // loop backedge. Loop backedges are special.
   BottomUpPathCount += Other.BottomUpPathCount;
 
+  // Check for overflow. If we have overflow, fall back to conservative behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    clearBottomUpPointers();
+    return;
+  }
+
   // For each entry in the other set, if our set has an entry with the
   // same key, merge the entries. Otherwise, copy the entry and merge
   // it with an empty entry.
@@ -1789,7 +1788,9 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     RetainRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
                              Attributes);
@@ -1803,7 +1804,9 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     AutoreleaseRVCallee =
       M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
                              Attributes);
@@ -1815,7 +1818,9 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
   if (!ReleaseCallee) {
     LLVMContext &C = M->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     ReleaseCallee =
       M->getOrInsertFunction(
         "objc_release",
@@ -1829,7 +1834,9 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   if (!RetainCallee) {
     LLVMContext &C = M->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     RetainCallee =
       M->getOrInsertFunction(
         "objc_retain",
@@ -1858,7 +1865,9 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     AutoreleaseCallee =
       M->getOrInsertFunction(
         "objc_autorelease",
@@ -1868,6 +1877,26 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   return AutoreleaseCallee;
 }
 
+/// IsPotentialUse - Test whether the given value is possible a
+/// reference-counted pointer, including tests which utilize AliasAnalysis.
+static bool IsPotentialUse(const Value *Op, AliasAnalysis &AA) {
+  // First make the rudimentary check.
+  if (!IsPotentialUse(Op))
+    return false;
+
+  // Objects in constant memory are not reference-counted.
+  if (AA.pointsToConstantMemory(Op))
+    return false;
+
+  // Pointers in constant memory are not pointing to reference-counted objects.
+  if (const LoadInst *LI = dyn_cast<LoadInst>(Op))
+    if (AA.pointsToConstantMemory(LI->getPointerOperand()))
+      return false;
+
+  // Otherwise assume the worst.
+  return true;
+}
+
 /// CanAlterRefCount - Test whether the given instruction can result in a
 /// reference count modification (positive or negative) for the pointer's
 /// object.
@@ -1894,7 +1923,7 @@ CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
     for (ImmutableCallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end();
          I != E; ++I) {
       const Value *Op = *I;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1919,14 +1948,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
     // Comparing a pointer with null, or any other constant, isn't really a use,
     // because we don't care what the pointer points to, or about the values
     // of any other dynamic reference-counted pointers.
-    if (!IsPotentialUse(ICI->getOperand(1)))
+    if (!IsPotentialUse(ICI->getOperand(1), *PA.getAA()))
       return false;
   } else if (ImmutableCallSite CS = static_cast<const Value *>(Inst)) {
     // For calls, just check the arguments (and not the callee operand).
     for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
          OE = CS.arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
-      if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+      if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
         return true;
     }
     return false;
@@ -1936,14 +1965,14 @@ CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
     const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
     // If we can't tell what the underlying object was, assume there is a
     // dependence.
-    return IsPotentialUse(Op) && PA.related(Op, Ptr);
+    return IsPotentialUse(Op, *PA.getAA()) && PA.related(Op, Ptr);
   }
 
   // Check each operand for a match.
   for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
        OI != OE; ++OI) {
     const Value *Op = *OI;
-    if (IsPotentialUse(Op) && PA.related(Ptr, Op))
+    if (IsPotentialUse(Op, *PA.getAA()) && PA.related(Ptr, Op))
       return true;
   }
   return false;
@@ -2612,11 +2641,11 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
     S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented();
+    S.RRI.KnownSafe = S.IsKnownIncremented();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
     break;
   }
   case IC_RetainBlock:
@@ -2631,7 +2660,6 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
-    S.DecrementNestCount();
 
     switch (S.GetSeq()) {
     case S_Stop:
@@ -2747,8 +2775,9 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
 
   // Merge the states from each successor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator SI(MyStates.succ_begin()),
-       SE(MyStates.succ_end()); SI != SE; ++SI) {
+  BBState::edge_iterator SI(MyStates.succ_begin()),
+                         SE(MyStates.succ_end());
+  if (SI != SE) {
     const BasicBlock *Succ = *SI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
     assert(I != BBStates.end());
@@ -2760,7 +2789,6 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       assert(I != BBStates.end());
       MyStates.MergeSucc(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, bottom-up.
@@ -2823,12 +2851,11 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
       S.ResetSequenceProgress(S_Retain);
       S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      // Don't check S.IsKnownIncremented() here because it's not sufficient.
-      S.RRI.KnownSafe = S.IsKnownNested();
+      S.RRI.KnownSafe = S.IsKnownIncremented();
       S.RRI.Calls.insert(Inst);
     }
 
-    S.IncrementNestCount();
+    S.SetKnownPositiveRefCount();
 
     // A retain can be a potential use; procede to the generic checking
     // code below.
@@ -2838,7 +2865,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.DecrementNestCount();
+    S.ClearRefCount();
 
     switch (S.GetSeq()) {
     case S_Retain:
@@ -2935,8 +2962,9 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
 
   // Merge the states from each predecessor to compute the initial state
   // for the current block.
-  for (BBState::edge_iterator PI(MyStates.pred_begin()),
-       PE(MyStates.pred_end()); PI != PE; ++PI) {
+  BBState::edge_iterator PI(MyStates.pred_begin()),
+                         PE(MyStates.pred_end());
+  if (PI != PE) {
     const BasicBlock *Pred = *PI;
     DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
     assert(I != BBStates.end());
@@ -2948,7 +2976,6 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
       assert(I != BBStates.end());
       MyStates.MergePred(I->second);
     }
-    break;
   }
 
   // Visit all the instructions, top-down.
@@ -3532,19 +3559,19 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
 }
 
 /// OptimizeReturns - Look for this pattern:
-///
+/// \code
 ///    %call = call i8* @something(...)
 ///    %2 = call i8* @objc_retain(i8* %call)
 ///    %3 = call i8* @objc_autorelease(i8* %2)
 ///    ret i8* %3
-///
+/// \endcode
 /// And delete the retain and autorelease.
 ///
 /// Otherwise if it's just this:
-///
+/// \code
 ///    %3 = call i8* @objc_autorelease(i8* %2)
 ///    ret i8* %3
-///
+/// \endcode
 /// convert the autorelease to autoreleaseRV.
 void ObjCARCOpt::OptimizeReturns(Function &F) {
   if (!F.getReturnType()->isPointerTy())
@@ -3814,8 +3841,9 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
     Type *Params[] = { I8XX, I8X };
 
     AttrListPtr Attributes = AttrListPtr()
-      .addAttr(~0u, Attribute::NoUnwind)
-      .addAttr(1, Attribute::NoCapture);
+      .addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+               Attributes::get(C, Attributes::NoUnwind))
+      .addAttr(M->getContext(), 1, Attributes::get(C, Attributes::NoCapture));
 
     StoreStrongCallee =
       M->getOrInsertFunction(
@@ -3832,7 +3860,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     RetainAutoreleaseCallee =
       M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes);
   }
@@ -3845,7 +3875,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind);
+    AttrListPtr Attributes =
+      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+                            Attributes::get(C, Attributes::NoUnwind));
     RetainAutoreleaseRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
                              Attributes);
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 09687d8909da..7a4079784bb7 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -339,36 +339,6 @@ static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
   }
 }
 
-/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C
-/// is repeated Weight times.
-static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C,
-                                          APInt Weight) {
-  // For addition the result can be efficiently computed as the product of the
-  // constant and the weight.
-  if (Opcode == Instruction::Add)
-    return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight));
-
-  // The weight might be huge, so compute by repeated squaring to ensure that
-  // compile time is proportional to the logarithm of the weight.
-  Constant *Result = 0;
-  Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc.
-  // Visit the bits in Weight.
-  while (Weight != 0) {
-    // If the current bit in Weight is non-zero do Result = Result op Power.
-    if (Weight[0])
-      Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power;
-    // Move on to the next bit if any more are non-zero.
-    Weight = Weight.lshr(1);
-    if (Weight.isMinValue())
-      break;
-    // Square the power.
-    Power = ConstantExpr::get(Opcode, Power, Power);
-  }
-
-  assert(Result && "Only positive weights supported!");
-  return Result;
-}
-
 typedef std::pair<Value*, APInt> RepeatedValue;
 
 /// LinearizeExprTree - Given an associative binary expression, return the leaf
@@ -382,9 +352,7 @@ typedef std::pair<Value*, APInt> RepeatedValue;
 /// op
 ///   (Ops[N].first op Ops[N].first op ... Ops[N].first)  <- Ops[N].second times
 ///
-/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and
-/// they are all non-constant except possibly for the last one, which if it is
-/// constant will have weight one (Ops[N].second === 1).
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
 ///
 /// This routine may modify the function, in which case it returns 'true'.  The
 /// changes it makes may well be destructive, changing the value computed by 'I'
@@ -604,7 +572,6 @@ static bool LinearizeExprTree(BinaryOperator *I,
 
   // The leaves, repeated according to their weights, represent the linearized
   // form of the expression.
-  Constant *Cst = 0; // Accumulate constants here.
   for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
     Value *V = LeafOrder[i];
     LeafMap::iterator It = Leaves.find(V);
@@ -618,31 +585,14 @@ static bool LinearizeExprTree(BinaryOperator *I,
       continue;
     // Ensure the leaf is only output once.
     It->second = 0;
-    // Glob all constants together into Cst.
-    if (Constant *C = dyn_cast<Constant>(V)) {
-      C = EvaluateRepeatedConstant(Opcode, C, Weight);
-      Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C;
-      continue;
-    }
-    // Add non-constant
     Ops.push_back(std::make_pair(V, Weight));
   }
 
-  // Add any constants back into Ops, all globbed together and reduced to having
-  // weight 1 for the convenience of users.
-  Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
-  if (Cst && Cst != Identity) {
-    // If combining multiple constants resulted in the absorber then the entire
-    // expression must evaluate to the absorber.
-    if (Cst == Absorber)
-      Ops.clear();
-    Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1)));
-  }
-
   // For nilpotent operations or addition there may be no operands, for example
   // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
   // in both cases the weight reduces to 0 causing the value to be skipped.
   if (Ops.empty()) {
+    Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
     assert(Identity && "Associative operation without identity!");
     Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1)));
   }
@@ -656,8 +606,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
                                   SmallVectorImpl<ValueEntry> &Ops) {
   assert(Ops.size() > 1 && "Single values should be used directly!");
 
-  // Since our optimizations never increase the number of operations, the new
-  // expression can always be written by reusing the existing binary operators
+  // Since our optimizations should never increase the number of operations, the
+  // new expression can usually be written reusing the existing binary operators
   // from the original expression tree, without creating any new instructions,
   // though the rewritten expression may have a completely different topology.
   // We take care to not change anything if the new expression will be the same
@@ -671,6 +621,20 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
   unsigned Opcode = I->getOpcode();
   BinaryOperator *Op = I;
 
+  /// NotRewritable - The operands being written will be the leaves of the new
+  /// expression and must not be used as inner nodes (via NodesToRewrite) by
+  /// mistake.  Inner nodes are always reassociable, and usually leaves are not
+  /// (if they were they would have been incorporated into the expression and so
+  /// would not be leaves), so most of the time there is no danger of this.  But
+  /// in rare cases a leaf may become reassociable if an optimization kills uses
+  /// of it, or it may momentarily become reassociable during rewriting (below)
+  /// due it being removed as an operand of one of its uses.  Ensure that misuse
+  /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+  /// leaves and refusing to reuse any of them as inner nodes.
+  SmallPtrSet<Value*, 8> NotRewritable;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    NotRewritable.insert(Ops[i].Op);
+
   // ExpressionChanged - Non-null if the rewritten expression differs from the
   // original in some non-trivial way, requiring the clearing of optional flags.
   // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
@@ -703,12 +667,14 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
       // the old operands with the new ones.
       DEBUG(dbgs() << "RA: " << *Op << '\n');
       if (NewLHS != OldLHS) {
-        if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode))
+        BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
           NodesToRewrite.push_back(BO);
         Op->setOperand(0, NewLHS);
       }
       if (NewRHS != OldRHS) {
-        if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode))
+        BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+        if (BO && !NotRewritable.count(BO))
           NodesToRewrite.push_back(BO);
         Op->setOperand(1, NewRHS);
       }
@@ -732,7 +698,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
         Op->swapOperands();
       } else {
         // Overwrite with the new right-hand side.
-        if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode))
+        BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+        if (BO && !NotRewritable.count(BO))
           NodesToRewrite.push_back(BO);
         Op->setOperand(1, NewRHS);
         ExpressionChanged = Op;
@@ -745,7 +712,8 @@ void Reassociate::RewriteExprTree(BinaryOperator *I,
     // Now deal with the left-hand side.  If this is already an operation node
     // from the original expression then just rewrite the rest of the expression
     // into it.
-    if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) {
+    BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+    if (BO && !NotRewritable.count(BO)) {
       Op = BO;
       continue;
     }
@@ -1446,9 +1414,26 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
                                        SmallVectorImpl<ValueEntry> &Ops) {
   // Now that we have the linearized expression tree, try to optimize it.
   // Start by folding any constants that we found.
-  if (Ops.size() == 1) return Ops[0].Op;
-
+  Constant *Cst = 0;
   unsigned Opcode = I->getOpcode();
+  while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+    Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+    Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+  }
+  // If there was nothing but constants then we are done.
+  if (Ops.empty())
+    return Cst;
+
+  // Put the combined constant back at the end of the operand list, except if
+  // there is no point.  For example, an add of 0 gets dropped here, while a
+  // multiplication by zero turns the whole expression into zero.
+  if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+    if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+      return Cst;
+    Ops.push_back(ValueEntry(0, Cst));
+  }
+
+  if (Ops.size() == 1) return Ops[0].Op;
 
   // Handle destructive annihilation due to identities between elements in the
   // argument list here.
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 2c39aab5cded..686520e724c4 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -26,7 +26,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
@@ -153,7 +153,7 @@ namespace {
 /// Constant Propagation.
 ///
 class SCCPSolver : public InstVisitor<SCCPSolver> {
-  const TargetData *TD;
+  const DataLayout *TD;
   const TargetLibraryInfo *TLI;
   SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
   DenseMap<Value*, LatticeVal> ValueState;  // The state each value is in.
@@ -205,7 +205,7 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   typedef std::pair<BasicBlock*, BasicBlock*> Edge;
   DenseSet<Edge> KnownFeasibleEdges;
 public:
-  SCCPSolver(const TargetData *td, const TargetLibraryInfo *tli)
+  SCCPSolver(const DataLayout *td, const TargetLibraryInfo *tli)
     : TD(td), TLI(tli) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
@@ -1564,7 +1564,7 @@ static void DeleteInstructionInBlock(BasicBlock *BB) {
 //
 bool SCCP::runOnFunction(Function &F) {
   DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
   SCCPSolver Solver(TD, TLI);
 
@@ -1693,7 +1693,7 @@ static bool AddressIsTaken(const GlobalValue *GV) {
 }
 
 bool IPSCCP::runOnModule(Module &M) {
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
   const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
   SCCPSolver Solver(TD, TLI);
 
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 000000000000..ccc2f7a77b3c
--- /dev/null
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,3697 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sroa"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted,         "Number of instructions deleted");
+STATISTIC(NumVectorized,      "Number of vectorized aggregates");
+
+/// Hidden option to force the pass to not use DomTree and mem2reg, instead
+/// forming SSA values through the SSAUpdater infrastructure.
+static cl::opt<bool>
+ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
+
+namespace {
+/// \brief Alloca partitioning representation.
+///
+/// This class represents a partitioning of an alloca into slices, and
+/// information about the nature of uses of each slice of the alloca. The goal
+/// is that this information is sufficient to decide if and how to split the
+/// alloca apart and replace slices with scalars. It is also intended that this
+/// structure can capture the relevant information needed both to decide about
+/// and to enact these transformations.
+class AllocaPartitioning {
+public:
+  /// \brief A common base class for representing a half-open byte range.
+  struct ByteRange {
+    /// \brief The beginning offset of the range.
+    uint64_t BeginOffset;
+
+    /// \brief The ending offset, not included in the range.
+    uint64_t EndOffset;
+
+    ByteRange() : BeginOffset(), EndOffset() {}
+    ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
+        : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+
+    /// \brief Support for ordering ranges.
+    ///
+    /// This provides an ordering over ranges such that start offsets are
+    /// always increasing, and within equal start offsets, the end offsets are
+    /// decreasing. Thus the spanning range comes first in a cluster with the
+    /// same start position.
+    bool operator<(const ByteRange &RHS) const {
+      if (BeginOffset < RHS.BeginOffset) return true;
+      if (BeginOffset > RHS.BeginOffset) return false;
+      if (EndOffset > RHS.EndOffset) return true;
+      return false;
+    }
+
+    /// \brief Support comparison with a single offset to allow binary searches.
+    friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
+      return LHS.BeginOffset < RHSOffset;
+    }
+
+    friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                                const ByteRange &RHS) {
+      return LHSOffset < RHS.BeginOffset;
+    }
+
+    bool operator==(const ByteRange &RHS) const {
+      return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+    }
+    bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+  };
+
+  /// \brief A partition of an alloca.
+  ///
+  /// This structure represents a contiguous partition of the alloca. These are
+  /// formed by examining the uses of the alloca. During formation, they may
+  /// overlap but once an AllocaPartitioning is built, the Partitions within it
+  /// are all disjoint.
+  struct Partition : public ByteRange {
+    /// \brief Whether this partition is splittable into smaller partitions.
+    ///
+    /// We flag partitions as splittable when they are formed entirely due to
+    /// accesses by trivially splittable operations such as memset and memcpy.
+    bool IsSplittable;
+
+    /// \brief Test whether a partition has been marked as dead.
+    bool isDead() const {
+      if (BeginOffset == UINT64_MAX) {
+        assert(EndOffset == UINT64_MAX);
+        return true;
+      }
+      return false;
+    }
+
+    /// \brief Kill a partition.
+    /// This is accomplished by setting both its beginning and end offset to
+    /// the maximum possible value.
+    void kill() {
+      assert(!isDead() && "He's Dead, Jim!");
+      BeginOffset = EndOffset = UINT64_MAX;
+    }
+
+    Partition() : ByteRange(), IsSplittable() {}
+    Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
+        : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
+  };
+
+  /// \brief A particular use of a partition of the alloca.
+  ///
+  /// This structure is used to associate uses of a partition with it. They
+  /// mark the range of bytes which are referenced by a particular instruction,
+  /// and includes a handle to the user itself and the pointer value in use.
+  /// The bounds of these uses are determined by intersecting the bounds of the
+  /// memory use itself with a particular partition. As a consequence there is
+  /// intentionally overlap between various uses of the same partition.
+  struct PartitionUse : public ByteRange {
+    /// \brief The use in question. Provides access to both user and used value.
+    ///
+    /// Note that this may be null if the partition use is *dead*, that is, it
+    /// should be ignored.
+    Use *U;
+
+    PartitionUse() : ByteRange(), U() {}
+    PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U)
+        : ByteRange(BeginOffset, EndOffset), U(U) {}
+  };
+
+  /// \brief Construct a partitioning of a particular alloca.
+  ///
+  /// Construction does most of the work for partitioning the alloca. This
+  /// performs the necessary walks of users and builds a partitioning from it.
+  AllocaPartitioning(const DataLayout &TD, AllocaInst &AI);
+
+  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  ///
+  /// If this is true, the partitioning is never fully built and should be
+  /// ignored.
+  bool isEscaped() const { return PointerEscapingInstr; }
+
+  /// \brief Support for iterating over the partitions.
+  /// @{
+  typedef SmallVectorImpl<Partition>::iterator iterator;
+  iterator begin() { return Partitions.begin(); }
+  iterator end() { return Partitions.end(); }
+
+  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
+  const_iterator begin() const { return Partitions.begin(); }
+  const_iterator end() const { return Partitions.end(); }
+  /// @}
+
+  /// \brief Support for iterating over and manipulating a particular
+  /// partition's uses.
+  ///
+  /// The iteration support provided for uses is more limited, but also
+  /// includes some manipulation routines to support rewriting the uses of
+  /// partitions during SROA.
+  /// @{
+  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
+  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
+  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
+  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
+  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
+
+  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
+  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
+  const_use_iterator use_begin(const_iterator I) const {
+    return Uses[I - begin()].begin();
+  }
+  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
+  const_use_iterator use_end(const_iterator I) const {
+    return Uses[I - begin()].end();
+  }
+
+  unsigned use_size(unsigned Idx) const { return Uses[Idx].size(); }
+  unsigned use_size(const_iterator I) const { return Uses[I - begin()].size(); }
+  const PartitionUse &getUse(unsigned PIdx, unsigned UIdx) const {
+    return Uses[PIdx][UIdx];
+  }
+  const PartitionUse &getUse(const_iterator I, unsigned UIdx) const {
+    return Uses[I - begin()][UIdx];
+  }
+
+  void use_push_back(unsigned Idx, const PartitionUse &PU) {
+    Uses[Idx].push_back(PU);
+  }
+  void use_push_back(const_iterator I, const PartitionUse &PU) {
+    Uses[I - begin()].push_back(PU);
+  }
+  /// @}
+
+  /// \brief Allow iterating the dead users for this alloca.
+  ///
+  /// These are instructions which will never actually use the alloca as they
+  /// are outside the allocated range. They are safe to replace with undef and
+  /// delete.
+  /// @{
+  typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
+  dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
+  dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
+  /// @}
+
+  /// \brief Allow iterating the dead expressions referring to this alloca.
+  ///
+  /// These are operands which have cannot actually be used to refer to the
+  /// alloca as they are outside its range and the user doesn't correct for
+  /// that. These mostly consist of PHI node inputs and the like which we just
+  /// need to replace with undef.
+  /// @{
+  typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
+  dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
+  dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
+  /// @}
+
+  /// \brief MemTransferInst auxiliary data.
+  /// This struct provides some auxiliary data about memory transfer
+  /// intrinsics such as memcpy and memmove. These intrinsics can use two
+  /// different ranges within the same alloca, and provide other challenges to
+  /// correctly represent. We stash extra data to help us untangle this
+  /// after the partitioning is complete.
+  struct MemTransferOffsets {
+    /// The destination begin and end offsets when the destination is within
+    /// this alloca. If the end offset is zero the destination is not within
+    /// this alloca.
+    uint64_t DestBegin, DestEnd;
+
+    /// The source begin and end offsets when the source is within this alloca.
+    /// If the end offset is zero, the source is not within this alloca.
+    uint64_t SourceBegin, SourceEnd;
+
+    /// Flag for whether an alloca is splittable.
+    bool IsSplittable;
+  };
+  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
+    return MemTransferInstData.lookup(&II);
+  }
+
+  /// \brief Map from a PHI or select operand back to a partition.
+  ///
+  /// When manipulating PHI nodes or selects, they can use more than one
+  /// partition of an alloca. We store a special mapping to allow finding the
+  /// partition referenced by each of these operands, if any.
+  iterator findPartitionForPHIOrSelectOperand(Use *U) {
+    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
+      = PHIOrSelectOpMap.find(U);
+    if (MapIt == PHIOrSelectOpMap.end())
+      return end();
+
+    return begin() + MapIt->second.first;
+  }
+
+  /// \brief Map from a PHI or select operand back to the specific use of
+  /// a partition.
+  ///
+  /// Similar to mapping these operands back to the partitions, this maps
+  /// directly to the use structure of that partition.
+  use_iterator findPartitionUseForPHIOrSelectOperand(Use *U) {
+    SmallDenseMap<Use *, std::pair<unsigned, unsigned> >::const_iterator MapIt
+      = PHIOrSelectOpMap.find(U);
+    assert(MapIt != PHIOrSelectOpMap.end());
+    return Uses[MapIt->second.first].begin() + MapIt->second.second;
+  }
+
+  /// \brief Compute a common type among the uses of a particular partition.
+  ///
+  /// This routines walks all of the uses of a particular partition and tries
+  /// to find a common type between them. Untyped operations such as memset and
+  /// memcpy are ignored.
+  Type *getCommonType(iterator I) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
+  void printUsers(raw_ostream &OS, const_iterator I,
+                  StringRef Indent = "  ") const;
+  void print(raw_ostream &OS) const;
+  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
+  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
+#endif
+
+private:
+  template <typename DerivedT, typename RetT = void> class BuilderBase;
+  class PartitionBuilder;
+  friend class AllocaPartitioning::PartitionBuilder;
+  class UseBuilder;
+  friend class AllocaPartitioning::UseBuilder;
+
+#ifndef NDEBUG
+  /// \brief Handle to alloca instruction to simplify method interfaces.
+  AllocaInst &AI;
+#endif
+
+  /// \brief The instruction responsible for this alloca having no partitioning.
+  ///
+  /// When an instruction (potentially) escapes the pointer to the alloca, we
+  /// store a pointer to that here and abort trying to partition the alloca.
+  /// This will be null if the alloca is partitioned successfully.
+  Instruction *PointerEscapingInstr;
+
+  /// \brief The partitions of the alloca.
+  ///
+  /// We store a vector of the partitions over the alloca here. This vector is
+  /// sorted by increasing begin offset, and then by decreasing end offset. See
+  /// the Partition inner class for more details. Initially (during
+  /// construction) there are overlaps, but we form a disjoint sequence of
+  /// partitions while finishing construction and a fully constructed object is
+  /// expected to always have this as a disjoint space.
+  SmallVector<Partition, 8> Partitions;
+
+  /// \brief The uses of the partitions.
+  ///
+  /// This is essentially a mapping from each partition to a list of uses of
+  /// that partition. The mapping is done with a Uses vector that has the exact
+  /// same number of entries as the partition vector. Each entry is itself
+  /// a vector of the uses.
+  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
+
+  /// \brief Instructions which will become dead if we rewrite the alloca.
+  ///
+  /// Note that these are not separated by partition. This is because we expect
+  /// a partitioned alloca to be completely rewritten or not rewritten at all.
+  /// If rewritten, all these instructions can simply be removed and replaced
+  /// with undef as they come from outside of the allocated space.
+  SmallVector<Instruction *, 8> DeadUsers;
+
+  /// \brief Operands which will become dead if we rewrite the alloca.
+  ///
+  /// These are operands that in their particular use can be replaced with
+  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// to PHI nodes and the like. They aren't entirely dead (there might be
+  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+  /// want to swap this particular input for undef to simplify the use lists of
+  /// the alloca.
+  SmallVector<Use *, 8> DeadOperands;
+
+  /// \brief The underlying storage for auxiliary memcpy and memset info.
+  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
+
+  /// \brief A side datastructure used when building up the partitions and uses.
+  ///
+  /// This mapping is only really used during the initial building of the
+  /// partitioning so that we can retain information about PHI and select nodes
+  /// processed.
+  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
+
+  /// \brief Auxiliary information for particular PHI or select operands.
+  SmallDenseMap<Use *, std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
+
+  /// \brief A utility routine called from the constructor.
+  ///
+  /// This does what it says on the tin. It is the key of the alloca partition
+  /// splitting and merging. After it is called we have the desired disjoint
+  /// collection of partitions.
+  void splitAndMergePartitions();
+};
+}
+
+template <typename DerivedT, typename RetT>
+class AllocaPartitioning::BuilderBase
+    : public InstVisitor<DerivedT, RetT> {
+public:
+  BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : TD(TD),
+        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {
+    enqueueUsers(AI, 0);
+  }
+
+protected:
+  const DataLayout &TD;
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
+
+  SmallPtrSet<Use *, 8> VisitedUses;
+
+  struct OffsetUse {
+    Use *U;
+    int64_t Offset;
+  };
+  SmallVector<OffsetUse, 8> Queue;
+
+  // The active offset and use while visiting.
+  Use *U;
+  int64_t Offset;
+
+  void enqueueUsers(Instruction &I, int64_t UserOffset) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+         UI != UE; ++UI) {
+      if (VisitedUses.insert(&UI.getUse())) {
+        OffsetUse OU = { &UI.getUse(), UserOffset };
+        Queue.push_back(OU);
+      }
+    }
+  }
+
+  bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) {
+    GEPOffset = Offset;
+    for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
+         GTI != GTE; ++GTI) {
+      ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+      if (!OpC)
+        return false;
+      if (OpC->isZero())
+        continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        unsigned ElementIdx = OpC->getZExtValue();
+        const StructLayout *SL = TD.getStructLayout(STy);
+        uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
+        // Check that we can continue to model this GEP in a signed 64-bit offset.
+        if (ElementOffset > INT64_MAX ||
+            (GEPOffset >= 0 &&
+             ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) {
+          DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
+                       << "what can be represented in an int64_t!\n"
+                       << "  alloca: " << P.AI << "\n");
+          return false;
+        }
+        if (GEPOffset < 0)
+          GEPOffset = ElementOffset + (uint64_t)-GEPOffset;
+        else
+          GEPOffset += ElementOffset;
+        continue;
+      }
+
+      APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits());
+      Index *= APInt(Index.getBitWidth(),
+                     TD.getTypeAllocSize(GTI.getIndexedType()));
+      Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset,
+                     /*isSigned*/true);
+      // Check if the result can be stored in our int64_t offset.
+      if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) {
+        DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
+                     << "what can be represented in an int64_t!\n"
+                     << "  alloca: " << P.AI << "\n");
+        return false;
+      }
+
+      GEPOffset = Index.getSExtValue();
+    }
+    return true;
+  }
+
+  Value *foldSelectInst(SelectInst &SI) {
+    // If the condition being selected on is a constant or the same value is
+    // being selected between, fold the select. Yes this does (rarely) happen
+    // early on.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+      return SI.getOperand(1+CI->isZero());
+    if (SI.getOperand(1) == SI.getOperand(2)) {
+      assert(*U == SI.getOperand(1));
+      return SI.getOperand(1);
+    }
+    return 0;
+  }
+};
+
+/// \brief Builder for the alloca partitioning.
+///
+/// This class builds an alloca partitioning by recursively visiting the uses
+/// of an alloca and splitting the partitions for each load and store at each
+/// offset.
+class AllocaPartitioning::PartitionBuilder
+    : public BuilderBase<PartitionBuilder, bool> {
+  friend class InstVisitor<PartitionBuilder, bool>;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
+
+public:
+  PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {}
+
+  /// \brief Run the builder over the allocation.
+  bool operator()() {
+    // Note that we have to re-evaluate size on each trip through the loop as
+    // the queue grows at the tail.
+    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
+      U = Queue[Idx].U;
+      Offset = Queue[Idx].Offset;
+      if (!visit(cast<Instruction>(U->getUser())))
+        return false;
+    }
+    return true;
+  }
+
+private:
+  bool markAsEscaping(Instruction &I) {
+    P.PointerEscapingInstr = &I;
+    return false;
+  }
+
+  void insertUse(Instruction &I, int64_t Offset, uint64_t Size,
+                 bool IsSplittable = false) {
+    // Completely skip uses which have a zero size or don't overlap the
+    // allocation.
+    if (Size == 0 ||
+        (Offset >= 0 && (uint64_t)Offset >= AllocSize) ||
+        (Offset < 0 && (uint64_t)-Offset >= Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+                   << " which starts past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      return;
+    }
+
+    // Clamp the start to the beginning of the allocation.
+    if (Offset < 0) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to start at the beginning of the alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      Size -= (uint64_t)-Offset;
+      Offset = 0;
+    }
+
+    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    // NOTE! This may appear superficially to be something we could ignore
+    // entirely, but that is not so! There may be PHI-node uses where some
+    // instructions are dead but not others. We can't completely ignore the
+    // PHI node, and so have to record at least the information here.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to remain within the " << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      EndOffset = AllocSize;
+    }
+
+    Partition New(BeginOffset, EndOffset, IsSplittable);
+    P.Partitions.push_back(New);
+  }
+
+  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset,
+                         bool IsVolatile) {
+    uint64_t Size = TD.getTypeStoreSize(Ty);
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
+        Size > (AllocSize - (uint64_t)Offset)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte "
+                   << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset
+                   << " which extends past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      return true;
+    }
+
+    // We allow splitting of loads and stores where the type is an integer type
+    // and which cover the entire alloca. Such integer loads and stores
+    // often require decomposition into fine grained loads and stores.
+    bool IsSplittable = false;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+      IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8;
+
+    insertUse(I, Offset, Size, IsSplittable);
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC, Offset);
+    return true;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    int64_t GEPOffset;
+    if (!computeConstantGEPOffset(GEPI, GEPOffset))
+      return markAsEscaping(GEPI);
+
+    enqueueUsers(GEPI, GEPOffset);
+    return true;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+    return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile());
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return markAsEscaping(SI);
+
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    return handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
+  }
+
+
+  bool visitMemSetInst(MemSetInst &II) {
+    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    insertUse(II, Offset, Size, Length);
+    return true;
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    if (!Size)
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return true;
+
+    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+
+    // Only intrinsics with a constant length can be split.
+    Offsets.IsSplittable = Length;
+
+    if (*U == II.getRawDest()) {
+      Offsets.DestBegin = Offset;
+      Offsets.DestEnd = Offset + Size;
+    }
+    if (*U == II.getRawSource()) {
+      Offsets.SourceBegin = Offset;
+      Offsets.SourceEnd = Offset + Size;
+    }
+
+    // If we have set up end offsets for both the source and the destination,
+    // we have found both sides of this transfer pointing at the same alloca.
+    bool SeenBothEnds = Offsets.SourceEnd && Offsets.DestEnd;
+    if (SeenBothEnds && II.getRawDest() != II.getRawSource()) {
+      unsigned PrevIdx = MemTransferPartitionMap[&II];
+
+      // Check if the begin offsets match and this is a non-volatile transfer.
+      // In that case, we can completely elide the transfer.
+      if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
+        P.Partitions[PrevIdx].kill();
+        return true;
+      }
+
+      // Otherwise we have an offset transfer within the same alloca. We can't
+      // split those.
+      P.Partitions[PrevIdx].IsSplittable = Offsets.IsSplittable = false;
+    } else if (SeenBothEnds) {
+      // Handle the case where this exact use provides both ends of the
+      // operation.
+      assert(II.getRawDest() == II.getRawSource());
+
+      // For non-volatile transfers this is a no-op.
+      if (!II.isVolatile())
+        return true;
+
+      // Otherwise just suppress splitting.
+      Offsets.IsSplittable = false;
+    }
+
+
+    // Insert the use now that we've fixed up the splittable nature.
+    insertUse(II, Offset, Size, Offsets.IsSplittable);
+
+    // Setup the mapping from intrinsic to partition of we've not seen both
+    // ends of this transfer.
+    if (!SeenBothEnds) {
+      unsigned NewIdx = P.Partitions.size() - 1;
+      bool Inserted
+        = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx)).second;
+      assert(Inserted &&
+             "Already have intrinsic in map but haven't seen both ends");
+      (void)Inserted;
+    }
+
+    return true;
+  }
+
+  // Disable SRoA for any intrinsics except for lifetime invariants.
+  // FIXME: What about debug instrinsics? This matches old behavior, but
+  // doesn't make sense.
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+        II.getIntrinsicID() == Intrinsic::lifetime_end) {
+      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+      uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue());
+      insertUse(II, Offset, Size, true);
+      return true;
+    }
+
+    return markAsEscaping(II);
+  }
+
+  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+    // We consider any PHI or select that results in a direct load or store of
+    // the same offset to be a viable use for partitioning purposes. These uses
+    // are considered unsplittable and the size is the maximum loaded or stored
+    // size.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+    Visited.insert(Root);
+    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    // If there are no loads or stores, the access is dead. We mark that as
+    // a size zero access.
+    Size = 0;
+    do {
+      Instruction *I, *UsedI;
+      llvm::tie(UsedI, I) = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        Size = std::max(Size, TD.getTypeStoreSize(LI->getType()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        Value *Op = SI->getOperand(0);
+        if (Op == UsedI)
+          return SI;
+        Size = std::max(Size, TD.getTypeStoreSize(Op->getType()));
+        continue;
+      }
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        if (!GEP->hasAllZeroIndices())
+          return GEP;
+      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+                 !isa<SelectInst>(I)) {
+        return I;
+      }
+
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+           ++UI)
+        if (Visited.insert(cast<Instruction>(*UI)))
+          Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
+    } while (!Uses.empty());
+
+    return 0;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    // See if we already have computed info on this node.
+    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
+    if (PHIInfo.first) {
+      PHIInfo.second = true;
+      insertUse(PN, Offset, PHIInfo.first);
+      return true;
+    }
+
+    // Check for an unsafe use of the PHI node.
+    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
+      return markAsEscaping(*EscapingI);
+
+    insertUse(PN, Offset, PHIInfo.first);
+    return true;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    if (Value *Result = foldSelectInst(SI)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the select as if we had RAUW'ed it.
+        enqueueUsers(SI, Offset);
+
+      return true;
+    }
+
+    // See if we already have computed info on this node.
+    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
+    if (SelectInfo.first) {
+      SelectInfo.second = true;
+      insertUse(SI, Offset, SelectInfo.first);
+      return true;
+    }
+
+    // Check for an unsafe use of the PHI node.
+    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
+      return markAsEscaping(*EscapingI);
+
+    insertUse(SI, Offset, SelectInfo.first);
+    return true;
+  }
+
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  bool visitInstruction(Instruction &I) { return markAsEscaping(I); }
+};
+
+
+/// \brief Use adder for the alloca partitioning.
+///
+/// This class adds the uses of an alloca to all of the partitions which they
+/// use. For splittable partitions, this can end up doing essentially a linear
+/// walk of the partitions, but the number of steps remains bounded by the
+/// total result instruction size:
+/// - The number of partitions is a result of the number unsplittable
+///   instructions using the alloca.
+/// - The number of users of each partition is at worst the total number of
+///   splittable instructions using the alloca.
+/// Thus we will produce N * M instructions in the end, where N are the number
+/// of unsplittable uses and M are the number of splittable. This visitor does
+/// the exact same number of updates to the partitioning.
+///
+/// In the more common case, this visitor will leverage the fact that the
+/// partition space is pre-sorted, and do a logarithmic search for the
+/// partition needed, making the total visit a classical ((N + M) * log(N))
+/// complexity operation.
+class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> {
+  friend class InstVisitor<UseBuilder>;
+
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+  UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : BuilderBase<UseBuilder>(TD, AI, P) {}
+
+  /// \brief Run the builder over the allocation.
+  void operator()() {
+    // Note that we have to re-evaluate size on each trip through the loop as
+    // the queue grows at the tail.
+    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
+      U = Queue[Idx].U;
+      Offset = Queue[Idx].Offset;
+      this->visit(cast<Instruction>(U->getUser()));
+    }
+  }
+
+private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I))
+      P.DeadUsers.push_back(&I);
+  }
+
+  void insertUse(Instruction &User, int64_t Offset, uint64_t Size) {
+    // If the use has a zero size or extends outside of the allocation, record
+    // it as a dead use for elimination later.
+    if (Size == 0 || (uint64_t)Offset >= AllocSize ||
+        (Offset < 0 && (uint64_t)-Offset >= Size))
+      return markAsDead(User);
+
+    // Clamp the start to the beginning of the allocation.
+    if (Offset < 0) {
+      Size -= (uint64_t)-Offset;
+      Offset = 0;
+    }
+
+    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset)
+      EndOffset = AllocSize;
+
+    // NB: This only works if we have zero overlapping partitions.
+    iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset);
+    if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset)
+      B = llvm::prior(B);
+    for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset;
+         ++I) {
+      PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
+                         std::min(I->EndOffset, EndOffset), U);
+      P.use_push_back(I, NewPU);
+      if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
+        P.PHIOrSelectOpMap[U]
+          = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
+    }
+  }
+
+  void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
+    uint64_t Size = TD.getTypeStoreSize(Ty);
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse.
+    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
+        Size > (AllocSize - (uint64_t)Offset))
+      return markAsDead(I);
+
+    insertUse(I, Offset, Size);
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    enqueueUsers(BC, Offset);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    int64_t GEPOffset;
+    if (!computeConstantGEPOffset(GEPI, GEPOffset))
+      llvm_unreachable("Unable to compute constant offset for use");
+
+    enqueueUsers(GEPI, GEPOffset);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    handleLoadOrStore(LI.getType(), LI, Offset);
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    insertUse(II, Offset, Size);
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    if (!Size)
+      return markAsDead(II);
+
+    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+    if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
+        Offsets.DestBegin == Offsets.SourceBegin)
+      return markAsDead(II); // Skip identity transfers without side-effects.
+
+    insertUse(II, Offset, Size);
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+
+    ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+    insertUse(II, Offset,
+              std::min(AllocSize - Offset, Length->getLimitedValue()));
+  }
+
+  void insertPHIOrSelect(Instruction &User, uint64_t Offset) {
+    uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
+
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    if (Offset >= AllocSize) {
+      P.DeadOperands.push_back(U);
+      return;
+    }
+
+    insertUse(User, Offset, Size);
+  }
+  void visitPHINode(PHINode &PN) {
+    if (PN.use_empty())
+      return markAsDead(PN);
+
+    insertPHIOrSelect(PN, Offset);
+  }
+  void visitSelectInst(SelectInst &SI) {
+    if (SI.use_empty())
+      return markAsDead(SI);
+
+    if (Value *Result = foldSelectInst(SI)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the select as if we had RAUW'ed it.
+        enqueueUsers(SI, Offset);
+      else
+        // Otherwise the operand to the select is dead, and we can replace it
+        // with undef.
+        P.DeadOperands.push_back(U);
+
+      return;
+    }
+
+    insertPHIOrSelect(SI, Offset);
+  }
+
+  /// \brief Unreachable, we've already visited the alloca once.
+  void visitInstruction(Instruction &I) {
+    llvm_unreachable("Unhandled instruction in use builder.");
+  }
+};
+
+void AllocaPartitioning::splitAndMergePartitions() {
+  size_t NumDeadPartitions = 0;
+
+  // Track the range of splittable partitions that we pass when accumulating
+  // overlapping unsplittable partitions.
+  uint64_t SplitEndOffset = 0ull;
+
+  Partition New(0ull, 0ull, false);
+
+  for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
+    ++j;
+
+    if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
+      assert(New.BeginOffset == New.EndOffset);
+      New = Partitions[i];
+    } else {
+      assert(New.IsSplittable);
+      New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
+    }
+    assert(New.BeginOffset != New.EndOffset);
+
+    // Scan the overlapping partitions.
+    while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
+      // If the new partition we are forming is splittable, stop at the first
+      // unsplittable partition.
+      if (New.IsSplittable && !Partitions[j].IsSplittable)
+        break;
+
+      // Grow the new partition to include any equally splittable range. 'j' is
+      // always equally splittable when New is splittable, but when New is not
+      // splittable, we may subsume some (or part of some) splitable partition
+      // without growing the new one.
+      if (New.IsSplittable == Partitions[j].IsSplittable) {
+        New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
+      } else {
+        assert(!New.IsSplittable);
+        assert(Partitions[j].IsSplittable);
+        SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
+      }
+
+      Partitions[j].kill();
+      ++NumDeadPartitions;
+      ++j;
+    }
+
+    // If the new partition is splittable, chop off the end as soon as the
+    // unsplittable subsequent partition starts and ensure we eventually cover
+    // the splittable area.
+    if (j != e && New.IsSplittable) {
+      SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
+      New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+    }
+
+    // Add the new partition if it differs from the original one and is
+    // non-empty. We can end up with an empty partition here if it was
+    // splittable but there is an unsplittable one that starts at the same
+    // offset.
+    if (New != Partitions[i]) {
+      if (New.BeginOffset != New.EndOffset)
+        Partitions.push_back(New);
+      // Mark the old one for removal.
+      Partitions[i].kill();
+      ++NumDeadPartitions;
+    }
+
+    New.BeginOffset = New.EndOffset;
+    if (!New.IsSplittable) {
+      New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
+      if (j != e && !Partitions[j].IsSplittable)
+        New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+      New.IsSplittable = true;
+      // If there is a trailing splittable partition which won't be fused into
+      // the next splittable partition go ahead and add it onto the partitions
+      // list.
+      if (New.BeginOffset < New.EndOffset &&
+          (j == e || !Partitions[j].IsSplittable ||
+           New.EndOffset < Partitions[j].BeginOffset)) {
+        Partitions.push_back(New);
+        New.BeginOffset = New.EndOffset = 0ull;
+      }
+    }
+  }
+
+  // Re-sort the partitions now that they have been split and merged into
+  // disjoint set of partitions. Also remove any of the dead partitions we've
+  // replaced in the process.
+  std::sort(Partitions.begin(), Partitions.end());
+  if (NumDeadPartitions) {
+    assert(Partitions.back().isDead());
+    assert((ptrdiff_t)NumDeadPartitions ==
+           std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
+  }
+  Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
+}
+
+AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
+    :
+#ifndef NDEBUG
+      AI(AI),
+#endif
+      PointerEscapingInstr(0) {
+  PartitionBuilder PB(TD, AI, *this);
+  if (!PB())
+    return;
+
+  // Sort the uses. This arranges for the offsets to be in ascending order,
+  // and the sizes to be in descending order.
+  std::sort(Partitions.begin(), Partitions.end());
+
+  // Remove any partitions from the back which are marked as dead.
+  while (!Partitions.empty() && Partitions.back().isDead())
+    Partitions.pop_back();
+
+  if (Partitions.size() > 1) {
+    // Intersect splittability for all partitions with equal offsets and sizes.
+    // Then remove all but the first so that we have a sequence of non-equal but
+    // potentially overlapping partitions.
+    for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
+         I = J) {
+      ++J;
+      while (J != E && *I == *J) {
+        I->IsSplittable &= J->IsSplittable;
+        ++J;
+      }
+    }
+    Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
+                     Partitions.end());
+
+    // Split splittable and merge unsplittable partitions into a disjoint set
+    // of partitions over the used space of the allocation.
+    splitAndMergePartitions();
+  }
+
+  // Now build up the user lists for each of these disjoint partitions by
+  // re-walking the recursive users of the alloca.
+  Uses.resize(Partitions.size());
+  UseBuilder UB(TD, AI, *this);
+  UB();
+}
+
+Type *AllocaPartitioning::getCommonType(iterator I) const {
+  Type *Ty = 0;
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
+    if (!UI->U)
+      continue; // Skip dead uses.
+    if (isa<IntrinsicInst>(*UI->U->getUser()))
+      continue;
+    if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
+      continue;
+
+    Type *UserTy = 0;
+    if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) {
+      UserTy = SI->getValueOperand()->getType();
+    } else {
+      return 0; // Bail if we have weird uses.
+    }
+
+    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split.
+      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
+        continue;
+
+      // If we have found an integer type use covering the alloca, use that
+      // regardless of the other types, as integers are often used for a "bucket
+      // of bits" type.
+      return ITy;
+    }
+
+    if (Ty && Ty != UserTy)
+      return 0;
+
+    Ty = UserTy;
+  }
+  return Ty;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
+                               StringRef Indent) const {
+  OS << Indent << "partition #" << (I - begin())
+     << " [" << I->BeginOffset << "," << I->EndOffset << ")"
+     << (I->IsSplittable ? " (splittable)" : "")
+     << (Uses[I - begin()].empty() ? " (zero uses)" : "")
+     << "\n";
+}
+
+void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
+                                    StringRef Indent) const {
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I);
+       UI != UE; ++UI) {
+    if (!UI->U)
+      continue; // Skip dead uses.
+    OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
+       << "used by: " << *UI->U->getUser() << "\n";
+    if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) {
+      const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
+      bool IsDest;
+      if (!MTO.IsSplittable)
+        IsDest = UI->BeginOffset == MTO.DestBegin;
+      else
+        IsDest = MTO.DestBegin != 0u;
+      OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
+         << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
+         << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
+    }
+  }
+}
+
+void AllocaPartitioning::print(raw_ostream &OS) const {
+  if (PointerEscapingInstr) {
+    OS << "No partitioning for alloca: " << AI << "\n"
+       << "  A pointer to this alloca escaped by:\n"
+       << "  " << *PointerEscapingInstr << "\n";
+    return;
+  }
+
+  OS << "Partitioning of alloca: " << AI << "\n";
+  unsigned Num = 0;
+  for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) {
+    print(OS, I);
+    printUsers(OS, I);
+  }
+}
+
+void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
+void AllocaPartitioning::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+
+namespace {
+/// \brief Implementation of LoadAndStorePromoter for promoting allocas.
+///
+/// This subclass of LoadAndStorePromoter adds overrides to handle promoting
+/// the loads and stores of an alloca instruction, as well as updating its
+/// debug information. This is used when a domtree is unavailable and thus
+/// mem2reg in its full form can't be used to handle promotion of allocas to
+/// scalar values.
+class AllocaPromoter : public LoadAndStorePromoter {
+  AllocaInst &AI;
+  DIBuilder &DIB;
+
+  SmallVector<DbgDeclareInst *, 4> DDIs;
+  SmallVector<DbgValueInst *, 4> DVIs;
+
+public:
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 AllocaInst &AI, DIBuilder &DIB)
+    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+
+  void run(const SmallVectorImpl<Instruction*> &Insts) {
+    // Remember which alloca we're promoting (for isInstInList).
+    if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
+      for (Value::use_iterator UI = DebugNode->use_begin(),
+                               UE = DebugNode->use_end();
+           UI != UE; ++UI)
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+          DDIs.push_back(DDI);
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+          DVIs.push_back(DVI);
+    }
+
+    LoadAndStorePromoter::run(Insts);
+    AI.eraseFromParent();
+    while (!DDIs.empty())
+      DDIs.pop_back_val()->eraseFromParent();
+    while (!DVIs.empty())
+      DVIs.pop_back_val()->eraseFromParent();
+  }
+
+  virtual bool isInstInList(Instruction *I,
+                            const SmallVectorImpl<Instruction*> &Insts) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      return LI->getOperand(0) == &AI;
+    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+  }
+
+  virtual void updateDebugInfo(Instruction *Inst) const {
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+    }
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      Value *Arg = NULL;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        // If an argument is zero extended then use argument directly. The ZExt
+        // may be zapped by an optimization pass in future.
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
+        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (!Arg)
+          Arg = SI->getOperand(0);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Arg = LI->getOperand(0);
+      } else {
+        continue;
+      }
+      Instruction *DbgVal =
+        DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                     Inst);
+      DbgVal->setDebugLoc(DVI->getDebugLoc());
+    }
+  }
+};
+} // end anon namespace
+
+
+namespace {
+/// \brief An optimization pass providing Scalar Replacement of Aggregates.
+///
+/// This pass takes allocations which can be completely analyzed (that is, they
+/// don't escape) and tries to turn them into scalar SSA values. There are
+/// a few steps to this process.
+///
+/// 1) It takes allocations of aggregates and analyzes the ways in which they
+///    are used to try to split them into smaller allocations, ideally of
+///    a single scalar data type. It will split up memcpy and memset accesses
+///    as necessary and try to isolate invidual scalar accesses.
+/// 2) It will transform accesses into forms which are suitable for SSA value
+///    promotion. This can be replacing a memset with a scalar store of an
+///    integer value, or it can involve speculating operations on a PHI or
+///    select to be a PHI or select of the results.
+/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
+///    onto insert and extract operations on a vector value, and convert them to
+///    this form. By doing so, it will enable promotion of vector aggregates to
+///    SSA vector values.
+class SROA : public FunctionPass {
+  const bool RequiresDomTree;
+
+  LLVMContext *C;
+  const DataLayout *TD;
+  DominatorTree *DT;
+
+  /// \brief Worklist of alloca instructions to simplify.
+  ///
+  /// Each alloca in the function is added to this. Each new alloca formed gets
+  /// added to it as well to recursively simplify unless that alloca can be
+  /// directly promoted. Finally, each time we rewrite a use of an alloca other
+  /// the one being actively rewritten, we add it back onto the list if not
+  /// already present to ensure it is re-visited.
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
+
+  /// \brief A collection of instructions to delete.
+  /// We try to batch deletions to simplify code and make things a bit more
+  /// efficient.
+  SetVector<Instruction *, SmallVector<Instruction *, 8> > DeadInsts;
+
+  /// \brief Post-promotion worklist.
+  ///
+  /// Sometimes we discover an alloca which has a high probability of becoming
+  /// viable for SROA after a round of promotion takes place. In those cases,
+  /// the alloca is enqueued here for re-processing.
+  ///
+  /// Note that we have to be very careful to clear allocas out of this list in
+  /// the event they are deleted.
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > PostPromotionWorklist;
+
+  /// \brief A collection of alloca instructions we can directly promote.
+  std::vector<AllocaInst *> PromotableAllocas;
+
+public:
+  SROA(bool RequiresDomTree = true)
+      : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
+        C(0), TD(0), DT(0) {
+    initializeSROAPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  const char *getPassName() const { return "SROA"; }
+  static char ID;
+
+private:
+  friend class PHIOrSelectSpeculator;
+  friend class AllocaPartitionRewriter;
+  friend class AllocaPartitionVectorRewriter;
+
+  bool rewriteAllocaPartition(AllocaInst &AI,
+                              AllocaPartitioning &P,
+                              AllocaPartitioning::iterator PI);
+  bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
+  bool runOnAlloca(AllocaInst &AI);
+  void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
+  bool promoteAllocas(Function &F);
+};
+}
+
+char SROA::ID = 0;
+
+FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
+  return new SROA(RequiresDomTree);
+}
+
+INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
+
+namespace {
+/// \brief Visitor to speculate PHIs and Selects where possible.
+class PHIOrSelectSpeculator : public InstVisitor<PHIOrSelectSpeculator> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<PHIOrSelectSpeculator>;
+
+  const DataLayout &TD;
+  AllocaPartitioning &P;
+  SROA &Pass;
+
+public:
+  PHIOrSelectSpeculator(const DataLayout &TD, AllocaPartitioning &P, SROA &Pass)
+    : TD(TD), P(P), Pass(Pass) {}
+
+  /// \brief Visit the users of an alloca partition and rewrite them.
+  void visitUsers(AllocaPartitioning::const_iterator PI) {
+    // Note that we need to use an index here as the underlying vector of uses
+    // may be grown during speculation. However, we never need to re-visit the
+    // new uses, and so we can use the initial size bound.
+    for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
+      const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx);
+      if (!PU.U)
+        continue; // Skip dead use.
+
+      visit(cast<Instruction>(PU.U->getUser()));
+    }
+  }
+
+private:
+  // By default, skip this instruction.
+  void visitInstruction(Instruction &I) {}
+
+  /// PHI instructions that use an alloca and are subsequently loaded can be
+  /// rewritten to load both input pointers in the pred blocks and then PHI the
+  /// results, allowing the load of the alloca to be promoted.
+  /// From this:
+  ///   %P2 = phi [i32* %Alloca, i32* %Other]
+  ///   %V = load i32* %P2
+  /// to:
+  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+  ///   ...
+  ///   %V2 = load i32* %Other
+  ///   ...
+  ///   %V = phi [i32 %V1, i32 %V2]
+  ///
+  /// We can do this to a select if its only uses are loads and if the operands
+  /// to the select can be loaded unconditionally.
+  ///
+  /// FIXME: This should be hoisted into a generic utility, likely in
+  /// Transforms/Util/Local.h
+  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
+    // For now, we can only do this promotion if the load is in the same block
+    // as the PHI, and if there are no stores between the phi and load.
+    // TODO: Allow recursive phi users.
+    // TODO: Allow stores.
+    BasicBlock *BB = PN.getParent();
+    unsigned MaxAlign = 0;
+    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple()) return false;
+
+      // For now we only allow loads in the same block as the PHI.  This is
+      // a common case that happens when instcombine merges two loads through
+      // a PHI.
+      if (LI->getParent() != BB) return false;
+
+      // Ensure that there are no instructions between the PHI and the load that
+      // could store.
+      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
+        if (BBI->mayWriteToMemory())
+          return false;
+
+      MaxAlign = std::max(MaxAlign, LI->getAlignment());
+      Loads.push_back(LI);
+    }
+
+    // We can only transform this if it is safe to push the loads into the
+    // predecessor blocks. The only thing to watch out for is that we can't put
+    // a possibly trapping load in the predecessor if it is a critical edge.
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
+         ++Idx) {
+      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+      Value *InVal = PN.getIncomingValue(Idx);
+
+      // If the value is produced by the terminator of the predecessor (an
+      // invoke) or it has side-effects, there is no valid place to put a load
+      // in the predecessor.
+      if (TI == InVal || TI->mayHaveSideEffects())
+        return false;
+
+      // If the predecessor has a single successor, then the edge isn't
+      // critical.
+      if (TI->getNumSuccessors() == 1)
+        continue;
+
+      // If this pointer is always safe to load, or if we can prove that there
+      // is already a load in the block, then we can move the load to the pred
+      // block.
+      if (InVal->isDereferenceablePointer() ||
+          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
+        continue;
+
+      return false;
+    }
+
+    return true;
+  }
+
+  void visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+
+    SmallVector<LoadInst *, 4> Loads;
+    if (!isSafePHIToSpeculate(PN, Loads))
+      return;
+
+    assert(!Loads.empty());
+
+    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+    IRBuilder<> PHIBuilder(&PN);
+    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+                                          PN.getName() + ".sroa.speculated");
+
+    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+    // matter which one we get and if any differ, it doesn't matter.
+    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
+    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+    unsigned Align = SomeLoad->getAlignment();
+
+    // Rewrite all loads of the PN to use the new PHI.
+    do {
+      LoadInst *LI = Loads.pop_back_val();
+      LI->replaceAllUsesWith(NewPN);
+      Pass.DeadInsts.insert(LI);
+    } while (!Loads.empty());
+
+    // Inject loads into all of the pred blocks.
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+      BasicBlock *Pred = PN.getIncomingBlock(Idx);
+      TerminatorInst *TI = Pred->getTerminator();
+      Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
+      Value *InVal = PN.getIncomingValue(Idx);
+      IRBuilder<> PredBuilder(TI);
+
+      LoadInst *Load
+        = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
+                                         Pred->getName()));
+      ++NumLoadsSpeculated;
+      Load->setAlignment(Align);
+      if (TBAATag)
+        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+      NewPN->addIncoming(Load, Pred);
+
+      Instruction *Ptr = dyn_cast<Instruction>(InVal);
+      if (!Ptr)
+        // No uses to rewrite.
+        continue;
+
+      // Try to lookup and rewrite any partition uses corresponding to this phi
+      // input.
+      AllocaPartitioning::iterator PI
+        = P.findPartitionForPHIOrSelectOperand(InUse);
+      if (PI == P.end())
+        continue;
+
+      // Replace the Use in the PartitionUse for this operand with the Use
+      // inside the load.
+      AllocaPartitioning::use_iterator UI
+        = P.findPartitionUseForPHIOrSelectOperand(InUse);
+      assert(isa<PHINode>(*UI->U->getUser()));
+      UI->U = &Load->getOperandUse(Load->getPointerOperandIndex());
+    }
+    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+  }
+
+  /// Select instructions that use an alloca and are subsequently loaded can be
+  /// rewritten to load both input pointers and then select between the result,
+  /// allowing the load of the alloca to be promoted.
+  /// From this:
+  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+  ///   %V = load i32* %P2
+  /// to:
+  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+  ///   %V2 = load i32* %Other
+  ///   %V = select i1 %cond, i32 %V1, i32 %V2
+  ///
+  /// We can do this to a select if its only uses are loads and if the operand
+  /// to the select can be loaded unconditionally.
+  bool isSafeSelectToSpeculate(SelectInst &SI,
+                               SmallVectorImpl<LoadInst *> &Loads) {
+    Value *TValue = SI.getTrueValue();
+    Value *FValue = SI.getFalseValue();
+    bool TDerefable = TValue->isDereferenceablePointer();
+    bool FDerefable = FValue->isDereferenceablePointer();
+
+    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple()) return false;
+
+      // Both operands to the select need to be dereferencable, either
+      // absolutely (e.g. allocas) or at this point because we can see other
+      // accesses to it.
+      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
+                                                      LI->getAlignment(), &TD))
+        return false;
+      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
+                                                      LI->getAlignment(), &TD))
+        return false;
+      Loads.push_back(LI);
+    }
+
+    return true;
+  }
+
+  void visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    IRBuilder<> IRB(&SI);
+
+    // If the select isn't safe to speculate, just use simple logic to emit it.
+    SmallVector<LoadInst *, 4> Loads;
+    if (!isSafeSelectToSpeculate(SI, Loads))
+      return;
+
+    Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
+    AllocaPartitioning::iterator PIs[2];
+    AllocaPartitioning::PartitionUse PUs[2];
+    for (unsigned i = 0, e = 2; i != e; ++i) {
+      PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
+      if (PIs[i] != P.end()) {
+        // If the pointer is within the partitioning, remove the select from
+        // its uses. We'll add in the new loads below.
+        AllocaPartitioning::use_iterator UI
+          = P.findPartitionUseForPHIOrSelectOperand(Ops[i]);
+        PUs[i] = *UI;
+        // Clear out the use here so that the offsets into the use list remain
+        // stable but this use is ignored when rewriting.
+        UI->U = 0;
+      }
+    }
+
+    Value *TV = SI.getTrueValue();
+    Value *FV = SI.getFalseValue();
+    // Replace the loads of the select with a select of two loads.
+    while (!Loads.empty()) {
+      LoadInst *LI = Loads.pop_back_val();
+
+      IRB.SetInsertPoint(LI);
+      LoadInst *TL =
+        IRB.CreateLoad(TV, LI->getName() + ".sroa.speculate.load.true");
+      LoadInst *FL =
+        IRB.CreateLoad(FV, LI->getName() + ".sroa.speculate.load.false");
+      NumLoadsSpeculated += 2;
+
+      // Transfer alignment and TBAA info if present.
+      TL->setAlignment(LI->getAlignment());
+      FL->setAlignment(LI->getAlignment());
+      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
+        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
+        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+      }
+
+      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+                                  LI->getName() + ".sroa.speculated");
+
+      LoadInst *Loads[2] = { TL, FL };
+      for (unsigned i = 0, e = 2; i != e; ++i) {
+        if (PIs[i] != P.end()) {
+          Use *LoadUse = &Loads[i]->getOperandUse(0);
+          assert(PUs[i].U->get() == LoadUse->get());
+          PUs[i].U = LoadUse;
+          P.use_push_back(PIs[i], PUs[i]);
+        }
+      }
+
+      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+      LI->replaceAllUsesWith(V);
+      Pass.DeadInsts.insert(LI);
+    }
+  }
+};
+}
+
+/// \brief Accumulate the constant offsets in a GEP into a single APInt offset.
+///
+/// If the provided GEP is all-constant, the total byte offset formed by the
+/// GEP is computed and Offset is set to it. If the GEP has any non-constant
+/// operands, the function returns false and the value of Offset is unmodified.
+static bool accumulateGEPOffsets(const DataLayout &TD, GEPOperator &GEP,
+                                 APInt &Offset) {
+  APInt GEPOffset(Offset.getBitWidth(), 0);
+  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
+       GTI != GTE; ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+    if (!OpC)
+      return false;
+    if (OpC->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = TD.getStructLayout(STy);
+      GEPOffset += APInt(Offset.getBitWidth(),
+                         SL->getElementOffset(ElementIdx));
+      continue;
+    }
+
+    APInt TypeSize(Offset.getBitWidth(),
+                   TD.getTypeAllocSize(GTI.getIndexedType()));
+    if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) {
+      assert((VTy->getScalarSizeInBits() % 8) == 0 &&
+             "vector element size is not a multiple of 8, cannot GEP over it");
+      TypeSize = VTy->getScalarSizeInBits() / 8;
+    }
+
+    GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize;
+  }
+  Offset = GEPOffset;
+  return true;
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices,
+                       const Twine &Prefix) {
+  if (Indices.empty())
+    return BasePtr;
+
+  // A single zero index is a no-op, so check for this and avoid building a GEP
+  // in that case.
+  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+    return BasePtr;
+
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
+                                    Value *BasePtr, Type *Ty, Type *TargetTy,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    const Twine &Prefix) {
+  if (Ty == TargetTy)
+    return buildGEP(IRB, BasePtr, Indices, Prefix);
+
+  // See if we can descend into a struct and locate a field with the correct
+  // type.
+  unsigned NumLayers = 0;
+  Type *ElementTy = Ty;
+  do {
+    if (ElementTy->isPointerTy())
+      break;
+    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
+      ElementTy = SeqTy->getElementType();
+      // Note that we use the default address space as this index is over an
+      // array or a vector, not a pointer.
+      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(0), 0)));
+    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+      if (STy->element_begin() == STy->element_end())
+        break; // Nothing left to descend into.
+      ElementTy = *STy->element_begin();
+      Indices.push_back(IRB.getInt32(0));
+    } else {
+      break;
+    }
+    ++NumLayers;
+  } while (ElementTy != TargetTy);
+  if (ElementTy != TargetTy)
+    Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+  return buildGEP(IRB, BasePtr, Indices, Prefix);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
+                                       Value *Ptr, Type *Ty, APInt &Offset,
+                                       Type *TargetTy,
+                                       SmallVectorImpl<Value *> &Indices,
+                                       const Twine &Prefix) {
+  if (Offset == 0)
+    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
+
+  // We can't recurse through pointer types.
+  if (Ty->isPointerTy())
+    return 0;
+
+  // We try to analyze GEPs over vectors here, but note that these GEPs are
+  // extremely poorly defined currently. The long-term goal is to remove GEPing
+  // over a vector from the IR completely.
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+    unsigned ElementSizeInBits = VecTy->getScalarSizeInBits();
+    if (ElementSizeInBits % 8)
+      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
+    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(VecTy->getNumElements()))
+      return 0;
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
+                                    Offset, TargetTy, Indices, Prefix);
+  }
+
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    Type *ElementTy = ArrTy->getElementType();
+    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+    APInt NumSkippedElements = Offset.sdiv(ElementSize);
+    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+      return 0;
+
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                    Indices, Prefix);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return 0;
+
+  const StructLayout *SL = TD.getStructLayout(STy);
+  uint64_t StructOffset = Offset.getZExtValue();
+  if (StructOffset >= SL->getSizeInBytes())
+    return 0;
+  unsigned Index = SL->getElementContainingOffset(StructOffset);
+  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+  Type *ElementTy = STy->getElementType(Index);
+  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
+    return 0; // The offset points into alignment padding.
+
+  Indices.push_back(IRB.getInt32(Index));
+  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, Prefix);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
+                                      Value *Ptr, APInt Offset, Type *TargetTy,
+                                      SmallVectorImpl<Value *> &Indices,
+                                      const Twine &Prefix) {
+  PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+  // an i8.
+  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
+    return 0;
+
+  Type *ElementTy = Ty->getElementType();
+  if (!ElementTy->isSized())
+    return 0; // We can't GEP through an unsized element.
+  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+  if (ElementSize == 0)
+    return 0; // Zero-length arrays can't help us build a natural GEP.
+  APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+  Offset -= NumSkippedElements * ElementSize;
+  Indices.push_back(IRB.getInt(NumSkippedElements));
+  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, Prefix);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properities. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
+                             Value *Ptr, APInt Offset, Type *PointerTy,
+                             const Twine &Prefix) {
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(Ptr);
+  SmallVector<Value *, 4> Indices;
+
+  // We may end up computing an offset pointer that has the wrong type. If we
+  // never are able to compute one directly that has the correct type, we'll
+  // fall back to it, so keep it around here.
+  Value *OffsetPtr = 0;
+
+  // Remember any i8 pointer we come across to re-use if we need to do a raw
+  // byte offset.
+  Value *Int8Ptr = 0;
+  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+  Type *TargetTy = PointerTy->getPointerElementType();
+
+  do {
+    // First fold any existing GEPs into the offset.
+    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(Offset.getBitWidth(), 0);
+      if (!accumulateGEPOffsets(TD, *GEP, GEPOffset))
+        break;
+      Offset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+      if (!Visited.insert(Ptr))
+        break;
+    }
+
+    // See if we can perform a natural GEP here.
+    Indices.clear();
+    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
+                                           Indices, Prefix)) {
+      if (P->getType() == PointerTy) {
+        // Zap any offset pointer that we ended up computing in previous rounds.
+        if (OffsetPtr && OffsetPtr->use_empty())
+          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
+            I->eraseFromParent();
+        return P;
+      }
+      if (!OffsetPtr) {
+        OffsetPtr = P;
+      }
+    }
+
+    // Stash this pointer if we've found an i8*.
+    if (Ptr->getType()->isIntegerTy(8)) {
+      Int8Ptr = Ptr;
+      Int8PtrOffset = Offset;
+    }
+
+    // Peel off a layer of the pointer and update the offset appropriately.
+    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->mayBeOverridden())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(Ptr));
+
+  if (!OffsetPtr) {
+    if (!Int8Ptr) {
+      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
+                                  Prefix + ".raw_cast");
+      Int8PtrOffset = Offset;
+    }
+
+    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
+      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+                            Prefix + ".raw_idx");
+  }
+  Ptr = OffsetPtr;
+
+  // On the off chance we were targeting i8*, guard the bitcast here.
+  if (Ptr->getType() != PointerTy)
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
+
+  return Ptr;
+}
+
+/// \brief Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+  if (OldTy == NewTy)
+    return true;
+  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+    return false;
+  if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+    return false;
+
+  if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+    if (NewTy->isPointerTy() && OldTy->isPointerTy())
+      return true;
+    if (NewTy->isIntegerTy() || OldTy->isIntegerTy())
+      return true;
+    return false;
+  }
+
+  return true;
+}
+
+/// \brief Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+                           Type *Ty) {
+  assert(canConvertValue(DL, V->getType(), Ty) &&
+         "Value not convertable to type");
+  if (V->getType() == Ty)
+    return V;
+  if (V->getType()->isIntegerTy() && Ty->isPointerTy())
+    return IRB.CreateIntToPtr(V, Ty);
+  if (V->getType()->isPointerTy() && Ty->isIntegerTy())
+    return IRB.CreatePtrToInt(V, Ty);
+
+  return IRB.CreateBitCast(V, Ty);
+}
+
+/// \brief Test whether the given alloca partition can be promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static bool isVectorPromotionViable(const DataLayout &TD,
+                                    Type *AllocaTy,
+                                    AllocaPartitioning &P,
+                                    uint64_t PartitionBeginOffset,
+                                    uint64_t PartitionEndOffset,
+                                    AllocaPartitioning::const_use_iterator I,
+                                    AllocaPartitioning::const_use_iterator E) {
+  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
+  if (!Ty)
+    return false;
+
+  uint64_t VecSize = TD.getTypeSizeInBits(Ty);
+  uint64_t ElementSize = Ty->getScalarSizeInBits();
+
+  // While the definition of LLVM vectors is bitpacked, we don't support sizes
+  // that aren't byte sized.
+  if (ElementSize % 8)
+    return false;
+  assert((VecSize % 8) == 0 && "vector size not a multiple of element size?");
+  VecSize /= 8;
+  ElementSize /= 8;
+
+  for (; I != E; ++I) {
+    if (!I->U)
+      continue; // Skip dead use.
+
+    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
+    uint64_t BeginIndex = BeginOffset / ElementSize;
+    if (BeginIndex * ElementSize != BeginOffset ||
+        BeginIndex >= Ty->getNumElements())
+      return false;
+    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
+    uint64_t EndIndex = EndOffset / ElementSize;
+    if (EndIndex * ElementSize != EndOffset ||
+        EndIndex > Ty->getNumElements())
+      return false;
+
+    // FIXME: We should build shuffle vector instructions to handle
+    // non-element-sized accesses.
+    if ((EndOffset - BeginOffset) != ElementSize &&
+        (EndOffset - BeginOffset) != VecSize)
+      return false;
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
+      if (MI->isVolatile())
+        return false;
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
+        const AllocaPartitioning::MemTransferOffsets &MTO
+          = P.getMemTransferOffsets(*MTI);
+        if (!MTO.IsSplittable)
+          return false;
+      }
+    } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) {
+      // Disable vector promotion when there are loads or stores of an FCA.
+      return false;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+      if (LI->isVolatile())
+        return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+      if (SI->isVolatile())
+        return false;
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// \brief Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(const DataLayout &TD,
+                                    Type *AllocaTy,
+                                    uint64_t AllocBeginOffset,
+                                    AllocaPartitioning &P,
+                                    AllocaPartitioning::const_use_iterator I,
+                                    AllocaPartitioning::const_use_iterator E) {
+  uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
+
+  // Don't try to handle allocas with bit-padding.
+  if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
+    return false;
+
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(TD, AllocaTy, IntTy) ||
+      !canConvertValue(TD, IntTy, AllocaTy))
+    return false;
+
+  uint64_t Size = TD.getTypeStoreSize(AllocaTy);
+
+  // Check the uses to ensure the uses are (likely) promoteable integer uses.
+  // Also ensure that the alloca has a covering load or store. We don't want
+  // to widen the integer operotains only to fail to promote due to some other
+  // unsplittable entry (which we may make splittable later).
+  bool WholeAllocaOp = false;
+  for (; I != E; ++I) {
+    if (!I->U)
+      continue; // Skip dead use.
+
+    uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
+    uint64_t RelEnd = I->EndOffset - AllocBeginOffset;
+
+    // We can't reasonably handle cases where the load or store extends past
+    // the end of the aloca's type and into its padding.
+    if (RelEnd > Size)
+      return false;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+      if (LI->isVolatile())
+        return false;
+      if (RelBegin == 0 && RelEnd == Size)
+        WholeAllocaOp = true;
+      if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+        if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy))
+          return false;
+        continue;
+      }
+      // Non-integer loads need to be convertible from the alloca type so that
+      // they are promotable.
+      if (RelBegin != 0 || RelEnd != Size ||
+          !canConvertValue(TD, AllocaTy, LI->getType()))
+        return false;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+      Type *ValueTy = SI->getValueOperand()->getType();
+      if (SI->isVolatile())
+        return false;
+      if (RelBegin == 0 && RelEnd == Size)
+        WholeAllocaOp = true;
+      if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+        if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy))
+          return false;
+        continue;
+      }
+      // Non-integer stores need to be convertible to the alloca type so that
+      // they are promotable.
+      if (RelBegin != 0 || RelEnd != Size ||
+          !canConvertValue(TD, ValueTy, AllocaTy))
+        return false;
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
+      if (MI->isVolatile())
+        return false;
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
+        const AllocaPartitioning::MemTransferOffsets &MTO
+          = P.getMemTransferOffsets(*MTI);
+        if (!MTO.IsSplittable)
+          return false;
+      }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) {
+      if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+          II->getIntrinsicID() != Intrinsic::lifetime_end)
+        return false;
+    } else {
+      return false;
+    }
+  }
+  return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+                             IntegerType *Ty, uint64_t Offset,
+                             const Twine &Name) {
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  IntegerType *IntTy = cast<IntegerType>(V->getType());
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (Ty != IntTy) {
+    V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+    DEBUG(dbgs() << "     trunced: " << *V << "\n");
+  }
+  return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
+                            Value *V, uint64_t Offset, const Twine &Name) {
+  IntegerType *IntTy = cast<IntegerType>(Old->getType());
+  IntegerType *Ty = cast<IntegerType>(V->getType());
+  assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot insert a larger integer!");
+  DEBUG(dbgs() << "       start: " << *V << "\n");
+  if (Ty != IntTy) {
+    V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+    DEBUG(dbgs() << "    extended: " << *V << "\n");
+  }
+  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+         "Element store outside of alloca store");
+  uint64_t ShAmt = 8*Offset;
+  if (DL.isBigEndian())
+    ShAmt = 8*(DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+  if (ShAmt) {
+    V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+    DEBUG(dbgs() << "     shifted: " << *V << "\n");
+  }
+
+  if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+    APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+    Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+    DEBUG(dbgs() << "      masked: " << *Old << "\n");
+    V = IRB.CreateOr(Old, V, Name + ".insert");
+    DEBUG(dbgs() << "    inserted: " << *V << "\n");
+  }
+  return V;
+}
+
+namespace {
+/// \brief Visitor to rewrite instructions using a partition of an alloca to
+/// use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
+                                                   bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
+
+  const DataLayout &TD;
+  AllocaPartitioning &P;
+  SROA &Pass;
+  AllocaInst &OldAI, &NewAI;
+  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+  Type *NewAllocaTy;
+
+  // If we are rewriting an alloca partition which can be written as pure
+  // vector operations, we stash extra information here. When VecTy is
+  // non-null, we have some strict guarantees about the rewriten alloca:
+  //   - The new alloca is exactly the size of the vector type here.
+  //   - The accesses all either map to the entire vector or to a single
+  //     element.
+  //   - The set of accessing instructions is only one of those handled above
+  //     in isVectorPromotionViable. Generally these are the same access kinds
+  //     which are promotable via mem2reg.
+  VectorType *VecTy;
+  Type *ElementTy;
+  uint64_t ElementSize;
+
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca's integer operations should be widened to this integer type due to
+  // passing isIntegerWideningViable above. If it is non-null, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntTy;
+
+  // The offset of the partition user currently being rewritten.
+  uint64_t BeginOffset, EndOffset;
+  Use *OldUse;
+  Instruction *OldPtr;
+
+  // The name prefix to use when rewriting instructions for this alloca.
+  std::string NamePrefix;
+
+public:
+  AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
+                          AllocaPartitioning::iterator PI,
+                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
+                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
+    : TD(TD), P(P), Pass(Pass),
+      OldAI(OldAI), NewAI(NewAI),
+      NewAllocaBeginOffset(NewBeginOffset),
+      NewAllocaEndOffset(NewEndOffset),
+      NewAllocaTy(NewAI.getAllocatedType()),
+      VecTy(), ElementTy(), ElementSize(), IntTy(),
+      BeginOffset(), EndOffset() {
+  }
+
+  /// \brief Visit the users of the alloca partition and rewrite them.
+  bool visitUsers(AllocaPartitioning::const_use_iterator I,
+                  AllocaPartitioning::const_use_iterator E) {
+    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
+                                NewAllocaBeginOffset, NewAllocaEndOffset,
+                                I, E)) {
+      ++NumVectorized;
+      VecTy = cast<VectorType>(NewAI.getAllocatedType());
+      ElementTy = VecTy->getElementType();
+      assert((VecTy->getScalarSizeInBits() % 8) == 0 &&
+             "Only multiple-of-8 sized vector elements are viable");
+      ElementSize = VecTy->getScalarSizeInBits() / 8;
+    } else if (isIntegerWideningViable(TD, NewAI.getAllocatedType(),
+                                       NewAllocaBeginOffset, P, I, E)) {
+      IntTy = Type::getIntNTy(NewAI.getContext(),
+                              TD.getTypeSizeInBits(NewAI.getAllocatedType()));
+    }
+    bool CanSROA = true;
+    for (; I != E; ++I) {
+      if (!I->U)
+        continue; // Skip dead uses.
+      BeginOffset = I->BeginOffset;
+      EndOffset = I->EndOffset;
+      OldUse = I->U;
+      OldPtr = cast<Instruction>(I->U->get());
+      NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
+      CanSROA &= visit(cast<Instruction>(I->U->getUser()));
+    }
+    if (VecTy) {
+      assert(CanSROA);
+      VecTy = 0;
+      ElementTy = 0;
+      ElementSize = 0;
+    }
+    if (IntTy) {
+      assert(CanSROA);
+      IntTy = 0;
+    }
+    return CanSROA;
+  }
+
+private:
+  // Every instruction which can end up as a user must have a rewrite rule.
+  bool visitInstruction(Instruction &I) {
+    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    llvm_unreachable("No rewrite rule for this instruction!");
+  }
+
+  Twine getName(const Twine &Suffix) {
+    return NamePrefix + Suffix;
+  }
+
+  Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
+    assert(BeginOffset >= NewAllocaBeginOffset);
+    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
+    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
+  }
+
+  /// \brief Compute suitable alignment to access an offset into the new alloca.
+  unsigned getOffsetAlign(uint64_t Offset) {
+    unsigned NewAIAlign = NewAI.getAlignment();
+    if (!NewAIAlign)
+      NewAIAlign = TD.getABITypeAlignment(NewAI.getAllocatedType());
+    return MinAlign(NewAIAlign, Offset);
+  }
+
+  /// \brief Compute suitable alignment to access this partition of the new
+  /// alloca.
+  unsigned getPartitionAlign() {
+    return getOffsetAlign(BeginOffset - NewAllocaBeginOffset);
+  }
+
+  /// \brief Compute suitable alignment to access a type at an offset of the
+  /// new alloca.
+  ///
+  /// \returns zero if the type's ABI alignment is a suitable alignment,
+  /// otherwise returns the maximal suitable alignment.
+  unsigned getOffsetTypeAlign(Type *Ty, uint64_t Offset) {
+    unsigned Align = getOffsetAlign(Offset);
+    return Align == TD.getABITypeAlignment(Ty) ? 0 : Align;
+  }
+
+  /// \brief Compute suitable alignment to access a type at the beginning of
+  /// this partition of the new alloca.
+  ///
+  /// See \c getOffsetTypeAlign for details; this routine delegates to it.
+  unsigned getPartitionTypeAlign(Type *Ty) {
+    return getOffsetTypeAlign(Ty, BeginOffset - NewAllocaBeginOffset);
+  }
+
+  ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
+    assert(VecTy && "Can only call getIndex when rewriting a vector");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+    uint32_t Index = RelOffset / ElementSize;
+    assert(Index * ElementSize == RelOffset);
+    return IRB.getInt32(Index);
+  }
+
+  void deleteIfTriviallyDead(Value *V) {
+    Instruction *I = cast<Instruction>(V);
+    if (isInstructionTriviallyDead(I))
+      Pass.DeadInsts.insert(I);
+  }
+
+  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     getName(".load"));
+    if (LI.getType() == VecTy->getElementType() ||
+        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
+      V = IRB.CreateExtractElement(V, getIndex(IRB, BeginOffset),
+                                   getName(".extract"));
+    }
+    return V;
+  }
+
+  Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+    assert(IntTy && "We cannot insert an integer to the alloca");
+    assert(!LI.isVolatile());
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     getName(".load"));
+    V = convertValue(TD, IRB, V, IntTy);
+    assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+    if (Offset > 0 || EndOffset < NewAllocaEndOffset)
+      V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
+                         getName(".extract"));
+    return V;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    Value *OldOp = LI.getOperand(0);
+    assert(OldOp == OldPtr);
+    IRBuilder<> IRB(&LI);
+
+    uint64_t Size = EndOffset - BeginOffset;
+    bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of the original allocation it's behavior is undefined. Rather
+    // than trying to transform it, just replace it with undef.
+    // FIXME: We should do something more clever for functions being
+    // instrumented by asan.
+    // FIXME: Eventually, once ASan and friends can flush out bugs here, this
+    // should be transformed to a load of null making it unreachable.
+    uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType());
+    if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) {
+      LI.replaceAllUsesWith(UndefValue::get(LI.getType()));
+      Pass.DeadInsts.insert(&LI);
+      deleteIfTriviallyDead(OldOp);
+      DEBUG(dbgs() << "          to: undef!!\n");
+      return true;
+    }
+
+    Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8)
+                                    : LI.getType();
+    bool IsPtrAdjusted = false;
+    Value *V;
+    if (VecTy) {
+      V = rewriteVectorizedLoadInst(IRB, LI, OldOp);
+    } else if (IntTy && LI.getType()->isIntegerTy()) {
+      V = rewriteIntegerLoad(IRB, LI);
+    } else if (BeginOffset == NewAllocaBeginOffset &&
+               canConvertValue(TD, NewAllocaTy, LI.getType())) {
+      V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                LI.isVolatile(), getName(".load"));
+    } else {
+      Type *LTy = TargetTy->getPointerTo();
+      V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
+                                getPartitionTypeAlign(TargetTy),
+                                LI.isVolatile(), getName(".load"));
+      IsPtrAdjusted = true;
+    }
+    V = convertValue(TD, IRB, V, TargetTy);
+
+    if (IsSplitIntLoad) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) &&
+             "Only alloca-wide loads can be split and recomposed");
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder
+        = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
+                        getName(".insert"));
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      delete Placeholder;
+    } else {
+      LI.replaceAllUsesWith(V);
+    }
+
+    Pass.DeadInsts.insert(&LI);
+    deleteIfTriviallyDead(OldOp);
+    DEBUG(dbgs() << "          to: " << *V << "\n");
+    return !LI.isVolatile() && !IsPtrAdjusted;
+  }
+
+  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
+                                  StoreInst &SI, Value *OldOp) {
+    if (V->getType() == ElementTy ||
+        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
+      if (V->getType() != ElementTy)
+        V = convertValue(TD, IRB, V, ElementTy);
+      LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                           getName(".load"));
+      V = IRB.CreateInsertElement(LI, V, getIndex(IRB, BeginOffset),
+                                  getName(".insert"));
+    } else if (V->getType() != VecTy) {
+      V = convertValue(TD, IRB, V, VecTy);
+    }
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) {
+    assert(IntTy && "We cannot extract an integer from the alloca");
+    assert(!SI.isVolatile());
+    if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
+                        getName(".insert"));
+    }
+    V = convertValue(TD, IRB, V, NewAllocaTy);
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    Pass.DeadInsts.insert(&SI);
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    Value *OldOp = SI.getOperand(1);
+    assert(OldOp == OldPtr);
+    IRBuilder<> IRB(&SI);
+
+    Value *V = SI.getValueOperand();
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after promoting this alloca.
+    if (V->getType()->isPointerTy())
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+        Pass.PostPromotionWorklist.insert(AI);
+
+    uint64_t Size = EndOffset - BeginOffset;
+    if (Size < TD.getTypeStoreSize(V->getType())) {
+      assert(!SI.isVolatile());
+      assert(V->getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(V->getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(V->getType()) &&
+             "Non-byte-multiple bit width");
+      assert(V->getType()->getIntegerBitWidth() ==
+             TD.getTypeSizeInBits(OldAI.getAllocatedType()) &&
+             "Only alloca-wide stores can be split and recomposed");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
+      V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
+                         getName(".extract"));
+    }
+
+    if (VecTy)
+      return rewriteVectorizedStoreInst(IRB, V, SI, OldOp);
+    if (IntTy && V->getType()->isIntegerTy())
+      return rewriteIntegerStore(IRB, V, SI);
+
+    StoreInst *NewSI;
+    if (BeginOffset == NewAllocaBeginOffset &&
+        canConvertValue(TD, V->getType(), NewAllocaTy)) {
+      V = convertValue(TD, IRB, V, NewAllocaTy);
+      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                     SI.isVolatile());
+    } else {
+      Value *NewPtr = getAdjustedAllocaPtr(IRB, V->getType()->getPointerTo());
+      NewSI = IRB.CreateAlignedStore(V, NewPtr,
+                                     getPartitionTypeAlign(V->getType()),
+                                     SI.isVolatile());
+    }
+    (void)NewSI;
+    Pass.DeadInsts.insert(&SI);
+    deleteIfTriviallyDead(OldOp);
+
+    DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+    return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+  }
+
+  bool visitMemSetInst(MemSetInst &II) {
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    IRBuilder<> IRB(&II);
+    assert(II.getRawDest() == OldPtr);
+
+    // If the memset has a variable size, it cannot be split, just adjust the
+    // pointer to the new alloca.
+    if (!isa<Constant>(II.getLength())) {
+      II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      Type *CstTy = II.getAlignmentCst()->getType();
+      II.setAlignment(ConstantInt::get(CstTy, getPartitionAlign()));
+
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    Type *AllocaTy = NewAI.getAllocatedType();
+    Type *ScalarTy = AllocaTy->getScalarType();
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memset.
+    if (!VecTy && !IntTy &&
+        (BeginOffset != NewAllocaBeginOffset ||
+         EndOffset != NewAllocaEndOffset ||
+         !AllocaTy->isSingleValueType() ||
+         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+      CallInst *New
+        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
+                                                II.getRawDest()->getType()),
+                           II.getValue(), Size, getPartitionAlign(),
+                           II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // If we can represent this as a simple value, we have to build the actual
+    // value to store, which requires expanding the byte present in memset to
+    // a sensible representation for the alloca type. This is essentially
+    // splatting the byte to a sufficiently wide integer, bitcasting to the
+    // desired scalar type, and splatting it across any desired vector type.
+    uint64_t Size = EndOffset - BeginOffset;
+    Value *V = II.getValue();
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
+    if (Size*8 > VTy->getBitWidth())
+      V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
+                        ConstantExpr::getUDiv(
+                          Constant::getAllOnesValue(SplatIntTy),
+                          ConstantExpr::getZExt(
+                            Constant::getAllOnesValue(V->getType()),
+                            SplatIntTy)),
+                        getName(".isplat"));
+
+    // If this is an element-wide memset of a vectorizable alloca, insert it.
+    if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
+                  EndOffset < NewAllocaEndOffset)) {
+      if (V->getType() != ScalarTy)
+        V = convertValue(TD, IRB, V, ScalarTy);
+      StoreInst *Store = IRB.CreateAlignedStore(
+        IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
+                                                      NewAI.getAlignment(),
+                                                      getName(".load")),
+                                V, getIndex(IRB, BeginOffset),
+                                getName(".insert")),
+        &NewAI, NewAI.getAlignment());
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+      return true;
+    }
+
+    // If this is a memset on an alloca where we can widen stores, insert the
+    // set integer.
+    if (IntTy && (BeginOffset > NewAllocaBeginOffset ||
+                  EndOffset < NewAllocaEndOffset)) {
+      assert(!II.isVolatile());
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
+    }
+
+    if (V->getType() != AllocaTy)
+      V = convertValue(TD, IRB, V, AllocaTy);
+
+    Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
+                                        II.isVolatile());
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    // Rewriting of memory transfer instructions can be a bit tricky. We break
+    // them into two categories: split intrinsics and unsplit intrinsics.
+
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    IRBuilder<> IRB(&II);
+
+    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
+    bool IsDest = II.getRawDest() == OldPtr;
+
+    const AllocaPartitioning::MemTransferOffsets &MTO
+      = P.getMemTransferOffsets(II);
+
+    // Compute the relative offset within the transfer.
+    unsigned IntPtrWidth = TD.getPointerSizeInBits();
+    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
+                                                       : MTO.SourceBegin));
+
+    unsigned Align = II.getAlignment();
+    if (Align > 1)
+      Align = MinAlign(RelOffset.zextOrTrunc(64).getZExtValue(),
+                       MinAlign(II.getAlignment(), getPartitionAlign()));
+
+    // For unsplit intrinsics, we simply modify the source and destination
+    // pointers in place. This isn't just an optimization, it is a matter of
+    // correctness. With unsplit intrinsics we may be dealing with transfers
+    // within a single alloca before SROA ran, or with transfers that have
+    // a variable length. We may also be dealing with memmove instead of
+    // memcpy, and so simply updating the pointers is the necessary for us to
+    // update both source and dest of a single call.
+    if (!MTO.IsSplittable) {
+      Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
+      if (IsDest)
+        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      else
+        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
+
+      Type *CstTy = II.getAlignmentCst()->getType();
+      II.setAlignment(ConstantInt::get(CstTy, Align));
+
+      DEBUG(dbgs() << "          to: " << II << "\n");
+      deleteIfTriviallyDead(OldOp);
+      return false;
+    }
+    // For split transfer intrinsics we have an incredibly useful assurance:
+    // the source and destination do not reside within the same alloca, and at
+    // least one of them does not escape. This means that we can replace
+    // memmove with memcpy, and we don't need to worry about all manner of
+    // downsides to splitting and transforming the operations.
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memcpy.
+    bool EmitMemCpy
+      = !VecTy && !IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                             EndOffset != NewAllocaEndOffset ||
+                             !NewAI.getAllocatedType()->isSingleValueType());
+
+    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+    // size hasn't been shrunk based on analysis of the viable range, this is
+    // a no-op.
+    if (EmitMemCpy && &OldAI == &NewAI) {
+      uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
+      uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
+      // Ensure the start lines up.
+      assert(BeginOffset == OrigBegin);
+      (void)OrigBegin;
+
+      // Rewrite the size as needed.
+      if (EndOffset != OrigEnd)
+        II.setLength(ConstantInt::get(II.getLength()->getType(),
+                                      EndOffset - BeginOffset));
+      return false;
+    }
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
+                         EndOffset == NewAllocaEndOffset;
+    bool IsVectorElement = VecTy && !IsWholeAlloca;
+    uint64_t Size = EndOffset - BeginOffset;
+    IntegerType *SubIntTy
+      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
+
+    Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
+                              : II.getRawDest()->getType();
+    if (!EmitMemCpy) {
+      if (IsVectorElement)
+        OtherPtrTy = VecTy->getElementType()->getPointerTo();
+      else if (IntTy && !IsWholeAlloca)
+        OtherPtrTy = SubIntTy->getPointerTo();
+      else
+        OtherPtrTy = NewAI.getType();
+    }
+
+    // Compute the other pointer, folding as much as possible to produce
+    // a single, simple GEP in most cases.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+    OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
+                              getName("." + OtherPtr->getName()));
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after rewriting this instruction.
+    if (AllocaInst *AI
+          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
+      Pass.Worklist.insert(AI);
+
+    if (EmitMemCpy) {
+      Value *OurPtr
+        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
+                                           : II.getRawSource()->getType());
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+
+      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
+                                       IsDest ? OtherPtr : OurPtr,
+                                       Size, Align, II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // Note that we clamp the alignment to 1 here as a 0 alignment for a memcpy
+    // is equivalent to 1, but that isn't true if we end up rewriting this as
+    // a load or store.
+    if (!Align)
+      Align = 1;
+
+    Value *SrcPtr = OtherPtr;
+    Value *DstPtr = &NewAI;
+    if (!IsDest)
+      std::swap(SrcPtr, DstPtr);
+
+    Value *Src;
+    if (IsVectorElement && !IsDest) {
+      // We have to extract rather than load.
+      Src = IRB.CreateExtractElement(
+        IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
+        getIndex(IRB, BeginOffset),
+        getName(".copyextract"));
+    } else if (IntTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  getName(".load"));
+      Src = convertValue(TD, IRB, Src, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract"));
+    } else {
+      Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
+                                  getName(".copyload"));
+    }
+
+    if (IntTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Old = convertValue(TD, IRB, Old, IntTy);
+      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+      Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert"));
+      Src = convertValue(TD, IRB, Src, NewAllocaTy);
+    }
+
+    if (IsVectorElement && IsDest) {
+      // We have to insert into a loaded copy before storing.
+      Src = IRB.CreateInsertElement(
+        IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
+        Src, getIndex(IRB, BeginOffset),
+        getName(".insert"));
+    }
+
+    StoreInst *Store = cast<StoreInst>(
+      IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    IRBuilder<> IRB(&II);
+    assert(II.getArgOperand(1) == OldPtr);
+
+    // Record this instruction for deletion.
+    Pass.DeadInsts.insert(&II);
+
+    ConstantInt *Size
+      = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+                         EndOffset - BeginOffset);
+    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
+    Value *New;
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+      New = IRB.CreateLifetimeStart(Ptr, Size);
+    else
+      New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return true;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+
+    // We would like to compute a new pointer in only one place, but have it be
+    // as local as possible to the PHI. To do that, we re-use the location of
+    // the old pointer, which necessarily must be in the right position to
+    // dominate the PHI.
+    IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr));
+
+    Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
+    // Replace the operands which were using the old pointer.
+    std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+    DEBUG(dbgs() << "          to: " << PN << "\n");
+    deleteIfTriviallyDead(OldPtr);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    IRBuilder<> IRB(&SI);
+
+    // Find the operand we need to rewrite here.
+    bool IsTrueVal = SI.getTrueValue() == OldPtr;
+    if (IsTrueVal)
+      assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!");
+    else
+      assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!");
+
+    Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
+    SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
+    DEBUG(dbgs() << "          to: " << SI << "\n");
+    deleteIfTriviallyDead(OldPtr);
+    return false;
+  }
+
+};
+}
+
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  const DataLayout &TD;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  AggLoadStoreRewriter(const DataLayout &TD) : TD(TD) {}
+
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+         ++UI)
+      if (Visited.insert(*UI))
+        Queue.push_back(&UI.getUse());
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission class.
+  template <typename Derived>
+  class OpSplitter {
+  protected:
+    /// The builder used to form new instructions.
+    IRBuilder<> IRB;
+    /// The indices which to be used with insert- or extractvalue to select the
+    /// appropriate value within the aggregate.
+    SmallVector<unsigned, 4> Indices;
+    /// The indices to a GEP instruction which will move Ptr to the correct slot
+    /// within the aggregate.
+    SmallVector<Value *, 4> GEPIndices;
+    /// The base pointer of the original op, used as a base for GEPing the
+    /// split operations.
+    Value *Ptr;
+
+    /// Initialize the splitter with an insertion point, Ptr and start with a
+    /// single zero GEP index.
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+  public:
+    /// \brief Generic recursive split emission routine.
+    ///
+    /// This method recursively splits an aggregate op (load or store) into
+    /// scalar or vector ops. It splits recursively until it hits a single value
+    /// and emits that single value operation via the template argument.
+    ///
+    /// The logic of this routine relies on GEPs and insertvalue and
+    /// extractvalue all operating with the same fundamental index list, merely
+    /// formatted differently (GEPs need actual values).
+    ///
+    /// \param Ty  The type being split recursively into smaller ops.
+    /// \param Agg The aggregate value being built up or stored, depending on
+    /// whether this is splitting a load or a store respectively.
+    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+      if (Ty->isSingleValueType())
+        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      llvm_unreachable("Only arrays and structs are aggregate loadable types");
+    }
+  };
+
+  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf load of a single value. This is called at the leaves of the
+    /// recursive emission to actually load values.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Load the single value and insert it using the indices.
+      Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
+                                                         Name + ".gep"),
+                                   Name + ".load");
+      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+      DEBUG(dbgs() << "          to: " << *Load << "\n");
+    }
+  };
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LoadOpSplitter Splitter(&LI, *U);
+    Value *V = UndefValue::get(LI.getType());
+    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf store of a single value. This is called at the leaves of the
+    /// recursive emission to actually produce stores.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Extract the single value and store it using the indices.
+      Value *Store = IRB.CreateStore(
+        IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+        IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+    }
+  };
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    StoreOpSplitter Splitter(&SI, *U);
+    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
+/// \brief Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+  if (Ty->isSingleValueType())
+    return Ty;
+
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+
+  Type *InnerTy;
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    InnerTy = ArrTy->getElementType();
+  } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    unsigned Index = SL->getElementContainingOffset(0);
+    InnerTy = STy->getElementType(Index);
+  } else {
+    return Ty;
+  }
+
+  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy))
+    return Ty;
+
+  return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &TD, Type *Ty,
+                              uint64_t Offset, uint64_t Size) {
+  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
+    return stripAggregateTypeWrapping(TD, Ty);
+  if (Offset > TD.getTypeAllocSize(Ty) ||
+      (TD.getTypeAllocSize(Ty) - Offset) < Size)
+    return 0;
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+    // We can't partition pointers...
+    if (SeqTy->isPointerTy())
+      return 0;
+
+    Type *ElementTy = SeqTy->getElementType();
+    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+    uint64_t NumSkippedElements = Offset / ElementSize;
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
+      if (NumSkippedElements >= ArrTy->getNumElements())
+        return 0;
+    if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
+      if (NumSkippedElements >= VecTy->getNumElements())
+        return 0;
+    Offset -= NumSkippedElements * ElementSize;
+
+    // First check if we need to recurse.
+    if (Offset > 0 || Size < ElementSize) {
+      // Bail if the partition ends in a different array element.
+      if ((Offset + Size) > ElementSize)
+        return 0;
+      // Recurse through the element type trying to peel off offset bytes.
+      return getTypePartition(TD, ElementTy, Offset, Size);
+    }
+    assert(Offset == 0);
+
+    if (Size == ElementSize)
+      return stripAggregateTypeWrapping(TD, ElementTy);
+    assert(Size > ElementSize);
+    uint64_t NumElements = Size / ElementSize;
+    if (NumElements * ElementSize != Size)
+      return 0;
+    return ArrayType::get(ElementTy, NumElements);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return 0;
+
+  const StructLayout *SL = TD.getStructLayout(STy);
+  if (Offset >= SL->getSizeInBytes())
+    return 0;
+  uint64_t EndOffset = Offset + Size;
+  if (EndOffset > SL->getSizeInBytes())
+    return 0;
+
+  unsigned Index = SL->getElementContainingOffset(Offset);
+  Offset -= SL->getElementOffset(Index);
+
+  Type *ElementTy = STy->getElementType(Index);
+  uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+  if (Offset >= ElementSize)
+    return 0; // The offset points into alignment padding.
+
+  // See if any partition must be contained by the element.
+  if (Offset > 0 || Size < ElementSize) {
+    if ((Offset + Size) > ElementSize)
+      return 0;
+    return getTypePartition(TD, ElementTy, Offset, Size);
+  }
+  assert(Offset == 0);
+
+  if (Size == ElementSize)
+    return stripAggregateTypeWrapping(TD, ElementTy);
+
+  StructType::element_iterator EI = STy->element_begin() + Index,
+                               EE = STy->element_end();
+  if (EndOffset < SL->getSizeInBytes()) {
+    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+    if (Index == EndIndex)
+      return 0; // Within a single element and its padding.
+
+    // Don't try to form "natural" types if the elements don't line up with the
+    // expected size.
+    // FIXME: We could potentially recurse down through the last element in the
+    // sub-struct to find a natural end point.
+    if (SL->getElementOffset(EndIndex) != EndOffset)
+      return 0;
+
+    assert(Index < EndIndex);
+    EE = STy->element_begin() + EndIndex;
+  }
+
+  // Try to build up a sub-structure.
+  StructType *SubTy = StructType::get(STy->getContext(), makeArrayRef(EI, EE),
+                                      STy->isPacked());
+  const StructLayout *SubSL = TD.getStructLayout(SubTy);
+  if (Size != SubSL->getSizeInBytes())
+    return 0; // The sub-struct doesn't have quite the size needed.
+
+  return SubTy;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+bool SROA::rewriteAllocaPartition(AllocaInst &AI,
+                                  AllocaPartitioning &P,
+                                  AllocaPartitioning::iterator PI) {
+  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
+  bool IsLive = false;
+  for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
+                                        UE = P.use_end(PI);
+       UI != UE && !IsLive; ++UI)
+    if (UI->U)
+      IsLive = true;
+  if (!IsLive)
+    return false; // No live uses left of this partition.
+
+  DEBUG(dbgs() << "Speculating PHIs and selects in partition "
+               << "[" << PI->BeginOffset << "," << PI->EndOffset << ")\n");
+
+  PHIOrSelectSpeculator Speculator(*TD, P, *this);
+  DEBUG(dbgs() << "  speculating ");
+  DEBUG(P.print(dbgs(), PI, ""));
+  Speculator.visitUsers(PI);
+
+  // Try to compute a friendly type for this partition of the alloca. This
+  // won't always succeed, in which case we fall back to a legal integer type
+  // or an i8 array of an appropriate size.
+  Type *AllocaTy = 0;
+  if (Type *PartitionTy = P.getCommonType(PI))
+    if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
+      AllocaTy = PartitionTy;
+  if (!AllocaTy)
+    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
+                                             PI->BeginOffset, AllocaSize))
+      AllocaTy = PartitionTy;
+  if ((!AllocaTy ||
+       (AllocaTy->isArrayTy() &&
+        AllocaTy->getArrayElementType()->isIntegerTy())) &&
+      TD->isLegalInteger(AllocaSize * 8))
+    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
+  if (!AllocaTy)
+    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
+  assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
+
+  // Check for the case where we're going to rewrite to a new alloca of the
+  // exact same type as the original, and with the same access offsets. In that
+  // case, re-use the existing alloca, but still run through the rewriter to
+  // performe phi and select speculation.
+  AllocaInst *NewAI;
+  if (AllocaTy == AI.getAllocatedType()) {
+    assert(PI->BeginOffset == 0 &&
+           "Non-zero begin offset but same alloca type");
+    assert(PI == P.begin() && "Begin offset is zero on later partition");
+    NewAI = &AI;
+  } else {
+    unsigned Alignment = AI.getAlignment();
+    if (!Alignment) {
+      // The minimum alignment which users can rely on when the explicit
+      // alignment is omitted or zero is that required by the ABI for this
+      // type.
+      Alignment = TD->getABITypeAlignment(AI.getAllocatedType());
+    }
+    Alignment = MinAlign(Alignment, PI->BeginOffset);
+    // If we will get at least this much alignment from the type alone, leave
+    // the alloca's alignment unconstrained.
+    if (Alignment <= TD->getABITypeAlignment(AllocaTy))
+      Alignment = 0;
+    NewAI = new AllocaInst(AllocaTy, 0, Alignment,
+                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
+                           &AI);
+    ++NumNewAllocas;
+  }
+
+  DEBUG(dbgs() << "Rewriting alloca partition "
+               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
+               << *NewAI << "\n");
+
+  // Track the high watermark of the post-promotion worklist. We will reset it
+  // to this point if the alloca is not in fact scheduled for promotion.
+  unsigned PPWOldSize = PostPromotionWorklist.size();
+
+  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
+                                   PI->BeginOffset, PI->EndOffset);
+  DEBUG(dbgs() << "  rewriting ");
+  DEBUG(P.print(dbgs(), PI, ""));
+  bool Promotable = Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI));
+  if (Promotable) {
+    DEBUG(dbgs() << "  and queuing for promotion\n");
+    PromotableAllocas.push_back(NewAI);
+  } else if (NewAI != &AI) {
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
+  }
+
+  // Drop any post-promotion work items if promotion didn't happen.
+  if (!Promotable)
+    while (PostPromotionWorklist.size() > PPWOldSize)
+      PostPromotionWorklist.pop_back();
+
+  return true;
+}
+
+/// \brief Walks the partitioning of an alloca rewriting uses of each partition.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
+  bool Changed = false;
+  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
+       ++PI)
+    Changed |= rewriteAllocaPartition(AI, P, PI);
+
+  return Changed;
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// a partitioning of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  ++NumAllocasAnalyzed;
+
+  // Special case dead allocas, as they're trivial.
+  if (AI.use_empty()) {
+    AI.eraseFromParent();
+    return true;
+  }
+
+  // Skip alloca forms that this analysis can't handle.
+  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+    return false;
+
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter(*TD);
+  Changed |= AggRewriter.rewrite(AI);
+
+  // Build the partition set using a recursive instruction-visiting builder.
+  AllocaPartitioning P(*TD, AI);
+  DEBUG(P.print(dbgs()));
+  if (P.isEscaped())
+    return Changed;
+
+  // Delete all the dead users of this alloca before splitting and rewriting it.
+  for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
+                                              DE = P.dead_user_end();
+       DI != DE; ++DI) {
+    Changed = true;
+    (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+    DeadInsts.insert(*DI);
+  }
+  for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
+                                            DE = P.dead_op_end();
+       DO != DE; ++DO) {
+    Value *OldV = **DO;
+    // Clobber the use with an undef value.
+    **DO = UndefValue::get(OldV->getType());
+    if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+      if (isInstructionTriviallyDead(OldI)) {
+        Changed = true;
+        DeadInsts.insert(OldI);
+      }
+  }
+
+  // No partitions to split. Leave the dead alloca for a later pass to clean up.
+  if (P.begin() == P.end())
+    return Changed;
+
+  return splitAlloca(AI, P) || Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
+  while (!DeadInsts.empty()) {
+    Instruction *I = DeadInsts.pop_back_val();
+    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        *OI = 0;
+        if (isInstructionTriviallyDead(U))
+          DeadInsts.insert(U);
+      }
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      DeletedAllocas.insert(AI);
+
+    ++NumDeleted;
+    I->eraseFromParent();
+  }
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// If there is a domtree available, we attempt to promote using the full power
+/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
+/// based on the SSAUpdater utilities. This function returns whether any
+/// promotion occured.
+bool SROA::promoteAllocas(Function &F) {
+  if (PromotableAllocas.empty())
+    return false;
+
+  NumPromoted += PromotableAllocas.size();
+
+  if (DT && !ForceSSAUpdater) {
+    DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+    PromoteMemToReg(PromotableAllocas, *DT);
+    PromotableAllocas.clear();
+    return true;
+  }
+
+  DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
+  SSAUpdater SSA;
+  DIBuilder DIB(*F.getParent());
+  SmallVector<Instruction*, 64> Insts;
+
+  for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
+    AllocaInst *AI = PromotableAllocas[Idx];
+    for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+         UI != UE;) {
+      Instruction *I = cast<Instruction>(*UI++);
+      // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
+      // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
+      // leading to them) here. Eventually it should use them to optimize the
+      // scalar values produced.
+      if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        assert(onlyUsedByLifetimeMarkers(I) &&
+               "Found a bitcast used outside of a lifetime marker.");
+        while (!I->use_empty())
+          cast<Instruction>(*I->use_begin())->eraseFromParent();
+        I->eraseFromParent();
+        continue;
+      }
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
+               II->getIntrinsicID() == Intrinsic::lifetime_end);
+        II->eraseFromParent();
+        continue;
+      }
+
+      Insts.push_back(I);
+    }
+    AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
+    Insts.clear();
+  }
+
+  PromotableAllocas.clear();
+  return true;
+}
+
+namespace {
+  /// \brief A predicate to test whether an alloca belongs to a set.
+  class IsAllocaInSet {
+    typedef SmallPtrSet<AllocaInst *, 4> SetType;
+    const SetType &Set;
+
+  public:
+    typedef AllocaInst *argument_type;
+
+    IsAllocaInSet(const SetType &Set) : Set(Set) {}
+    bool operator()(AllocaInst *AI) const { return Set.count(AI); }
+  };
+}
+
+bool SROA::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  C = &F.getContext();
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD) {
+    DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
+    return false;
+  }
+  DT = getAnalysisIfAvailable<DominatorTree>();
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
+       I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      Worklist.insert(AI);
+
+  bool Changed = false;
+  // A set of deleted alloca instruction pointers which should be removed from
+  // the list of promotable allocas.
+  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+  do {
+    while (!Worklist.empty()) {
+      Changed |= runOnAlloca(*Worklist.pop_back_val());
+      deleteDeadInstructions(DeletedAllocas);
+
+      // Remove the deleted allocas from various lists so that we don't try to
+      // continue processing them.
+      if (!DeletedAllocas.empty()) {
+        Worklist.remove_if(IsAllocaInSet(DeletedAllocas));
+        PostPromotionWorklist.remove_if(IsAllocaInSet(DeletedAllocas));
+        PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
+                                               PromotableAllocas.end(),
+                                               IsAllocaInSet(DeletedAllocas)),
+                                PromotableAllocas.end());
+        DeletedAllocas.clear();
+      }
+    }
+
+    Changed |= promoteAllocas(F);
+
+    Worklist = PostPromotionWorklist;
+    PostPromotionWorklist.clear();
+  } while (!Worklist.empty());
+
+  return Changed;
+}
+
+void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (RequiresDomTree)
+    AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 48318c8a55d0..39630fd027f0 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -19,7 +19,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Scalar.h"
 
 using namespace llvm;
@@ -59,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
   initializeIPSCCPPass(Registry);
+  initializeSROAPass(Registry);
   initializeSROA_DTPass(Registry);
   initializeSROA_SSAUpPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index 6637126fcb7d..a46d09c32093 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -46,7 +46,7 @@
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -56,7 +56,6 @@ STATISTIC(NumReplaced,  "Number of allocas broken up");
 STATISTIC(NumPromoted,  "Number of allocas promoted");
 STATISTIC(NumAdjusted,  "Number of scalar allocas adjusted to allow promotion");
 STATISTIC(NumConverted, "Number of aggregates converted to scalar");
-STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
 
 namespace {
   struct SROA : public FunctionPass {
@@ -88,7 +87,7 @@ namespace {
 
   private:
     bool HasDomTree;
-    TargetData *TD;
+    DataLayout *TD;
 
     /// DeadInsts - Keep track of instructions we have made dead, so that
     /// we can remove them after we are done working.
@@ -183,9 +182,6 @@ namespace {
     void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
                                       SmallVector<AllocaInst*, 32> &NewElts);
     bool ShouldAttemptScalarRepl(AllocaInst *AI);
-
-    static MemTransferInst *isOnlyCopiedFromConstantGlobal(
-        AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete);
   };
 
   // SROA_DT - SROA that uses DominatorTree.
@@ -262,7 +258,7 @@ namespace {
 class ConvertToScalarInfo {
   /// AllocaSize - The size of the alloca being considered in bytes.
   unsigned AllocaSize;
-  const TargetData &TD;
+  const DataLayout &TD;
   unsigned ScalarLoadThreshold;
 
   /// IsNotTrivial - This is set to true if there is some access to the object
@@ -305,7 +301,7 @@ class ConvertToScalarInfo {
   bool HadDynamicAccess;
 
 public:
-  explicit ConvertToScalarInfo(unsigned Size, const TargetData &td,
+  explicit ConvertToScalarInfo(unsigned Size, const DataLayout &td,
                                unsigned SLT)
     : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false),
     ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false),
@@ -1024,11 +1020,11 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
 
 
 bool SROA::runOnFunction(Function &F) {
-  TD = getAnalysisIfAvailable<TargetData>();
+  TD = getAnalysisIfAvailable<DataLayout>();
 
   bool Changed = performPromotion(F);
 
-  // FIXME: ScalarRepl currently depends on TargetData more than it
+  // FIXME: ScalarRepl currently depends on DataLayout more than it
   // theoretically needs to. It should be refactored in order to support
   // target-independent IR. Until this is done, just skip the actual
   // scalar-replacement portion of this pass.
@@ -1138,7 +1134,7 @@ public:
 ///
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
+static bool isSafeSelectToSpeculate(SelectInst *SI, const DataLayout *TD) {
   bool TDerefable = SI->getTrueValue()->isDereferenceablePointer();
   bool FDerefable = SI->getFalseValue()->isDereferenceablePointer();
 
@@ -1176,7 +1172,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) {
 ///
 /// We can do this to a select if its only uses are loads and if the operand to
 /// the select can be loaded unconditionally.
-static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
+static bool isSafePHIToSpeculate(PHINode *PN, const DataLayout *TD) {
   // For now, we can only do this promotion if the load is in the same block as
   // the PHI, and if there are no stores between the phi and load.
   // TODO: Allow recursive phi users.
@@ -1240,7 +1236,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) {
 /// direct (non-volatile) loads and stores to it.  If the alloca is close but
 /// not quite there, this will transform the code to allow promotion.  As such,
 /// it is a non-pure predicate.
-static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) {
+static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const DataLayout *TD) {
   SetVector<Instruction*, SmallVector<Instruction*, 4>,
             SmallPtrSet<Instruction*, 4> > InstsToRewrite;
 
@@ -1465,26 +1461,6 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
   return false;
 }
 
-/// getPointeeAlignment - Compute the minimum alignment of the value pointed
-/// to by the given pointer.
-static unsigned getPointeeAlignment(Value *V, const TargetData &TD) {
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-    if (CE->getOpcode() == Instruction::BitCast ||
-        (CE->getOpcode() == Instruction::GetElementPtr &&
-         cast<GEPOperator>(CE)->hasAllZeroIndices()))
-      return getPointeeAlignment(CE->getOperand(0), TD);
-
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    if (!GV->isDeclaration())
-      return TD.getPreferredAlignment(GV);
-
-  if (PointerType *PT = dyn_cast<PointerType>(V->getType()))
-    return TD.getABITypeAlignment(PT->getElementType());
-
-  return 0;
-}
-
-
 // performScalarRepl - This algorithm is a simple worklist driven algorithm,
 // which runs on all of the alloca instructions in the function, removing them
 // if they are only used by getelementptr instructions.
@@ -1516,29 +1492,6 @@ bool SROA::performScalarRepl(Function &F) {
     if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized())
       continue;
 
-    // Check to see if this allocation is only modified by a memcpy/memmove from
-    // a constant global whose alignment is equal to or exceeds that of the
-    // allocation.  If this is the case, we can change all users to use
-    // the constant global instead.  This is commonly produced by the CFE by
-    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
-    // is only subsequently read.
-    SmallVector<Instruction *, 4> ToDelete;
-    if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) {
-      if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) {
-        DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n');
-        DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
-        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-          ToDelete[i]->eraseFromParent();
-        Constant *TheSrc = cast<Constant>(Copy->getSource());
-        AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
-        Copy->eraseFromParent();  // Don't mutate the global.
-        AI->eraseFromParent();
-        ++NumGlobals;
-        Changed = true;
-        continue;
-      }
-    }
-
     // Check to see if we can perform the core SROA transformation.  We cannot
     // transform the allocation instruction if it is an array allocation
     // (allocations OF arrays are ok though), and an allocation of a scalar
@@ -2584,7 +2537,7 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI,
 /// HasPadding - Return true if the specified type has any structure or
 /// alignment padding in between the elements that would be split apart
 /// by SROA; return false otherwise.
-static bool HasPadding(Type *Ty, const TargetData &TD) {
+static bool HasPadding(Type *Ty, const DataLayout &TD) {
   if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Ty = ATy->getElementType();
     return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty);
@@ -2656,134 +2609,3 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) {
 
   return true;
 }
-
-
-
-/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to
-/// some part of a constant global variable.  This intentionally only accepts
-/// constant expressions because we don't can't rewrite arbitrary instructions.
-static bool PointsToConstantGlobal(Value *V) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    return GV->isConstant();
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
-    if (CE->getOpcode() == Instruction::BitCast ||
-        CE->getOpcode() == Instruction::GetElementPtr)
-      return PointsToConstantGlobal(CE->getOperand(0));
-  return false;
-}
-
-/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
-/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
-/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
-/// track of whether it moves the pointer (with isOffset) but otherwise traverse
-/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
-/// the alloca, and if the source pointer is a pointer to a constant global, we
-/// can optimize this.
-static bool
-isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
-                               bool isOffset,
-                               SmallVector<Instruction *, 4> &LifetimeMarkers) {
-  // We track lifetime intrinsics as we encounter them.  If we decide to go
-  // ahead and replace the value with the global, this lets the caller quickly
-  // eliminate the markers.
-
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
-    User *U = cast<Instruction>(*UI);
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      // Ignore non-volatile loads, they are always ok.
-      if (!LI->isSimple()) return false;
-      continue;
-    }
-
-    if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
-      // If uses of the bitcast are ok, we are ok.
-      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset,
-                                          LifetimeMarkers))
-        return false;
-      continue;
-    }
-    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
-      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
-      // doesn't, it does.
-      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy,
-                                          isOffset || !GEP->hasAllZeroIndices(),
-                                          LifetimeMarkers))
-        return false;
-      continue;
-    }
-
-    if (CallSite CS = U) {
-      // If this is the function being called then we treat it like a load and
-      // ignore it.
-      if (CS.isCallee(UI))
-        continue;
-
-      // If this is a readonly/readnone call site, then we know it is just a
-      // load (but one that potentially returns the value itself), so we can
-      // ignore it if we know that the value isn't captured.
-      unsigned ArgNo = CS.getArgumentNo(UI);
-      if (CS.onlyReadsMemory() &&
-          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
-        continue;
-
-      // If this is being passed as a byval argument, the caller is making a
-      // copy, so it is only a read of the alloca.
-      if (CS.isByValArgument(ArgNo))
-        continue;
-    }
-
-    // Lifetime intrinsics can be handled by the caller.
-    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
-          II->getIntrinsicID() == Intrinsic::lifetime_end) {
-        assert(II->use_empty() && "Lifetime markers have no result to use!");
-        LifetimeMarkers.push_back(II);
-        continue;
-      }
-    }
-
-    // If this is isn't our memcpy/memmove, reject it as something we can't
-    // handle.
-    MemTransferInst *MI = dyn_cast<MemTransferInst>(U);
-    if (MI == 0)
-      return false;
-
-    // If the transfer is using the alloca as a source of the transfer, then
-    // ignore it since it is a load (unless the transfer is volatile).
-    if (UI.getOperandNo() == 1) {
-      if (MI->isVolatile()) return false;
-      continue;
-    }
-
-    // If we already have seen a copy, reject the second one.
-    if (TheCopy) return false;
-
-    // If the pointer has been offset from the start of the alloca, we can't
-    // safely handle this.
-    if (isOffset) return false;
-
-    // If the memintrinsic isn't using the alloca as the dest, reject it.
-    if (UI.getOperandNo() != 0) return false;
-
-    // If the source of the memcpy/move is not a constant global, reject it.
-    if (!PointsToConstantGlobal(MI->getSource()))
-      return false;
-
-    // Otherwise, the transform is safe.  Remember the copy instruction.
-    TheCopy = MI;
-  }
-  return true;
-}
-
-/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
-/// modified by a copy from a constant global.  If we can prove this, we can
-/// replace any uses of the alloca with uses of the global directly.
-MemTransferInst *
-SROA::isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
-                                     SmallVector<Instruction*, 4> &ToDelete) {
-  MemTransferInst *TheCopy = 0;
-  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false, ToDelete))
-    return TheCopy;
-  return 0;
-}
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index d13e4abff9dc..9f24bb635e88 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -31,10 +31,11 @@
 #include "llvm/Attributes.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Pass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/TargetTransformInfo.h"
 using namespace llvm;
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
@@ -59,9 +60,9 @@ FunctionPass *llvm::createCFGSimplificationPass() {
   return new CFGSimplifyPass();
 }
 
-/// ChangeToUnreachable - Insert an unreachable instruction before the specified
+/// changeToUnreachable - Insert an unreachable instruction before the specified
 /// instruction, making it and the rest of the code in the block dead.
-static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
+static void changeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   BasicBlock *BB = I->getParent();
   // Loop over all of the successors, removing BB's entry from any PHI
   // nodes.
@@ -87,8 +88,8 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) {
   }
 }
 
-/// ChangeToCall - Convert the specified invoke into a normal call.
-static void ChangeToCall(InvokeInst *II) {
+/// changeToCall - Convert the specified invoke into a normal call.
+static void changeToCall(InvokeInst *II) {
   SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
   CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II);
   NewCall->takeName(II);
@@ -105,7 +106,7 @@ static void ChangeToCall(InvokeInst *II) {
   II->eraseFromParent();
 }
 
-static bool MarkAliveBlocks(BasicBlock *BB,
+static bool markAliveBlocks(BasicBlock *BB,
                             SmallPtrSet<BasicBlock*, 128> &Reachable) {
 
   SmallVector<BasicBlock*, 128> Worklist;
@@ -129,7 +130,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           ++BBI;
           if (!isa<UnreachableInst>(BBI)) {
             // Don't insert a call to llvm.trap right before the unreachable.
-            ChangeToUnreachable(BBI, false);
+            changeToUnreachable(BBI, false);
             Changed = true;
           }
           break;
@@ -148,7 +149,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
         if (isa<UndefValue>(Ptr) ||
             (isa<ConstantPointerNull>(Ptr) &&
              SI->getPointerAddressSpace() == 0)) {
-          ChangeToUnreachable(SI, true);
+          changeToUnreachable(SI, true);
           Changed = true;
           break;
         }
@@ -159,7 +160,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
       Value *Callee = II->getCalledValue();
       if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
-        ChangeToUnreachable(II, true);
+        changeToUnreachable(II, true);
         Changed = true;
       } else if (II->doesNotThrow()) {
         if (II->use_empty() && II->onlyReadsMemory()) {
@@ -168,7 +169,7 @@ static bool MarkAliveBlocks(BasicBlock *BB,
           II->getUnwindDest()->removePredecessor(II->getParent());
           II->eraseFromParent();
         } else
-          ChangeToCall(II);
+          changeToCall(II);
         Changed = true;
       }
     }
@@ -180,12 +181,12 @@ static bool MarkAliveBlocks(BasicBlock *BB,
   return Changed;
 }
 
-/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even
+/// removeUnreachableBlocksFromFn - Remove blocks that are not reachable, even
 /// if they are in a dead cycle.  Return true if a change was made, false
 /// otherwise.
-static bool RemoveUnreachableBlocksFromFn(Function &F) {
+static bool removeUnreachableBlocksFromFn(Function &F) {
   SmallPtrSet<BasicBlock*, 128> Reachable;
-  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+  bool Changed = markAliveBlocks(F.begin(), Reachable);
 
   // If there are unreachable blocks in the CFG...
   if (Reachable.size() == F.size())
@@ -215,9 +216,9 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) {
   return true;
 }
 
-/// MergeEmptyReturnBlocks - If we have more than one empty (other than phi
+/// mergeEmptyReturnBlocks - If we have more than one empty (other than phi
 /// node) return blocks, merge them together to promote recursive block merging.
-static bool MergeEmptyReturnBlocks(Function &F) {
+static bool mergeEmptyReturnBlocks(Function &F) {
   bool Changed = false;
 
   BasicBlock *RetBlock = 0;
@@ -291,9 +292,10 @@ static bool MergeEmptyReturnBlocks(Function &F) {
   return Changed;
 }
 
-/// IterativeSimplifyCFG - Call SimplifyCFG on all the blocks in the function,
+/// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
-static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
+static bool iterativelySimplifyCFG(Function &F, const DataLayout *TD,
+                                   const TargetTransformInfo *TTI) {
   bool Changed = false;
   bool LocalChange = true;
   while (LocalChange) {
@@ -302,7 +304,7 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
     // Loop over all of the basic blocks and remove them if they are unneeded...
     //
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(BBIt++, TD)) {
+      if (SimplifyCFG(BBIt++, TD, TTI)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -316,25 +318,27 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) {
 // simplify the CFG.
 //
 bool CFGSimplifyPass::runOnFunction(Function &F) {
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
-  bool EverChanged = RemoveUnreachableBlocksFromFn(F);
-  EverChanged |= MergeEmptyReturnBlocks(F);
-  EverChanged |= IterativeSimplifyCFG(F, TD);
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
+  const TargetTransformInfo *TTI =
+      getAnalysisIfAvailable<TargetTransformInfo>();
+  bool EverChanged = removeUnreachableBlocksFromFn(F);
+  EverChanged |= mergeEmptyReturnBlocks(F);
+  EverChanged |= iterativelySimplifyCFG(F, TD, TTI);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
 
-  // IterativeSimplifyCFG can (rarely) make some loops dead.  If this happens,
-  // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should
+  // iterativelySimplifyCFG can (rarely) make some loops dead.  If this happens,
+  // removeUnreachableBlocksFromFn is needed to nuke them, which means we should
   // iterate between the two optimizations.  We structure the code like this to
-  // avoid reruning IterativeSimplifyCFG if the second pass of
-  // RemoveUnreachableBlocksFromFn doesn't do anything.
-  if (!RemoveUnreachableBlocksFromFn(F))
+  // avoid reruning iterativelySimplifyCFG if the second pass of
+  // removeUnreachableBlocksFromFn doesn't do anything.
+  if (!removeUnreachableBlocksFromFn(F))
     return true;
 
   do {
-    EverChanged = IterativeSimplifyCFG(F, TD);
-    EverChanged |= RemoveUnreachableBlocksFromFn(F);
+    EverChanged = iterativelySimplifyCFG(F, TD, TTI);
+    EverChanged |= removeUnreachableBlocksFromFn(F);
   } while (EverChanged);
 
   return true;
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index f110320c1bf9..17d07cdb2d4d 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -28,9 +28,10 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
 using namespace llvm;
@@ -38,6 +39,10 @@ using namespace llvm;
 STATISTIC(NumSimplified, "Number of library calls simplified");
 STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
+static cl::opt<bool> UnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+                                   cl::init(false),
+                                   cl::desc("Enable unsafe double to float "
+                                            "shrinking for math lib calls"));
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
 //===----------------------------------------------------------------------===//
@@ -48,7 +53,7 @@ namespace {
 class LibCallOptimization {
 protected:
   Function *Caller;
-  const TargetData *TD;
+  const DataLayout *TD;
   const TargetLibraryInfo *TLI;
   LLVMContext* Context;
 public:
@@ -63,7 +68,7 @@ public:
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
     =0;
 
-  Value *OptimizeCall(CallInst *CI, const TargetData *TD,
+  Value *OptimizeCall(CallInst *CI, const DataLayout *TD,
                       const TargetLibraryInfo *TLI, IRBuilder<> &B) {
     Caller = CI->getParent()->getParent();
     this->TD = TD;
@@ -85,22 +90,6 @@ public:
 // Helper Functions
 //===----------------------------------------------------------------------===//
 
-/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
-/// value is equal or not-equal to zero.
-static bool IsOnlyUsedInZeroEqualityComparison(Value *V) {
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
-       UI != E; ++UI) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
 static bool CallHasFloatingPointArgument(const CallInst *CI) {
   for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
        it != e; ++it) {
@@ -110,799 +99,62 @@ static bool CallHasFloatingPointArgument(const CallInst *CI) {
   return false;
 }
 
-/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality
-/// comparisons with With.
-static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
-       UI != E; ++UI) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
-      if (IC->isEquality() && IC->getOperand(1) == With)
-        continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
+namespace {
 //===----------------------------------------------------------------------===//
-// String and Memory LibCall Optimizations
+// Math Library Optimizations
 //===----------------------------------------------------------------------===//
 
 //===---------------------------------------===//
-// 'strcat' Optimizations
-namespace {
-struct StrCatOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strcat" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        FT->getParamType(1) != FT->getReturnType())
-      return 0;
-
-    // Extract some information from the instruction
-    Value *Dst = CI->getArgOperand(0);
-    Value *Src = CI->getArgOperand(1);
-
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
-    --Len;  // Unbias length.
-
-    // Handle the simple, do-nothing case: strcat(x, "") -> x
-    if (Len == 0)
-      return Dst;
-
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    return EmitStrLenMemCpy(Src, Dst, Len, B);
-  }
-
-  Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) {
-    // We need to find the end of the destination string.  That's where the
-    // memory is to be moved to. We just generate a call to strlen.
-    Value *DstLen = EmitStrLen(Dst, B, TD, TLI);
-    if (!DstLen)
-      return 0;
-
-    // Now that we have the destination's length, we must index into the
-    // destination's pointer to get the actual memcpy destination (end of
-    // the string .. we're concatenating).
-    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
-
-    // We have enough information to now generate the memcpy call to do the
-    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-    B.CreateMemCpy(CpyDst, Src,
-                   ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1);
-    return Dst;
-  }
-};
-
-//===---------------------------------------===//
-// 'strncat' Optimizations
-
-struct StrNCatOpt : public StrCatOpt {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strncat" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        FT->getParamType(1) != FT->getReturnType() ||
-        !FT->getParamType(2)->isIntegerTy())
-      return 0;
-
-    // Extract some information from the instruction
-    Value *Dst = CI->getArgOperand(0);
-    Value *Src = CI->getArgOperand(1);
-    uint64_t Len;
-
-    // We don't do anything if length is not constant
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
-      Len = LengthArg->getZExtValue();
-    else
-      return 0;
-
-    // See if we can get the length of the input string.
-    uint64_t SrcLen = GetStringLength(Src);
-    if (SrcLen == 0) return 0;
-    --SrcLen;  // Unbias length.
-
-    // Handle the simple, do-nothing cases:
-    // strncat(x, "", c) -> x
-    // strncat(x,  c, 0) -> x
-    if (SrcLen == 0 || Len == 0) return Dst;
-
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    // We don't optimize this case
-    if (Len < SrcLen) return 0;
-
-    // strncat(x, s, c) -> strcat(x, s)
-    // s is constant so the strcat can be optimized further
-    return EmitStrLenMemCpy(Src, Dst, SrcLen, B);
-  }
-};
-
-//===---------------------------------------===//
-// 'strchr' Optimizations
-
-struct StrChrOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strchr" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        !FT->getParamType(1)->isIntegerTy(32))
-      return 0;
-
-    Value *SrcStr = CI->getArgOperand(0);
-
-    // If the second operand is non-constant, see if we can compute the length
-    // of the input string and turn this into memchr.
-    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-    if (CharC == 0) {
-      // These optimizations require TargetData.
-      if (!TD) return 0;
-
-      uint64_t Len = GetStringLength(SrcStr);
-      if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
-        return 0;
-
-      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
-                        ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                        B, TD, TLI);
-    }
-
-    // Otherwise, the character is a constant, see if the first argument is
-    // a string literal.  If so, we can constant fold.
-    StringRef Str;
-    if (!getConstantStringInfo(SrcStr, Str))
-      return 0;
-
-    // Compute the offset, make sure to handle the case when we're searching for
-    // zero (a weird way to spell strlen).
-    size_t I = CharC->getSExtValue() == 0 ?
-        Str.size() : Str.find(CharC->getSExtValue());
-    if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
-      return Constant::getNullValue(CI->getType());
-
-    // strchr(s+n,c)  -> gep(s+n+i,c)
-    return B.CreateGEP(SrcStr, B.getInt64(I), "strchr");
-  }
-};
-
-//===---------------------------------------===//
-// 'strrchr' Optimizations
-
-struct StrRChrOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strrchr" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getReturnType() != B.getInt8PtrTy() ||
-        FT->getParamType(0) != FT->getReturnType() ||
-        !FT->getParamType(1)->isIntegerTy(32))
-      return 0;
-
-    Value *SrcStr = CI->getArgOperand(0);
-    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-
-    // Cannot fold anything if we're not looking for a constant.
-    if (!CharC)
-      return 0;
-
-    StringRef Str;
-    if (!getConstantStringInfo(SrcStr, Str)) {
-      // strrchr(s, 0) -> strchr(s, 0)
-      if (TD && CharC->isZero())
-        return EmitStrChr(SrcStr, '\0', B, TD, TLI);
-      return 0;
-    }
-
-    // Compute the offset.
-    size_t I = CharC->getSExtValue() == 0 ?
-        Str.size() : Str.rfind(CharC->getSExtValue());
-    if (I == StringRef::npos) // Didn't find the char. Return null.
-      return Constant::getNullValue(CI->getType());
-
-    // strrchr(s+n,c) -> gep(s+n+i,c)
-    return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr");
-  }
-};
-
-//===---------------------------------------===//
-// 'strcmp' Optimizations
-
-struct StrCmpOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strcmp" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        !FT->getReturnType()->isIntegerTy(32) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
-
-    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
-    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
-      return ConstantInt::get(CI->getType(), 0);
-
-    StringRef Str1, Str2;
-    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
-    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
-    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2)
-      return ConstantInt::get(CI->getType(), Str1.compare(Str2));
-
-    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
-      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
-                                      CI->getType()));
-
-    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
-    // strcmp(P, "x") -> memcmp(P, "x", 2)
-    uint64_t Len1 = GetStringLength(Str1P);
-    uint64_t Len2 = GetStringLength(Str2P);
-    if (Len1 && Len2) {
-      // These optimizations require TargetData.
-      if (!TD) return 0;
-
-      return EmitMemCmp(Str1P, Str2P,
-                        ConstantInt::get(TD->getIntPtrType(*Context),
-                        std::min(Len1, Len2)), B, TD, TLI);
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'strncmp' Optimizations
-
-struct StrNCmpOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strncmp" function prototype.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 ||
-        !FT->getReturnType()->isIntegerTy(32) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        !FT->getParamType(2)->isIntegerTy())
-      return 0;
-
-    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
-    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
-      return ConstantInt::get(CI->getType(), 0);
-
-    // Get the length argument if it is constant.
-    uint64_t Length;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
-      Length = LengthArg->getZExtValue();
-    else
-      return 0;
-
-    if (Length == 0) // strncmp(x,y,0)   -> 0
-      return ConstantInt::get(CI->getType(), 0);
-
-    if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
-      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI);
-
-    StringRef Str1, Str2;
-    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
-    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
-    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
-    if (HasStr1 && HasStr2) {
-      StringRef SubStr1 = Str1.substr(0, Length);
-      StringRef SubStr2 = Str2.substr(0, Length);
-      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
-    }
-
-    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
-      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
-                                      CI->getType()));
-
-    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
-      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
-
-    return 0;
-  }
-};
-
-
-//===---------------------------------------===//
-// 'strcpy' Optimizations
-
-struct StrCpyOpt : public LibCallOptimization {
-  bool OptChkCall;  // True if it's optimizing a __strcpy_chk libcall.
-
-  StrCpyOpt(bool c) : OptChkCall(c) {}
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "strcpy" function prototype.
-    unsigned NumParams = OptChkCall ? 3 : 2;
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != NumParams ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
-
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src)      // strcpy(x,x)  -> x
-      return Src;
-
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
-
-    // We have enough information to now generate the memcpy call to do the
-    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
-    if (!OptChkCall ||
-        !EmitMemCpyChk(Dst, Src,
-                       ConstantInt::get(TD->getIntPtrType(*Context), Len),
-                       CI->getArgOperand(2), B, TD, TLI))
-      B.CreateMemCpy(Dst, Src,
-                     ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
-    return Dst;
-  }
-};
-
-//===---------------------------------------===//
-// 'stpcpy' Optimizations
-
-struct StpCpyOpt: public LibCallOptimization {
-  bool OptChkCall;  // True if it's optimizing a __stpcpy_chk libcall.
-
-  StpCpyOpt(bool c) : OptChkCall(c) {}
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Verify the "stpcpy" function prototype.
-    unsigned NumParams = OptChkCall ? 3 : 2;
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != NumParams ||
-        FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy())
-      return 0;
-
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
-    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
-      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
-      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
-    }
-
-    // See if we can get the length of the input string.
-    uint64_t Len = GetStringLength(Src);
-    if (Len == 0) return 0;
-
-    Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len);
-    Value *DstEnd = B.CreateGEP(Dst,
-                                ConstantInt::get(TD->getIntPtrType(*Context),
-                                                 Len - 1));
-
-    // We have enough information to now generate the memcpy call to do the
-    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-    if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B,
-                                      TD, TLI))
-      B.CreateMemCpy(Dst, Src, LenV, 1);
-    return DstEnd;
-  }
-};
-
-//===---------------------------------------===//
-// 'strncpy' Optimizations
-
-struct StrNCpyOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        FT->getParamType(0) != FT->getParamType(1) ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        !FT->getParamType(2)->isIntegerTy())
-      return 0;
-
-    Value *Dst = CI->getArgOperand(0);
-    Value *Src = CI->getArgOperand(1);
-    Value *LenOp = CI->getArgOperand(2);
-
-    // See if we can get the length of the input string.
-    uint64_t SrcLen = GetStringLength(Src);
-    if (SrcLen == 0) return 0;
-    --SrcLen;
-
-    if (SrcLen == 0) {
-      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
-      B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
-      return Dst;
-    }
-
-    uint64_t Len;
-    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
-      Len = LengthArg->getZExtValue();
-    else
-      return 0;
-
-    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
-
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    // Let strncpy handle the zero padding
-    if (Len > SrcLen+1) return 0;
-
-    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
-    B.CreateMemCpy(Dst, Src,
-                   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
-
-    return Dst;
-  }
-};
-
-//===---------------------------------------===//
-// 'strlen' Optimizations
-
-struct StrLenOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    Value *Src = CI->getArgOperand(0);
-
-    // Constant folding: strlen("xyz") -> 3
-    if (uint64_t Len = GetStringLength(Src))
-      return ConstantInt::get(CI->getType(), Len-1);
-
-    // strlen(x) != 0 --> *x != 0
-    // strlen(x) == 0 --> *x == 0
-    if (IsOnlyUsedInZeroEqualityComparison(CI))
-      return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
-    return 0;
-  }
-};
-
-
-//===---------------------------------------===//
-// 'strpbrk' Optimizations
-
-struct StrPBrkOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        FT->getReturnType() != FT->getParamType(0))
-      return 0;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strpbrk(s, "") -> NULL
-    // strpbrk("", s) -> NULL
-    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t I = S1.find_first_of(S2);
-      if (I == std::string::npos) // No match.
-        return Constant::getNullValue(CI->getType());
-
-      return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
-    }
-
-    // strpbrk(s, "a") -> strchr(s, 'a')
-    if (TD && HasS2 && S2.size() == 1)
-      return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI);
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'strto*' Optimizations.  This handles strtol, strtod, strtof, strtoul, etc.
-
-struct StrToOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy())
-      return 0;
-
-    Value *EndPtr = CI->getArgOperand(1);
-    if (isa<ConstantPointerNull>(EndPtr)) {
-      // With a null EndPtr, this function won't capture the main argument.
-      // It would be readonly too, except that it still may write to errno.
-      CI->addAttribute(1, Attribute::NoCapture);
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'strspn' Optimizations
-
-struct StrSpnOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strspn(s, "") -> 0
-    // strspn("", s) -> 0
-    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t Pos = S1.find_first_not_of(S2);
-      if (Pos == StringRef::npos) Pos = S1.size();
-      return ConstantInt::get(CI->getType(), Pos);
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'strcspn' Optimizations
-
-struct StrCSpnOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strcspn("", s) -> 0
-    if (HasS1 && S1.empty())
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t Pos = S1.find_first_of(S2);
-      if (Pos == StringRef::npos) Pos = S1.size();
-      return ConstantInt::get(CI->getType(), Pos);
-    }
-
-    // strcspn(s, "") -> strlen(s)
-    if (TD && HasS2 && S2.empty())
-      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'strstr' Optimizations
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
 
-struct StrStrOpt : public LibCallOptimization {
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isPointerTy())
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
       return 0;
 
-    // fold strstr(x, x) -> x.
-    if (CI->getArgOperand(0) == CI->getArgOperand(1))
-      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
-    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
-    if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
-      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
-      if (!StrLen)
-        return 0;
-      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
-                                   StrLen, B, TD, TLI);
-      if (!StrNCmp)
-        return 0;
-      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
-           UI != UE; ) {
-        ICmpInst *Old = cast<ICmpInst>(*UI++);
-        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
-                                  ConstantInt::getNullValue(StrNCmp->getType()),
-                                  "cmp");
-        Old->replaceAllUsesWith(Cmp);
-        Old->eraseFromParent();
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
       }
-      return CI;
-    }
-
-    // See if either input string is a constant string.
-    StringRef SearchStr, ToFindStr;
-    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
-    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
-
-    // fold strstr(x, "") -> x.
-    if (HasStr2 && ToFindStr.empty())
-      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
-    // If both strings are known, constant fold it.
-    if (HasStr1 && HasStr2) {
-      std::string::size_type Offset = SearchStr.find(ToFindStr);
-
-      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
-        return Constant::getNullValue(CI->getType());
-
-      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-      Value *Result = CastToCStr(CI->getArgOperand(0), B);
-      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
-      return B.CreateBitCast(Result, CI->getType());
-    }
-
-    // fold strstr(x, "y") -> strchr(x, 'y').
-    if (HasStr2 && ToFindStr.size() == 1) {
-      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
-      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
     }
-    return 0;
-  }
-};
-
-
-//===---------------------------------------===//
-// 'memcmp' Optimizations
-
-struct MemCmpOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy(32))
-      return 0;
 
-    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
-
-    if (LHS == RHS)  // memcmp(s,s,x) -> 0
-      return Constant::getNullValue(CI->getType());
-
-    // Make sure we have a constant length.
-    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!LenC) return 0;
-    uint64_t Len = LenC->getZExtValue();
-
-    if (Len == 0) // memcmp(s1,s2,0) -> 0
-      return Constant::getNullValue(CI->getType());
-
-    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
-    if (Len == 1) {
-      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
-                                 CI->getType(), "lhsv");
-      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
-                                 CI->getType(), "rhsv");
-      return B.CreateSub(LHSV, RHSV, "chardiff");
-    }
-
-    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
-    StringRef LHSStr, RHSStr;
-    if (getConstantStringInfo(LHS, LHSStr) &&
-        getConstantStringInfo(RHS, RHSStr)) {
-      // Make sure we're not reading out-of-bounds memory.
-      if (Len > LHSStr.size() || Len > RHSStr.size())
-        return 0;
-      uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
-      return ConstantInt::get(CI->getType(), Ret);
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'memcpy' Optimizations
-
-struct MemCpyOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
-      return 0;
-
-    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                   CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
-//===---------------------------------------===//
-// 'memmove' Optimizations
-
-struct MemMoveOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
-      return 0;
-
-    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
-//===---------------------------------------===//
-// 'memset' Optimizations
-
-struct MemSetOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require TargetData.
-    if (!TD) return 0;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
       return 0;
 
-    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
-    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
   }
 };
 
-//===----------------------------------------------------------------------===//
-// Math Library Optimizations
-//===----------------------------------------------------------------------===//
-
 //===---------------------------------------===//
 // 'cos*' Optimizations
-
 struct CosOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     // cos(-x) -> cos(x)
     Value *Op1 = CI->getArgOperand(0);
@@ -910,7 +162,7 @@ struct CosOpt : public LibCallOptimization {
       BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
       return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
     }
-    return 0;
+    return Ret;
   }
 };
 
@@ -919,13 +171,20 @@ struct CosOpt : public LibCallOptimization {
 
 struct PowOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 2 arguments of the same FP type, which match the
     // result type.
     if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
     if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
@@ -936,7 +195,7 @@ struct PowOpt : public LibCallOptimization {
     }
 
     ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
-    if (Op2C == 0) return 0;
+    if (Op2C == 0) return Ret;
 
     if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
       return ConstantFP::get(CI->getType(), 1.0);
@@ -974,12 +233,19 @@ struct PowOpt : public LibCallOptimization {
 
 struct Exp2Opt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.CallOptimizer(Callee, CI, B);
+    }
+
     FunctionType *FT = Callee->getFunctionType();
     // Just make sure this has 1 argument of FP type, which matches the
     // result type.
     if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isFloatingPointTy())
-      return 0;
+      return Ret;
 
     Value *Op = CI->getArgOperand(0);
     // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
@@ -1016,29 +282,7 @@ struct Exp2Opt : public LibCallOptimization {
 
       return CI;
     }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
-
-struct UnaryDoubleFPOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
-        !FT->getParamType(0)->isDoubleTy())
-      return 0;
-
-    // If this is something like 'floor((double)floatval)', convert to floorf.
-    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
-    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
-      return 0;
-
-    // floor((double)floatval) -> (double)floorf(floatval)
-    Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
-    return B.CreateFPExt(V, B.getDoubleTy());
+    return Ret;
   }
 };
 
@@ -1063,8 +307,8 @@ struct FFSOpt : public LibCallOptimization {
 
     // Constant fold.
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
-      if (CI->getValue() == 0)  // ffs(0) -> 0.
-        return Constant::getNullValue(CI->getType());
+      if (CI->isZero()) // ffs(0) -> 0.
+        return B.getInt32(0);
       // ffs(c) -> cttz(c)+1
       return B.getInt32(CI->getValue().countTrailingZeros() + 1);
     }
@@ -1267,7 +511,7 @@ struct SPrintFOpt : public LibCallOptimization {
         if (FormatStr[i] == '%')
           return 0; // we found a format specifier, bail out.
 
-      // These optimizations require TargetData.
+      // These optimizations require DataLayout.
       if (!TD) return 0;
 
       // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
@@ -1297,7 +541,7 @@ struct SPrintFOpt : public LibCallOptimization {
     }
 
     if (FormatStr[1] == 's') {
-      // These optimizations require TargetData.
+      // These optimizations require DataLayout.
       if (!TD) return 0;
 
       // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
@@ -1385,7 +629,7 @@ struct FWriteOpt : public LibCallOptimization {
 
 struct FPutsOpt : public LibCallOptimization {
   virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require TargetData.
+    // These optimizations require DataLayout.
     if (!TD) return 0;
 
     // Require two pointers.  Also, we can't optimize if return value is used.
@@ -1422,7 +666,7 @@ struct FPrintFOpt : public LibCallOptimization {
         if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
           return 0; // We found a format specifier.
 
-      // These optimizations require TargetData.
+      // These optimizations require DataLayout.
       if (!TD) return 0;
 
       Value *NewCI = EmitFWrite(CI->getArgOperand(1),
@@ -1524,17 +768,9 @@ namespace {
     TargetLibraryInfo *TLI;
 
     StringMap<LibCallOptimization*> Optimizations;
-    // String and Memory LibCall Optimizations
-    StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr;
-    StrCmpOpt StrCmp; StrNCmpOpt StrNCmp;
-    StrCpyOpt StrCpy; StrCpyOpt StrCpyChk;
-    StpCpyOpt StpCpy; StpCpyOpt StpCpyChk;
-    StrNCpyOpt StrNCpy;
-    StrLenOpt StrLen; StrPBrkOpt StrPBrk;
-    StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
-    MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
-    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP;
+    CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
+    UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
     // Integer Optimizations
     FFSOpt FFS; AbsOpt Abs; IsDigitOpt IsDigit; IsAsciiOpt IsAscii;
     ToAsciiOpt ToAscii;
@@ -1546,11 +782,13 @@ namespace {
     bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
-    SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true),
-                         StpCpy(false), StpCpyChk(true) {
+    SimplifyLibCalls() : FunctionPass(ID), UnaryDoubleFP(false),
+                         UnsafeUnaryDoubleFP(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
     void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
+    void AddOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1586,40 +824,15 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
     Optimizations[TLI->getName(F)] = Opt;
 }
 
+void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
+                              LibCallOptimization* Opt) {
+  if (TLI->has(F1) && TLI->has(F2))
+    Optimizations[TLI->getName(F1)] = Opt;
+}
+
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
-  // String and Memory LibCall Optimizations
-  Optimizations["strcat"] = &StrCat;
-  Optimizations["strncat"] = &StrNCat;
-  Optimizations["strchr"] = &StrChr;
-  Optimizations["strrchr"] = &StrRChr;
-  Optimizations["strcmp"] = &StrCmp;
-  Optimizations["strncmp"] = &StrNCmp;
-  Optimizations["strcpy"] = &StrCpy;
-  Optimizations["strncpy"] = &StrNCpy;
-  Optimizations["stpcpy"] = &StpCpy;
-  Optimizations["strlen"] = &StrLen;
-  Optimizations["strpbrk"] = &StrPBrk;
-  Optimizations["strtol"] = &StrTo;
-  Optimizations["strtod"] = &StrTo;
-  Optimizations["strtof"] = &StrTo;
-  Optimizations["strtoul"] = &StrTo;
-  Optimizations["strtoll"] = &StrTo;
-  Optimizations["strtold"] = &StrTo;
-  Optimizations["strtoull"] = &StrTo;
-  Optimizations["strspn"] = &StrSpn;
-  Optimizations["strcspn"] = &StrCSpn;
-  Optimizations["strstr"] = &StrStr;
-  Optimizations["memcmp"] = &MemCmp;
-  AddOpt(LibFunc::memcpy, &MemCpy);
-  Optimizations["memmove"] = &MemMove;
-  AddOpt(LibFunc::memset, &MemSet);
-
-  // _chk variants of String and Memory LibCall Optimizations.
-  Optimizations["__strcpy_chk"] = &StrCpyChk;
-  Optimizations["__stpcpy_chk"] = &StpCpyChk;
-
   // Math Library Optimizations
   Optimizations["cosf"] = &Cos;
   Optimizations["cos"] = &Cos;
@@ -1641,16 +854,37 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["llvm.exp2.f64"] = &Exp2;
   Optimizations["llvm.exp2.f32"] = &Exp2;
 
-  if (TLI->has(LibFunc::floor) && TLI->has(LibFunc::floorf))
-    Optimizations["floor"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::ceil) && TLI->has(LibFunc::ceilf))
-    Optimizations["ceil"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::round) && TLI->has(LibFunc::roundf))
-    Optimizations["round"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::rint) && TLI->has(LibFunc::rintf))
-    Optimizations["rint"] = &UnaryDoubleFP;
-  if (TLI->has(LibFunc::nearbyint) && TLI->has(LibFunc::nearbyintf))
-    Optimizations["nearbyint"] = &UnaryDoubleFP;
+  AddOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
+  AddOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
+  AddOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
+  AddOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
+  AddOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
+  AddOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
+
+  if(UnsafeFPShrink) {
+    AddOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
+    AddOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
+  }
 
   // Integer Optimizations
   Optimizations["ffs"] = &FFS;
@@ -1681,7 +915,7 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
   if (Optimizations.empty())
     InitOptimizations();
 
-  const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+  const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
 
   IRBuilder<> Builder(F.getContext());
 
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index d83145289ce2..6815e411b421 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -16,7 +16,7 @@
 #include "llvm/GlobalValue.h"
 #include "llvm/Instruction.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
@@ -55,10 +55,12 @@ void ExtAddrMode::print(raw_ostream &OS) const {
   OS << ']';
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ExtAddrMode::dump() const {
   print(dbgs());
   dbgs() << '\n';
 }
+#endif
 
 
 /// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
@@ -219,7 +221,7 @@ bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
     unsigned VariableScale = 0;
     
     int64_t ConstantOffset = 0;
-    const TargetData *TD = TLI.getTargetData();
+    const DataLayout *TD = TLI.getDataLayout();
     gep_type_iterator GTI = gep_type_begin(AddrInst);
     for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
       if (StructType *STy = dyn_cast<StructType>(*GTI)) {
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 2679b933f6b0..9fea11391a1d 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -94,7 +94,7 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB, Pass *P) {
 /// is dead. Also recursively delete any operands that become dead as
 /// a result. This includes tracing the def-use list from the PHI to see if
 /// it is ultimately unused or if it reaches an unused cycle.
-bool llvm::DeleteDeadPHIs(BasicBlock *BB) {
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakVH for the PHIs to delete.
   SmallVector<WeakVH, 8> PHIs;
@@ -105,7 +105,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB) {
   bool Changed = false;
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
     if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
-      Changed |= RecursivelyDeleteDeadPHINode(PN);
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI);
 
   return Changed;
 }
@@ -687,3 +687,42 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
+/// SplitBlockAndInsertIfThen - Split the containing block at the
+/// specified instruction - everything before and including Cmp stays
+/// in the old basic block, and everything after Cmp is moved to a
+/// new block. The two blocks are connected by a conditional branch
+/// (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   Cmp
+///   Tail
+/// After:
+///   Head
+///   Cmp
+///   if (Cmp)
+///     ThenBlock
+///   Tail
+///
+/// If Unreachable is true, then ThenBlock ends with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+
+TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp,
+    bool Unreachable, MDNode *BranchWeights) {
+  Instruction *SplitBefore = Cmp->getNextNode();
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  TerminatorInst *CheckTerm;
+  if (Unreachable)
+    CheckTerm = new UnreachableInst(C, ThenBlock);
+  else
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+  return CheckTerm;
+}
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index e13fd716fa85..74b2ee10e01d 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -22,7 +22,7 @@
 #include "llvm/Module.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 
 using namespace llvm;
@@ -34,19 +34,22 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
 
 /// EmitStrLen - Emit a call to the strlen function to the builder, for the
 /// specified pointer.  This always returns an integer value of size intptr_t.
-Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
+Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strlen))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI),
+  Constant *StrLen = M->getOrInsertFunction("strlen",
+                                            AttrListPtr::get(M->getContext(),
+                                                             AWI),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -61,18 +64,21 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
 /// specified pointer.  Ptr is required to be some pointer type, MaxLen must
 /// be of size_t type, and the return value has 'intptr_t' type.
 Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
-                         const TargetData *TD, const TargetLibraryInfo *TLI) {
+                         const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strnlen))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrNLen = M->getOrInsertFunction("strnlen", AttrListPtr::get(AWI),
+  Constant *StrNLen = M->getOrInsertFunction("strnlen",
+                                             AttrListPtr::get(M->getContext(),
+                                                              AWI),
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
                                              TD->getIntPtrType(Context),
@@ -88,17 +94,21 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
 /// specified pointer and character.  Ptr is required to be some pointer type,
 /// and the return value has 'i8*' type.
 Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
-                        const TargetData *TD, const TargetLibraryInfo *TLI) {
+                        const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strchr))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
   AttributeWithIndex AWI =
-    AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+    AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                            ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
-  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI),
+  Constant *StrChr = M->getOrInsertFunction("strchr",
+                                            AttrListPtr::get(M->getContext(),
+                                                             AWI),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -109,20 +119,23 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
 /// EmitStrNCmp - Emit a call to the strncmp function to the builder.
 Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
-                         IRBuilder<> &B, const TargetData *TD,
+                         IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strncmp))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
+  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI),
+  Value *StrNCmp = M->getOrInsertFunction("strncmp",
+                                          AttrListPtr::get(M->getContext(),
+                                                           AWI),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -139,17 +152,19 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
 /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
-                        const TargetData *TD, const TargetLibraryInfo *TLI,
+                        const DataLayout *TD, const TargetLibraryInfo *TLI,
                         StringRef Name) {
   if (!TLI->has(LibFunc::strcpy))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   Attributes::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
+  Value *StrCpy = M->getOrInsertFunction(Name,
+                                         AttrListPtr::get(M->getContext(), AWI),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -161,17 +176,20 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
-                         IRBuilder<> &B, const TargetData *TD,
+                         IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strncpy))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   Attributes::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
+  Value *StrNCpy = M->getOrInsertFunction(Name,
+                                          AttrListPtr::get(M->getContext(),
+                                                           AWI),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -185,17 +203,18 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
 /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
 /// are pointers.
 Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                           IRBuilder<> &B, const TargetData *TD,
+                           IRBuilder<> &B, const DataLayout *TD,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcpy_chk))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                Attributes::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttrListPtr::get(AWI),
+                                         AttrListPtr::get(M->getContext(), AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -212,16 +231,19 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
 /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
 Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
-                        Value *Len, IRBuilder<> &B, const TargetData *TD,
+                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memchr))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
+  AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                ArrayRef<Attributes::AttrVal>(AVs, 2));
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI),
+  Value *MemChr = M->getOrInsertFunction("memchr",
+                                         AttrListPtr::get(M->getContext(), AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -237,20 +259,22 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
 
 /// EmitMemCmp - Emit a call to the memcmp function.
 Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
-                        Value *Len, IRBuilder<> &B, const TargetData *TD,
+                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcmp))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
+  Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI),
+  Value *MemCmp = M->getOrInsertFunction("memcmp",
+                                         AttrListPtr::get(M->getContext(), AWI),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -294,7 +318,7 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 
 /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
 /// is an integer.
-Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
+Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::putchar))
     return 0;
@@ -316,17 +340,19 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
 
 /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
 /// some pointer.
-Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
+Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::puts))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   Attributes::NoUnwind);
 
-  Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI),
+  Value *PutS = M->getOrInsertFunction("puts",
+                                       AttrListPtr::get(M->getContext(), AWI),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -339,17 +365,19 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
 /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
 /// an integer and File is a pointer to FILE.
 Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
-                       const TargetData *TD, const TargetLibraryInfo *TLI) {
+                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputc))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   Attributes::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI),
+    F = M->getOrInsertFunction("fputc",
+                               AttrListPtr::get(M->getContext(), AWI),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -370,19 +398,21 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
 /// pointer and File is a pointer to FILE.
 Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
-                       const TargetData *TD, const TargetLibraryInfo *TLI) {
+                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputs))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   Attributes::NoUnwind);
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI),
+    F = M->getOrInsertFunction(FPutsName,
+                               AttrListPtr::get(M->getContext(), AWI),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -400,21 +430,23 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
 /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
 Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
-                        IRBuilder<> &B, const TargetData *TD,
+                        IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fwrite))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attributes::NoCapture);
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+                                   Attributes::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI),
+    F = M->getOrInsertFunction(FWriteName,
+                               AttrListPtr::get(M->getContext(), AWI),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
@@ -436,9 +468,9 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
 
 SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { }
 
-bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD,
+bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
                                      const TargetLibraryInfo *TLI) {
-  // We really need TargetData for later.
+  // We really need DataLayout for later.
   if (!TD) return false;
   
   this->CI = CI;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
new file mode 100644
index 000000000000..bee2f7bcb6ea
--- /dev/null
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -0,0 +1,262 @@
+//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "bypass-slow-division"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
+using namespace llvm;
+
+namespace {
+  struct DivOpInfo {
+    bool SignedOp;
+    Value *Dividend;
+    Value *Divisor;
+
+    DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
+      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
+  };
+
+  struct DivPhiNodes {
+    PHINode *Quotient;
+    PHINode *Remainder;
+
+    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
+      : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+}
+
+namespace llvm {
+  template<>
+  struct DenseMapInfo<DivOpInfo> {
+    static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
+      return Val1.SignedOp == Val2.SignedOp &&
+             Val1.Dividend == Val2.Dividend &&
+             Val1.Divisor == Val2.Divisor;
+    }
+
+    static DivOpInfo getEmptyKey() {
+      return DivOpInfo(false, 0, 0);
+    }
+
+    static DivOpInfo getTombstoneKey() {
+      return DivOpInfo(true, 0, 0);
+    }
+
+    static unsigned getHashValue(const DivOpInfo &Val) {
+      return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
+                        reinterpret_cast<uintptr_t>(Val.Divisor)) ^
+                        (unsigned)Val.SignedOp;
+    }
+  };
+
+  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+}
+
+// insertFastDiv - Substitutes the div/rem instruction with code that checks the
+// value of the operands and uses a shorter-faster div/rem instruction when
+// possible and the longer-slower div/rem instruction otherwise.
+static bool insertFastDiv(Function &F,
+                          Function::iterator &I,
+                          BasicBlock::iterator &J,
+                          IntegerType *BypassType,
+                          bool UseDivOp,
+                          bool UseSignedOp,
+                          DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  Value *Dividend = Instr->getOperand(0);
+  Value *Divisor = Instr->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor) ||
+      (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
+    // Operations with immediate values should have
+    // been solved and replaced during compile time.
+    return false;
+  }
+
+  // Basic Block is split before divide
+  BasicBlock *MainBB = I;
+  BasicBlock *SuccessorBB = I->splitBasicBlock(J);
+  ++I; //advance iterator I to successorBB
+
+  // Add new basic block for slow divide operation
+  BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  SlowBB->moveBefore(SuccessorBB);
+  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
+  Value *SlowQuotientV;
+  Value *SlowRemainderV;
+  if (UseSignedOp) {
+    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  } else {
+    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
+    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+  }
+  SlowBuilder.CreateBr(SuccessorBB);
+
+  // Add new basic block for fast divide operation
+  BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
+                                          MainBB->getParent(), SuccessorBB);
+  FastBB->moveBefore(SlowBB);
+  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
+  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
+                                                BypassType);
+  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
+                                                 BypassType);
+
+  // udiv/urem because optimization only handles positive numbers
+  Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
+                                                      ShortDivisorV);
+  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
+                                                  ShortDivisorV);
+  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                ShortQuotientV,
+                                                Dividend->getType());
+  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
+                                                 ShortRemainderV,
+                                                 Dividend->getType());
+  FastBuilder.CreateBr(SuccessorBB);
+
+  // Phi nodes for result of div and rem
+  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
+  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
+  QuoPhi->addIncoming(FastQuotientV, FastBB);
+  PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
+  RemPhi->addIncoming(SlowRemainderV, SlowBB);
+  RemPhi->addIncoming(FastRemainderV, FastBB);
+
+  // Replace Instr with appropriate phi node
+  if (UseDivOp)
+    Instr->replaceAllUsesWith(QuoPhi);
+  else
+    Instr->replaceAllUsesWith(RemPhi);
+  Instr->eraseFromParent();
+
+  // Combine operands into a single value with OR for value testing below
+  MainBB->getInstList().back().eraseFromParent();
+  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);
+
+  // BitMask is inverted to check if the operands are
+  // larger than the bypass type
+  uint64_t BitMask = ~BypassType->getBitMask();
+  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values and branch
+  Value *ZeroV = MainBuilder.getInt32(0);
+  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
+  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
+
+  // point iterator J at first instruction of successorBB
+  J = I->begin();
+
+  // Cache phi nodes to be used later in place of other instances
+  // of div or rem with the same sign, dividend, and divisor
+  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
+  DivPhiNodes Value(QuoPhi, RemPhi);
+  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
+  return true;
+}
+
+// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
+// operands and operation are identical. Otherwise call insertFastDiv to perform
+// the optimization and cache the resulting dividend and remainder.
+static bool reuseOrInsertFastDiv(Function &F,
+                                 Function::iterator &I,
+                                 BasicBlock::iterator &J,
+                                 IntegerType *BypassType,
+                                 bool UseDivOp,
+                                 bool UseSignedOp,
+                                 DivCacheTy &PerBBDivCache) {
+  // Get instruction operands
+  Instruction *Instr = J;
+  DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
+  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
+
+  if (CacheI == PerBBDivCache.end()) {
+    // If previous instance does not exist, insert fast div
+    return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
+                         PerBBDivCache);
+  }
+
+  // Replace operation value with previously generated phi node
+  DivPhiNodes &Value = CacheI->second;
+  if (UseDivOp) {
+    // Replace all uses of div instruction with quotient phi node
+    J->replaceAllUsesWith(Value.Quotient);
+  } else {
+    // Replace all uses of rem instruction with remainder phi node
+    J->replaceAllUsesWith(Value.Remainder);
+  }
+
+  // Advance to next operation
+  ++J;
+
+  // Remove redundant operation
+  Instr->eraseFromParent();
+  return true;
+}
+
+// bypassSlowDivision - This optimization identifies DIV instructions that can
+// be profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(Function &F,
+                              Function::iterator &I,
+                              const DenseMap<unsigned int, unsigned int> &BypassWidths) {
+  DivCacheTy DivCache;
+
+  bool MadeChange = false;
+  for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {
+
+    // Get instruction details
+    unsigned Opcode = J->getOpcode();
+    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
+    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
+    bool UseSignedOp = Opcode == Instruction::SDiv ||
+                       Opcode == Instruction::SRem;
+
+    // Only optimize div or rem ops
+    if (!UseDivOp && !UseRemOp)
+      continue;
+
+    // Skip division on vector types, only optimize integer instructions
+    if (!J->getType()->isIntegerTy())
+      continue;
+
+    // Get bitwidth of div/rem instruction
+    IntegerType *T = cast<IntegerType>(J->getType());
+    int bitwidth = T->getBitWidth();
+
+    // Continue if bitwidth is not bypassed
+    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
+    if (BI == BypassWidths.end())
+      continue;
+
+    // Get type for div/rem instruction with bypass bitwidth
+    IntegerType *BT = IntegerType::get(J->getContext(), BI->second);
+
+    MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp,
+                                       UseSignedOp, DivCache);
+  }
+
+  return MadeChange;
+}
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 4ff31cae62cd..620209bccbc8 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMTransformUtils
   BasicBlockUtils.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
+  BypassSlowDivision.cpp
   CloneFunction.cpp
   CloneModule.cpp
   CmpInstAnalysis.cpp
@@ -10,6 +11,7 @@ add_llvm_library(LLVMTransformUtils
   DemoteRegToStack.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
+  IntegerDivision.cpp
   LCSSA.cpp
   Local.cpp
   LoopSimplify.cpp
@@ -19,12 +21,14 @@ add_llvm_library(LLVMTransformUtils
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
+  MetaRenamer.cpp
   ModuleUtils.cpp
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
+  SimplifyLibCalls.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 99237b8390ec..7ba9f6d9d25d 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -98,10 +98,14 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
         Anew->addAttr( OldFunc->getAttributes()
                        .getParamAttributes(I->getArgNo() + 1));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(0, OldFunc->getAttributes()
+                           .addAttr(NewFunc->getContext(),
+                                    AttrListPtr::ReturnIndex,
+                                    OldFunc->getAttributes()
                                      .getRetAttributes()));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(~0, OldFunc->getAttributes()
+                           .addAttr(NewFunc->getContext(),
+                                    AttrListPtr::FunctionIndex,
+                                    OldFunc->getAttributes()
                                      .getFnAttributes()));
 
   }
@@ -202,14 +206,14 @@ namespace {
     bool ModuleLevelChanges;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
-    const TargetData *TD;
+    const DataLayout *TD;
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap,
                           bool moduleLevelChanges,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
-                          const TargetData *td)
+                          const DataLayout *td)
     : NewFunc(newFunc), OldFunc(oldFunc),
       VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
       NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
@@ -365,7 +369,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                      SmallVectorImpl<ReturnInst*> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
-                                     const TargetData *TD,
+                                     const DataLayout *TD,
                                      Instruction *TheCall) {
   assert(NameSuffix && "NameSuffix cannot be null!");
   
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index c545cd68c987..281714f4c100 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -346,7 +346,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
                                            header->getName(), M);
   // If the old function is no-throw, so is the new one.
   if (oldFunction->doesNotThrow())
-    newFunction->setDoesNotThrow(true);
+    newFunction->setDoesNotThrow();
   
   newFunction->getBasicBlockList().push_back(newRootNode);
 
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 89e89e7acf3d..009847f87bce 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -357,7 +357,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
 
   Type *VoidPtrTy = Type::getInt8PtrTy(Context);
   
-  // Create the alloca.  If we have TargetData, use nice alignment.
+  // Create the alloca.  If we have DataLayout, use nice alignment.
   unsigned Align = 1;
   if (IFI.TD)
     Align = IFI.TD->getPrefTypeAlignment(AggTy);
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
new file mode 100644
index 000000000000..55227e2714e6
--- /dev/null
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -0,0 +1,420 @@
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit scalar integer division for
+// targets that don't have native support. It's largely derived from
+// compiler-rt's implementation of __udivsi3, but hand-tuned to reduce the
+// amount of control flow
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "integer-division"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+
+using namespace llvm;
+
+/// Generate code to compute the remainder of two signed integers. Returns the
+/// remainder, which will have the sign of the dividend. Builder's insert point
+/// should be pointing where the caller wants code generated, e.g. at the srem
+/// instruction. This will generate a urem in the process, and Builder's insert
+/// point will be pointing at the uren (if present, i.e. not folded), ready to
+/// be expanded if the user wishes
+static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
+                                          IRBuilder<> &Builder) {
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %dividend_sgn = ashr i32 %dividend, 31
+  // ;   %divisor_sgn  = ashr i32 %divisor, 31
+  // ;   %dvd_xor      = xor i32 %dividend, %dividend_sgn
+  // ;   %dvs_xor      = xor i32 %divisor, %divisor_sgn
+  // ;   %u_dividend   = sub i32 %dvd_xor, %dividend_sgn
+  // ;   %u_divisor    = sub i32 %dvs_xor, %divisor_sgn
+  // ;   %urem         = urem i32 %dividend, %divisor
+  // ;   %xored        = xor i32 %urem, %dividend_sgn
+  // ;   %srem         = sub i32 %xored, %dividend_sgn
+  Value *DividendSign = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *DivisorSign  = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *DvdXor       = Builder.CreateXor(Dividend, DividendSign);
+  Value *DvsXor       = Builder.CreateXor(Divisor, DivisorSign);
+  Value *UDividend    = Builder.CreateSub(DvdXor, DividendSign);
+  Value *UDivisor     = Builder.CreateSub(DvsXor, DivisorSign);
+  Value *URem         = Builder.CreateURem(UDividend, UDivisor);
+  Value *Xored        = Builder.CreateXor(URem, DividendSign);
+  Value *SRem         = Builder.CreateSub(Xored, DividendSign);
+
+  if (Instruction *URemInst = dyn_cast<Instruction>(URem))
+    Builder.SetInsertPoint(URemInst);
+
+  return SRem;
+}
+
+
+/// Generate code to compute the remainder of two unsigned integers. Returns the
+/// remainder. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the urem instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes
+static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
+                                             IRBuilder<> &Builder) {
+  // Remainder = Dividend - Quotient*Divisor
+
+  // ;   %quotient  = udiv i32 %dividend, %divisor
+  // ;   %product   = mul i32 %divisor, %quotient
+  // ;   %remainder = sub i32 %dividend, %product
+  Value *Quotient  = Builder.CreateUDiv(Dividend, Divisor);
+  Value *Product   = Builder.CreateMul(Divisor, Quotient);
+  Value *Remainder = Builder.CreateSub(Dividend, Product);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
+    Builder.SetInsertPoint(UDiv);
+
+  return Remainder;
+}
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+                                         IRBuilder<> &Builder) {
+  // Implementation taken from compiler-rt's __divsi3
+
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %tmp    = ashr i32 %dividend, 31
+  // ;   %tmp1   = ashr i32 %divisor, 31
+  // ;   %tmp2   = xor i32 %tmp, %dividend
+  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp
+  // ;   %tmp3   = xor i32 %tmp1, %divisor
+  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1
+  // ;   %q_sgn  = xor i32 %tmp1, %tmp
+  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr
+  // ;   %tmp4   = xor i32 %q_mag, %q_sgn
+  // ;   %q      = sub i32 %tmp4, %q_sgn
+  Value *Tmp    = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *Tmp1   = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend);
+  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor);
+  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp);
+  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn);
+  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+    Builder.SetInsertPoint(UDiv);
+
+  return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit integers. Returns the
+/// quotient, rounded towards 0. Builder's insert point should be pointing where
+/// the caller wants code generated, e.g. at the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+                                           IRBuilder<> &Builder) {
+  // The basic algorithm can be found in the compiler-rt project's
+  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+  // that's been hand-tuned to lessen the amount of control flow involved.
+
+  // Some helper values
+  IntegerType *I32Ty = Builder.getInt32Ty();
+
+  ConstantInt *Zero      = Builder.getInt32(0);
+  ConstantInt *One       = Builder.getInt32(1);
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+  ConstantInt *NegOne    = ConstantInt::getSigned(I32Ty, -1);
+  ConstantInt *True      = Builder.getTrue();
+
+  BasicBlock *IBB = Builder.GetInsertBlock();
+  Function *F = IBB->getParent();
+  Function *CTLZi32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                                I32Ty);
+
+  // Our CFG is going to look like:
+  // +---------------------+
+  // | special-cases       |
+  // |   ...               |
+  // +---------------------+
+  //  |       |
+  //  |   +----------+
+  //  |   |  bb1     |
+  //  |   |  ...     |
+  //  |   +----------+
+  //  |    |      |
+  //  |    |  +------------+
+  //  |    |  |  preheader |
+  //  |    |  |  ...       |
+  //  |    |  +------------+
+  //  |    |      |
+  //  |    |      |      +---+
+  //  |    |      |      |   |
+  //  |    |  +------------+ |
+  //  |    |  |  do-while  | |
+  //  |    |  |  ...       | |
+  //  |    |  +------------+ |
+  //  |    |      |      |   |
+  //  |   +-----------+  +---+
+  //  |   | loop-exit |
+  //  |   |  ...      |
+  //  |   +-----------+
+  //  |     |
+  // +-------+
+  // | ...   |
+  // | end   |
+  // +-------+
+  BasicBlock *SpecialCases = Builder.GetInsertBlock();
+  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+                                                  "udiv-end");
+  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-loop-exit", F, End);
+  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-do-while", F, End);
+  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-preheader", F, End);
+  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-bb1", F, End);
+
+  // We'll be overwriting the terminator to insert our extra blocks
+  SpecialCases->getTerminator()->eraseFromParent();
+
+  // First off, check for special cases: dividend or divisor is zero, divisor
+  // is greater than dividend, and divisor is 1.
+  // ; special-cases:
+  // ;   %ret0_1      = icmp eq i32 %divisor, 0
+  // ;   %ret0_2      = icmp eq i32 %dividend, 0
+  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2
+  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+  // ;   %sr          = sub nsw i32 %tmp0, %tmp1
+  // ;   %ret0_4      = icmp ugt i32 %sr, 31
+  // ;   %ret0        = or i1 %ret0_3, %ret0_4
+  // ;   %retDividend = icmp eq i32 %sr, 31
+  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend
+  // ;   %earlyRet    = or i1 %ret0, %retDividend
+  // ;   br i1 %earlyRet, label %end, label %bb1
+  Builder.SetInsertPoint(SpecialCases);
+  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero);
+  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero);
+  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2);
+  Value *Tmp0        = Builder.CreateCall2(CTLZi32, Divisor, True);
+  Value *Tmp1        = Builder.CreateCall2(CTLZi32, Dividend, True);
+  Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
+  Value *Ret0_4      = Builder.CreateICmpUGT(SR, ThirtyOne);
+  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4);
+  Value *RetDividend = Builder.CreateICmpEQ(SR, ThirtyOne);
+  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend);
+  Builder.CreateCondBr(EarlyRet, End, BB1);
+
+  // ; bb1:                                             ; preds = %special-cases
+  // ;   %sr_1     = add i32 %sr, 1
+  // ;   %tmp2     = sub i32 31, %sr
+  // ;   %q        = shl i32 %dividend, %tmp2
+  // ;   %skipLoop = icmp eq i32 %sr_1, 0
+  // ;   br i1 %skipLoop, label %loop-exit, label %preheader
+  Builder.SetInsertPoint(BB1);
+  Value *SR_1     = Builder.CreateAdd(SR, One);
+  Value *Tmp2     = Builder.CreateSub(ThirtyOne, SR);
+  Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+  // ; preheader:                                           ; preds = %bb1
+  // ;   %tmp3 = lshr i32 %dividend, %sr_1
+  // ;   %tmp4 = add i32 %divisor, -1
+  // ;   br label %do-while
+  Builder.SetInsertPoint(Preheader);
+  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+  Builder.CreateBr(DoWhile);
+
+  // ; do-while:                                 ; preds = %do-while, %preheader
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  // ;   %tmp5  = shl i32 %r_1, 1
+  // ;   %tmp6  = lshr i32 %q_2, 31
+  // ;   %tmp7  = or i32 %tmp5, %tmp6
+  // ;   %tmp8  = shl i32 %q_2, 1
+  // ;   %q_1   = or i32 %carry_1, %tmp8
+  // ;   %tmp9  = sub i32 %tmp4, %tmp7
+  // ;   %tmp10 = ashr i32 %tmp9, 31
+  // ;   %carry = and i32 %tmp10, 1
+  // ;   %tmp11 = and i32 %tmp10, %divisor
+  // ;   %r     = sub i32 %tmp7, %tmp11
+  // ;   %sr_2  = add i32 %sr_3, -1
+  // ;   %tmp12 = icmp eq i32 %sr_2, 0
+  // ;   br i1 %tmp12, label %loop-exit, label %do-while
+  Builder.SetInsertPoint(DoWhile);
+  PHINode *Carry_1 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *SR_3    = Builder.CreatePHI(I32Ty, 2);
+  PHINode *R_1     = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_2     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp5  = Builder.CreateShl(R_1, One);
+  Value *Tmp6  = Builder.CreateLShr(Q_2, ThirtyOne);
+  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6);
+  Value *Tmp8  = Builder.CreateShl(Q_2, One);
+  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8);
+  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7);
+  Value *Tmp10 = Builder.CreateAShr(Tmp9, 31);
+  Value *Carry = Builder.CreateAnd(Tmp10, One);
+  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+  Value *R     = Builder.CreateSub(Tmp7, Tmp11);
+  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
+  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+  // ; loop-exit:                                      ; preds = %do-while, %bb1
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  // ;   %tmp13 = shl i32 %q_3, 1
+  // ;   %q_4   = or i32 %carry_2, %tmp13
+  // ;   br label %end
+  Builder.SetInsertPoint(LoopExit);
+  PHINode *Carry_2 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_3     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp13 = Builder.CreateShl(Q_3, One);
+  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13);
+  Builder.CreateBr(End);
+
+  // ; end:                                 ; preds = %loop-exit, %special-cases
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  // ;   ret i32 %q_5
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Q_5 = Builder.CreatePHI(I32Ty, 2);
+
+  // Populate the Phis, since all values have now been created. Our Phis were:
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  Carry_1->addIncoming(Zero, Preheader);
+  Carry_1->addIncoming(Carry, DoWhile);
+  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  SR_3->addIncoming(SR_1, Preheader);
+  SR_3->addIncoming(SR_2, DoWhile);
+  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  R_1->addIncoming(Tmp3, Preheader);
+  R_1->addIncoming(R, DoWhile);
+  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  Q_2->addIncoming(Q, Preheader);
+  Q_2->addIncoming(Q_1, DoWhile);
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  Carry_2->addIncoming(Zero, BB1);
+  Carry_2->addIncoming(Carry, DoWhile);
+  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  Q_3->addIncoming(Q, BB1);
+  Q_3->addIncoming(Q_1, DoWhile);
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  Q_5->addIncoming(Q_4, LoopExit);
+  Q_5->addIncoming(RetVal, SpecialCases);
+
+  return Q_5;
+}
+
+/// Generate code to calculate the remainder of two integers, replacing Rem with
+/// the generated code. This currently generates code using the udiv expansion,
+/// but future work includes generating more specialized code, e.g. when more
+/// information about the operands are known. Currently only implements 32bit
+/// scalar division (due to udiv's limitation), but future work is removing this
+/// limitation.
+///
+/// @brief Replace Rem with generated code.
+bool llvm::expandRemainder(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+         "Trying to expand remainder from a non-remainder function");
+
+  IRBuilder<> Builder(Rem);
+
+  // First prepare the sign if it's a signed remainder
+  if (Rem->getOpcode() == Instruction::SRem) {
+    Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
+                                                   Rem->getOperand(1), Builder);
+
+    Rem->replaceAllUsesWith(Remainder);
+    Rem->dropAllReferences();
+    Rem->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::URem)
+      return true;
+
+    Rem = BO;
+  }
+
+  Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
+                                                    Rem->getOperand(1),
+                                                    Builder);
+
+  Rem->replaceAllUsesWith(Remainder);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  // Expand the udiv
+  if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
+    assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
+    expandDivision(UDiv);
+  }
+
+  return true;
+}
+
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Currently only
+/// implements 32bit scalar division, but future work is removing this
+/// limitation.
+///
+/// @brief Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+         "Trying to expand division from a non-division function");
+
+  IRBuilder<> Builder(Div);
+
+  if (Div->getType()->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  // First prepare the sign if it's a signed division
+  if (Div->getOpcode() == Instruction::SDiv) {
+    // Lower the code to unsigned division, and reset Div to point to the udiv.
+    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1), Builder);
+    Div->replaceAllUsesWith(Quotient);
+    Div->dropAllReferences();
+    Div->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::UDiv)
+      return true;
+
+    Div = BO;
+  }
+
+  // Insert the unsigned division code
+  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1),
+                                                 Builder);
+  Div->replaceAllUsesWith(Quotient);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index b654111eba74..5e05c83c3566 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -53,6 +53,8 @@ namespace {
 
     // Cached analysis information for the current function.
     DominatorTree *DT;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
     std::vector<BasicBlock*> LoopBlocks;
     PredIteratorCache PredCache;
     Loop *L;
@@ -117,6 +119,8 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
   L = TheLoop;
   
   DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = getAnalysisIfAvailable<ScalarEvolution>();
 
   // Get the set of exiting blocks.
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -156,6 +160,12 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
       MadeChange |= ProcessInstruction(I, ExitBlocks);
     }
   }
+
+  // If we modified the code, remove any caches about the loop from SCEV to
+  // avoid dangling entries.
+  // FIXME: This is a big hammer, can we clear the cache more selectively?
+  if (SE && MadeChange)
+    SE->forgetLoop(L);
   
   assert(L->isLCSSAForm(*DT));
   PredCache.clear();
@@ -245,7 +255,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
     // Remember that this phi makes the value alive in this block.
     SSAUpdate.AddAvailableValue(ExitBB, PN);
   }
-  
+
   // Rewrite all uses outside the loop in terms of the new PHIs we just
   // inserted.
   for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
@@ -260,6 +270,9 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
 
     if (isa<PHINode>(UserBB->begin()) &&
         isExitBlock(UserBB, ExitBlocks)) {
+      // Tell the VHs that the uses changed. This updates SCEV's caches.
+      if (UsesToRewrite[i]->get()->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
       UsesToRewrite[i]->set(UserBB->begin());
       continue;
     }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index bed7d72fffc6..a954d82c05bf 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
@@ -38,7 +39,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -52,7 +53,8 @@ using namespace llvm;
 /// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
 /// conditions and indirectbr addresses this might make dead if
 /// DeleteDeadConditions is true.
-bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+                                  const TargetLibraryInfo *TLI) {
   TerminatorInst *T = BB->getTerminator();
   IRBuilder<> Builder(T);
 
@@ -96,7 +98,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       Value *Cond = BI->getCondition();
       BI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Cond);
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
     return false;
@@ -121,6 +123,27 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
       if (i.getCaseSuccessor() == DefaultDest) {
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        // MD should have 2 + NumCases operands.
+        if (MD && MD->getNumOperands() == 2 + SI->getNumCases()) {
+          // Collect branch weights into a vector.
+          SmallVector<uint32_t, 8> Weights;
+          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+               ++MD_i) {
+            ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+            assert(CI);
+            Weights.push_back(CI->getValue().getZExtValue());
+          }
+          // Merge weight of this case to the default weight.
+          unsigned idx = i.getCaseIndex();
+          Weights[0] += Weights[idx+1];
+          // Remove weight for this case.
+          std::swap(Weights[idx+1], Weights.back());
+          Weights.pop_back();
+          SI->setMetadata(LLVMContext::MD_prof,
+                          MDBuilder(BB->getContext()).
+                          createBranchWeights(Weights));
+        }
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
         SI->removeCase(i);
@@ -161,7 +184,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       Value *Cond = SI->getCondition();
       SI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Cond);
+        RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
       return true;
     }
     
@@ -177,8 +200,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
             "cond");
 
         // Insert the new branch.
-        Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
-                             SI->getDefaultDest());
+        BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                FirstCase.getCaseSuccessor(),
+                                SI->getDefaultDest());
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        if (MD && MD->getNumOperands() == 3) {
+          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+          assert(SICase && SIDef);
+          // The TrueWeight should be the weight for the single case of SI.
+          NewBr->setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(BB->getContext()).
+                 createBranchWeights(SICase->getValue().getZExtValue(),
+                                     SIDef->getValue().getZExtValue()));
+        }
 
         // Delete the old switch.
         SI->eraseFromParent();
@@ -205,7 +240,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
       Value *Address = IBI->getAddress();
       IBI->eraseFromParent();
       if (DeleteDeadConditions)
-        RecursivelyDeleteTriviallyDeadInstructions(Address);
+        RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
       
       // If we didn't find our destination in the IBI successor list, then we
       // have undefined behavior.  Replace the unconditional branch with an
@@ -230,7 +265,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) {
 /// isInstructionTriviallyDead - Return true if the result produced by the
 /// instruction is not used, and the instruction has no side effects.
 ///
-bool llvm::isInstructionTriviallyDead(Instruction *I) {
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+                                      const TargetLibraryInfo *TLI) {
   if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
 
   // We don't want the landingpad instruction removed by anything this general.
@@ -265,9 +301,9 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
       return isa<UndefValue>(II->getArgOperand(1));
   }
 
-  if (isAllocLikeFn(I)) return true;
+  if (isAllocLikeFn(I, TLI)) return true;
 
-  if (CallInst *CI = isFreeCall(I))
+  if (CallInst *CI = isFreeCall(I, TLI))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
       return C->isNullValue() || isa<UndefValue>(C);
 
@@ -278,9 +314,11 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) {
 /// trivially dead instruction, delete it.  If that makes any of its operands
 /// trivially dead, delete them too, recursively.  Return true if any
 /// instructions were deleted.
-bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
+bool
+llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V,
+                                                 const TargetLibraryInfo *TLI) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I))
+  if (!I || !I->use_empty() || !isInstructionTriviallyDead(I, TLI))
     return false;
   
   SmallVector<Instruction*, 16> DeadInsts;
@@ -301,7 +339,7 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructions(Value *V) {
       // operand, and if it is 'trivially' dead, delete it in a future loop
       // iteration.
       if (Instruction *OpI = dyn_cast<Instruction>(OpV))
-        if (isInstructionTriviallyDead(OpI))
+        if (isInstructionTriviallyDead(OpI, TLI))
           DeadInsts.push_back(OpI);
     }
     
@@ -334,19 +372,20 @@ static bool areAllUsesEqual(Instruction *I) {
 /// either forms a cycle or is terminated by a trivially dead instruction,
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
-bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+                                        const TargetLibraryInfo *TLI) {
   SmallPtrSet<Instruction*, 4> Visited;
   for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
        I = cast<Instruction>(*I->use_begin())) {
     if (I->use_empty())
-      return RecursivelyDeleteTriviallyDeadInstructions(I);
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
 
     // If we find an instruction more than once, we're on a cycle that
     // won't prove fruitful.
     if (!Visited.insert(I)) {
       // Break the cycle and delete the instruction and its operands.
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
-      (void)RecursivelyDeleteTriviallyDeadInstructions(I);
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
       return true;
     }
   }
@@ -358,7 +397,8 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN) {
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
+                                       const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
 #ifndef NDEBUG
@@ -381,7 +421,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
       continue;
     }
 
-    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    MadeChange |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
     if (BIHandle != BI)
       BI = BB->begin();
   }
@@ -405,7 +445,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD) {
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the and to 0.
 void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                        TargetData *TD) {
+                                        DataLayout *TD) {
   // This only adjusts blocks with PHI nodes.
   if (!isa<PHINode>(BB->begin()))
     return;
@@ -720,7 +760,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// their preferred alignment from the beginning.
 ///
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
-                                      unsigned PrefAlign, const TargetData *TD) {
+                                      unsigned PrefAlign, const DataLayout *TD) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
@@ -763,7 +803,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                          const TargetData *TD) {
+                                          const DataLayout *TD) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
   unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 0bc185d8b722..9d9e20166564 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -46,6 +46,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Type.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -89,6 +90,7 @@ namespace {
 
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DependenceAnalysis>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
     }
 
@@ -194,6 +196,11 @@ ReprocessLoop:
 
           BI->setCondition(ConstantInt::get(Cond->getType(),
                                             !L->contains(BI->getSuccessor(0))));
+
+          // This may make the loop analyzable, force SCEV recomputation.
+          if (SE)
+            SE->forgetLoop(L);
+
           Changed = true;
         }
       }
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
new file mode 100644
index 000000000000..233bc12d3cfd
--- /dev/null
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -0,0 +1,132 @@
+//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/TypeFinder.h"
+
+using namespace llvm;
+
+namespace {
+
+  // This PRNG is from the ISO C spec. It is intentionally simple and
+  // unsuitable for cryptographic use. We're just looking for enough
+  // variety to surprise and delight users.
+  struct PRNG {
+    unsigned long next;
+
+    void srand(unsigned int seed) {
+      next = seed;
+    }
+
+    int rand(void) {
+      next = next * 1103515245 + 12345;
+      return (unsigned int)(next / 65536) % 32768;
+    }
+  };
+
+  struct MetaRenamer : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    MetaRenamer() : ModulePass(ID) {
+      initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    bool runOnModule(Module &M) {
+      static const char *metaNames[] = {
+        // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+        "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+        "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+      };
+
+      // Seed our PRNG with simple additive sum of ModuleID. We're looking to
+      // simply avoid always having the same function names, and we need to
+      // remain deterministic.
+      unsigned int randSeed = 0;
+      for (std::string::const_iterator I = M.getModuleIdentifier().begin(),
+           E = M.getModuleIdentifier().end(); I != E; ++I)
+        randSeed += *I;
+
+      PRNG prng;
+      prng.srand(randSeed);
+
+      // Rename all aliases
+      for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
+           AI != AE; ++AI)
+        AI->setName("alias");
+
+      // Rename all global variables
+      for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
+           GI != GE; ++GI)
+        GI->setName("global");
+
+      // Rename all struct types
+      TypeFinder StructTypes;
+      StructTypes.run(M, true);
+      for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+        StructType *STy = StructTypes[i];
+        if (STy->isLiteral() || STy->getName().empty()) continue;
+
+        SmallString<128> NameStorage;
+        STy->setName((Twine("struct.") + metaNames[prng.rand() %
+                     array_lengthof(metaNames)]).toStringRef(NameStorage));
+      }
+
+      // Rename all functions
+      for (Module::iterator FI = M.begin(), FE = M.end();
+           FI != FE; ++FI) {
+        FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
+        runOnFunction(*FI);
+      }
+      return true;
+    }
+
+    bool runOnFunction(Function &F) {
+      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
+           AI != AE; ++AI)
+        if (!AI->getType()->isVoidTy())
+          AI->setName("arg");
+
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+        BB->setName("bb");
+
+        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+          if (!I->getType()->isVoidTy())
+            I->setName("tmp");
+      }
+      return true;
+    }
+  };
+}
+
+char MetaRenamer::ID = 0;
+INITIALIZE_PASS(MetaRenamer, "metarenamer", 
+                "Assign new names to everything", false, false)
+//===----------------------------------------------------------------------===//
+//
+// MetaRenamer - Rename everything with metasyntactic names.
+//
+ModulePass *llvm::createMetaRenamerPass() {
+  return new MetaRenamer();
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index dd5e20ed50a7..558de9d12e6c 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -212,9 +212,13 @@ namespace {
     ///
     DenseMap<AllocaInst*, unsigned>  AllocaLookup;
 
-    /// NewPhiNodes - The PhiNodes we're adding.
+    /// NewPhiNodes - The PhiNodes we're adding.  That map is used to simplify
+    /// some Phi nodes as we iterate over it, so it should have deterministic
+    /// iterators.  We could use a MapVector, but since we already maintain a
+    /// map from BasicBlock* to a stable numbering (BBNumbers), the DenseMap is
+    /// more efficient (also supports removal).
     ///
-    DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*> NewPhiNodes;
+    DenseMap<std::pair<unsigned, unsigned>, PHINode*> NewPhiNodes;
     
     /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas
     /// it corresponds to.
@@ -588,7 +592,11 @@ void PromoteMem2Reg::run() {
   while (EliminatedAPHI) {
     EliminatedAPHI = false;
     
-    for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+    // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+    // simplify and RAUW them as we go.  If it was not, we could add uses to
+    // the values we replace with in a non deterministic order, thus creating
+    // non deterministic def->use chains.
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
            NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) {
       PHINode *PN = I->second;
 
@@ -612,7 +620,7 @@ void PromoteMem2Reg::run() {
   // have incoming values for all predecessors.  Loop over all PHI nodes we have
   // created, inserting undef values if they are missing any incoming values.
   //
-  for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
          NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) {
     // We want to do this once per basic block.  As such, only process a block
     // when we find the PHI that is the first entry in the block.
@@ -992,7 +1000,7 @@ void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
 bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                                   unsigned &Version) {
   // Look up the basic-block in question.
-  PHINode *&PN = NewPhiNodes[std::make_pair(BB, AllocaNo)];
+  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
 
   // If the BB already has a phi node added for the i'th alloca then we're done!
   if (PN) return false;
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index e568a616b6f0..72d4199a2a69 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -39,7 +39,7 @@ SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode*> *NewPHI)
   : AV(0), ProtoType(0), ProtoName(), InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
-  delete &getAvailableVals(AV);
+  delete static_cast<AvailableValsTy*>(AV);
 }
 
 /// Initialize - Reset this object to get ready for a new set of SSA
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 518df7cddab4..c767da624e19 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -14,6 +14,7 @@
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/IRBuilder.h"
@@ -22,6 +23,7 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
+#include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
@@ -38,7 +40,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <set>
@@ -53,6 +55,13 @@ static cl::opt<bool>
 DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
        cl::desc("Duplicate return instructions into unconditional branches"));
 
+static cl::opt<bool>
+SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+       cl::desc("Sink common instructions down to the end block"));
+
+STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
+STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
+STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
@@ -68,10 +77,13 @@ namespace {
       // Comparing pointers is ok as we only rely on the order for uniquing.
       return Value < RHS.Value;
     }
+
+    bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
   };
 
 class SimplifyCFGOpt {
-  const TargetData *const TD;
+  const DataLayout *const TD;
+  const TargetTransformInfo *const TTI;
 
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
@@ -91,7 +103,8 @@ class SimplifyCFGOpt {
   bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
-  explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {}
+  SimplifyCFGOpt(const DataLayout *td, const TargetTransformInfo *tti)
+      : TD(td), TTI(tti) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -101,14 +114,14 @@ public:
 ///
 static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
   if (SI1 == SI2) return false;  // Can't merge with self!
-  
+
   // It is not safe to merge these two switch instructions if they have a common
   // successor, and if that successor has a PHI node, and if *that* PHI node has
   // conflicting incoming values from the two switch blocks.
   BasicBlock *SI1BB = SI1->getParent();
   BasicBlock *SI2BB = SI2->getParent();
   SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
-  
+
   for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I)
     if (SI1Succs.count(*I))
       for (BasicBlock::iterator BBI = (*I)->begin();
@@ -118,7 +131,7 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) {
             PN->getIncomingValueForBlock(SI2BB))
           return false;
       }
-        
+
   return true;
 }
 
@@ -135,7 +148,7 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
   assert(SI1->isUnconditional() && SI2->isConditional());
 
   // We fold the unconditional branch if we can easily update all PHI nodes in
-  // common successors: 
+  // common successors:
   // 1> We have a constant incoming value for the conditional branch;
   // 2> We have "Cond" as the incoming value for the unconditional branch;
   // 3> SI2->getCondition() and Cond have same operands.
@@ -170,7 +183,7 @@ static bool isProfitableToFoldUnconditional(BranchInst *SI1,
 static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
                                   BasicBlock *ExistPred) {
   if (!isa<PHINode>(Succ->begin())) return; // Quick exit if nothing to do
-  
+
   PHINode *PN;
   for (BasicBlock::iterator I = Succ->begin();
        (PN = dyn_cast<PHINode>(I)); ++I)
@@ -222,7 +235,7 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     // doesn't dominate BB.
     if (Pred2->getSinglePredecessor() == 0)
       return 0;
-    
+
     // If we found a conditional branch predecessor, make sure that it branches
     // to BB and Pred2Br.  If it doesn't, this isn't an "if statement".
     if (Pred1Br->getSuccessor(0) == BB &&
@@ -252,7 +265,7 @@ static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   // Otherwise, if this is a conditional branch, then we can use it!
   BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
   if (BI == 0) return 0;
-  
+
   assert(BI->isConditional() && "Two successors but not conditional?");
   if (BI->getSuccessor(0) == Pred1) {
     IfTrue = Pred1;
@@ -345,7 +358,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
   // If we aren't allowing aggressive promotion anymore, then don't consider
   // instructions in the 'if region'.
   if (AggressiveInsts == 0) return false;
-  
+
   // If we have seen this instruction before, don't count it again.
   if (AggressiveInsts->count(I)) return true;
 
@@ -374,7 +387,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
 /// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
-static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
+static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) {
   // Normal constant int.
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
   if (CI || !TD || !isa<Constant>(V) || !V->getType()->isPointerTy())
@@ -382,7 +395,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
   // ConstantInt if possible.
-  IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
+  IntegerType *PtrTy = cast<IntegerType>(TD->getIntPtrType(V->getType()));
 
   // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
   if (isa<ConstantPointerNull>(V))
@@ -408,10 +421,10 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
 /// Values vector.
 static Value *
 GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
-                       const TargetData *TD, bool isEQ, unsigned &UsedICmps) {
+                       const DataLayout *TD, bool isEQ, unsigned &UsedICmps) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return 0;
-  
+
   // If this is an icmp against a constant, handle this as one of the cases.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I)) {
     if (ConstantInt *C = GetConstantInt(I->getOperand(1), TD)) {
@@ -420,21 +433,21 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
         Vals.push_back(C);
         return I->getOperand(0);
       }
-      
+
       // If we have "x ult 3" comparison, for example, then we can add 0,1,2 to
       // the set.
       ConstantRange Span =
         ConstantRange::makeICmpRegion(ICI->getPredicate(), C->getValue());
-      
+
       // If this is an and/!= check then we want to optimize "x ugt 2" into
       // x != 0 && x != 1.
       if (!isEQ)
         Span = Span.inverse();
-      
+
       // If there are a ton of values, we don't want to make a ginormous switch.
       if (Span.getSetSize().ugt(8) || Span.isEmptySet())
         return 0;
-      
+
       for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
         Vals.push_back(ConstantInt::get(V->getContext(), Tmp));
       UsedICmps++;
@@ -442,11 +455,11 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
     }
     return 0;
   }
-  
+
   // Otherwise, we can only handle an | or &, depending on isEQ.
   if (I->getOpcode() != (isEQ ? Instruction::Or : Instruction::And))
     return 0;
-  
+
   unsigned NumValsBeforeLHS = Vals.size();
   unsigned UsedICmpsBeforeLHS = UsedICmps;
   if (Value *LHS = GatherConstantCompares(I->getOperand(0), Vals, Extra, TD,
@@ -467,12 +480,12 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
       Extra = I->getOperand(1);
       return LHS;
     }
-    
+
     Vals.resize(NumValsBeforeLHS);
     UsedICmps = UsedICmpsBeforeLHS;
     return 0;
   }
-  
+
   // If the LHS can't be folded in, but Extra is available and RHS can, try to
   // use LHS as Extra.
   if (Extra == 0 || Extra == I->getOperand(0)) {
@@ -484,7 +497,7 @@ GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
     assert(Vals.size() == NumValsBeforeLHS);
     Extra = OldExtra;
   }
-  
+
   return 0;
 }
 
@@ -556,11 +569,7 @@ GetValueEqualityComparisonCases(TerminatorInst *TI,
 /// in the list that match the specified block.
 static void EliminateBlockCases(BasicBlock *BB,
                               std::vector<ValueEqualityComparisonCase> &Cases) {
-  for (unsigned i = 0, e = Cases.size(); i != e; ++i)
-    if (Cases[i].Dest == BB) {
-      Cases.erase(Cases.begin()+i);
-      --i; --e;
-    }
+  Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
 }
 
 /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
@@ -615,6 +624,9 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
   assert(ThisVal && "This isn't a value comparison!!");
   if (ThisVal != PredVal) return false;  // Different predicates.
 
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
   // Find out information about when control will move from Pred to TI's block.
   std::vector<ValueEqualityComparisonCase> PredCases;
   BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(),
@@ -634,7 +646,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     // can simplify TI.
     if (!ValuesOverlap(PredCases, ThisCases))
       return false;
-    
+
     if (isa<BranchInst>(TI)) {
       // Okay, one of the successors of this condbr is dead.  Convert it to a
       // uncond br.
@@ -652,7 +664,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
       EraseTerminatorInstAndDCECond(TI);
       return true;
     }
-      
+
     SwitchInst *SI = cast<SwitchInst>(TI);
     // Okay, TI has cases that are statically dead, prune them away.
     SmallPtrSet<Constant*, 16> DeadCases;
@@ -662,18 +674,37 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
 
+    // Collect branch weights into a vector.
+    SmallVector<uint32_t, 8> Weights;
+    MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+    bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases());
+    if (HasWeight)
+      for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+           ++MD_i) {
+        ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+        assert(CI);
+        Weights.push_back(CI->getValue().getZExtValue());
+      }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
       if (DeadCases.count(i.getCaseValue())) {
+        if (HasWeight) {
+          std::swap(Weights[i.getCaseIndex()+1], Weights.back());
+          Weights.pop_back();
+        }
         i.getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
+    if (HasWeight && Weights.size() >= 2)
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getParent()->getContext()).
+                      createBranchWeights(Weights));
 
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
   }
-  
+
   // Otherwise, TI's block must correspond to some matched value.  Find out
   // which value (or set of values) this is.
   ConstantInt *TIV = 0;
@@ -729,8 +760,8 @@ namespace {
 }
 
 static int ConstantIntSortPredicate(const void *P1, const void *P2) {
-  const ConstantInt *LHS = *(const ConstantInt**)P1;
-  const ConstantInt *RHS = *(const ConstantInt**)P2;
+  const ConstantInt *LHS = *(const ConstantInt*const*)P1;
+  const ConstantInt *RHS = *(const ConstantInt*const*)P2;
   if (LHS->getValue().ult(RHS->getValue()))
     return 1;
   if (LHS->getValue() == RHS->getValue())
@@ -738,6 +769,56 @@ static int ConstantIntSortPredicate(const void *P1, const void *P2) {
   return -1;
 }
 
+static inline bool HasBranchWeights(const Instruction* I) {
+  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
+  if (ProfMD && ProfMD->getOperand(0))
+    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+      return MDS->getString().equals("branch_weights");
+
+  return false;
+}
+
+/// Get Weights of a given TerminatorInst, the default weight is at the front
+/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
+/// metadata.
+static void GetBranchWeights(TerminatorInst *TI,
+                             SmallVectorImpl<uint64_t> &Weights) {
+  MDNode* MD = TI->getMetadata(LLVMContext::MD_prof);
+  assert(MD);
+  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+    ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i));
+    assert(CI);
+    Weights.push_back(CI->getValue().getZExtValue());
+  }
+
+  // If TI is a conditional eq, the default case is the false case,
+  // and the corresponding branch-weight data is at index 2. We swap the
+  // default weight to be the first entry.
+  if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+    assert(Weights.size() == 2);
+    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      std::swap(Weights.front(), Weights.back());
+  }
+}
+
+/// Sees if any of the weights are too big for a uint32_t, and halves all the
+/// weights if any are.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+  bool Halve = false;
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    if (Weights[i] > UINT_MAX) {
+      Halve = true;
+      break;
+    }
+
+  if (! Halve)
+    return;
+
+  for (unsigned i = 0; i < Weights.size(); ++i)
+    Weights[i] /= 2;
+}
+
 /// FoldValueComparisonIntoPredecessors - The specified terminator is a value
 /// equality comparison instruction (either a switch or a branch on "X == c").
 /// See if any of the predecessors of the terminator block are value comparisons
@@ -770,6 +851,31 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // build.
       SmallVector<BasicBlock*, 8> NewSuccessors;
 
+      // Update the branch weight metadata along the way
+      SmallVector<uint64_t, 8> Weights;
+      bool PredHasWeights = HasBranchWeights(PTI);
+      bool SuccHasWeights = HasBranchWeights(TI);
+
+      if (PredHasWeights) {
+        GetBranchWeights(PTI, Weights);
+        // branch-weight metadata is inconsistant here.
+        if (Weights.size() != 1 + PredCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (SuccHasWeights)
+        // If there are no predecessor weights but there are successor weights,
+        // populate Weights with 1, which will later be scaled to the sum of
+        // successor's weights
+        Weights.assign(1 + PredCases.size(), 1);
+
+      SmallVector<uint64_t, 8> SuccWeights;
+      if (SuccHasWeights) {
+        GetBranchWeights(TI, SuccWeights);
+        // branch-weight metadata is inconsistant here.
+        if (SuccWeights.size() != 1 + BBCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (PredHasWeights)
+        SuccWeights.assign(1 + BBCases.size(), 1);
+
       if (PredDefault == BB) {
         // If this is the default destination from PTI, only the edges in TI
         // that don't occur in PTI, or that branch to BB will be activated.
@@ -780,6 +886,14 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           else {
             // The default destination is BB, we don't need explicit targets.
             std::swap(PredCases[i], PredCases.back());
+
+            if (PredHasWeights || SuccHasWeights) {
+              // Increase weight for the default case.
+              Weights[0] += Weights[i+1];
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
             PredCases.pop_back();
             --i; --e;
           }
@@ -790,21 +904,47 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           PredDefault = BBDefault;
           NewSuccessors.push_back(BBDefault);
         }
+
+        unsigned CasesFromPred = Weights.size();
+        uint64_t ValidTotalSuccWeight = 0;
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (!PTIHandled.count(BBCases[i].Value) &&
               BBCases[i].Dest != BBDefault) {
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
+            if (SuccHasWeights || PredHasWeights) {
+              // The default weight is at index 0, so weight for the ith case
+              // should be at index i+1. Scale the cases from successor by
+              // PredDefaultWeight (Weights[0]).
+              Weights.push_back(Weights[0] * SuccWeights[i+1]);
+              ValidTotalSuccWeight += SuccWeights[i+1];
+            }
           }
 
+        if (SuccHasWeights || PredHasWeights) {
+          ValidTotalSuccWeight += SuccWeights[0];
+          // Scale the cases from predecessor by ValidTotalSuccWeight.
+          for (unsigned i = 1; i < CasesFromPred; ++i)
+            Weights[i] *= ValidTotalSuccWeight;
+          // Scale the default weight by SuccDefaultWeight (SuccWeights[0]).
+          Weights[0] *= SuccWeights[0];
+        }
       } else {
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt*, uint64_t> WeightsForHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest == BB) {
             PTIHandled.insert(PredCases[i].Value);
+
+            if (PredHasWeights || SuccHasWeights) {
+              WeightsForHandled[PredCases[i].Value] = Weights[i+1];
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
             --i; --e;
@@ -815,6 +955,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (PTIHandled.count(BBCases[i].Value)) {
             // If this is one we are capable of getting...
+            if (PredHasWeights || SuccHasWeights)
+              Weights.push_back(WeightsForHandled[BBCases[i].Value]);
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
             PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
@@ -822,9 +964,11 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
         // If there are any constants vectored to BB that TI doesn't handle,
         // they must go to the default destination of TI.
-        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = 
+        for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
+          if (PredHasWeights || SuccHasWeights) 
+            Weights.push_back(WeightsForHandled[*I]); 
           PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
@@ -839,7 +983,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       Builder.SetInsertPoint(PTI);
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
-        assert(TD && "Cannot switch on pointer without TargetData");
+        assert(TD && "Cannot switch on pointer without DataLayout");
         CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
                                     "magicptr");
       }
@@ -851,6 +995,17 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
         NewSI->addCase(PredCases[i].Value, PredCases[i].Dest);
 
+      if (PredHasWeights || SuccHasWeights) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(Weights);
+
+        SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+
+        NewSI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(BB->getContext()).
+                           createBranchWeights(MDWeights));
+      }
+
       EraseTerminatorInstAndDCECond(PTI);
 
       // Okay, last check.  If BB is still a successor of PSI, then we must
@@ -984,11 +1139,11 @@ HoistTerminator:
       Value *BB1V = PN->getIncomingValueForBlock(BB1);
       Value *BB2V = PN->getIncomingValueForBlock(BB2);
       if (BB1V == BB2V) continue;
-      
+
       // These values do not agree.  Insert a select instruction before NT
       // that determines the right value.
       SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
-      if (SI == 0) 
+      if (SI == 0)
         SI = cast<SelectInst>
           (Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
                                 BB1V->getName()+"."+BB2V->getName()));
@@ -1008,6 +1163,175 @@ HoistTerminator:
   return true;
 }
 
+/// SinkThenElseCodeToEnd - Given an unconditional branch that goes to BBEnd,
+/// check whether BBEnd has only two predecessors and the other predecessor
+/// ends with an unconditional branch. If it is true, sink any common code
+/// in the two predecessors to BBEnd.
+static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
+  assert(BI1->isUnconditional());
+  BasicBlock *BB1 = BI1->getParent();
+  BasicBlock *BBEnd = BI1->getSuccessor(0);
+
+  // Check that BBEnd has two predecessors and the other predecessor ends with
+  // an unconditional branch.
+  pred_iterator PI = pred_begin(BBEnd), PE = pred_end(BBEnd);
+  BasicBlock *Pred0 = *PI++;
+  if (PI == PE) // Only one predecessor.
+    return false;
+  BasicBlock *Pred1 = *PI++;
+  if (PI != PE) // More than two predecessors.
+    return false;
+  BasicBlock *BB2 = (Pred0 == BB1) ? Pred1 : Pred0;
+  BranchInst *BI2 = dyn_cast<BranchInst>(BB2->getTerminator());
+  if (!BI2 || !BI2->isUnconditional())
+    return false;
+
+  // Gather the PHI nodes in BBEnd.
+  std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2;
+  Instruction *FirstNonPhiInBBEnd = 0;
+  for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2); 
+      MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN);
+    } else {
+      FirstNonPhiInBBEnd = &*I;
+      break;
+    }
+  }
+  if (!FirstNonPhiInBBEnd)
+    return false;
+  
+
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  We scan backward for obviously identical
+  // instructions in an identical order.
+  BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(),
+      RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(),
+      RE2 = BB2->getInstList().rend();
+  // Skip debug info.
+  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+  if (RI1 == RE1)
+    return false;
+  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+  if (RI2 == RE2)
+    return false;
+  // Skip the unconditional branches.
+  ++RI1;
+  ++RI2;
+
+  bool Changed = false;
+  while (RI1 != RE1 && RI2 != RE2) {
+    // Skip debug info.
+    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+    if (RI1 == RE1)
+      return Changed;
+    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+    if (RI2 == RE2)
+      return Changed;
+
+    Instruction *I1 = &*RI1, *I2 = &*RI2;
+    // I1 and I2 should have a single use in the same PHI node, and they
+    // perform the same operation.
+    // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+    if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
+        isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
+        isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) ||
+        isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
+        I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
+        I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
+        !I1->hasOneUse() || !I2->hasOneUse() ||
+        MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() ||
+        MapValueFromBB1ToBB2[I1].first != I2)
+      return Changed;
+
+    // Check whether we should swap the operands of ICmpInst.
+    ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
+    bool SwapOpnds = false;
+    if (ICmp1 && ICmp2 &&
+        ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
+        ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
+        (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
+         ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
+      ICmp2->swapOperands();
+      SwapOpnds = true;
+    }
+    if (!I1->isSameOperationAs(I2)) {
+      if (SwapOpnds)
+        ICmp2->swapOperands();
+      return Changed;
+    }
+
+    // The operands should be either the same or they need to be generated
+    // with a PHI node after sinking. We only handle the case where there is
+    // a single pair of different operands.
+    Value *DifferentOp1 = 0, *DifferentOp2 = 0;
+    unsigned Op1Idx = 0;
+    for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
+      if (I1->getOperand(I) == I2->getOperand(I))
+        continue;
+      // Early exit if we have more-than one pair of different operands or
+      // the different operand is already in MapValueFromBB1ToBB2.
+      // Early exit if we need a PHI node to replace a constant.
+      if (DifferentOp1 ||
+          MapValueFromBB1ToBB2.find(I1->getOperand(I)) !=
+          MapValueFromBB1ToBB2.end() ||
+          isa<Constant>(I1->getOperand(I)) ||
+          isa<Constant>(I2->getOperand(I))) {
+        // If we can't sink the instructions, undo the swapping.
+        if (SwapOpnds)
+          ICmp2->swapOperands();
+        return Changed;
+      }
+      DifferentOp1 = I1->getOperand(I);
+      Op1Idx = I;
+      DifferentOp2 = I2->getOperand(I);
+    }
+
+    // We insert the pair of different operands to MapValueFromBB1ToBB2 and
+    // remove (I1, I2) from MapValueFromBB1ToBB2.
+    if (DifferentOp1) {
+      PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2,
+                                       DifferentOp1->getName() + ".sink",
+                                       BBEnd->begin());
+      MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN);
+      // I1 should use NewPN instead of DifferentOp1.
+      I1->setOperand(Op1Idx, NewPN);
+      NewPN->addIncoming(DifferentOp1, BB1);
+      NewPN->addIncoming(DifferentOp2, BB2);
+      DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
+    }
+    PHINode *OldPN = MapValueFromBB1ToBB2[I1].second;
+    MapValueFromBB1ToBB2.erase(I1);
+
+    DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";);
+    DEBUG(dbgs() << "                         " << *I2 << "\n";);
+    // We need to update RE1 and RE2 if we are going to sink the first
+    // instruction in the basic block down.
+    bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
+    // Sink the instruction.
+    BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1);
+    if (!OldPN->use_empty())
+      OldPN->replaceAllUsesWith(I1);
+    OldPN->eraseFromParent();
+
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    I1->intersectOptionalDataWith(I2);
+    I2->eraseFromParent();
+
+    if (UpdateRE1)
+      RE1 = BB1->getInstList().rend();
+    if (UpdateRE2)
+      RE2 = BB2->getInstList().rend();
+    FirstNonPhiInBBEnd = I1;
+    NumSinkCommons++;
+    Changed = true;
+  }
+  return Changed;
+}
+
 /// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
 /// and an BB2 and the only successor of BB1 is BB2, hoist simple code
 /// (for now, restricted to a single instruction that's side effect free) from
@@ -1056,7 +1380,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
     // Do not hoist the instruction if any of its operands are defined but not
     // used in this BB. The transformation will prevent the operand from
     // being sunk into the use block.
-    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end(); 
+    for (User::op_iterator i = HInst->op_begin(), e = HInst->op_end();
          i != e; ++i) {
       Instruction *OpI = dyn_cast<Instruction>(*i);
       if (OpI && OpI->getParent() == BIParent &&
@@ -1112,7 +1436,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
   // as well.
   if (PHIs.empty())
     return false;
-  
+
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *BB1 << "\n";);
 
@@ -1162,13 +1486,13 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
   BranchInst *BI = cast<BranchInst>(BB->getTerminator());
   unsigned Size = 0;
-  
+
   for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
     if (isa<DbgInfoIntrinsic>(BBI))
       continue;
     if (Size > 10) return false;  // Don't clone large BB's.
     ++Size;
-    
+
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
     for (Value::use_iterator UI = BBI->use_begin(), E = BBI->use_end();
@@ -1176,7 +1500,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
       Instruction *U = cast<Instruction>(*UI);
       if (U->getParent() != BB || isa<PHINode>(U)) return false;
     }
-    
+
     // Looks ok, continue checking.
   }
 
@@ -1187,38 +1511,38 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// that is defined in the same block as the branch and if any PHI entries are
 /// constants, thread edges corresponding to that entry to be branches to their
 /// ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
   // outside of the block.
   if (!PN || PN->getParent() != BB || !PN->hasOneUse())
     return false;
-  
+
   // Degenerate case of a single entry PHI.
   if (PN->getNumIncomingValues() == 1) {
     FoldSingleEntryPHINodes(PN->getParent());
-    return true;    
+    return true;
   }
 
   // Now we know that this block has multiple preds and two succs.
   if (!BlockIsSimpleEnoughToThreadThrough(BB)) return false;
-  
+
   // Okay, this is a simple enough basic block.  See if any phi values are
   // constants.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
     if (CB == 0 || !CB->getType()->isIntegerTy(1)) continue;
-    
+
     // Okay, we now know that all edges from PredBB should be revectored to
     // branch to RealDest.
     BasicBlock *PredBB = PN->getIncomingBlock(i);
     BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
-    
+
     if (RealDest == BB) continue;  // Skip self loops.
     // Skip if the predecessor's terminator is an indirect branch.
     if (isa<IndirectBrInst>(PredBB->getTerminator())) continue;
-    
+
     // The dest block might have PHI nodes, other predecessors and other
     // difficult cases.  Instead of being smart about this, just insert a new
     // block that jumps to the destination block, effectively splitting
@@ -1227,7 +1551,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
                                             RealDest->getName()+".critedge",
                                             RealDest->getParent(), RealDest);
     BranchInst::Create(RealDest, EdgeBB);
-    
+
     // Update PHI nodes.
     AddPredecessorToBlock(RealDest, EdgeBB, BB);
 
@@ -1244,7 +1568,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
       // Clone the instruction.
       Instruction *N = BBI->clone();
       if (BBI->hasName()) N->setName(BBI->getName()+".c");
-      
+
       // Update operands due to translation.
       for (User::op_iterator i = N->op_begin(), e = N->op_end();
            i != e; ++i) {
@@ -1252,7 +1576,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
         if (PI != TranslateMap.end())
           *i = PI->second;
       }
-      
+
       // Check for trivial simplification.
       if (Value *V = SimplifyInstruction(N, TD)) {
         TranslateMap[BBI] = V;
@@ -1283,7 +1607,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
 
 /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
 /// PHI node, see if we can eliminate it.
-static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
+static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -1297,7 +1621,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       // Don't bother if the branch will be constant folded trivially.
       isa<ConstantInt>(IfCond))
     return false;
-  
+
   // Okay, we found that we can merge this two-entry phi node into a select.
   // Doing so would require us to fold *all* two entry phi nodes in this block.
   // At some point this becomes non-profitable (particularly if the target
@@ -1307,14 +1631,14 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
   for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
     if (NumPhis > 2)
       return false;
-  
+
   // Loop over the PHI's seeing if we can promote them all to select
   // instructions.  While we are at it, keep track of the instructions
   // that need to be moved to the dominating block.
   SmallPtrSet<Instruction*, 4> AggressiveInsts;
   unsigned MaxCostVal0 = PHINodeFoldingThreshold,
            MaxCostVal1 = PHINodeFoldingThreshold;
-  
+
   for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
     PHINode *PN = cast<PHINode>(II++);
     if (Value *V = SimplifyInstruction(PN, TD)) {
@@ -1322,19 +1646,19 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
       PN->eraseFromParent();
       continue;
     }
-    
+
     if (!DominatesMergePoint(PN->getIncomingValue(0), BB, &AggressiveInsts,
                              MaxCostVal0) ||
         !DominatesMergePoint(PN->getIncomingValue(1), BB, &AggressiveInsts,
                              MaxCostVal1))
       return false;
   }
-  
+
   // If we folded the first phi, PN dangles at this point.  Refresh it.  If
   // we ran out of PHIs then we simplified them all.
   PN = dyn_cast<PHINode>(BB->begin());
   if (PN == 0) return true;
-  
+
   // Don't fold i1 branches on PHIs which contain binary operators.  These can
   // often be turned into switches and other things.
   if (PN->getType()->isIntegerTy(1) &&
@@ -1342,7 +1666,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
        isa<BinaryOperator>(PN->getIncomingValue(1)) ||
        isa<BinaryOperator>(IfCond)))
     return false;
-  
+
   // If we all PHI nodes are promotable, check to make sure that all
   // instructions in the predecessor blocks can be promoted as well.  If
   // not, we won't be able to get rid of the control flow, so it's not
@@ -1362,7 +1686,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
         return false;
       }
   }
-    
+
   if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
     IfBlock2 = 0;
   } else {
@@ -1375,15 +1699,15 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
         return false;
       }
   }
-  
+
   DEBUG(dbgs() << "FOUND IF CONDITION!  " << *IfCond << "  T: "
                << IfTrue->getName() << "  F: " << IfFalse->getName() << "\n");
-      
+
   // If we can still promote the PHI nodes after this gauntlet of tests,
   // do all of the PHI's now.
   Instruction *InsertPt = DomBlock->getTerminator();
   IRBuilder<true, NoFolder> Builder(InsertPt);
-  
+
   // Move all 'aggressive' instructions, which are defined in the
   // conditional parts of the if's up to the dominating block.
   if (IfBlock1)
@@ -1394,19 +1718,19 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
     DomBlock->getInstList().splice(InsertPt,
                                    IfBlock2->getInstList(), IfBlock2->begin(),
                                    IfBlock2->getTerminator());
-  
+
   while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
     // Change the PHI node into a select instruction.
     Value *TrueVal  = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
     Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
-    
-    SelectInst *NV = 
+
+    SelectInst *NV =
       cast<SelectInst>(Builder.CreateSelect(IfCond, TrueVal, FalseVal, ""));
     PN->replaceAllUsesWith(NV);
     NV->takeName(PN);
     PN->eraseFromParent();
   }
-  
+
   // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
   // has been flattened.  Change DomBlock to jump directly to our new block to
   // avoid other simplifycfg's kicking in on the diamond.
@@ -1420,14 +1744,14 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
 /// SimplifyCondBranchToTwoReturns - If we found a conditional branch that goes
 /// to two returning blocks, try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI, 
+static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
                                            IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
   ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
   ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
-  
+
   // Check to ensure both blocks are empty (just a return) or optionally empty
   // with PHI nodes.  If there are other instructions, merging would cause extra
   // computation on one path or the other.
@@ -1447,12 +1771,12 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     EraseTerminatorInstAndDCECond(BI);
     return true;
   }
-    
+
   // Otherwise, figure out what the true and false return values are
   // so we can insert a new select instruction.
   Value *TrueValue = TrueRet->getReturnValue();
   Value *FalseValue = FalseRet->getReturnValue();
-  
+
   // Unwrap any PHI nodes in the return blocks.
   if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
     if (TVPN->getParent() == TrueSucc)
@@ -1460,7 +1784,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
   if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
     if (FVPN->getParent() == FalseSucc)
       FalseValue = FVPN->getIncomingValueForBlock(BI->getParent());
-  
+
   // In order for this transformation to be safe, we must be able to
   // unconditionally execute both operands to the return.  This is
   // normally the case, but we could have a potentially-trapping
@@ -1472,12 +1796,12 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
   if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
     if (FCV->canTrap())
       return false;
-  
+
   // Okay, we collected all the mapped values and checked them for sanity, and
   // defined to really do this transformation.  First, update the CFG.
   TrueSucc->removePredecessor(BI->getParent());
   FalseSucc->removePredecessor(BI->getParent());
-  
+
   // Insert select instructions where needed.
   Value *BrCond = BI->getCondition();
   if (TrueValue) {
@@ -1491,15 +1815,15 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
     }
   }
 
-  Value *RI = !TrueValue ? 
+  Value *RI = !TrueValue ?
     Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
 
   (void) RI;
-      
+
   DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
                << "\n  " << *BI << "NewRet = " << *RI
                << "TRUEBLOCK: " << *TrueSucc << "FALSEBLOCK: "<< *FalseSucc);
-      
+
   EraseTerminatorInstAndDCECond(BI);
 
   return true;
@@ -1510,7 +1834,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 /// parameters and return true, or returns false if no or invalid metadata was
 /// found.
 static bool ExtractBranchMetadata(BranchInst *BI,
-                                  APInt &ProbTrue, APInt &ProbFalse) {
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
   assert(BI->isConditional() &&
          "Looking for probabilities on unconditional branch?");
   MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
@@ -1518,35 +1842,11 @@ static bool ExtractBranchMetadata(BranchInst *BI,
   ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1));
   ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2));
   if (!CITrue || !CIFalse) return false;
-  ProbTrue = CITrue->getValue();
-  ProbFalse = CIFalse->getValue();
-  assert(ProbTrue.getBitWidth() == 32 && ProbFalse.getBitWidth() == 32 &&
-         "Branch probability metadata must be 32-bit integers");
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
   return true;
 }
 
-/// MultiplyAndLosePrecision - Multiplies A and B, then returns the result. In
-/// the event of overflow, logically-shifts all four inputs right until the
-/// multiply fits.
-static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
-                                      unsigned &BitsLost) {
-  BitsLost = 0;
-  bool Overflow = false;
-  APInt Result = A.umul_ov(B, Overflow);
-  if (Overflow) {
-    APInt MaxB = APInt::getMaxValue(A.getBitWidth()).udiv(A);
-    do {
-      B = B.lshr(1);
-      ++BitsLost;
-    } while (B.ugt(MaxB));
-    A = A.lshr(BitsLost);
-    C = C.lshr(BitsLost);
-    D = D.lshr(BitsLost);
-    Result = A * B;
-  }
-  return Result;
-}
-
 /// checkCSEInPredecessor - Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
 ///
@@ -1600,7 +1900,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     if (Cond == 0)
       return false;
   }
-     
+
   if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
     Cond->getParent() != BB || !Cond->hasOneUse())
   return false;
@@ -1623,7 +1923,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       isSafeToSpeculativelyExecute(FrontIt)) {
     BonusInst = &*FrontIt;
     ++FrontIt;
-    
+
     // Ignore dbg intrinsics.
     while (isa<DbgInfoIntrinsic>(FrontIt)) ++FrontIt;
   }
@@ -1631,13 +1931,13 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   // Only a single bonus inst is allowed.
   if (&*FrontIt != Cond)
     return false;
-  
+
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = Cond; ++CondIt;
 
   // Ingore dbg intrinsics.
   while (isa<DbgInfoIntrinsic>(CondIt)) ++CondIt;
-  
+
   if (&*CondIt != BI)
     return false;
 
@@ -1649,7 +1949,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
     if (CE->canTrap())
       return false;
-  
+
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest  = BI->getSuccessor(0);
   BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0;
@@ -1659,22 +1959,22 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *PredBlock = *PI;
     BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
-    
+
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
     SmallVector<PHINode*, 4> PHIs;
     if (PBI == 0 || PBI->isUnconditional() ||
-        (BI->isConditional() && 
+        (BI->isConditional() &&
          !SafeToMergeTerminators(BI, PBI)) ||
         (!BI->isConditional() &&
          !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs)))
       continue;
-    
+
     // Determine if the two branches share a common destination.
-    Instruction::BinaryOps Opc;
+    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
     bool InvertPredCond = false;
-    
+
     if (BI->isConditional()) {
       if (PBI->getSuccessor(0) == TrueDest)
         Opc = Instruction::Or;
@@ -1693,7 +1993,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
     // Ensure that any values used in the bonus instruction are also used
     // by the terminator of the predecessor.  This means that those values
-    // must already have been resolved, so we won't be inhibiting the 
+    // must already have been resolved, so we won't be inhibiting the
     // out-of-order core by speculating them earlier.
     if (BonusInst) {
       // Collect the values used by the bonus inst
@@ -1707,47 +2007,47 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
 
       SmallVector<std::pair<Value*, unsigned>, 4> Worklist;
       Worklist.push_back(std::make_pair(PBI->getOperand(0), 0));
-      
+
       // Walk up to four levels back up the use-def chain of the predecessor's
       // terminator to see if all those values were used.  The choice of four
       // levels is arbitrary, to provide a compile-time-cost bound.
       while (!Worklist.empty()) {
         std::pair<Value*, unsigned> Pair = Worklist.back();
         Worklist.pop_back();
-        
+
         if (Pair.second >= 4) continue;
         UsedValues.erase(Pair.first);
         if (UsedValues.empty()) break;
-        
+
         if (Instruction *I = dyn_cast<Instruction>(Pair.first)) {
           for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
                OI != OE; ++OI)
             Worklist.push_back(std::make_pair(OI->get(), Pair.second+1));
-        }       
+        }
       }
-      
+
       if (!UsedValues.empty()) return false;
     }
 
     DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
-    IRBuilder<> Builder(PBI);    
+    IRBuilder<> Builder(PBI);
 
     // If we need to invert the condition in the pred block to match, do so now.
     if (InvertPredCond) {
       Value *NewCond = PBI->getCondition();
-      
+
       if (NewCond->hasOneUse() && isa<CmpInst>(NewCond)) {
         CmpInst *CI = cast<CmpInst>(NewCond);
         CI->setPredicate(CI->getInversePredicate());
       } else {
-        NewCond = Builder.CreateNot(NewCond, 
+        NewCond = Builder.CreateNot(NewCond,
                                     PBI->getCondition()->getName()+".not");
       }
-      
+
       PBI->setCondition(NewCond);
       PBI->swapSuccessors();
     }
-    
+
     // If we have a bonus inst, clone it into the predecessor block.
     Instruction *NewBonus = 0;
     if (BonusInst) {
@@ -1756,7 +2056,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       NewBonus->takeName(BonusInst);
       BonusInst->setName(BonusInst->getName()+".old");
     }
-    
+
     // Clone Cond into the predecessor basic block, and or/and the
     // two conditions together.
     Instruction *New = Cond->clone();
@@ -1764,21 +2064,60 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     PredBlock->getInstList().insert(PBI, New);
     New->takeName(Cond);
     Cond->setName(New->getName()+".old");
-    
+
     if (BI->isConditional()) {
-      Instruction *NewCond = 
+      Instruction *NewCond =
         cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(),
                                             New, "or.cond"));
       PBI->setCondition(NewCond);
 
+      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+      bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                                  PredFalseWeight);
+      bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                                  SuccFalseWeight);
+      SmallVector<uint64_t, 8> NewWeights;
+
       if (PBI->getSuccessor(0) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, BB, FalseDest
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          //               TrueWeight for PBI * FalseWeight for BI.
+          // We assume that total weights of a BranchInst can fit into 32 bits.
+          // Therefore, we will not have overflow using 64-bit arithmetic.
+          NewWeights.push_back(PredFalseWeight * (SuccFalseWeight +
+               SuccTrueWeight) + PredTrueWeight * SuccFalseWeight);
+        }
         AddPredecessorToBlock(TrueDest, PredBlock, BB);
         PBI->setSuccessor(0, TrueDest);
       }
       if (PBI->getSuccessor(1) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, TrueDest, BB
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          //              FalseWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * (SuccFalseWeight +
+              SuccTrueWeight) + PredFalseWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+        }
         AddPredecessorToBlock(FalseDest, PredBlock, BB);
         PBI->setSuccessor(1, FalseDest);
       }
+      if (NewWeights.size() == 2) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(NewWeights);
+
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end());
+        PBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(BI->getContext()).
+                         createBranchWeights(MDWeights));
+      } else
+        PBI->setMetadata(LLVMContext::MD_prof, NULL);
     } else {
       // Update PHI nodes in the common successors.
       for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
@@ -1806,7 +2145,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
           // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C)
           // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond)
           //       is false: PBI_Cond and BI_Value
-          MergedCond = 
+          MergedCond =
             cast<Instruction>(Builder.CreateBinOp(Instruction::And,
                                 PBI->getCondition(), New,
                                 "and.cond"));
@@ -1814,7 +2153,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
             Instruction *NotCond =
               cast<Instruction>(Builder.CreateNot(PBI->getCondition(),
                                   "not.cond"));
-            MergedCond = 
+            MergedCond =
               cast<Instruction>(Builder.CreateBinOp(Instruction::Or,
                                   NotCond, MergedCond,
                                   "or.cond"));
@@ -1833,95 +2172,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // TODO: If BB is reachable from all paths through PredBlock, then we
     // could replace PBI's branch probabilities with BI's.
 
-    // Merge probability data into PredBlock's branch.
-    APInt A, B, C, D;
-    if (PBI->isConditional() && BI->isConditional() &&
-        ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
-      // Given IR which does:
-      //   bbA:
-      //     br i1 %x, label %bbB, label %bbC
-      //   bbB:
-      //     br i1 %y, label %bbD, label %bbC
-      // Let's call the probability that we take the edge from %bbA to %bbB
-      // 'a', from %bbA to %bbC, 'b', from %bbB to %bbD 'c' and from %bbB to
-      // %bbC probability 'd'.
-      //
-      // We transform the IR into:
-      //   bbA:
-      //     br i1 %z, label %bbD, label %bbC
-      // where the probability of going to %bbD is (a*c) and going to bbC is
-      // (b+a*d).
-      //
-      // Probabilities aren't stored as ratios directly. Using branch weights,
-      // we get:
-      // (a*c)% = A*C, (b+(a*d))% = A*D+B*C+B*D.
-
-      // In the event of overflow, we want to drop the LSB of the input
-      // probabilities.
-      unsigned BitsLost;
-
-      // Ignore overflow result on ProbTrue.
-      APInt ProbTrue = MultiplyAndLosePrecision(A, C, B, D, BitsLost);
-
-      APInt Tmp1 = MultiplyAndLosePrecision(B, D, A, C, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-      }
-
-      APInt Tmp2 = MultiplyAndLosePrecision(A, D, C, B, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-        Tmp1 = Tmp1.lshr(BitsLost*2);
-      }
-
-      APInt Tmp3 = MultiplyAndLosePrecision(B, C, A, D, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-        Tmp1 = Tmp1.lshr(BitsLost*2);
-        Tmp2 = Tmp2.lshr(BitsLost*2);
-      }
-
-      bool Overflow1 = false, Overflow2 = false;
-      APInt Tmp4 = Tmp2.uadd_ov(Tmp3, Overflow1);
-      APInt ProbFalse = Tmp4.uadd_ov(Tmp1, Overflow2);
-
-      if (Overflow1 || Overflow2) {
-        ProbTrue = ProbTrue.lshr(1);
-        Tmp1 = Tmp1.lshr(1);
-        Tmp2 = Tmp2.lshr(1);
-        Tmp3 = Tmp3.lshr(1);
-        Tmp4 = Tmp2 + Tmp3;
-        ProbFalse = Tmp4 + Tmp1;
-      }
-
-      // The sum of branch weights must fit in 32-bits.
-      if (ProbTrue.isNegative() && ProbFalse.isNegative()) {
-        ProbTrue = ProbTrue.lshr(1);
-        ProbFalse = ProbFalse.lshr(1);
-      }
-
-      if (ProbTrue != ProbFalse) {
-        // Normalize the result.
-        APInt GCD = APIntOps::GreatestCommonDivisor(ProbTrue, ProbFalse);
-        ProbTrue = ProbTrue.udiv(GCD);
-        ProbFalse = ProbFalse.udiv(GCD);
-
-        MDBuilder MDB(BI->getContext());
-        MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(),
-                                            ProbFalse.getZExtValue());
-        PBI->setMetadata(LLVMContext::MD_prof, N);
-      } else {
-        PBI->setMetadata(LLVMContext::MD_prof, NULL);
-      }
-    } else {
-      PBI->setMetadata(LLVMContext::MD_prof, NULL);
-    }
-
     // Copy any debug value intrinsics into the end of PredBlock.
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
       if (isa<DbgInfoIntrinsic>(*I))
         I->clone()->insertBefore(PBI);
-      
+
     return true;
   }
   return false;
@@ -1936,7 +2191,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   BasicBlock *BB = BI->getParent();
 
   // If this block ends with a branch instruction, and if there is a
-  // predecessor that ends on a branch of the same condition, make 
+  // predecessor that ends on a branch of the same condition, make
   // this conditional branch redundant.
   if (PBI->getCondition() == BI->getCondition() &&
       PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
@@ -1945,11 +2200,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     if (BB->getSinglePredecessor()) {
       // Turn this into a branch on constant.
       bool CondIsTrue = PBI->getSuccessor(0) == BB;
-      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
+      BI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
                                         CondIsTrue));
       return true;  // Nuke the branch on constant.
     }
-    
+
     // Otherwise, if there are multiple predecessors, insert a PHI that merges
     // in the constant and simplify the block result.  Subsequent passes of
     // simplifycfg will thread the block.
@@ -1969,18 +2224,18 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
             PBI->getCondition() == BI->getCondition() &&
             PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
           bool CondIsTrue = PBI->getSuccessor(0) == BB;
-          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()), 
+          NewPN->addIncoming(ConstantInt::get(Type::getInt1Ty(BB->getContext()),
                                               CondIsTrue), P);
         } else {
           NewPN->addIncoming(BI->getCondition(), P);
         }
       }
-      
+
       BI->setCondition(NewPN);
       return true;
     }
   }
-  
+
   // If this is a conditional branch in an empty block, and if any
   // predecessors is a conditional branch to one of our destinations,
   // fold the conditions into logical ops and one cond br.
@@ -1991,11 +2246,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   if (&*BBI != BI)
     return false;
 
-  
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
     if (CE->canTrap())
       return false;
-  
+
   int PBIOp, BIOp;
   if (PBI->getSuccessor(0) == BI->getSuccessor(0))
     PBIOp = BIOp = 0;
@@ -2007,31 +2262,31 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
     PBIOp = BIOp = 1;
   else
     return false;
-    
+
   // Check to make sure that the other destination of this branch
   // isn't BB itself.  If so, this is an infinite loop that will
   // keep getting unwound.
   if (PBI->getSuccessor(PBIOp) == BB)
     return false;
-    
-  // Do not perform this transformation if it would require 
+
+  // Do not perform this transformation if it would require
   // insertion of a large number of select instructions. For targets
   // without predication/cmovs, this is a big pessimization.
   BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
-      
+
   unsigned NumPhis = 0;
   for (BasicBlock::iterator II = CommonDest->begin();
        isa<PHINode>(II); ++II, ++NumPhis)
     if (NumPhis > 2) // Disable this xform.
       return false;
-    
+
   // Finally, if everything is ok, fold the branches to logical ops.
   BasicBlock *OtherDest  = BI->getSuccessor(BIOp ^ 1);
-  
+
   DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
                << "AND: " << *BI->getParent());
-  
-  
+
+
   // If OtherDest *is* BB, then BB is a basic block with a single conditional
   // branch in it, where one edge (OtherDest) goes back to itself but the other
   // exits.  We don't *know* that the program avoids the infinite loop
@@ -2046,13 +2301,13 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
                                                   "infloop", BB->getParent());
     BranchInst::Create(InfLoopBlock, InfLoopBlock);
     OtherDest = InfLoopBlock;
-  }  
-  
+  }
+
   DEBUG(dbgs() << *PBI->getParent()->getParent());
 
   // BI may have other predecessors.  Because of this, we leave
   // it alone, but modify PBI.
-  
+
   // Make sure we get to CommonDest on True&True directions.
   Value *PBICond = PBI->getCondition();
   IRBuilder<true, NoFolder> Builder(PBI);
@@ -2065,16 +2320,43 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
 
   // Merge the conditions.
   Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
-  
+
   // Modify PBI to branch on the new condition to the new dests.
   PBI->setCondition(Cond);
   PBI->setSuccessor(0, CommonDest);
   PBI->setSuccessor(1, OtherDest);
-  
+
+  // Update branch weight for PBI.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                              PredFalseWeight);
+  bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                              SuccFalseWeight);
+  if (PredHasWeights && SuccHasWeights) {
+    uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight;
+    uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+    // The weight to CommonDest should be PredCommon * SuccTotal +
+    //                                    PredOther * SuccCommon.
+    // The weight to OtherDest should be PredOther * SuccOther.
+    SmallVector<uint64_t, 2> NewWeights;
+    NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) +
+                         PredOther * SuccCommon);
+    NewWeights.push_back(PredOther * SuccOther);
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end());
+    PBI->setMetadata(LLVMContext::MD_prof,
+                     MDBuilder(BI->getContext()).
+                     createBranchWeights(MDWeights));
+  }
+
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
   // block that are identical to the entries for BI's block.
   AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
-  
+
   // We know that the CommonDest already had an edge from PBI to
   // it.  If it has PHIs though, the PHIs may have different
   // entries for BB and PBI's BB.  If so, insert a select to make
@@ -2092,10 +2374,10 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
       PN->setIncomingValue(PBBIdx, NV);
     }
   }
-  
+
   DEBUG(dbgs() << "INTO: " << *PBI->getParent());
   DEBUG(dbgs() << *PBI->getParent()->getParent());
-  
+
   // This basic block is probably dead.  We know it has at least
   // one fewer predecessor.
   return true;
@@ -2107,7 +2389,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
 static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
-                                       BasicBlock *TrueBB, BasicBlock *FalseBB){
+                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                       uint32_t TrueWeight,
+                                       uint32_t FalseWeight){
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -2136,10 +2420,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
       Builder.CreateBr(TrueBB);
-    else
+    else {
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
-      Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      if (TrueWeight != FalseWeight)
+        NewBI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(OldTerm->getContext()).
+                           createBranchWeights(TrueWeight, FalseWeight));
+    }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
@@ -2176,8 +2465,23 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
   BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
   BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
 
+  // Get weight for TrueBB and FalseBB.
+  uint32_t TrueWeight = 0, FalseWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal).
+                                     getSuccessorIndex()];
+      FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal).
+                                      getSuccessorIndex()];
+    }
+  }
+
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB);
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB,
+                                    TrueWeight, FalseWeight);
 }
 
 // SimplifyIndirectBrOnSelect - Replaces
@@ -2197,7 +2501,8 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
   BasicBlock *FalseBB = FBA->getBasicBlock();
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB);
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
+                                    0, 0);
 }
 
 /// TryToSimplifyUncondBranchWithICmpInIt - This is called when we find an icmp
@@ -2214,11 +2519,11 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 ///   br label %end
 /// end:
 ///   ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
-/// 
+///
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
 static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
-                                                  const TargetData *TD,
+                                                  const DataLayout *TD,
                                                   IRBuilder<> &Builder) {
   BasicBlock *BB = ICI->getParent();
 
@@ -2228,17 +2533,17 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
 
   Value *V = ICI->getOperand(0);
   ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
-  
+
   // The pattern we're looking for is where our only predecessor is a switch on
   // 'V' and this block is the default case for the switch.  In this case we can
   // fold the compared value into the switch to simplify things.
   BasicBlock *Pred = BB->getSinglePredecessor();
   if (Pred == 0 || !isa<SwitchInst>(Pred->getTerminator())) return false;
-  
+
   SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
   if (SI->getCondition() != V)
     return false;
-  
+
   // If BB is reachable on a non-default case, then we simply know the value of
   // V in this block.  Substitute it and constant fold the icmp instruction
   // away.
@@ -2246,7 +2551,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
     ConstantInt *VVal = SI->findCaseDest(BB);
     assert(VVal && "Should have a unique destination value");
     ICI->setOperand(0, VVal);
-    
+
     if (Value *V = SimplifyInstruction(ICI, TD)) {
       ICI->replaceAllUsesWith(V);
       ICI->eraseFromParent();
@@ -2254,7 +2559,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
     // BB is now empty, so it is likely to simplify away.
     return SimplifyCFG(BB) | true;
   }
-  
+
   // Ok, the block is reachable from the default dest.  If the constant we're
   // comparing exists in one of the other edges, then we can constant fold ICI
   // and zap it.
@@ -2264,13 +2569,13 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
       V = ConstantInt::getFalse(BB->getContext());
     else
       V = ConstantInt::getTrue(BB->getContext());
-    
+
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
     return SimplifyCFG(BB) | true;
   }
-  
+
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
   // the block.
   BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
@@ -2296,8 +2601,23 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   // the switch to the merge point on the compared value.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
                                          BB->getParent(), BB);
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Split weight for default case to case for "Cst".
+      Weights[0] = (Weights[0]+1) >> 1;
+      Weights.push_back(Weights[0]);
+
+      SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getContext()).
+                      createBranchWeights(MDWeights));
+    }
+  }
   SI->addCase(Cst, NewBB);
-  
+
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
   Builder.SetInsertPoint(NewBB);
   Builder.SetCurrentDebugLocation(SI->getDebugLoc());
@@ -2309,12 +2629,12 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
 /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
                                       IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0) return false;
-  
-  
+
+
   // Change br (X == 0 | X == 1), T, F into a switch instruction.
   // If this is a bunch of seteq's or'd together, or if it's a bunch of
   // 'setne's and'ed together, collect them.
@@ -2323,7 +2643,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   bool TrueWhenEqual = true;
   Value *ExtraCase = 0;
   unsigned UsedICmps = 0;
-  
+
   if (Cond->getOpcode() == Instruction::Or) {
     CompVal = GatherConstantCompares(Cond, Values, ExtraCase, TD, true,
                                      UsedICmps);
@@ -2332,7 +2652,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
                                      UsedICmps);
     TrueWhenEqual = false;
   }
-  
+
   // If we didn't have a multiply compared value, fail.
   if (CompVal == 0) return false;
 
@@ -2344,21 +2664,24 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   // instruction can't handle, remove them now.
   array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
   Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
-  
+
   // If Extra was used, we require at least two switch values to do the
   // transformation.  A switch with one value is just an cond branch.
   if (ExtraCase && Values.size() < 2) return false;
-  
+
+  // TODO: Preserve branch weight metadata, similarly to how
+  // FoldValueComparisonIntoPredecessors preserves it.
+
   // Figure out which block is which destination.
   BasicBlock *DefaultBB = BI->getSuccessor(1);
   BasicBlock *EdgeBB    = BI->getSuccessor(0);
   if (!TrueWhenEqual) std::swap(DefaultBB, EdgeBB);
-  
+
   BasicBlock *BB = BI->getParent();
-  
+
   DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
                << " cases into SWITCH.  BB is:\n" << *BB);
-  
+
   // If there are any extra values that couldn't be folded into the switch
   // then we evaluate them with an explicit branch first.  Split the block
   // right before the condbr to handle it.
@@ -2372,13 +2695,13 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
       Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
     else
       Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-      
+
     OldTI->eraseFromParent();
-    
+
     // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
     // for the edge we just added.
     AddPredecessorToBlock(EdgeBB, BB, NewBB);
-    
+
     DEBUG(dbgs() << "  ** 'icmp' chain unhandled condition: " << *ExtraCase
           << "\nEXTRABB = " << *BB);
     BB = NewBB;
@@ -2387,19 +2710,19 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   Builder.SetInsertPoint(BI);
   // Convert pointer to int before we switch.
   if (CompVal->getType()->isPointerTy()) {
-    assert(TD && "Cannot switch on pointer without TargetData");
+    assert(TD && "Cannot switch on pointer without DataLayout");
     CompVal = Builder.CreatePtrToInt(CompVal,
                                      TD->getIntPtrType(CompVal->getContext()),
                                      "magicptr");
   }
-  
+
   // Create the new switch instruction now.
   SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
 
   // Add all of the 'cases' to the switch instruction.
   for (unsigned i = 0, e = Values.size(); i != e; ++i)
     New->addCase(Values[i], EdgeBB);
-  
+
   // We added edges from PI to the EdgeBB.  As such, if there were any
   // PHI nodes in EdgeBB, they need entries to be added corresponding to
   // the number of edges added.
@@ -2410,10 +2733,10 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
     for (unsigned i = 0, e = Values.size()-1; i != e; ++i)
       PN->addIncoming(InVal, BB);
   }
-  
+
   // Erase the old branch instruction.
   EraseTerminatorInstAndDCECond(BI);
-  
+
   DEBUG(dbgs() << "  ** 'icmp' chain result is:\n" << *BB << '\n');
   return true;
 }
@@ -2467,7 +2790,7 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator()) return false;
-  
+
   // Find predecessors that end with branches.
   SmallVector<BasicBlock*, 8> UncondBranchPreds;
   SmallVector<BranchInst*, 8> CondBranchPreds;
@@ -2481,7 +2804,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
         CondBranchPreds.push_back(BI);
     }
   }
-  
+
   // If we found some, do the transformation!
   if (!UncondBranchPreds.empty() && DupRet) {
     while (!UncondBranchPreds.empty()) {
@@ -2490,21 +2813,21 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
             << "INTO UNCOND BRANCH PRED: " << *Pred);
       (void)FoldReturnIntoUncondBranch(RI, BB, Pred);
     }
-    
+
     // If we eliminated all predecessors of the block, delete the block now.
     if (pred_begin(BB) == pred_end(BB))
       // We know there are no successors, so just nuke the block.
       BB->eraseFromParent();
-    
+
     return true;
   }
-  
+
   // Check out all of the conditional branches going to this return
   // instruction.  If any of them just select between returns, change the
   // branch itself into a select/return pair.
   while (!CondBranchPreds.empty()) {
     BranchInst *BI = CondBranchPreds.pop_back_val();
-    
+
     // Check to see if the non-BB successor is also a return block.
     if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
         isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
@@ -2516,9 +2839,9 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
 
 bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
   BasicBlock *BB = UI->getParent();
-  
+
   bool Changed = false;
-  
+
   // If there are any instructions immediately before the unreachable that can
   // be removed, do so.
   while (UI != BB->begin()) {
@@ -2558,11 +2881,11 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
     BBI->eraseFromParent();
     Changed = true;
   }
-  
+
   // If the unreachable instruction is the first in the block, take a gander
   // at all of the predecessors of this instruction, and simplify them.
   if (&BB->front() != UI) return Changed;
-  
+
   SmallVector<BasicBlock*, 8> Preds(pred_begin(BB), pred_end(BB));
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
     TerminatorInst *TI = Preds[i]->getTerminator();
@@ -2615,7 +2938,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         BasicBlock *MaxBlock = 0;
         for (std::map<BasicBlock*, std::pair<unsigned, unsigned> >::iterator
              I = Popularity.begin(), E = Popularity.end(); I != E; ++I) {
-          if (I->second.first > MaxPop || 
+          if (I->second.first > MaxPop ||
               (I->second.first == MaxPop && MaxIndex > I->second.second)) {
             MaxPop = I->second.first;
             MaxIndex = I->second.second;
@@ -2627,13 +2950,13 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
           // edges to it.
           SI->setDefaultDest(MaxBlock);
           Changed = true;
-          
+
           // If MaxBlock has phinodes in it, remove MaxPop-1 entries from
           // it.
           if (isa<PHINode>(MaxBlock->begin()))
             for (unsigned i = 0; i != MaxPop-1; ++i)
               MaxBlock->removePredecessor(SI->getParent());
-          
+
           for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
                i != e; ++i)
             if (i.getCaseSuccessor() == MaxBlock) {
@@ -2648,7 +2971,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         // place to note that the call does not throw though.
         BranchInst *BI = Builder.CreateBr(II->getNormalDest());
         II->removeFromParent();   // Take out of symbol table
-        
+
         // Insert the call now...
         SmallVector<Value*, 8> Args(II->op_begin(), II->op_end()-3);
         Builder.SetInsertPoint(BI);
@@ -2663,7 +2986,7 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
       }
     }
   }
-  
+
   // If this block is now dead, remove it.
   if (pred_begin(BB) == pred_end(BB) &&
       BB != &BB->getParent()->getEntryBlock()) {
@@ -2706,9 +3029,28 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
   Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  Builder.CreateCondBr(
+  BranchInst *NewBI = Builder.CreateCondBr(
       Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
+  // Update weight for the newly-created conditional branch.
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Combine all weights for the cases to be the true weight of NewBI.
+      // We assume that the sum of all weights for a Terminator can fit into 32
+      // bits.
+      uint32_t NewTrueWeight = 0;
+      for (unsigned I = 1, E = Weights.size(); I != E; ++I)
+        NewTrueWeight += (uint32_t)Weights[I];
+      NewBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(SI->getContext()).
+                         createBranchWeights(NewTrueWeight,
+                                             (uint32_t)Weights[0]));
+    }
+  }
+
   // Prune obsolete incoming values off the successor's PHI nodes.
   for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
        isa<PHINode>(BBI); ++BBI) {
@@ -2739,15 +3081,33 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
     }
   }
 
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeight = HasBranchWeights(SI);
+  if (HasWeight) {
+    GetBranchWeights(SI, Weights);
+    HasWeight = (Weights.size() == 1 + SI->getNumCases());
+  }
+
   // Remove dead cases from the switch.
   for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
     SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
     assert(Case != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
+    if (HasWeight) {
+      std::swap(Weights[Case.getCaseIndex()+1], Weights.back());
+      Weights.pop_back();
+    }
+
     // Prune unused values from PHI nodes.
     Case.getCaseSuccessor()->removePredecessor(SI->getParent());
     SI->removeCase(Case);
   }
+  if (HasWeight) {
+    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+    SI->setMetadata(LLVMContext::MD_prof,
+                    MDBuilder(SI->getParent()->getContext()).
+                    createBranchWeights(MDWeights));
+  }
 
   return !DeadCases.empty();
 }
@@ -2823,33 +3183,512 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   return Changed;
 }
 
-bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
-  // If this switch is too complex to want to look at, ignore it.
-  if (!isValueEqualityComparison(SI))
+/// ValidLookupTableConstant - Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C) {
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return CE->isGEPWithNoNotionalOverIndexing();
+
+  return isa<ConstantFP>(C) ||
+      isa<ConstantInt>(C) ||
+      isa<ConstantPointerNull>(C) ||
+      isa<GlobalValue>(C) ||
+      isa<UndefValue>(C);
+}
+
+/// LookupConstant - If V is a Constant, return it. Otherwise, try to look up
+/// its constant value in ConstantPool, returning 0 if it's not there.
+static Constant *LookupConstant(Value *V,
+                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C;
+  return ConstantPool.lookup(V);
+}
+
+/// ConstantFold - Try to fold instruction I into a constant. This works for
+/// simple instructions such as binary operations where both operands are
+/// constant or can be replaced by constants from the ConstantPool. Returns the
+/// resulting constant on success, 0 otherwise.
+static Constant *ConstantFold(Instruction *I,
+                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    Constant *A = LookupConstant(BO->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    Constant *B = LookupConstant(BO->getOperand(1), ConstantPool);
+    if (!B)
+      return 0;
+    return ConstantExpr::get(BO->getOpcode(), A, B);
+  }
+
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    Constant *B = LookupConstant(I->getOperand(1), ConstantPool);
+    if (!B)
+      return 0;
+    return ConstantExpr::getCompare(Cmp->getPredicate(), A, B);
+  }
+
+  if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
+    Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
+    if (!A)
+      return 0;
+    if (A->isAllOnesValue())
+      return LookupConstant(Select->getTrueValue(), ConstantPool);
+    if (A->isNullValue())
+      return LookupConstant(Select->getFalseValue(), ConstantPool);
+    return 0;
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(I)) {
+    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy());
+  }
+
+  return 0;
+}
+
+/// GetCaseResults - Try to determine the resulting constant values in phi nodes
+/// at the common destination basic block, *CommonDest, for one of the case
+/// destionations CaseDest corresponding to value CaseVal (0 for the default
+/// case), of a switch instruction SI.
+static bool GetCaseResults(SwitchInst *SI,
+                           ConstantInt *CaseVal,
+                           BasicBlock *CaseDest,
+                           BasicBlock **CommonDest,
+                           SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) {
+  // The block from which we enter the common destination.
+  BasicBlock *Pred = SI->getParent();
+
+  // If CaseDest is empty except for some side-effect free instructions through
+  // which we can constant-propagate the CaseVal, continue to its successor.
+  SmallDenseMap<Value*, Constant*> ConstantPool;
+  ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
+  for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
+       ++I) {
+    if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+      // If the terminator is a simple branch, continue to the next block.
+      if (T->getNumSuccessors() != 1)
+        return false;
+      Pred = CaseDest;
+      CaseDest = T->getSuccessor(0);
+    } else if (isa<DbgInfoIntrinsic>(I)) {
+      // Skip debug intrinsic.
+      continue;
+    } else if (Constant *C = ConstantFold(I, ConstantPool)) {
+      // Instruction is side-effect free and constant.
+      ConstantPool.insert(std::make_pair(I, C));
+    } else {
+      break;
+    }
+  }
+
+  // If we did not have a CommonDest before, use the current one.
+  if (!*CommonDest)
+    *CommonDest = CaseDest;
+  // If the destination isn't the common one, abort.
+  if (CaseDest != *CommonDest)
+    return false;
+
+  // Get the values for this case from phi nodes in the destination block.
+  BasicBlock::iterator I = (*CommonDest)->begin();
+  while (PHINode *PHI = dyn_cast<PHINode>(I++)) {
+    int Idx = PHI->getBasicBlockIndex(Pred);
+    if (Idx == -1)
+      continue;
+
+    Constant *ConstVal = LookupConstant(PHI->getIncomingValue(Idx),
+                                        ConstantPool);
+    if (!ConstVal)
+      return false;
+
+    // Note: If the constant comes from constant-propagating the case value
+    // through the CaseDest basic block, it will be safe to remove the
+    // instructions in that block. They cannot be used (except in the phi nodes
+    // we visit) outside CaseDest, because that block does not dominate its
+    // successor. If it did, we would not be in this phi node.
+
+    // Be conservative about which kinds of constants we support.
+    if (!ValidLookupTableConstant(ConstVal))
+      return false;
+
+    Res.push_back(std::make_pair(PHI, ConstVal));
+  }
+
+  return true;
+}
+
+namespace {
+  /// SwitchLookupTable - This class represents a lookup table that can be used
+  /// to replace a switch.
+  class SwitchLookupTable {
+  public:
+    /// SwitchLookupTable - Create a lookup table to use as a switch replacement
+    /// with the contents of Values, using DefaultValue to fill any holes in the
+    /// table.
+    SwitchLookupTable(Module &M,
+                      uint64_t TableSize,
+                      ConstantInt *Offset,
+               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+                      Constant *DefaultValue,
+                      const DataLayout *TD);
+
+    /// BuildLookup - Build instructions with Builder to retrieve the value at
+    /// the position given by Index in the lookup table.
+    Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+    /// WouldFitInRegister - Return true if a table with TableSize elements of
+    /// type ElementType would fit in a target-legal register.
+    static bool WouldFitInRegister(const DataLayout *TD,
+                                   uint64_t TableSize,
+                                   const Type *ElementType);
+
+  private:
+    // Depending on the contents of the table, it can be represented in
+    // different ways.
+    enum {
+      // For tables where each element contains the same value, we just have to
+      // store that single value and return it for each lookup.
+      SingleValueKind,
+
+      // For small tables with integer elements, we can pack them into a bitmap
+      // that fits into a target-legal register. Values are retrieved by
+      // shift and mask operations.
+      BitMapKind,
+
+      // The table is stored as an array of values. Values are retrieved by load
+      // instructions from the table.
+      ArrayKind
+    } Kind;
+
+    // For SingleValueKind, this is the single value.
+    Constant *SingleValue;
+
+    // For BitMapKind, this is the bitmap.
+    ConstantInt *BitMap;
+    IntegerType *BitMapElementTy;
+
+    // For ArrayKind, this is the array.
+    GlobalVariable *Array;
+  };
+}
+
+SwitchLookupTable::SwitchLookupTable(Module &M,
+                                     uint64_t TableSize,
+                                     ConstantInt *Offset,
+               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+                                     Constant *DefaultValue,
+                                     const DataLayout *TD) {
+  assert(Values.size() && "Can't build lookup table without values!");
+  assert(TableSize >= Values.size() && "Can't fit values in table!");
+
+  // If all values in the table are equal, this is that value.
+  SingleValue = Values.begin()->second;
+
+  // Build up the table contents.
+  SmallVector<Constant*, 64> TableContents(TableSize);
+  for (size_t I = 0, E = Values.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Values[I].first;
+    Constant *CaseRes = Values[I].second;
+    assert(CaseRes->getType() == DefaultValue->getType());
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue())
+                   .getLimitedValue();
+    TableContents[Idx] = CaseRes;
+
+    if (CaseRes != SingleValue)
+      SingleValue = 0;
+  }
+
+  // Fill in any holes in the table with the default result.
+  if (Values.size() < TableSize) {
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      if (!TableContents[I])
+        TableContents[I] = DefaultValue;
+    }
+
+    if (DefaultValue != SingleValue)
+      SingleValue = 0;
+  }
+
+  // If each element in the table contains the same value, we only need to store
+  // that single value.
+  if (SingleValue) {
+    Kind = SingleValueKind;
+    return;
+  }
+
+  // If the type is integer and the table fits in a register, build a bitmap.
+  if (WouldFitInRegister(TD, TableSize, DefaultValue->getType())) {
+    IntegerType *IT = cast<IntegerType>(DefaultValue->getType());
+    APInt TableInt(TableSize * IT->getBitWidth(), 0);
+    for (uint64_t I = TableSize; I > 0; --I) {
+      TableInt <<= IT->getBitWidth();
+      // Insert values into the bitmap. Undef values are set to zero.
+      if (!isa<UndefValue>(TableContents[I - 1])) {
+        ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
+        TableInt |= Val->getValue().zext(TableInt.getBitWidth());
+      }
+    }
+    BitMap = ConstantInt::get(M.getContext(), TableInt);
+    BitMapElementTy = IT;
+    Kind = BitMapKind;
+    ++NumBitMaps;
+    return;
+  }
+
+  // Store the table in an array.
+  ArrayType *ArrayTy = ArrayType::get(DefaultValue->getType(), TableSize);
+  Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+  Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
+                             GlobalVariable::PrivateLinkage,
+                             Initializer,
+                             "switch.table");
+  Array->setUnnamedAddr(true);
+  Kind = ArrayKind;
+}
+
+Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
+  switch (Kind) {
+    case SingleValueKind:
+      return SingleValue;
+    case BitMapKind: {
+      // Type of the bitmap (e.g. i59).
+      IntegerType *MapTy = BitMap->getType();
+
+      // Cast Index to the same type as the bitmap.
+      // Note: The Index is <= the number of elements in the table, so
+      // truncating it to the width of the bitmask is safe.
+      Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+      // Multiply the shift amount by the element width.
+      ShiftAmt = Builder.CreateMul(ShiftAmt,
+                      ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+                                   "switch.shiftamt");
+
+      // Shift down.
+      Value *DownShifted = Builder.CreateLShr(BitMap, ShiftAmt,
+                                              "switch.downshift");
+      // Mask off.
+      return Builder.CreateTrunc(DownShifted, BitMapElementTy,
+                                 "switch.masked");
+    }
+    case ArrayKind: {
+      Value *GEPIndices[] = { Builder.getInt32(0), Index };
+      Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices,
+                                             "switch.gep");
+      return Builder.CreateLoad(GEP, "switch.load");
+    }
+  }
+  llvm_unreachable("Unknown lookup table kind!");
+}
+
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
+                                           uint64_t TableSize,
+                                           const Type *ElementType) {
+  if (!TD)
+    return false;
+  const IntegerType *IT = dyn_cast<IntegerType>(ElementType);
+  if (!IT)
+    return false;
+  // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
+  // are <= 15, we could try to narrow the type.
+
+  // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
+  if (TableSize >= UINT_MAX/IT->getBitWidth())
+    return false;
+  return TD->fitsInLegalInteger(TableSize * IT->getBitWidth());
+}
+
+/// ShouldBuildLookupTable - Determine whether a lookup table should be built
+/// for this switch, based on the number of caes, size of the table and the
+/// types of the results.
+static bool ShouldBuildLookupTable(SwitchInst *SI,
+                                   uint64_t TableSize,
+                                   const DataLayout *TD,
+                            const SmallDenseMap<PHINode*, Type*>& ResultTypes) {
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
+    return false; // TableSize overflowed, or mul below might overflow.
+  if (SI->getNumCases() * 10 >= TableSize * 4)
+    return true;
+
+  // If each table would fit in a register, we should build it anyway.
+  for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(),
+       E = ResultTypes.end(); I != E; ++I) {
+    if (!SwitchLookupTable::WouldFitInRegister(TD, TableSize, I->second))
+      return false;
+  }
+  return true;
+}
+
+/// SwitchToLookupTable - If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with different constant values,
+/// replace the switch with lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI,
+                                IRBuilder<> &Builder,
+                                const DataLayout* TD,
+                                const TargetTransformInfo *TTI) {
+  assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+  // Only build lookup table when we have a target that supports it.
+  if (!TTI || !TTI->getScalarTargetTransformInfo() ||
+      !TTI->getScalarTargetTransformInfo()->shouldBuildLookupTables())
     return false;
 
+  // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+  // split off a dense part and build a lookup table for that.
+
+  // FIXME: This creates arrays of GEPs to constant strings, which means each
+  // GEP needs a runtime relocation in PIC code. We should just build one big
+  // string and lookup indices into that.
+
+  // Ignore the switch if the number of cases is too small.
+  // This is similar to the check when building jump tables in
+  // SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Determine the best cut-off.
+  if (SI->getNumCases() < 4)
+    return false;
+
+  // Figure out the corresponding result for each case value and phi node in the
+  // common destination, as well as the the min and max case values.
+  assert(SI->case_begin() != SI->case_end());
+  SwitchInst::CaseIt CI = SI->case_begin();
+  ConstantInt *MinCaseVal = CI.getCaseValue();
+  ConstantInt *MaxCaseVal = CI.getCaseValue();
+
+  BasicBlock *CommonDest = 0;
+  typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
+  SmallDenseMap<PHINode*, ResultListTy> ResultLists;
+  SmallDenseMap<PHINode*, Constant*> DefaultResults;
+  SmallDenseMap<PHINode*, Type*> ResultTypes;
+  SmallVector<PHINode*, 4> PHIs;
+
+  for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+    ConstantInt *CaseVal = CI.getCaseValue();
+    if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+      MinCaseVal = CaseVal;
+    if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+      MaxCaseVal = CaseVal;
+
+    // Resulting value at phi nodes for this case value.
+    typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
+    ResultsTy Results;
+    if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
+                        Results))
+      return false;
+
+    // Append the result from this case to the list for each phi.
+    for (ResultsTy::iterator I = Results.begin(), E = Results.end(); I!=E; ++I) {
+      if (!ResultLists.count(I->first))
+        PHIs.push_back(I->first);
+      ResultLists[I->first].push_back(std::make_pair(CaseVal, I->second));
+    }
+  }
+
+  // Get the resulting values for the default case.
+  SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
+  if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
+                      DefaultResultsList))
+    return false;
+  for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
+    PHINode *PHI = DefaultResultsList[I].first;
+    Constant *Result = DefaultResultsList[I].second;
+    DefaultResults[PHI] = Result;
+    ResultTypes[PHI] = Result->getType();
+  }
+
+  APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+  uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+  if (!ShouldBuildLookupTable(SI, TableSize, TD, ResultTypes))
+    return false;
+
+  // Create the BB that does the lookups.
+  Module &Mod = *CommonDest->getParent()->getParent();
+  BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
+                                            "switch.lookup",
+                                            CommonDest->getParent(),
+                                            CommonDest);
+
+  // Check whether the condition value is within the case range, and branch to
+  // the new BB.
+  Builder.SetInsertPoint(SI);
+  Value *TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
+                                        "switch.tableidx");
+  Value *Cmp = Builder.CreateICmpULT(TableIndex, ConstantInt::get(
+      MinCaseVal->getType(), TableSize));
+  Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+
+  // Populate the BB that does the lookups.
+  Builder.SetInsertPoint(LookupBB);
+  bool ReturnedEarly = false;
+  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
+    PHINode *PHI = PHIs[I];
+
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI],
+                            DefaultResults[PHI], TD);
+
+    Value *Result = Table.BuildLookup(TableIndex, Builder);
+
+    // If the result is used to return immediately from the function, we want to
+    // do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin()) &&
+        *PHI->use_begin() == CommonDest->getFirstNonPHIOrDbg()) {
+      Builder.CreateRet(Result);
+      ReturnedEarly = true;
+      break;
+    }
+
+    PHI->addIncoming(Result, LookupBB);
+  }
+
+  if (!ReturnedEarly)
+    Builder.CreateBr(CommonDest);
+
+  // Remove the switch.
+  for (unsigned i = 0; i < SI->getNumSuccessors(); ++i) {
+    BasicBlock *Succ = SI->getSuccessor(i);
+    if (Succ == SI->getDefaultDest()) continue;
+    Succ->removePredecessor(SI->getParent());
+  }
+  SI->eraseFromParent();
+
+  ++NumLookupTables;
+  return true;
+}
+
+bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   BasicBlock *BB = SI->getParent();
 
-  // If we only have one predecessor, and if it is a branch on this value,
-  // see if that predecessor totally determines the outcome of this switch.
-  if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-    if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
-      return SimplifyCFG(BB) | true;
+  if (isValueEqualityComparison(SI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+        return SimplifyCFG(BB) | true;
 
-  Value *Cond = SI->getCondition();
-  if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
-    if (SimplifySwitchOnSelect(SI, Select))
-      return SimplifyCFG(BB) | true;
+    Value *Cond = SI->getCondition();
+    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+      if (SimplifySwitchOnSelect(SI, Select))
+        return SimplifyCFG(BB) | true;
 
-  // If the block only contains the switch, see if we can fold the block
-  // away into any preds.
-  BasicBlock::iterator BBI = BB->begin();
-  // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(BBI))
-    ++BBI;
-  if (SI == &*BBI)
-    if (FoldValueComparisonIntoPredecessors(SI, Builder))
-      return SimplifyCFG(BB) | true;
+    // If the block only contains the switch, see if we can fold the block
+    // away into any preds.
+    BasicBlock::iterator BBI = BB->begin();
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(BBI))
+      ++BBI;
+    if (SI == &*BBI)
+      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+        return SimplifyCFG(BB) | true;
+  }
 
   // Try to transform the switch into an icmp and a branch.
   if (TurnSwitchRangeIntoICmp(SI, Builder))
@@ -2862,13 +3701,16 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (ForwardSwitchConditionToPHI(SI))
     return SimplifyCFG(BB) | true;
 
+  if (SwitchToLookupTable(SI, Builder, TD, TTI))
+    return SimplifyCFG(BB) | true;
+
   return false;
 }
 
 bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
   BasicBlock *BB = IBI->getParent();
   bool Changed = false;
-  
+
   // Eliminate redundant destinations.
   SmallPtrSet<Value *, 8> Succs;
   for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
@@ -2879,7 +3721,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
       --i; --e;
       Changed = true;
     }
-  } 
+  }
 
   if (IBI->getNumDestinations() == 0) {
     // If the indirectbr has no successors, change it to unreachable.
@@ -2887,14 +3729,14 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
     EraseTerminatorInstAndDCECond(IBI);
     return true;
   }
-  
+
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
     BranchInst::Create(IBI->getDestination(0), IBI);
     EraseTerminatorInstAndDCECond(IBI);
     return true;
   }
-  
+
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
       return SimplifyCFG(BB) | true;
@@ -2904,13 +3746,16 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
-  
+
+  if (SinkCommon && SinkThenElseCodeToEnd(BI))
+    return true;
+
   // If the Terminator is the only non-phi instruction, simplify the block.
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
       TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
-  
+
   // If the only instruction in the block is a seteq/setne comparison
   // against a constant, try to simplify the block.
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
@@ -2921,7 +3766,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
           TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
         return true;
     }
-  
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and our successor, fold the comparison into the
   // predecessor and use logical operations to update the incoming value
@@ -2934,7 +3779,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
 
 bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
-  
+
   // Conditional branch
   if (isValueEqualityComparison(BI)) {
     // If we only have one predecessor, and if it is a branch on this value,
@@ -2943,7 +3788,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
         return SimplifyCFG(BB) | true;
-    
+
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
     BasicBlock::iterator I = BB->begin();
@@ -2962,17 +3807,17 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
         return SimplifyCFG(BB) | true;
     }
   }
-  
+
   // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
   if (SimplifyBranchOnICmpChain(BI, TD, Builder))
     return true;
-  
+
   // If this basic block is ONLY a compare and a branch, and if a predecessor
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI))
     return SimplifyCFG(BB) | true;
-  
+
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
   // there is any identical code in the "then" and "else" blocks.  If so, we
@@ -2999,14 +3844,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
         return SimplifyCFG(BB) | true;
   }
-  
+
   // If this is a branch on a phi node in the current block, thread control
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, TD))
         return SimplifyCFG(BB) | true;
-  
+
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
@@ -3023,11 +3868,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
   if (!C)
     return false;
 
-  if (!I->hasOneUse()) // Only look at single-use instructions, for compile time
+  if (I->use_empty())
     return false;
 
   if (C->isNullValue()) {
-    Instruction *Use = I->use_back();
+    // Only look at the first use, avoid hurting compile time with long uselists
+    User *Use = *I->use_begin();
 
     // Now make sure that there are no instructions in between that can alter
     // control flow (eg. calls)
@@ -3114,7 +3960,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
   //
   if (MergeBlockIntoPredecessor(BB))
     return true;
-  
+
   IRBuilder<> Builder(BB);
 
   // If there is a trivial two-entry PHI node in this basic block, and we can
@@ -3152,6 +3998,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// eliminates unreachable basic blocks, and does other "peephole" optimization
 /// of the CFG.  It returns true if a modification was made.
 ///
-bool llvm::SimplifyCFG(BasicBlock *BB, const TargetData *TD) {
-  return SimplifyCFGOpt(TD).run(BB);
+bool llvm::SimplifyCFG(BasicBlock *BB, const DataLayout *TD,
+                       const TargetTransformInfo *TTI) {
+  return SimplifyCFGOpt(TD, TTI).run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 5d673f182411..110f3808573e 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -24,7 +24,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 
@@ -44,7 +44,7 @@ namespace {
     Loop             *L;
     LoopInfo         *LI;
     ScalarEvolution  *SE;
-    const TargetData *TD; // May be NULL
+    const DataLayout *TD; // May be NULL
 
     SmallVectorImpl<WeakVH> &DeadInsts;
 
@@ -56,7 +56,7 @@ namespace {
       L(Loop),
       LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
       SE(SE),
-      TD(LPM->getAnalysisIfAvailable<TargetData>()),
+      TD(LPM->getAnalysisIfAvailable<DataLayout>()),
       DeadInsts(Dead),
       Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 81eb9e0f8ae1..65353dc46037 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -23,7 +23,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -46,7 +46,7 @@ namespace {
     /// runOnFunction - Remove instructions that simplify.
     bool runOnFunction(Function &F) {
       const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
-      const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+      const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
       const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
       SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
       bool Changed = false;
@@ -72,7 +72,7 @@ namespace {
                 ++NumSimplified;
                 Changed = true;
               }
-            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I);
+            Changed |= RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
           }
 
         // Place the list of instructions to simplify on the next loop iteration
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
new file mode 100644
index 000000000000..c3ea63852fed
--- /dev/null
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -0,0 +1,1149 @@
+//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility pass used for testing the InstructionSimplify analysis.
+// The analysis is applied to every instruction, and if it simplifies then the
+// instruction is replaced by the simplification.  If you are looking for a pass
+// that performs serious instruction folding, use the instcombine pass instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/DataLayout.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class LibCallOptimization {
+protected:
+  Function *Caller;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
+  LLVMContext* Context;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// callOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
+    =0;
+
+  Value *optimizeCall(CallInst *CI, const DataLayout *TD,
+                      const TargetLibraryInfo *TLI,
+                      const LibCallSimplifier *LCS, IRBuilder<> &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = TD;
+    this->TLI = TLI;
+    this->LCS = LCS;
+    if (CI->getCalledFunction())
+      Context = &CI->getCalledFunction()->getContext();
+
+    // We never change the calling convention.
+    if (CI->getCallingConv() != llvm::CallingConv::C)
+      return NULL;
+
+    return callOptimizer(CI->getCalledFunction(), CI, B);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero.
+static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct FortifiedLibCallOptimization : public LibCallOptimization {
+protected:
+  virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
+			  bool isString) const = 0;
+};
+
+struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization {
+  CallInst *CI;
+
+  bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const {
+    if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
+      return true;
+    if (ConstantInt *SizeCI =
+                           dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
+      if (SizeCI->isAllOnesValue())
+        return true;
+      if (isString) {
+        uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
+        // If the length is 0 we don't know how long it is and so we can't
+        // remove the check.
+        if (Len == 0) return false;
+        return SizeCI->getZExtValue() >= Len;
+      }
+      if (ConstantInt *Arg = dyn_cast<ConstantInt>(
+                                                  CI->getArgOperand(SizeArgOp)))
+        return SizeCI->getZExtValue() >= Arg->getZExtValue();
+    }
+    return false;
+  }
+};
+
+struct MemCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct MemMoveChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
+                                   false);
+      B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(Context))
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // __strcpy_chk(x,x)  -> x
+      return Src;
+
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain strcpy. Otherwise we'll keep our
+    // strcpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Value *Ret =
+	EmitMemCpyChk(Dst, Src,
+                      ConstantInt::get(TD->getIntPtrType(Context), Len),
+                      CI->getArgOperand(2), B, TD, TLI);
+      return Ret;
+    }
+    return 0;
+  }
+};
+
+struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain stpcpy. Otherwise we'll keep our
+    // stpcpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Type *PT = FT->getParamType(0);
+      Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
+      Value *DstEnd = B.CreateGEP(Dst,
+                                  ConstantInt::get(TD->getIntPtrType(PT),
+                                                   Len - 1));
+      if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD, TLI))
+        return 0;
+      return DstEnd;
+    }
+    return 0;
+  }
+};
+
+struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), B, TD, TLI,
+                               Name.substr(2, 7));
+      return Ret;
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// String and Memory Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct StrCatOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcat" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType())
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    --Len;  // Unbias length.
+
+    // Handle the simple, do-nothing case: strcat(x, "") -> x
+    if (Len == 0)
+      return Dst;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    return emitStrLenMemCpy(Src, Dst, Len, B);
+  }
+
+  Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+                          IRBuilder<> &B) {
+    // We need to find the end of the destination string.  That's where the
+    // memory is to be moved to. We just generate a call to strlen.
+    Value *DstLen = EmitStrLen(Dst, B, TD, TLI);
+    if (!DstLen)
+      return 0;
+
+    // Now that we have the destination's length, we must index into the
+    // destination's pointer to get the actual memcpy destination (end of
+    // the string .. we're concatenating).
+    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(CpyDst, Src,
+                   ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1);
+    return Dst;
+  }
+};
+
+struct StrNCatOpt : public StrCatOpt {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncat" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    uint64_t Len;
+
+    // We don't do anything if length is not constant
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;  // Unbias length.
+
+    // Handle the simple, do-nothing cases:
+    // strncat(x, "", c) -> x
+    // strncat(x,  c, 0) -> x
+    if (SrcLen == 0 || Len == 0) return Dst;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // We don't optimize this case
+    if (Len < SrcLen) return 0;
+
+    // strncat(x, s, c) -> strcat(x, s)
+    // s is constant so the strcat can be optimized further
+    return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+  }
+};
+
+struct StrChrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strchr" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        !FT->getParamType(1)->isIntegerTy(32))
+      return 0;
+
+    Value *SrcStr = CI->getArgOperand(0);
+
+    // If the second operand is non-constant, see if we can compute the length
+    // of the input string and turn this into memchr.
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    if (CharC == 0) {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      uint64_t Len = GetStringLength(SrcStr);
+      if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
+        return 0;
+
+      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                        ConstantInt::get(TD->getIntPtrType(*Context), Len),
+                        B, TD, TLI);
+    }
+
+    // Otherwise, the character is a constant, see if the first argument is
+    // a string literal.  If so, we can constant fold.
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str))
+      return 0;
+
+    // Compute the offset, make sure to handle the case when we're searching for
+    // zero (a weird way to spell strlen).
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.find(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+      return Constant::getNullValue(CI->getType());
+
+    // strchr(s+n,c)  -> gep(s+n+i,c)
+    return B.CreateGEP(SrcStr, B.getInt64(I), "strchr");
+  }
+};
+
+struct StrRChrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strrchr" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        !FT->getParamType(1)->isIntegerTy(32))
+      return 0;
+
+    Value *SrcStr = CI->getArgOperand(0);
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+
+    // Cannot fold anything if we're not looking for a constant.
+    if (!CharC)
+      return 0;
+
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str)) {
+      // strrchr(s, 0) -> strchr(s, 0)
+      if (TD && CharC->isZero())
+        return EmitStrChr(SrcStr, '\0', B, TD, TLI);
+      return 0;
+    }
+
+    // Compute the offset.
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.rfind(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char. Return null.
+      return Constant::getNullValue(CI->getType());
+
+    // strrchr(s+n,c) -> gep(s+n+i,c)
+    return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr");
+  }
+};
+
+struct StrCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcmp" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+    // strcmp(P, "x") -> memcmp(P, "x", 2)
+    uint64_t Len1 = GetStringLength(Str1P);
+    uint64_t Len2 = GetStringLength(Str2P);
+    if (Len1 && Len2) {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      return EmitMemCmp(Str1P, Str2P,
+                        ConstantInt::get(TD->getIntPtrType(*Context),
+                        std::min(Len1, Len2)), B, TD, TLI);
+    }
+
+    return 0;
+  }
+};
+
+struct StrNCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncmp" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    // Get the length argument if it is constant.
+    uint64_t Length;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      Length = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Length == 0) // strncmp(x,y,0)   -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI);
+
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2) {
+      StringRef SubStr1 = Str1.substr(0, Length);
+      StringRef SubStr2 = Str2.substr(0, Length);
+      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+    }
+
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+    return 0;
+  }
+};
+
+struct StrCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src,
+		   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
+    return Dst;
+  }
+};
+
+struct StpCpyOpt: public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "stpcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    Type *PT = FT->getParamType(0);
+    Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
+    Value *DstEnd = B.CreateGEP(Dst,
+                                ConstantInt::get(TD->getIntPtrType(PT),
+                                                 Len - 1));
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src, LenV, 1);
+    return DstEnd;
+  }
+};
+
+struct StrNCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    Value *LenOp = CI->getArgOperand(2);
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;
+
+    if (SrcLen == 0) {
+      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+      B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
+      return Dst;
+    }
+
+    uint64_t Len;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Let strncpy handle the zero padding
+    if (Len > SrcLen+1) return 0;
+
+    Type *PT = FT->getParamType(0);
+    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+    B.CreateMemCpy(Dst, Src,
+                   ConstantInt::get(TD->getIntPtrType(PT), Len), 1);
+
+    return Dst;
+  }
+};
+
+struct StrLenOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    Value *Src = CI->getArgOperand(0);
+
+    // Constant folding: strlen("xyz") -> 3
+    if (uint64_t Len = GetStringLength(Src))
+      return ConstantInt::get(CI->getType(), Len-1);
+
+    // strlen(x) != 0 --> *x != 0
+    // strlen(x) == 0 --> *x == 0
+    if (isOnlyUsedInZeroEqualityComparison(CI))
+      return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+    return 0;
+  }
+};
+
+struct StrPBrkOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        FT->getReturnType() != FT->getParamType(0))
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strpbrk(s, "") -> NULL
+    // strpbrk("", s) -> NULL
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t I = S1.find_first_of(S2);
+      if (I == std::string::npos) // No match.
+        return Constant::getNullValue(CI->getType());
+
+      return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
+    }
+
+    // strpbrk(s, "a") -> strchr(s, 'a')
+    if (TD && HasS2 && S2.size() == 1)
+      return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrToOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy())
+      return 0;
+
+    Value *EndPtr = CI->getArgOperand(1);
+    if (isa<ConstantPointerNull>(EndPtr)) {
+      // With a null EndPtr, this function won't capture the main argument.
+      // It would be readonly too, except that it still may write to errno.
+      CI->addAttribute(1, Attributes::get(Callee->getContext(),
+                                          Attributes::NoCapture));
+    }
+
+    return 0;
+  }
+};
+
+struct StrSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strspn(s, "") -> 0
+    // strspn("", s) -> 0
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_not_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    return 0;
+  }
+};
+
+struct StrCSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strcspn("", s) -> 0
+    if (HasS1 && S1.empty())
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    // strcspn(s, "") -> strlen(s)
+    if (TD && HasS2 && S2.empty())
+      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrStrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isPointerTy())
+      return 0;
+
+    // fold strstr(x, x) -> x.
+    if (CI->getArgOperand(0) == CI->getArgOperand(1))
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+    if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
+      if (!StrLen)
+        return 0;
+      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   StrLen, B, TD, TLI);
+      if (!StrNCmp)
+        return 0;
+      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+           UI != UE; ) {
+        ICmpInst *Old = cast<ICmpInst>(*UI++);
+        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+                                  ConstantInt::getNullValue(StrNCmp->getType()),
+                                  "cmp");
+        LCS->replaceAllUsesWith(Old, Cmp);
+      }
+      return CI;
+    }
+
+    // See if either input string is a constant string.
+    StringRef SearchStr, ToFindStr;
+    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+    // fold strstr(x, "") -> x.
+    if (HasStr2 && ToFindStr.empty())
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // If both strings are known, constant fold it.
+    if (HasStr1 && HasStr2) {
+      std::string::size_type Offset = SearchStr.find(ToFindStr);
+
+      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+        return Constant::getNullValue(CI->getType());
+
+      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+      Value *Result = CastToCStr(CI->getArgOperand(0), B);
+      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+      return B.CreateBitCast(Result, CI->getType());
+    }
+
+    // fold strstr(x, "y") -> strchr(x, 'y').
+    if (HasStr2 && ToFindStr.size() == 1) {
+      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
+      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+    }
+    return 0;
+  }
+};
+
+struct MemCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy(32))
+      return 0;
+
+    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+    if (Len == 1) {
+      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+                                 CI->getType(), "lhsv");
+      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+                                 CI->getType(), "rhsv");
+      return B.CreateSub(LHSV, RHSV, "chardiff");
+    }
+
+    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+    StringRef LHSStr, RHSStr;
+    if (getConstantStringInfo(LHS, LHSStr) &&
+        getConstantStringInfo(RHS, RHSStr)) {
+      // Make sure we're not reading out-of-bounds memory.
+      if (Len > LHSStr.size() || Len > RHSStr.size())
+        return 0;
+      uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      return ConstantInt::get(CI->getType(), Ret);
+    }
+
+    return 0;
+  }
+};
+
+struct MemCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemMoveOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemSetOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+} // End anonymous namespace.
+
+namespace llvm {
+
+class LibCallSimplifierImpl {
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
+  StringMap<LibCallOptimization*> Optimizations;
+
+  // Fortified library call optimizations.
+  MemCpyChkOpt MemCpyChk;
+  MemMoveChkOpt MemMoveChk;
+  MemSetChkOpt MemSetChk;
+  StrCpyChkOpt StrCpyChk;
+  StpCpyChkOpt StpCpyChk;
+  StrNCpyChkOpt StrNCpyChk;
+
+  // String library call optimizations.
+  StrCatOpt StrCat;
+  StrNCatOpt StrNCat;
+  StrChrOpt StrChr;
+  StrRChrOpt StrRChr;
+  StrCmpOpt StrCmp;
+  StrNCmpOpt StrNCmp;
+  StrCpyOpt StrCpy;
+  StpCpyOpt StpCpy;
+  StrNCpyOpt StrNCpy;
+  StrLenOpt StrLen;
+  StrPBrkOpt StrPBrk;
+  StrToOpt StrTo;
+  StrSpnOpt StrSpn;
+  StrCSpnOpt StrCSpn;
+  StrStrOpt StrStr;
+
+  // Memory library call optimizations.
+  MemCmpOpt MemCmp;
+  MemCpyOpt MemCpy;
+  MemMoveOpt MemMove;
+  MemSetOpt MemSet;
+
+  void initOptimizations();
+  void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
+public:
+  LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                        const LibCallSimplifier *LCS) {
+    this->TD = TD;
+    this->TLI = TLI;
+    this->LCS = LCS;
+  }
+
+  Value *optimizeCall(CallInst *CI);
+};
+
+void LibCallSimplifierImpl::initOptimizations() {
+  // Fortified library call optimizations.
+  Optimizations["__memcpy_chk"] = &MemCpyChk;
+  Optimizations["__memmove_chk"] = &MemMoveChk;
+  Optimizations["__memset_chk"] = &MemSetChk;
+  Optimizations["__strcpy_chk"] = &StrCpyChk;
+  Optimizations["__stpcpy_chk"] = &StpCpyChk;
+  Optimizations["__strncpy_chk"] = &StrNCpyChk;
+  Optimizations["__stpncpy_chk"] = &StrNCpyChk;
+
+  // String library call optimizations.
+  addOpt(LibFunc::strcat, &StrCat);
+  addOpt(LibFunc::strncat, &StrNCat);
+  addOpt(LibFunc::strchr, &StrChr);
+  addOpt(LibFunc::strrchr, &StrRChr);
+  addOpt(LibFunc::strcmp, &StrCmp);
+  addOpt(LibFunc::strncmp, &StrNCmp);
+  addOpt(LibFunc::strcpy, &StrCpy);
+  addOpt(LibFunc::stpcpy, &StpCpy);
+  addOpt(LibFunc::strncpy, &StrNCpy);
+  addOpt(LibFunc::strlen, &StrLen);
+  addOpt(LibFunc::strpbrk, &StrPBrk);
+  addOpt(LibFunc::strtol, &StrTo);
+  addOpt(LibFunc::strtod, &StrTo);
+  addOpt(LibFunc::strtof, &StrTo);
+  addOpt(LibFunc::strtoul, &StrTo);
+  addOpt(LibFunc::strtoll, &StrTo);
+  addOpt(LibFunc::strtold, &StrTo);
+  addOpt(LibFunc::strtoull, &StrTo);
+  addOpt(LibFunc::strspn, &StrSpn);
+  addOpt(LibFunc::strcspn, &StrCSpn);
+  addOpt(LibFunc::strstr, &StrStr);
+
+  // Memory library call optimizations.
+  addOpt(LibFunc::memcmp, &MemCmp);
+  addOpt(LibFunc::memcpy, &MemCpy);
+  addOpt(LibFunc::memmove, &MemMove);
+  addOpt(LibFunc::memset, &MemSet);
+}
+
+Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
+  if (Optimizations.empty())
+    initOptimizations();
+
+  Function *Callee = CI->getCalledFunction();
+  LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
+  if (LCO) {
+    IRBuilder<> Builder(CI);
+    return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
+  }
+  return 0;
+}
+
+void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+  if (TLI->has(F))
+    Optimizations[TLI->getName(F)] = Opt;
+}
+
+LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
+                                     const TargetLibraryInfo *TLI) {
+  Impl = new LibCallSimplifierImpl(TD, TLI, this);
+}
+
+LibCallSimplifier::~LibCallSimplifier() {
+  delete Impl;
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  return Impl->optimizeCall(CI);
+}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
+  I->replaceAllUsesWith(With);
+  I->eraseFromParent();
+}
+
+}
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index 24e8c8ff5c5f..5812d4607dfc 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -29,6 +29,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializePromotePassPass(Registry);
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
+  initializeMetaRenamerPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index fc2538db6482..a30b09321b5e 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -21,7 +21,7 @@
 using namespace llvm;
 
 // Out of line method to get vtable etc for class.
-void ValueMapTypeRemapper::Anchor() {}
+void ValueMapTypeRemapper::anchor() {}
 
 Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
                       ValueMapTypeRemapper *TypeMapper) {
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 62d23cb948f1..f7be3e312407 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -28,12 +28,14 @@
 #include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -41,17 +43,27 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
 #include <map>
 using namespace llvm;
 
+static cl::opt<bool>
+IgnoreTargetInfo("bb-vectorize-ignore-target-info",  cl::init(false),
+  cl::Hidden, cl::desc("Ignore target information"));
+
 static cl::opt<unsigned>
 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
   cl::desc("The required chain depth for vectorization"));
 
+static cl::opt<bool>
+UseChainDepthWithTI("bb-vectorize-use-chain-depth",  cl::init(false),
+  cl::Hidden, cl::desc("Use the chain depth requirement with"
+                       " target information"));
+
 static cl::opt<unsigned>
 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
   cl::desc("The maximum search distance for instruction pairs"));
@@ -93,8 +105,9 @@ static cl::opt<bool>
 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
   cl::desc("Don't try to vectorize floating-point values"));
 
+// FIXME: This should default to false once pointer vector support works.
 static cl::opt<bool>
-NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden,
+NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
   cl::desc("Don't try to vectorize pointer values"));
 
 static cl::opt<bool>
@@ -159,6 +172,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check",
   cl::init(false), cl::Hidden,
   cl::desc("When debugging is enabled, output information on the"
            " cycle-checking process"));
+
+static cl::opt<bool>
+PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, dump the basic block after"
+           " every pair is fused"));
 #endif
 
 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
@@ -177,13 +196,19 @@ namespace {
     BBVectorize(Pass *P, const VectorizeConfig &C)
       : BasicBlockPass(ID), Config(C) {
       AA = &P->getAnalysis<AliasAnalysis>();
+      DT = &P->getAnalysis<DominatorTree>();
       SE = &P->getAnalysis<ScalarEvolution>();
-      TD = P->getAnalysisIfAvailable<TargetData>();
+      TD = P->getAnalysisIfAvailable<DataLayout>();
+      TTI = IgnoreTargetInfo ? 0 :
+        P->getAnalysisIfAvailable<TargetTransformInfo>();
+      VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
     }
 
     typedef std::pair<Value *, Value *> ValuePair;
+    typedef std::pair<ValuePair, int> ValuePairWithCost;
     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+    typedef std::pair<VPPair, unsigned> VPPairWithType;
     typedef std::pair<std::multimap<Value *, Value *>::iterator,
               std::multimap<Value *, Value *>::iterator> VPIteratorPair;
     typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
@@ -191,8 +216,11 @@ namespace {
                 VPPIteratorPair;
 
     AliasAnalysis *AA;
+    DominatorTree *DT;
     ScalarEvolution *SE;
-    TargetData *TD;
+    DataLayout *TD;
+    TargetTransformInfo *TTI;
+    const VectorTargetTransformInfo *VTTI;
 
     // FIXME: const correct?
 
@@ -201,11 +229,23 @@ namespace {
     bool getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
                        std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseSet<ValuePair> &FixedOrderPairs,
+                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
 
+    // FIXME: The current implementation does not account for pairs that
+    // are connected in multiple ways. For example:
+    //   C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
+    enum PairConnectionType {
+      PairConnectionDirect,
+      PairConnectionSwap,
+      PairConnectionSplat
+    };
+
     void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
-                       std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                       DenseMap<VPPair, unsigned> &PairConnectionTypes);
 
     void buildDepMap(BasicBlock &BB,
                        std::multimap<Value *, Value *> &CandidatePairs,
@@ -213,19 +253,29 @@ namespace {
                        DenseSet<ValuePair> &PairableInstUsers);
 
     void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                         std::vector<Value *> &PairableInsts,
+                        DenseSet<ValuePair> &FixedOrderPairs,
+                        DenseMap<VPPair, unsigned> &PairConnectionTypes,
                         std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                        std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
                         DenseSet<ValuePair> &PairableInstUsers,
                         DenseMap<Value *, Value *>& ChosenPairs);
 
     void fuseChosenPairs(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *>& ChosenPairs);
+                     DenseMap<Value *, Value *>& ChosenPairs,
+                     DenseSet<ValuePair> &FixedOrderPairs,
+                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+
 
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
 
     bool areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len);
+                       bool IsSimpleLoadStore, bool NonPow2Len,
+                       int &CostSavings, int &FixedOrder);
 
     bool trackUsesOfI(DenseSet<Value *> &Users,
                       AliasSetTracker &WriteSet, Instruction *I,
@@ -236,6 +286,7 @@ namespace {
                       std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       ValuePair P);
 
     bool pairsConflict(ValuePair P, ValuePair Q,
@@ -267,17 +318,21 @@ namespace {
 
     void findBestTreeFor(
                       std::multimap<Value *, Value *> &CandidatePairs,
+                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
                       std::vector<Value *> &PairableInsts,
+                      DenseSet<ValuePair> &FixedOrderPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
                       DenseSet<ValuePair> &PairableInstUsers,
                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
                       DenseMap<Value *, Value *> &ChosenPairs,
                       DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      size_t &BestEffSize, VPIteratorPair ChoiceRange,
+                      int &BestEffSize, VPIteratorPair ChoiceRange,
                       bool UseCycleCheck);
 
     Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool FlipMemInputs);
+                     Instruction *J, unsigned o);
 
     void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
                      unsigned MaskOffset, unsigned NumInElem,
@@ -289,20 +344,20 @@ namespace {
 
     bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
                        unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR,
+                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
                        unsigned IdxOff = 0);
 
     Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool FlipMemInputs);
+                     Instruction *J, unsigned o, bool IBeforeJ);
 
     void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
-                     bool FlipMemInputs);
+                     bool IBeforeJ);
 
     void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt, Instruction *&K1,
-                     Instruction *&K2, bool FlipMemInputs);
+                     Instruction *&K2);
 
     void collectPairLoadMoveSet(BasicBlock &BB,
                      DenseMap<Value *, Value *> &ChosenPairs,
@@ -314,10 +369,6 @@ namespace {
                      DenseMap<Value *, Value *> &ChosenPairs,
                      std::multimap<Value *, Value *> &LoadMoveSet);
 
-    void collectPtrInfo(std::vector<Value *> &PairableInsts,
-                        DenseMap<Value *, Value *> &ChosenPairs,
-                        DenseSet<Value *> &LowPtrInsts);
-
     bool canMoveUsesOfIAfterJ(BasicBlock &BB,
                      std::multimap<Value *, Value *> &LoadMoveSet,
                      Instruction *I, Instruction *J);
@@ -330,13 +381,22 @@ namespace {
     void combineMetadata(Instruction *K, const Instruction *J);
 
     bool vectorizeBB(BasicBlock &BB) {
+      if (!DT->isReachableFromEntry(&BB)) {
+        DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
+              " in " << BB.getParent()->getName() << "\n");
+        return false;
+      }
+
+      DEBUG(if (VTTI) dbgs() << "BBV: using target information\n");
+
       bool changed = false;
       // Iterate a sufficient number of times to merge types of size 1 bit,
       // then 2 bits, then 4, etc. up to half of the target vector width of the
       // target vector register.
       unsigned n = 1;
       for (unsigned v = 2;
-           v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter);
+           (VTTI || v <= Config.VectorBits) &&
+           (!Config.MaxIter || n <= Config.MaxIter);
            v *= 2, ++n) {
         DEBUG(dbgs() << "BBV: fusing loop #" << n <<
               " for " << BB.getName() << " in " <<
@@ -363,8 +423,12 @@ namespace {
 
     virtual bool runOnBasicBlock(BasicBlock &BB) {
       AA = &getAnalysis<AliasAnalysis>();
+      DT = &getAnalysis<DominatorTree>();
       SE = &getAnalysis<ScalarEvolution>();
-      TD = getAnalysisIfAvailable<TargetData>();
+      TD = getAnalysisIfAvailable<DataLayout>();
+      TTI = IgnoreTargetInfo ? 0 :
+        getAnalysisIfAvailable<TargetTransformInfo>();
+      VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0;
 
       return vectorizeBB(BB);
     }
@@ -372,8 +436,10 @@ namespace {
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       BasicBlockPass::getAnalysisUsage(AU);
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<DominatorTree>();
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<DominatorTree>();
       AU.addPreserved<ScalarEvolution>();
       AU.setPreservesCFG();
     }
@@ -415,6 +481,14 @@ namespace {
         T2 = cast<CastInst>(I)->getSrcTy();
       else
         T2 = T1;
+
+      if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        T2 = SI->getCondition()->getType();
+      } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+        T2 = SI->getOperand(0)->getType();
+      } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+        T2 = CI->getOperand(0)->getType();
+      }
     }
 
     // Returns the weight associated with the provided value. A chain of
@@ -446,6 +520,62 @@ namespace {
       return 1;
     }
 
+    // Returns the cost of the provided instruction using VTTI.
+    // This does not handle loads and stores.
+    unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) {
+      switch (Opcode) {
+      default: break;
+      case Instruction::GetElementPtr:
+        // We mark this instruction as zero-cost because scalar GEPs are usually
+        // lowered to the intruction addressing mode. At the moment we don't
+        // generate vector GEPs.
+        return 0;
+      case Instruction::Br:
+        return VTTI->getCFInstrCost(Opcode);
+      case Instruction::PHI:
+        return 0;
+      case Instruction::Add:
+      case Instruction::FAdd:
+      case Instruction::Sub:
+      case Instruction::FSub:
+      case Instruction::Mul:
+      case Instruction::FMul:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+        return VTTI->getArithmeticInstrCost(Opcode, T1);
+      case Instruction::Select:
+      case Instruction::ICmp:
+      case Instruction::FCmp:
+        return VTTI->getCmpSelInstrCost(Opcode, T1, T2);
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast:
+      case Instruction::ShuffleVector:
+        return VTTI->getCastInstrCost(Opcode, T1, T2);
+      }
+
+      return 1;
+    }
+
     // This determines the relative offset of two loads or stores, returning
     // true if the offset could be determined to be some constant value.
     // For example, if OffsetInElmts == 1, then J accesses the memory directly
@@ -453,20 +583,30 @@ namespace {
     // directly after J.
     bool getPairPtrInfo(Instruction *I, Instruction *J,
         Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
-        int64_t &OffsetInElmts) {
+        unsigned &IAddressSpace, unsigned &JAddressSpace,
+        int64_t &OffsetInElmts, bool ComputeOffset = true) {
       OffsetInElmts = 0;
-      if (isa<LoadInst>(I)) {
-        IPtr = cast<LoadInst>(I)->getPointerOperand();
-        JPtr = cast<LoadInst>(J)->getPointerOperand();
-        IAlignment = cast<LoadInst>(I)->getAlignment();
-        JAlignment = cast<LoadInst>(J)->getAlignment();
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        LoadInst *LJ = cast<LoadInst>(J);
+        IPtr = LI->getPointerOperand();
+        JPtr = LJ->getPointerOperand();
+        IAlignment = LI->getAlignment();
+        JAlignment = LJ->getAlignment();
+        IAddressSpace = LI->getPointerAddressSpace();
+        JAddressSpace = LJ->getPointerAddressSpace();
       } else {
-        IPtr = cast<StoreInst>(I)->getPointerOperand();
-        JPtr = cast<StoreInst>(J)->getPointerOperand();
-        IAlignment = cast<StoreInst>(I)->getAlignment();
-        JAlignment = cast<StoreInst>(J)->getAlignment();
+        StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J);
+        IPtr = SI->getPointerOperand();
+        JPtr = SJ->getPointerOperand();
+        IAlignment = SI->getAlignment();
+        JAlignment = SJ->getAlignment();
+        IAddressSpace = SI->getPointerAddressSpace();
+        JAddressSpace = SJ->getPointerAddressSpace();
       }
 
+      if (!ComputeOffset)
+        return true;
+
       const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
       const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
 
@@ -536,6 +676,19 @@ namespace {
 
       return false;
     }
+
+    bool isPureIEChain(InsertElementInst *IE) {
+      InsertElementInst *IENext = IE;
+      do {
+        if (!isa<UndefValue>(IENext->getOperand(0)) &&
+            !isa<InsertElementInst>(IENext->getOperand(0))) {
+          return false;
+        }
+      } while ((IENext =
+                 dyn_cast<InsertElementInst>(IENext->getOperand(0))));
+
+      return true;
+    }
   };
 
   // This function implements one vectorization iteration on the provided
@@ -546,11 +699,18 @@ namespace {
 
     std::vector<Value *> AllPairableInsts;
     DenseMap<Value *, Value *> AllChosenPairs;
+    DenseSet<ValuePair> AllFixedOrderPairs;
+    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
+    std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
 
     do {
       std::vector<Value *> PairableInsts;
       std::multimap<Value *, Value *> CandidatePairs;
+      DenseSet<ValuePair> FixedOrderPairs;
+      DenseMap<ValuePair, int> CandidatePairCostSavings;
       ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
+                                         FixedOrderPairs,
+                                         CandidatePairCostSavings,
                                          PairableInsts, NonPow2Len);
       if (PairableInsts.empty()) continue;
 
@@ -563,10 +723,18 @@ namespace {
       // Note that it only matters that both members of the second pair use some
       // element of the first pair (to allow for splatting).
 
-      std::multimap<ValuePair, ValuePair> ConnectedPairs;
-      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+      std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+      DenseMap<VPPair, unsigned> PairConnectionTypes;
+      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
+                            PairConnectionTypes);
       if (ConnectedPairs.empty()) continue;
 
+      for (std::multimap<ValuePair, ValuePair>::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I) {
+        ConnectedPairDeps.insert(VPPair(I->second, I->first));
+      }
+
       // Build the pairable-instruction dependency map
       DenseSet<ValuePair> PairableInstUsers;
       buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
@@ -578,13 +746,48 @@ namespace {
       // variables.
 
       DenseMap<Value *, Value *> ChosenPairs;
-      choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+      choosePairs(CandidatePairs, CandidatePairCostSavings,
+        PairableInsts, FixedOrderPairs, PairConnectionTypes,
+        ConnectedPairs, ConnectedPairDeps,
         PairableInstUsers, ChosenPairs);
 
       if (ChosenPairs.empty()) continue;
       AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
                               PairableInsts.end());
       AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
+
+      // Only for the chosen pairs, propagate information on fixed-order pairs,
+      // pair connections, and their types to the data structures used by the
+      // pair fusion procedures.
+      for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
+           IE = ChosenPairs.end(); I != IE; ++I) {
+        if (FixedOrderPairs.count(*I))
+          AllFixedOrderPairs.insert(*I);
+        else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
+          AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
+
+        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
+             J != IE; ++J) {
+          DenseMap<VPPair, unsigned>::iterator K =
+            PairConnectionTypes.find(VPPair(*I, *J));
+          if (K != PairConnectionTypes.end()) {
+            AllPairConnectionTypes.insert(*K);
+          } else {
+            K = PairConnectionTypes.find(VPPair(*J, *I));
+            if (K != PairConnectionTypes.end())
+              AllPairConnectionTypes.insert(*K);
+          }
+        }
+      }
+
+      for (std::multimap<ValuePair, ValuePair>::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I) {
+        if (AllPairConnectionTypes.count(*I)) {
+          AllConnectedPairs.insert(*I);
+          AllConnectedPairDeps.insert(VPPair(I->second, I->first));
+        }
+      }
     } while (ShouldContinue);
 
     if (AllChosenPairs.empty()) return false;
@@ -597,11 +800,13 @@ namespace {
     // replaced with a vector_extract on the result.  Subsequent optimization
     // passes should coalesce the build/extract combinations.
 
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs);
+    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
+                    AllPairConnectionTypes,
+                    AllConnectedPairs, AllConnectedPairDeps);
 
     // It is important to cleanup here so that future iterations of this
     // function have less work to do.
-    (void) SimplifyInstructionsInBlock(&BB, TD);
+    (void) SimplifyInstructionsInBlock(&BB, TD, AA->getTargetLibraryInfo());
     return true;
   }
 
@@ -667,15 +872,22 @@ namespace {
         !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
       return false;
 
-    if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) {
+    if (T1->getScalarSizeInBits() == 1) {
       if (!Config.VectorizeBools)
         return false;
     } else {
-      if (!Config.VectorizeInts
-          && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+      if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
         return false;
     }
-  
+
+    if (T2->getScalarSizeInBits() == 1) {
+      if (!Config.VectorizeBools)
+        return false;
+    } else {
+      if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
+        return false;
+    }
+
     if (!Config.VectorizeFloats
         && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
       return false;
@@ -691,8 +903,8 @@ namespace {
          T2->getScalarType()->isPointerTy()))
       return false;
 
-    if (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
-        T2->getPrimitiveSizeInBits() >= Config.VectorBits)
+    if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
+                  T2->getPrimitiveSizeInBits() >= Config.VectorBits))
       return false;
 
     return true;
@@ -703,10 +915,14 @@ namespace {
   // that I has already been determined to be vectorizable and that J is not
   // in the use tree of I.
   bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
-                       bool IsSimpleLoadStore, bool NonPow2Len) {
+                       bool IsSimpleLoadStore, bool NonPow2Len,
+                       int &CostSavings, int &FixedOrder) {
     DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
                      " <-> " << *J << "\n");
 
+    CostSavings = 0;
+    FixedOrder = 0;
+
     // Loads and stores can be merged if they have different alignments,
     // but are otherwise the same.
     if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
@@ -719,38 +935,84 @@ namespace {
     unsigned MaxTypeBits = std::max(
       IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
       IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
-    if (MaxTypeBits > Config.VectorBits)
+    if (!VTTI && MaxTypeBits > Config.VectorBits)
       return false;
 
     // FIXME: handle addsub-type operations!
 
     if (IsSimpleLoadStore) {
       Value *IPtr, *JPtr;
-      unsigned IAlignment, JAlignment;
+      unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
       int64_t OffsetInElmts = 0;
       if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+            IAddressSpace, JAddressSpace,
             OffsetInElmts) && abs64(OffsetInElmts) == 1) {
-        if (Config.AlignedOnly) {
-          Type *aTypeI = isa<StoreInst>(I) ?
-            cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
-          Type *aTypeJ = isa<StoreInst>(J) ?
-            cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+        FixedOrder = (int) OffsetInElmts;
+        unsigned BottomAlignment = IAlignment;
+        if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+        Type *aTypeI = isa<StoreInst>(I) ?
+          cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+        Type *aTypeJ = isa<StoreInst>(J) ?
+          cast<StoreInst>(J)->getValueOperand()->getType() : J->getType();
+        Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
 
+        if (Config.AlignedOnly) {
           // An aligned load or store is possible only if the instruction
           // with the lower offset has an alignment suitable for the
           // vector type.
 
-          unsigned BottomAlignment = IAlignment;
-          if (OffsetInElmts < 0) BottomAlignment = JAlignment;
-
-          Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
           unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
           if (BottomAlignment < VecAlignment)
             return false;
         }
+
+        if (VTTI) {
+          unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(),
+                                                 IAlignment, IAddressSpace);
+          unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(),
+                                                 JAlignment, JAddressSpace);
+          unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType,
+                                                 BottomAlignment,
+                                                 IAddressSpace);
+          if (VCost > ICost + JCost)
+            return false;
+
+          // We don't want to fuse to a type that will be split, even
+          // if the two input types will also be split and there is no other
+          // associated cost.
+          unsigned VParts = VTTI->getNumberOfParts(VType);
+          if (VParts > 1)
+            return false;
+          else if (!VParts && VCost == ICost + JCost)
+            return false;
+
+          CostSavings = ICost + JCost - VCost;
+        }
       } else {
         return false;
       }
+    } else if (VTTI) {
+      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
+      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
+      Type *VT1 = getVecTypeForPair(IT1, JT1),
+           *VT2 = getVecTypeForPair(IT2, JT2);
+      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2);
+
+      if (VCost > ICost + JCost)
+        return false;
+
+      // We don't want to fuse to a type that will be split, even
+      // if the two input types will also be split and there is no other
+      // associated cost.
+      unsigned VParts1 = VTTI->getNumberOfParts(VT1),
+               VParts2 = VTTI->getNumberOfParts(VT2);
+      if (VParts1 > 1 || VParts2 > 1)
+        return false;
+      else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
+        return false;
+
+      CostSavings = ICost + JCost - VCost;
     }
 
     // The powi intrinsic is special because only the first argument is
@@ -833,6 +1095,8 @@ namespace {
   bool BBVectorize::getCandidatePairs(BasicBlock &BB,
                        BasicBlock::iterator &Start,
                        std::multimap<Value *, Value *> &CandidatePairs,
+                       DenseSet<ValuePair> &FixedOrderPairs,
+                       DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len) {
     BasicBlock::iterator E = BB.end();
     if (Start == E) return false;
@@ -869,7 +1133,9 @@ namespace {
 
         // J does not use I, and comes before the first use of I, so it can be
         // merged with I if the instructions are compatible.
-        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue;
+        int CostSavings, FixedOrder;
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len,
+            CostSavings, FixedOrder)) continue;
 
         // J is a candidate for merging with I.
         if (!PairableInsts.size() ||
@@ -878,6 +1144,14 @@ namespace {
         }
 
         CandidatePairs.insert(ValuePair(I, J));
+        if (VTTI)
+          CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J),
+                                                            CostSavings));
+
+        if (FixedOrder == 1)
+          FixedOrderPairs.insert(ValuePair(I, J));
+        else if (FixedOrder == -1)
+          FixedOrderPairs.insert(ValuePair(J, I));
 
         // The next call to this function must start after the last instruction
         // selected during this invocation.
@@ -887,7 +1161,8 @@ namespace {
         }
 
         DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
-                     << *I << " <-> " << *J << "\n");
+                     << *I << " <-> " << *J << " (cost savings: " <<
+                     CostSavings << ")\n");
 
         // If we have already found too many pairs, break here and this function
         // will be called again starting after the last instruction selected
@@ -915,6 +1190,7 @@ namespace {
                       std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       ValuePair P) {
     StoreInst *SI, *SJ;
 
@@ -946,12 +1222,18 @@ namespace {
         VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
 
         // Look for <I, J>:
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
+        }
 
         // Look for <J, I>:
-        if (isSecondInIteratorPair<Value*>(*I, JPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+        if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+          VPPair VP(P, ValuePair(*J, *I));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
+        }
       }
 
       if (Config.SplatBreaksChain) continue;
@@ -962,8 +1244,11 @@ namespace {
             P.first == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
       }
     }
 
@@ -985,8 +1270,11 @@ namespace {
             P.second == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
       }
     }
   }
@@ -997,7 +1285,8 @@ namespace {
   void BBVectorize::computeConnectedPairs(
                       std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes) {
 
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PE = PairableInsts.end(); PI != PE; ++PI) {
@@ -1006,7 +1295,7 @@ namespace {
       for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
            P != choiceRange.second; ++P)
         computePairsConnectedTo(CandidatePairs, PairableInsts,
-                                ConnectedPairs, *P);
+                                ConnectedPairs, PairConnectionTypes, *P);
     }
 
     DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
@@ -1196,7 +1485,7 @@ namespace {
       PrunedTree.insert(QTop.first);
 
       // Visit each child, pruning as necessary...
-      DenseMap<ValuePair, size_t> BestChildren;
+      SmallVector<ValuePairWithDepth, 8> BestChildren;
       VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
       for (std::multimap<ValuePair, ValuePair>::iterator K = QTopRange.first;
            K != QTopRange.second; ++K) {
@@ -1228,7 +1517,7 @@ namespace {
         DenseSet<ValuePair> CurrentPairs;
 
         bool CanAdd = true;
-        for (DenseMap<ValuePair, size_t>::iterator C2
+        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
               = BestChildren.begin(), E2 = BestChildren.end();
              C2 != E2; ++C2) {
           if (C2->first.first == C->first.first ||
@@ -1313,22 +1602,22 @@ namespace {
         // to an already-selected child. Check for this here, and if a
         // conflict is found, then remove the previously-selected child
         // before adding this one in its place.
-        for (DenseMap<ValuePair, size_t>::iterator C2
+        for (SmallVector<ValuePairWithDepth, 8>::iterator C2
               = BestChildren.begin(); C2 != BestChildren.end();) {
           if (C2->first.first == C->first.first ||
               C2->first.first == C->first.second ||
               C2->first.second == C->first.first ||
               C2->first.second == C->first.second ||
               pairsConflict(C2->first, C->first, PairableInstUsers))
-            BestChildren.erase(C2++);
+            C2 = BestChildren.erase(C2);
           else
             ++C2;
         }
 
-        BestChildren.insert(ValuePairWithDepth(C->first, C->second));
+        BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
       }
 
-      for (DenseMap<ValuePair, size_t>::iterator C
+      for (SmallVector<ValuePairWithDepth, 8>::iterator C
             = BestChildren.begin(), E2 = BestChildren.end();
            C != E2; ++C) {
         size_t DepthF = getDepthFactor(C->first.first);
@@ -1341,13 +1630,17 @@ namespace {
   // pairs, given the choice of root pairs as an iterator range.
   void BBVectorize::findBestTreeFor(
                       std::multimap<Value *, Value *> &CandidatePairs,
+                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
                       std::vector<Value *> &PairableInsts,
+                      DenseSet<ValuePair> &FixedOrderPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
                       DenseSet<ValuePair> &PairableInstUsers,
                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
                       DenseMap<Value *, Value *> &ChosenPairs,
                       DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
-                      size_t &BestEffSize, VPIteratorPair ChoiceRange,
+                      int &BestEffSize, VPIteratorPair ChoiceRange,
                       bool UseCycleCheck) {
     for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
          J != ChoiceRange.second; ++J) {
@@ -1397,17 +1690,289 @@ namespace {
                    PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
                    PrunedTree, *J, UseCycleCheck);
 
-      size_t EffSize = 0;
-      for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
-           E = PrunedTree.end(); S != E; ++S)
-        EffSize += getDepthFactor(S->first);
+      int EffSize = 0;
+      if (VTTI) {
+        DenseSet<Value *> PrunedTreeInstrs;
+        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+             E = PrunedTree.end(); S != E; ++S) {
+          PrunedTreeInstrs.insert(S->first);
+          PrunedTreeInstrs.insert(S->second);
+        }
+
+        // The set of pairs that have already contributed to the total cost.
+        DenseSet<ValuePair> IncomingPairs;
+
+        // If the cost model were perfect, this might not be necessary; but we
+        // need to make sure that we don't get stuck vectorizing our own
+        // shuffle chains.
+        bool HasNontrivialInsts = false;
+
+        // The node weights represent the cost savings associated with
+        // fusing the pair of instructions.
+        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+             E = PrunedTree.end(); S != E; ++S) {
+          if (!isa<ShuffleVectorInst>(S->first) &&
+              !isa<InsertElementInst>(S->first) &&
+              !isa<ExtractElementInst>(S->first))
+            HasNontrivialInsts = true;
+
+          bool FlipOrder = false;
+
+          if (getDepthFactor(S->first)) {
+            int ESContrib = CandidatePairCostSavings.find(*S)->second;
+            DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
+                   << *S->first << " <-> " << *S->second << "} = " <<
+                   ESContrib << "\n");
+            EffSize += ESContrib;
+          }
+
+          // The edge weights contribute in a negative sense: they represent
+          // the cost of shuffles.
+          VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S);
+          if (IP.first != ConnectedPairDeps.end()) {
+            unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+                 Q != IP.second; ++Q) {
+              if (!PrunedTree.count(Q->second))
+                continue;
+              DenseMap<VPPair, unsigned>::iterator R =
+                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+              assert(R != PairConnectionTypes.end() &&
+                     "Cannot find pair connection type");
+              if (R->second == PairConnectionDirect)
+                ++NumDepsDirect;
+              else if (R->second == PairConnectionSwap)
+                ++NumDepsSwap;
+            }
+
+            // If there are more swaps than direct connections, then
+            // the pair order will be flipped during fusion. So the real
+            // number of swaps is the minimum number.
+            FlipOrder = !FixedOrderPairs.count(*S) &&
+              ((NumDepsSwap > NumDepsDirect) ||
+                FixedOrderPairs.count(ValuePair(S->second, S->first)));
+
+            for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+                 Q != IP.second; ++Q) {
+              if (!PrunedTree.count(Q->second))
+                continue;
+              DenseMap<VPPair, unsigned>::iterator R =
+                PairConnectionTypes.find(VPPair(Q->second, Q->first));
+              assert(R != PairConnectionTypes.end() &&
+                     "Cannot find pair connection type");
+              Type *Ty1 = Q->second.first->getType(),
+                   *Ty2 = Q->second.second->getType();
+              Type *VTy = getVecTypeForPair(Ty1, Ty2);
+              if ((R->second == PairConnectionDirect && FlipOrder) ||
+                  (R->second == PairConnectionSwap && !FlipOrder)  ||
+                  R->second == PairConnectionSplat) {
+                int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                                   VTy, VTy);
+                DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                  *Q->second.first << " <-> " << *Q->second.second <<
+                    "} -> {" <<
+                  *S->first << " <-> " << *S->second << "} = " <<
+                   ESContrib << "\n");
+                EffSize -= ESContrib;
+              }
+            }
+          }
+
+          // Compute the cost of outgoing edges. We assume that edges outgoing
+          // to shuffles, inserts or extracts can be merged, and so contribute
+          // no additional cost.
+          if (!S->first->getType()->isVoidTy()) {
+            Type *Ty1 = S->first->getType(),
+                 *Ty2 = S->second->getType();
+            Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+            bool NeedsExtraction = false;
+            for (Value::use_iterator I = S->first->use_begin(),
+                 IE = S->first->use_end(); I != IE; ++I) {
+              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+                // Shuffle can be folded if it has no other input
+                if (isa<UndefValue>(SI->getOperand(1)))
+                  continue;
+              }
+              if (isa<ExtractElementInst>(*I))
+                continue;
+              if (PrunedTreeInstrs.count(*I))
+                continue;
+              NeedsExtraction = true;
+              break;
+            }
+
+            if (NeedsExtraction) {
+              int ESContrib;
+              if (Ty1->isVectorTy())
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               Ty1, VTy);
+              else
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::ExtractElement, VTy, 0);
+
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                *S->first << "} = " << ESContrib << "\n");
+              EffSize -= ESContrib;
+            }
+
+            NeedsExtraction = false;
+            for (Value::use_iterator I = S->second->use_begin(),
+                 IE = S->second->use_end(); I != IE; ++I) {
+              if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+                // Shuffle can be folded if it has no other input
+                if (isa<UndefValue>(SI->getOperand(1)))
+                  continue;
+              }
+              if (isa<ExtractElementInst>(*I))
+                continue;
+              if (PrunedTreeInstrs.count(*I))
+                continue;
+              NeedsExtraction = true;
+              break;
+            }
+
+            if (NeedsExtraction) {
+              int ESContrib;
+              if (Ty2->isVectorTy())
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               Ty2, VTy);
+              else
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::ExtractElement, VTy, 1);
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
+                *S->second << "} = " << ESContrib << "\n");
+              EffSize -= ESContrib;
+            }
+          }
+
+          // Compute the cost of incoming edges.
+          if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) {
+            Instruction *S1 = cast<Instruction>(S->first),
+                        *S2 = cast<Instruction>(S->second);
+            for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
+              Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
+
+              // Combining constants into vector constants (or small vector
+              // constants into larger ones are assumed free).
+              if (isa<Constant>(O1) && isa<Constant>(O2))
+                continue;
+
+              if (FlipOrder)
+                std::swap(O1, O2);
+
+              ValuePair VP  = ValuePair(O1, O2);
+              ValuePair VPR = ValuePair(O2, O1);
+
+              // Internal edges are not handled here.
+              if (PrunedTree.count(VP) || PrunedTree.count(VPR))
+                continue;
+
+              Type *Ty1 = O1->getType(),
+                   *Ty2 = O2->getType();
+              Type *VTy = getVecTypeForPair(Ty1, Ty2);
+
+              // Combining vector operations of the same type is also assumed
+              // folded with other operations.
+              if (Ty1 == Ty2) {
+                // If both are insert elements, then both can be widened.
+                InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
+                                  *IEO2 = dyn_cast<InsertElementInst>(O2);
+                if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
+                  continue;
+                // If both are extract elements, and both have the same input
+                // type, then they can be replaced with a shuffle
+                ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
+                                   *EIO2 = dyn_cast<ExtractElementInst>(O2);
+                if (EIO1 && EIO2 &&
+                    EIO1->getOperand(0)->getType() ==
+                      EIO2->getOperand(0)->getType())
+                  continue;
+                // If both are a shuffle with equal operand types and only two
+                // unqiue operands, then they can be replaced with a single
+                // shuffle
+                ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
+                                  *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
+                if (SIO1 && SIO2 &&
+                    SIO1->getOperand(0)->getType() ==
+                      SIO2->getOperand(0)->getType()) {
+                  SmallSet<Value *, 4> SIOps;
+                  SIOps.insert(SIO1->getOperand(0));
+                  SIOps.insert(SIO1->getOperand(1));
+                  SIOps.insert(SIO2->getOperand(0));
+                  SIOps.insert(SIO2->getOperand(1));
+                  if (SIOps.size() <= 2)
+                    continue;
+                }
+              }
+
+              int ESContrib;
+              // This pair has already been formed.
+              if (IncomingPairs.count(VP)) {
+                continue;
+              } else if (IncomingPairs.count(VPR)) {
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               VTy, VTy);
+              } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::InsertElement, VTy, 0);
+                ESContrib += (int) VTTI->getVectorInstrCost(
+                                     Instruction::InsertElement, VTy, 1);
+              } else if (!Ty1->isVectorTy()) {
+                // O1 needs to be inserted into a vector of size O2, and then
+                // both need to be shuffled together.
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::InsertElement, Ty2, 0);
+                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                VTy, Ty2);
+              } else if (!Ty2->isVectorTy()) {
+                // O2 needs to be inserted into a vector of size O1, and then
+                // both need to be shuffled together.
+                ESContrib = (int) VTTI->getVectorInstrCost(
+                                    Instruction::InsertElement, Ty1, 0);
+                ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                VTy, Ty1);
+              } else {
+                Type *TyBig = Ty1, *TySmall = Ty2;
+                if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
+                  std::swap(TyBig, TySmall);
+
+                ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
+                                               VTy, TyBig);
+                if (TyBig != TySmall)
+                  ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
+                                                  TyBig, TySmall);
+              }
+
+              DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
+                     << *O1 << " <-> " << *O2 << "} = " <<
+                     ESContrib << "\n");
+              EffSize -= ESContrib;
+              IncomingPairs.insert(VP);
+            }
+          }
+        }
+
+        if (!HasNontrivialInsts) {
+          DEBUG(if (DebugPairSelection) dbgs() <<
+                "\tNo non-trivial instructions in tree;"
+                " override to zero effective size\n");
+          EffSize = 0;
+        }
+      } else {
+        for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+             E = PrunedTree.end(); S != E; ++S)
+          EffSize += (int) getDepthFactor(S->first);
+      }
 
       DEBUG(if (DebugPairSelection)
              dbgs() << "BBV: found pruned Tree for pair {"
              << *J->first << " <-> " << *J->second << "} of depth " <<
              MaxDepth << " and size " << PrunedTree.size() <<
             " (effective size: " << EffSize << ")\n");
-      if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) {
+      if (((VTTI && !UseChainDepthWithTI) ||
+            MaxDepth >= Config.ReqChainDepth) &&
+          EffSize > 0 && EffSize > BestEffSize) {
         BestMaxDepth = MaxDepth;
         BestEffSize = EffSize;
         BestTree = PrunedTree;
@@ -1419,8 +1984,12 @@ namespace {
   // that will be fused into vector instructions.
   void BBVectorize::choosePairs(
                       std::multimap<Value *, Value *> &CandidatePairs,
+                      DenseMap<ValuePair, int> &CandidatePairCostSavings,
                       std::vector<Value *> &PairableInsts,
+                      DenseSet<ValuePair> &FixedOrderPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairDeps,
                       DenseSet<ValuePair> &PairableInstUsers,
                       DenseMap<Value *, Value *>& ChosenPairs) {
     bool UseCycleCheck =
@@ -1435,9 +2004,12 @@ namespace {
       VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
 
       // The best pair to choose and its tree:
-      size_t BestMaxDepth = 0, BestEffSize = 0;
+      size_t BestMaxDepth = 0;
+      int BestEffSize = 0;
       DenseSet<ValuePair> BestTree;
-      findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+      findBestTreeFor(CandidatePairs, CandidatePairCostSavings,
+                      PairableInsts, FixedOrderPairs, PairConnectionTypes,
+                      ConnectedPairs, ConnectedPairDeps,
                       PairableInstUsers, PairableInstUserMap, ChosenPairs,
                       BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
                       UseCycleCheck);
@@ -1490,24 +2062,19 @@ namespace {
   // Returns the value that is to be used as the pointer input to the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
-                     Instruction *I, Instruction *J, unsigned o,
-                     bool FlipMemInputs) {
+                     Instruction *I, Instruction *J, unsigned o) {
     Value *IPtr, *JPtr;
-    unsigned IAlignment, JAlignment;
+    unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
     int64_t OffsetInElmts;
 
-    // Note: the analysis might fail here, that is why FlipMemInputs has
+    // Note: the analysis might fail here, that is why the pair order has
     // been precomputed (OffsetInElmts must be unused here).
     (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                          OffsetInElmts);
+                          IAddressSpace, JAddressSpace,
+                          OffsetInElmts, false);
 
     // The pointer value is taken to be the one with the lowest offset.
-    Value *VPtr;
-    if (!FlipMemInputs) {
-      VPtr = IPtr;
-    } else {
-      VPtr = JPtr;
-    }
+    Value *VPtr = IPtr;
 
     Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType();
     Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType();
@@ -1515,7 +2082,7 @@ namespace {
     Type *VArgPtrType = PointerType::get(VArgType,
       cast<PointerType>(IPtr->getType())->getAddressSpace());
     return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
-                        /* insert before */ FlipMemInputs ? J : I);
+                        /* insert before */ I);
   }
 
   void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
@@ -1585,23 +2152,12 @@ namespace {
                                   Instruction *J, unsigned o, Value *&LOp,
                                   unsigned numElemL,
                                   Type *ArgTypeL, Type *ArgTypeH,
-                                  unsigned IdxOff) {
+                                  bool IBeforeJ, unsigned IdxOff) {
     bool ExpandedIEChain = false;
     if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
       // If we have a pure insertelement chain, then this can be rewritten
       // into a chain that directly builds the larger type.
-      bool PureChain = true;
-      InsertElementInst *LIENext = LIE;
-      do {
-        if (!isa<UndefValue>(LIENext->getOperand(0)) &&
-            !isa<InsertElementInst>(LIENext->getOperand(0))) {
-          PureChain = false;
-          break;
-        }
-      } while ((LIENext =
-                 dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
-      if (PureChain) {
+      if (isPureIEChain(LIE)) {
         SmallVector<Value *, 8> VectElemts(numElemL,
           UndefValue::get(ArgTypeL->getScalarType()));
         InsertElementInst *LIENext = LIE;
@@ -1619,8 +2175,9 @@ namespace {
           LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
                              ConstantInt::get(Type::getInt32Ty(Context),
                                               i + IdxOff),
-                             getReplacementName(I, true, o, i+1));
-          LIENext->insertBefore(J);
+                             getReplacementName(IBeforeJ ? I : J,
+                                                true, o, i+1));
+          LIENext->insertBefore(IBeforeJ ? J : I);
           LIEPrev = LIENext;
         }
 
@@ -1635,7 +2192,7 @@ namespace {
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o, bool FlipMemInputs) {
+                     Instruction *J, unsigned o, bool IBeforeJ) {
     Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
 
@@ -1646,12 +2203,6 @@ namespace {
 
     Instruction *L = I, *H = J;
     Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
-    if (FlipMemInputs) {
-      L = J;
-      H = I;
-      ArgTypeL = ArgTypeJ;
-      ArgTypeH = ArgTypeI;
-    }
 
     unsigned numElemL;
     if (ArgTypeL->isVectorTy())
@@ -1804,8 +2355,9 @@ namespace {
           Instruction *S =
             new ShuffleVectorInst(I1, UndefValue::get(I1T),
                                   ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o));
-          S->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o));
+          S->insertBefore(IBeforeJ ? J : I);
           return S;
         }
 
@@ -1826,8 +2378,9 @@ namespace {
           Instruction *NewI1 =
             new ShuffleVectorInst(I1, UndefValue::get(I1T),
                                   ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o, 1));
-          NewI1->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI1->insertBefore(IBeforeJ ? J : I);
           I1 = NewI1;
           I1T = I2T;
           I1Elem = I2Elem;
@@ -1842,8 +2395,9 @@ namespace {
           Instruction *NewI2 =
             new ShuffleVectorInst(I2, UndefValue::get(I2T),
                                   ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o, 1));
-          NewI2->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI2->insertBefore(IBeforeJ ? J : I);
           I2 = NewI2;
           I2T = I1T;
           I2Elem = I1Elem;
@@ -1863,8 +2417,8 @@ namespace {
 
         Instruction *NewOp =
           new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(I, true, o));
-        NewOp->insertBefore(J);
+                                getReplacementName(IBeforeJ ? I : J, true, o));
+        NewOp->insertBefore(IBeforeJ ? J : I);
         return NewOp;
       }
     }
@@ -1872,17 +2426,17 @@ namespace {
     Type *ArgType = ArgTypeL;
     if (numElemL < numElemH) {
       if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, 1)) {
+                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
         // This is another short-circuit case: we're combining a scalar into
         // a vector that is formed by an IE chain. We've just expanded the IE
         // chain, now insert the scalar and we're done.
 
         Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                                               getReplacementName(I, true, o));
-        S->insertBefore(J);
+                           getReplacementName(IBeforeJ ? I : J, true, o));
+        S->insertBefore(IBeforeJ ? J : I);
         return S;
       } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH)) {
+                                ArgTypeH, IBeforeJ)) {
         // The two vector inputs to the shuffle must be the same length,
         // so extend the smaller vector to be the same length as the larger one.
         Instruction *NLOp;
@@ -1897,29 +2451,32 @@ namespace {
     
           NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
                                        ConstantVector::get(Mask),
-                                       getReplacementName(I, true, o, 1));
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
         } else {
           NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(I, true, o, 1));
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
         }
   
-        NLOp->insertBefore(J);
+        NLOp->insertBefore(IBeforeJ ? J : I);
         LOp = NLOp;
       }
 
       ArgType = ArgTypeH;
     } else if (numElemL > numElemH) {
       if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType)) {
+                                         ArgTypeH, VArgType, IBeforeJ)) {
         Instruction *S =
           InsertElementInst::Create(LOp, HOp, 
                                     ConstantInt::get(Type::getInt32Ty(Context),
                                                      numElemL),
-                                    getReplacementName(I, true, o));
-        S->insertBefore(J);
+                                    getReplacementName(IBeforeJ ? I : J,
+                                                       true, o));
+        S->insertBefore(IBeforeJ ? J : I);
         return S;
       } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL)) {
+                                ArgTypeL, IBeforeJ)) {
         Instruction *NHOp;
         if (numElemH > 1) {
           std::vector<Constant *> Mask(numElemL);
@@ -1931,13 +2488,15 @@ namespace {
     
           NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
                                        ConstantVector::get(Mask),
-                                       getReplacementName(I, true, o, 1));
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
         } else {
           NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(I, true, o, 1));
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
         }
   
-        NHOp->insertBefore(J);
+        NHOp->insertBefore(IBeforeJ ? J : I);
         HOp = NHOp;
       }
     }
@@ -1955,19 +2514,21 @@ namespace {
       }
 
       Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                                              ConstantVector::get(Mask),
-                                              getReplacementName(I, true, o));
-      BV->insertBefore(J);
+                          ConstantVector::get(Mask),
+                          getReplacementName(IBeforeJ ? I : J, true, o));
+      BV->insertBefore(IBeforeJ ? J : I);
       return BV;
     }
 
     Instruction *BV1 = InsertElementInst::Create(
                                           UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(I, true, o, 1));
-    BV1->insertBefore(I);
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 1));
+    BV1->insertBefore(IBeforeJ ? J : I);
     Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(I, true, o, 2));
-    BV2->insertBefore(J);
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 2));
+    BV2->insertBefore(IBeforeJ ? J : I);
     return BV2;
   }
 
@@ -1976,7 +2537,7 @@ namespace {
   void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                      Instruction *I, Instruction *J,
                      SmallVector<Value *, 3> &ReplacedOperands,
-                     bool FlipMemInputs) {
+                     bool IBeforeJ) {
     unsigned NumOperands = I->getNumOperands();
 
     for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
@@ -1985,8 +2546,7 @@ namespace {
 
       if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
         // This is the pointer for a load/store instruction.
-        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
-                                FlipMemInputs);
+        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
         continue;
       } else if (isa<CallInst>(I)) {
         Function *F = cast<CallInst>(I)->getCalledFunction();
@@ -2014,8 +2574,7 @@ namespace {
         continue;
       }
 
-      ReplacedOperands[o] =
-        getReplacementInput(Context, I, J, o, FlipMemInputs);
+      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
     }
   }
 
@@ -2026,8 +2585,7 @@ namespace {
   void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, Instruction *K,
                      Instruction *&InsertionPt,
-                     Instruction *&K1, Instruction *&K2,
-                     bool FlipMemInputs) {
+                     Instruction *&K1, Instruction *&K2) {
     if (isa<StoreInst>(I)) {
       AA->replaceWithNewValue(I, K);
       AA->replaceWithNewValue(J, K);
@@ -2057,13 +2615,11 @@ namespace {
         }
 
         K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                   ConstantVector::get(
-                                     FlipMemInputs ? Mask2 : Mask1),
+                                   ConstantVector::get( Mask1),
                                    getReplacementName(K, false, 1));
       } else {
         Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
-        Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
-        K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
+        K1 = ExtractElementInst::Create(K, CV0,
                                           getReplacementName(K, false, 1));
       }
 
@@ -2075,13 +2631,11 @@ namespace {
         }
 
         K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
-                                   ConstantVector::get(
-                                     FlipMemInputs ? Mask1 : Mask2),
+                                   ConstantVector::get( Mask2),
                                    getReplacementName(K, false, 2));
       } else {
-        Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
         Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1);
-        K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
+        K2 = ExtractElementInst::Create(K, CV1,
                                           getReplacementName(K, false, 2));
       }
 
@@ -2181,36 +2735,6 @@ namespace {
     }
   }
 
-  // As with the aliasing information, SCEV can also change because of
-  // vectorization. This information is used to compute relative pointer
-  // offsets; the necessary information will be cached here prior to
-  // fusion.
-  void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts,
-                                   DenseMap<Value *, Value *> &ChosenPairs,
-                                   DenseSet<Value *> &LowPtrInsts) {
-    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
-      PIE = PairableInsts.end(); PI != PIE; ++PI) {
-      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
-      if (P == ChosenPairs.end()) continue;
-
-      Instruction *I = cast<Instruction>(P->first);
-      Instruction *J = cast<Instruction>(P->second);
-
-      if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
-        continue;
-
-      Value *IPtr, *JPtr;
-      unsigned IAlignment, JAlignment;
-      int64_t OffsetInElmts;
-      if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
-                          OffsetInElmts) || abs64(OffsetInElmts) != 1)
-        llvm_unreachable("Pre-fusion pointer analysis failed");
-
-      Value *LowPI = (OffsetInElmts > 0) ? I : J;
-      LowPtrInsts.insert(LowPI);
-    }
-  }
-
   // When the first instruction in each pair is cloned, it will inherit its
   // parent's metadata. This metadata must be combined with that of the other
   // instruction in a safe way.
@@ -2244,27 +2768,27 @@ namespace {
   // second member).
   void BBVectorize::fuseChosenPairs(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
-                     DenseMap<Value *, Value *> &ChosenPairs) {
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     DenseSet<ValuePair> &FixedOrderPairs,
+                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
     LLVMContext& Context = BB.getContext();
 
     // During the vectorization process, the order of the pairs to be fused
     // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
     // list. After a pair is fused, the flipped pair is removed from the list.
-    std::vector<ValuePair> FlippedPairs;
-    FlippedPairs.reserve(ChosenPairs.size());
+    DenseSet<ValuePair> FlippedPairs;
     for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
          E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.push_back(ValuePair(P->second, P->first));
-    for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+      FlippedPairs.insert(ValuePair(P->second, P->first));
+    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
          E = FlippedPairs.end(); P != E; ++P)
       ChosenPairs.insert(*P);
 
     std::multimap<Value *, Value *> LoadMoveSet;
     collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
 
-    DenseSet<Value *> LowPtrInsts;
-    collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts);
-
     DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
 
     for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
@@ -2304,44 +2828,92 @@ namespace {
         continue;
       }
 
-      bool FlipMemInputs = false;
-      if (isa<LoadInst>(I) || isa<StoreInst>(I))
-        FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end());
+      // If the pair must have the other order, then flip it.
+      bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
+      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
+        // This pair does not have a fixed order, and so we might want to
+        // flip it if that will yield fewer shuffles. We count the number
+        // of dependencies connected via swaps, and those directly connected,
+        // and flip the order if the number of swaps is greater.
+        bool OrigOrder = true;
+        VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
+        if (IP.first == ConnectedPairDeps.end()) {
+          IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+          OrigOrder = false;
+        }
 
+        if (IP.first != ConnectedPairDeps.end()) {
+          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+          for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+               Q != IP.second; ++Q) {
+            DenseMap<VPPair, unsigned>::iterator R =
+              PairConnectionTypes.find(VPPair(Q->second, Q->first));
+            assert(R != PairConnectionTypes.end() &&
+                   "Cannot find pair connection type");
+            if (R->second == PairConnectionDirect)
+              ++NumDepsDirect;
+            else if (R->second == PairConnectionSwap)
+              ++NumDepsSwap;
+          }
+
+          if (!OrigOrder)
+            std::swap(NumDepsDirect, NumDepsSwap);
+
+          if (NumDepsSwap > NumDepsDirect) {
+            FlipPairOrder = true;
+            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
+                            " <-> " << *J << "\n");
+          }
+        }
+      }
+
+      Instruction *L = I, *H = J;
+      if (FlipPairOrder)
+        std::swap(H, L);
+
+      // If the pair being fused uses the opposite order from that in the pair
+      // connection map, then we need to flip the types.
+      VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
+      for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+           Q != IP.second; ++Q) {
+        DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
+        assert(R != PairConnectionTypes.end() &&
+               "Cannot find pair connection type");
+        if (R->second == PairConnectionDirect)
+          R->second = PairConnectionSwap;
+        else if (R->second == PairConnectionSwap)
+          R->second = PairConnectionDirect;
+      }
+
+      bool LBeforeH = !FlipPairOrder;
       unsigned NumOperands = I->getNumOperands();
       SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, I, J, ReplacedOperands,
-        FlipMemInputs);
+      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
+                                  LBeforeH);
 
       // Make a copy of the original operation, change its type to the vector
       // type and replace its operands with the vector operands.
-      Instruction *K = I->clone();
-      if (I->hasName()) K->takeName(I);
+      Instruction *K = L->clone();
+      if (L->hasName())
+        K->takeName(L);
+      else if (H->hasName())
+        K->takeName(H);
 
       if (!isa<StoreInst>(K))
-        K->mutateType(getVecTypeForPair(I->getType(), J->getType()));
+        K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
 
-      combineMetadata(K, J);
+      combineMetadata(K, H);
+      K->intersectOptionalDataWith(H);
 
       for (unsigned o = 0; o < NumOperands; ++o)
         K->setOperand(o, ReplacedOperands[o]);
 
-      // If we've flipped the memory inputs, make sure that we take the correct
-      // alignment.
-      if (FlipMemInputs) {
-        if (isa<StoreInst>(K))
-          cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
-        else
-          cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
-      }
-
       K->insertAfter(J);
 
       // Instruction insertion point:
       Instruction *InsertionPt = K;
       Instruction *K1 = 0, *K2 = 0;
-      replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
-        FlipMemInputs);
+      replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
 
       // The use tree of the first original instruction must be moved to after
       // the location of the second instruction. The entire use tree of the
@@ -2351,10 +2923,10 @@ namespace {
       moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
 
       if (!isa<StoreInst>(I)) {
-        I->replaceAllUsesWith(K1);
-        J->replaceAllUsesWith(K2);
-        AA->replaceWithNewValue(I, K1);
-        AA->replaceWithNewValue(J, K2);
+        L->replaceAllUsesWith(K1);
+        H->replaceAllUsesWith(K2);
+        AA->replaceWithNewValue(L, K1);
+        AA->replaceWithNewValue(H, K2);
       }
 
       // Instructions that may read from memory may be in the load move set.
@@ -2387,6 +2959,9 @@ namespace {
       SE->forgetValue(J);
       I->eraseFromParent();
       J->eraseFromParent();
+
+      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
+                                               BB << "\n");
     }
 
     DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
@@ -2397,6 +2972,7 @@ char BBVectorize::ID = 0;
 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
 
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index 06cf1e4e5327..e64034ab26b4 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_llvm_library(LLVMVectorize
   BBVectorize.cpp
   Vectorize.cpp
+  LoopVectorize.cpp
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
new file mode 100644
index 000000000000..a7ef248e6e3d
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -0,0 +1,1941 @@
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+//    of the vectorization.
+// 3. SingleBlockLoopVectorizer - A unit that performs the actual
+//    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//===----------------------------------------------------------------------===//
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Value.h"
+#include "llvm/Function.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+using namespace llvm;
+
+static cl::opt<unsigned>
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+          cl::desc("Set the default vectorization width. Zero is autoselect."));
+
+/// We don't vectorize loops with a known constant trip count below this number.
+const unsigned TinyTripCountThreshold = 16;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
+namespace {
+
+// Forward declarations.
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+
+/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The SingleBlockLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class SingleBlockLoopVectorizer {
+public:
+  /// Ctor.
+  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
+                            DominatorTree *dt, LPPassManager *Lpm,
+                            unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
+  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
+
+  // Perform the actual loop widening (vectorization).
+  void vectorize(LoopVectorizationLegality *Legal) {
+    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    createEmptyLoop(Legal);
+    /// Widen each instruction in the old loop to a new one in the new loop.
+    /// Use the Legality module to find the induction and reduction variables.
+    vectorizeLoop(Legal);
+    // Register the new loop and update the analysis passes.
+    updateAnalysis();
+ }
+
+private:
+  /// Create an empty loop, based on the loop ranges of the old loop.
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
+  /// Copy and widen the instructions from the old loop.
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars.
+  void scalarizeInstruction(Instruction *Instr);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  Value *getBroadcastInstrs(Value *V);
+
+  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
+  /// for each element in the vector. Starting from zero.
+  Value *getConsecutiveVector(Value* Val);
+
+  /// When we go over instructions in the basic block we rely on previous
+  /// values within the current basic block or on loop invariant values.
+  /// When we widen (vectorize) values we place them in the map. If the values
+  /// are not within the map, they have to be loop invariant, so we simply
+  /// broadcast them into a vector.
+  Value *getVectorValue(Value *V);
+
+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
+  typedef DenseMap<Value*, Value*> ValueMap;
+
+  /// The original loop.
+  Loop *OrigLoop;
+  // Scev analysis to use.
+  ScalarEvolution *SE;
+  // Loop Info.
+  LoopInfo *LI;
+  // Dominator Tree.
+  DominatorTree *DT;
+  // Loop Pass Manager;
+  LPPassManager *LPM;
+  // The vectorization factor to use.
+  unsigned VF;
+
+  // The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction;
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction;
+  // Maps scalars to widened vectors.
+  ValueMap WidenMap;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeBlock checks for a number
+///   of different conditions, such as the availability of a single induction
+///   variable, that all types are supported and vectorize-able, etc.
+/// This code reflects the capabilities of SingleBlockLoopVectorizer.
+/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
+  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+
+  /// This represents the kinds of reductions that we support.
+  enum ReductionKind {
+    NoReduction, /// Not a reduction.
+    IntegerAdd,  /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    IntegerOr,   /// Bitwise or logical OR of numbers.
+    IntegerAnd,  /// Bitwise or logical AND of numbers.
+    IntegerXor   /// Bitwise or logical XOR of numbers.
+  };
+
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    // Default C'tor
+    ReductionDescriptor():
+    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+    // C'tor.
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
+
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+  };
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
+
+  /// Returns the Induction variable.
+  PHINode *getInduction() {return Induction;}
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Check if the pointer returned by this GEP is consecutive
+  /// when the index is vectorized. This happens when the last
+  /// index of the GEP is consecutive, like the induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  bool isConsecutiveGep(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeBlock(BasicBlock &BB);
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if BB is vectorizable
+  bool canVectorizeMemory(BasicBlock &BB);
+
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns True, if 'Phi' is an induction variable.
+  bool isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the induction variable.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+  /// C'tor.
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
+                             LoopVectorizationLegality *Leg,
+                             const VectorTargetTransformInfo *Vtti):
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+
+  /// Returns the most profitable vectorization factor for the loop that is
+  /// smaller or equal to the VF argument. This method checks every power
+  /// of two up to VF.
+  unsigned findBestVectorizationFactor(unsigned VF = 8);
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const VectorTargetTransformInfo *VTTI;
+};
+
+struct LoopVectorize : public LoopPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LoopVectorize() : LoopPass(ID) {
+    initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  LoopInfo *LI;
+  TargetTransformInfo *TTI;
+  DominatorTree *DT;
+
+  virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+    // We only vectorize innermost loops.
+    if (!L->empty())
+      return false;
+
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    LI = &getAnalysis<LoopInfo>();
+    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    DT = &getAnalysis<DominatorTree>();
+
+    DEBUG(dbgs() << "LV: Checking a loop in \"" <<
+          L->getHeader()->getParent()->getName() << "\"\n");
+
+    // Check if it is legal to vectorize the loop.
+    LoopVectorizationLegality LVL(L, SE, DL);
+    if (!LVL.canVectorize()) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
+      return false;
+    }
+
+    // Select the preffered vectorization factor.
+    unsigned VF = 1;
+    if (VectorizationFactor == 0) {
+      const VectorTargetTransformInfo *VTTI = 0;
+      if (TTI)
+        VTTI = TTI->getVectorTargetTransformInfo();
+      // Use the cost model.
+      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+      VF = CM.findBestVectorizationFactor();
+
+      if (VF == 1) {
+        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+        return false;
+      }
+
+    } else {
+      // Use the user command flag.
+      VF = VectorizationFactor;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
+          "\n");
+
+    // If we decided that it is *legal* to vectorizer the loop then do it.
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
+    LB.vectorize(&LVL);
+
+    DEBUG(verifyFunction(*L->getHeader()->getParent()));
+    return true;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    LoopPass::getAnalysisUsage(AU);
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<LoopInfo>();
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
+  }
+
+};
+
+Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
+  // Instructions that access the old induction variable
+  // actually want to get the new one.
+  if (V == OldInduction)
+    V = Induction;
+  // Create the types.
+  LLVMContext &C = V->getContext();
+  Type *VTy = VectorType::get(V->getType(), VF);
+  Type *I32 = IntegerType::getInt32Ty(C);
+  Constant *Zero = ConstantInt::get(I32, 0);
+  Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
+  Value *UndefVal = UndefValue::get(VTy);
+  // Insert the value into a new vector.
+  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
+  // Broadcast the scalar into all locations in the vector.
+  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
+                                             "broadcast");
+  // We are accessing the induction variable. Make sure to promote the
+  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
+  if (V == Induction)
+    return getConsecutiveVector(Shuf);
+  return Shuf;
+}
+
+Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+  assert(Val->getType()->isVectorTy() && "Must be a vector");
+  assert(Val->getType()->getScalarType()->isIntegerTy() &&
+         "Elem must be an integer");
+  // Create the types.
+  Type *ITy = Val->getType()->getScalarType();
+  VectorType *Ty = cast<VectorType>(Val->getType());
+  unsigned VLen = Ty->getNumElements();
+  SmallVector<Constant*, 8> Indices;
+
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantInt::get(ITy, i));
+
+  // Add the consecutive indices to the vector value.
+  Constant *Cv = ConstantVector::get(Indices);
+  assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+  return Builder.CreateAdd(Val, Cv, "induction");
+}
+
+bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
+  if (!Gep)
+    return false;
+
+  unsigned NumOperands = Gep->getNumOperands();
+  Value *LastIndex = Gep->getOperand(NumOperands - 1);
+
+  // Check that all of the gep indices are uniform except for the last.
+  for (unsigned i = 0; i < NumOperands - 1; ++i)
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+      return false;
+
+  // We can emit wide load/stores only of the last index is the induction
+  // variable.
+  const SCEV *Last = SE->getSCEV(LastIndex);
+  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
+    const SCEV *Step = AR->getStepRecurrence(*SE);
+
+    // The memory is consecutive because the last index is consecutive
+    // and all other indices are loop invariant.
+    if (Step->isOne())
+      return true;
+  }
+
+  return false;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
+Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+  // If we saved a vectorized copy of V, use it.
+  Value *&MapEntry = WidenMap[V];
+  if (MapEntry)
+    return MapEntry;
+
+  // Broadcast V and save the value for future uses.
+  Value *B = getBroadcastInstrs(V);
+  MapEntry = B;
+  return B;
+}
+
+Constant*
+SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+  SmallVector<Constant*, 8> Indices;
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VF; ++i)
+    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
+
+  // Add the consecutive indices to the vector value.
+  return ConstantVector::get(Indices);
+}
+
+void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+  // Holds vector parameters or scalars, in case of uniform vals.
+  SmallVector<Value*, 8> Params;
+
+  // Find all of the vectorized parameters.
+  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+    Value *SrcOp = Instr->getOperand(op);
+
+    // If we are accessing the old induction variable, use the new one.
+    if (SrcOp == OldInduction) {
+      Params.push_back(getBroadcastInstrs(Induction));
+      continue;
+    }
+
+    // Try using previously calculated values.
+    Instruction *SrcInst = dyn_cast<Instruction>(SrcOp);
+
+    // If the src is an instruction that appeared earlier in the basic block
+    // then it should already be vectorized.
+    if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
+      assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
+      // The parameter is a vector value from earlier.
+      Params.push_back(WidenMap[SrcInst]);
+    } else {
+      // The parameter is a scalar from outside the loop. Maybe even a constant.
+      Params.push_back(SrcOp);
+    }
+  }
+
+  assert(Params.size() == Instr->getNumOperands() &&
+         "Invalid number of operands");
+
+  // Does this instruction return a value ?
+  bool IsVoidRetTy = Instr->getType()->isVoidTy();
+  Value *VecResults = 0;
+
+  // If we have a return value, create an empty vector. We place the scalarized
+  // instructions in this vector.
+  if (!IsVoidRetTy)
+    VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
+
+  // For each scalar that we create:
+  for (unsigned i = 0; i < VF; ++i) {
+    Instruction *Cloned = Instr->clone();
+    if (!IsVoidRetTy)
+      Cloned->setName(Instr->getName() + ".cloned");
+    // Replace the operands of the cloned instrucions with extracted scalars.
+    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+      Value *Op = Params[op];
+      // Param is a vector. Need to extract the right lane.
+      if (Op->getType()->isVectorTy())
+        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
+      Cloned->setOperand(op, Op);
+    }
+
+    // Place the cloned scalar in the new loop.
+    Builder.Insert(Cloned);
+
+    // If the original scalar returns a value we need to place it in a vector
+    // so that future users will be able to use it.
+    if (!IsVoidRetTy)
+      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
+                                               Builder.getInt32(i));
+  }
+
+  if (!IsVoidRetTy)
+    WidenMap[Instr] = VecResults;
+}
+
+void
+SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+  /*
+   In this function we generate a new loop. The new loop will contain
+   the vectorized instructions while the old loop will continue to run the
+   scalar remainder.
+
+    [ ] <-- vector loop bypass.
+  /  |
+ /   v
+|   [ ]     <-- vector pre header.
+|    |
+|    v
+|   [  ] \
+|   [  ]_|   <-- vector loop.
+|    |
+ \   v
+   >[ ]   <--- middle-block.
+  /  |
+ /   v
+|   [ ]     <--- new preheader.
+|    |
+|    v
+|   [ ] \
+|   [ ]_|   <-- old scalar loop to handle remainder.
+ \   |
+  \  v
+   >[ ]     <-- exit block.
+   ...
+   */
+
+  OldInduction = Legal->getInduction();
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the total trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != IdxTy)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
+  // This is the original scalar-loop preheader.
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(ExitBlock && "Must have an exit block");
+
+  // The loop index does not have to start at Zero. It starts with this value.
+  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+
+  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
+  assert(BypassBlock && "Invalid loop structure");
+
+  BasicBlock *VectorPH =
+      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
+                                                 "vector.body");
+
+  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
+                                                  "middle.block");
+  BasicBlock *ScalarPH =
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                 "scalar.preheader");
+  // Find the induction variable.
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+
+  // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
+  // inside the loop.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Generate the induction variable.
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
+  Constant *Step = ConstantInt::get(IdxTy, VF);
+
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
+  Instruction *Loc = BypassBlock->getTerminator();
+
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+
+  // Add the start index to the loop count to get the new end index.
+  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+
+  // Now we need to generate the expression for N - (N % VF), which is
+  // the part that the vectorized body will execute.
+  Constant *CIVF = ConstantInt::get(IdxTy, VF);
+  Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
+  Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down", Loc);
+
+  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
+                               IdxEndRoundDown,
+                               StartIdx,
+                               "cmp.zero", Loc);
+
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+  Value *MemoryRuntimeCheck = 0;
+  if (PtrRtCheck->Need) {
+    unsigned NumPointers = PtrRtCheck->Pointers.size();
+    SmallVector<Value* , 2> Starts;
+    SmallVector<Value* , 2> Ends;
+
+    // Use this type for pointer arithmetic.
+    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      Value *Ptr = PtrRtCheck->Pointers[i];
+      const SCEV *Sc = SE->getSCEV(Ptr);
+
+      if (SE->isLoopInvariant(Sc, OrigLoop)) {
+        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+              *Ptr <<"\n");
+        Starts.push_back(Ptr);
+        Ends.push_back(Ptr);
+      } else {
+        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
+        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
+        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+        Starts.push_back(Start);
+        Ends.push_back(End);
+      }
+    }
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      for (unsigned j=i+1; j < NumPointers; ++j) {
+        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[0], Ends[1], "bound0", Loc);
+        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[1], Ends[0], "bound1", Loc);
+        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                    "found.conflict", Loc);
+        if (MemoryRuntimeCheck) {
+          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                      MemoryRuntimeCheck,
+                                                      IsConflict,
+                                                      "conflict.rdx", Loc);
+        } else {
+          MemoryRuntimeCheck = IsConflict;
+        }
+      }
+    }
+  }// end of need-runtime-check code.
+
+  // If we are using memory runtime checks, include them in.
+  if (MemoryRuntimeCheck) {
+    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+                                 "CntOrMem", Loc);
+  }
+
+  BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
+  // Remove the old terminator.
+  Loc->eraseFromParent();
+
+  // We are going to resume the execution of the scalar loop.
+  // This PHI decides on what number to start. If we come from the
+  // vector loop then we need to start with the end index minus the
+  // index modulo VF. If we come from a bypass edge then we need to start
+  // from the real start.
+  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
+                                         MiddleBlock->getTerminator());
+  ResumeIndex->addIncoming(StartIdx, BypassBlock);
+  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+
+  // Add a check in the middle block to see if we have completed
+  // all of the iterations in the first vector loop.
+  // If (N - N%VF) == N, then we *don't* need to run the remainder.
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+                                ResumeIndex, "cmp.n",
+                                MiddleBlock->getTerminator());
+
+  BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
+  // Remove the old terminator.
+  MiddleBlock->getTerminator()->eraseFromParent();
+
+  // Create i+1 and fill the PHINode.
+  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
+  Induction->addIncoming(StartIdx, VectorPH);
+  Induction->addIncoming(NextIdx, VecBody);
+  // Create the compare.
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
+  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
+
+  // Now we have two terminators. Remove the old one from the block.
+  VecBody->getTerminator()->eraseFromParent();
+
+  // Fix the scalar body iteration count.
+  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
+
+  // Get ready to start creating new instructions into the vectorized body.
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
+
+  // Register the new loop.
+  Loop* Lp = new Loop();
+  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
+
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
+  Loop *ParentLoop = OrigLoop->getParentLoop();
+  if (ParentLoop) {
+    ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
+    ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  }
+
+  // Save the state.
+  LoopVectorPreHeader = VectorPH;
+  LoopScalarPreHeader = ScalarPH;
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+  LoopBypassBlock = BypassBlock;
+}
+
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
+static unsigned
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+  switch (K) {
+  case LoopVectorizationLegality::IntegerXor:
+  case LoopVectorizationLegality::IntegerAdd:
+  case LoopVectorizationLegality::IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
+    return 0;
+  case LoopVectorizationLegality::IntegerMult:
+    // Multiplying a number by 1 does not change it.
+    return 1;
+  case LoopVectorizationLegality::IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
+    return -1;
+  default:
+    llvm_unreachable("Unknown reduction kind");
+  }
+}
+
+void
+SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should be also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
+  typedef SmallVector<PHINode*, 4> PhiVector;
+  BasicBlock &BB = *OrigLoop->getHeader();
+  Constant *Zero = ConstantInt::get(
+    IntegerType::getInt32Ty(BB.getContext()), 0);
+
+  // In order to support reduction variables we need to be able to vectorize
+  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+  // steages. First, we create a new vector PHI node with no incoming edges.
+  // We use this value when we vectorize all of the instructions that use the
+  // PHI. Next, after all of the instructions in the block are complete we
+  // add the new incoming edges to the PHI. At this point all of the
+  // instructions in the basic block are vectorized, so we can use them to
+  // construct the PHI.
+  PhiVector PHIsToFix;
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *Inst = it;
+
+    switch (Inst->getOpcode()) {
+      case Instruction::Br:
+        // Nothing to do for PHIs and BR, since we already took care of the
+        // loop control flow instructions.
+        continue;
+      case Instruction::PHI:{
+        PHINode* P = cast<PHINode>(Inst);
+        // Special handling for the induction var.
+        if (OldInduction == Inst)
+          continue;
+        // This is phase one of vectorizing PHIs.
+        // This has to be a reduction variable.
+        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
+        Type *VecTy = VectorType::get(Inst->getType(), VF);
+        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
+        PHIsToFix.push_back(P);
+        continue;
+      }
+      case Instruction::Add:
+      case Instruction::FAdd:
+      case Instruction::Sub:
+      case Instruction::FSub:
+      case Instruction::Mul:
+      case Instruction::FMul:
+      case Instruction::UDiv:
+      case Instruction::SDiv:
+      case Instruction::FDiv:
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::FRem:
+      case Instruction::Shl:
+      case Instruction::LShr:
+      case Instruction::AShr:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor: {
+        // Just widen binops.
+        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+
+        // Use this vector value for all users of the original instruction.
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = V;
+
+        // Update the NSW, NUW and Exact flags.
+        BinaryOperator *VecOp = cast<BinaryOperator>(V);
+        if (isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
+        break;
+      }
+      case Instruction::Select: {
+        // Widen selects.
+        // If the selector is loop invariant we can create a select
+        // instruction with a scalar condition. Otherwise, use vector-select.
+        Value *Cond = Inst->getOperand(0);
+        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+        // The condition can be loop invariant  but still defined inside the
+        // loop. This means that we can't just use the original 'cond' value.
+        // We have to take the 'vectorized' value and pick the first lane.
+        // Instcombine will make this a no-op.
+        Cond = getVectorValue(Cond);
+        if (InvariantCond)
+          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+        Value *Op0 = getVectorValue(Inst->getOperand(1));
+        Value *Op1 = getVectorValue(Inst->getOperand(2));
+        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
+        break;
+      }
+
+      case Instruction::ICmp:
+      case Instruction::FCmp: {
+        // Widen compares. Generate vector compares.
+        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
+        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Value *B = getVectorValue(Inst->getOperand(1));
+        if (FCmp)
+          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+        else
+          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+        break;
+      }
+
+      case Instruction::Store: {
+        // Attempt to issue a wide store.
+        StoreInst *SI = dyn_cast<StoreInst>(Inst);
+        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+        Value *Ptr = SI->getPointerOperand();
+        unsigned Alignment = SI->getAlignment();
+
+        assert(!Legal->isUniform(Ptr) &&
+               "We do not allow storing to uniform addresses");
+
+        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+        // This store does not use GEPs.
+        if (!Legal->isConsecutiveGep(Gep)) {
+          scalarizeInstruction(Inst);
+          break;
+        }
+
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+        Value *Val = getVectorValue(SI->getValueOperand());
+        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+        break;
+      }
+      case Instruction::Load: {
+        // Attempt to issue a wide load.
+        LoadInst *LI = dyn_cast<LoadInst>(Inst);
+        Type *RetTy = VectorType::get(LI->getType(), VF);
+        Value *Ptr = LI->getPointerOperand();
+        unsigned Alignment = LI->getAlignment();
+        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+        // If we don't have a gep, or that the pointer is loop invariant,
+        // scalarize the load.
+        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
+          scalarizeInstruction(Inst);
+          break;
+        }
+
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder.CreateLoad(Ptr);
+        LI->setAlignment(Alignment);
+        // Use this vector value for all users of the load.
+        WidenMap[Inst] = LI;
+        break;
+      }
+      case Instruction::ZExt:
+      case Instruction::SExt:
+      case Instruction::FPToUI:
+      case Instruction::FPToSI:
+      case Instruction::FPExt:
+      case Instruction::PtrToInt:
+      case Instruction::IntToPtr:
+      case Instruction::SIToFP:
+      case Instruction::UIToFP:
+      case Instruction::Trunc:
+      case Instruction::FPTrunc:
+      case Instruction::BitCast: {
+        /// Vectorize bitcasts.
+        CastInst *CI = dyn_cast<CastInst>(Inst);
+        Value *A = getVectorValue(Inst->getOperand(0));
+        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+        break;
+      }
+
+      default:
+        /// All other instructions are unsupported. Scalarize them.
+        scalarizeInstruction(Inst);
+        break;
+    }// end of switch.
+  }// end of for_each instr.
+
+  // At this point every instruction in the original loop is widended to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable descriptor.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+      (*Legal->getReductionVars())[RdxPhi];
+
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
+    // This is the vector-clone of the value that leaves the loop.
+    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit->getType();
+
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
+                                          VecTy->getScalarType());
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                    RdxDesc.StartValue, Zero);
+
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
+    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    // This PHINode contains the vectorized reduction variable, or
+    // the initial value vector, if we bypass the vector loop.
+    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
+
+    // Extract the first scalar.
+    Value *Scalar0 =
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+    // Extract and reduce the remaining vector elements.
+    for (unsigned i=1; i < VF; ++i) {
+      Value *Scalar1 =
+        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+      switch (RdxDesc.Kind) {
+        case LoopVectorizationLegality::IntegerAdd:
+          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerMult:
+          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerOr:
+          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerAnd:
+          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerXor:
+          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+          break;
+        default:
+          llvm_unreachable("Unknown reduction operation");
+      }
+    }
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
+  }// end of for each redux variable.
+}
+
+void SingleBlockLoopVectorizer::updateAnalysis() {
+  // The original basic block.
+  SE->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  DEBUG(DT->verifyAnalysis());
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+  if (!TheLoop->getLoopPreheader()) {
+    assert(false && "No preheader!!");
+    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
+    return  false;
+  }
+
+  // We can only vectorize single basic block loops.
+  unsigned NumBlocks = TheLoop->getNumBlocks();
+  if (NumBlocks != 1) {
+    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+    return false;
+  }
+
+  // We need to have a loop header.
+  BasicBlock *BB = TheLoop->getHeader();
+  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
+    return false;
+  }
+
+  // Do not loop-vectorize loops with a tiny trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  if (TC > 0u && TC < TinyTripCountThreshold) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+          "This loop is not worth vectorizing.\n");
+    return false;
+  }
+
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
+
+  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+  // Scan the instructions in the block and look for hazards.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *I = it;
+
+    PHINode *Phi = dyn_cast<PHINode>(I);
+    if (Phi) {
+      // This should not happen because the loop should be normalized.
+      if (Phi->getNumIncomingValues() != 2) {
+        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+        return false;
+      }
+      // We only look at integer phi nodes.
+      if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+        return false;
+      }
+
+      if (isInductionVariable(Phi)) {
+        if (Induction) {
+          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+          return false;
+        }
+        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
+        Induction = Phi;
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerAdd)) {
+        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerMult)) {
+        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerOr)) {
+        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerAnd)) {
+        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerXor)) {
+        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+
+      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+      return false;
+    }// end of PHI handling
+
+    // We still don't handle functions.
+    CallInst *CI = dyn_cast<CallInst>(I);
+    if (CI) {
+      DEBUG(dbgs() << "LV: Found a call site.\n");
+      return false;
+    }
+
+    // We do not re-vectorize vectors.
+    if (!VectorType::isValidElementType(I->getType()) &&
+        !I->getType()->isVoidTy()) {
+      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+      return false;
+    }
+
+    // Reduction instructions are allowed to have exit users.
+    // All other instructions must not have external users.
+    if (!AllowedExit.count(I))
+      //Check that all of the users of the loop are inside the BB.
+      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
+           it != e; ++it) {
+        Instruction *U = cast<Instruction>(*it);
+        // This user may be a reduction exit value.
+        BasicBlock *Parent = U->getParent();
+        if (Parent != &BB) {
+          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+          return false;
+        }
+    }
+  } // next instr.
+
+  if (!Induction) {
+      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
+      return false;
+  }
+
+  // Don't vectorize if the memory dependencies do not allow vectorization.
+  if (!canVectorizeMemory(BB))
+    return false;
+
+  // We now know that the loop is vectorizable!
+  // Collect variables that will remain uniform after vectorization.
+  std::vector<Value*> Worklist;
+
+  // Start with the conditional branch and walk up the block.
+  Worklist.push_back(BB.getTerminator()->getOperand(0));
+
+  while (Worklist.size()) {
+    Instruction *I = dyn_cast<Instruction>(Worklist.back());
+    Worklist.pop_back();
+    // Look at instructions inside this block.
+    if (!I) continue;
+    if (I->getParent() != &BB) continue;
+
+    // Stop when reaching PHI nodes.
+    if (isa<PHINode>(I)) {
+      assert(I == Induction && "Found a uniform PHI that is not the induction");
+      break;
+    }
+
+    // This is a known uniform.
+    Uniforms.insert(I);
+
+    // Insert all operands.
+    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+      Worklist.push_back(I->getOperand(i));
+    }
+  }
+
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+  typedef SmallVector<Value*, 16> ValueVector;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  // Holds the Load and Store *instructions*.
+  ValueVector Loads;
+  ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
+
+  // Scan the BB and collect legal loads and stores.
+  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
+    Instruction *I = it;
+
+    // If this is a load, save it. If this instruction can read from memory
+    // but is not a load, then we quit. Notice that we don't handle function
+    // calls that read or write.
+    if (I->mayReadFromMemory()) {
+      LoadInst *Ld = dyn_cast<LoadInst>(I);
+      if (!Ld) return false;
+      if (!Ld->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+        return false;
+      }
+      Loads.push_back(Ld);
+      continue;
+    }
+
+    // Save store instructions. Abort if other instructions write to memory.
+    if (I->mayWriteToMemory()) {
+      StoreInst *St = dyn_cast<StoreInst>(I);
+      if (!St) return false;
+      if (!St->isSimple()) {
+        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+        return false;
+      }
+      Stores.push_back(St);
+    }
+  } // next instr.
+
+  // Now we have two lists that hold the loads and the stores.
+  // Next, we find the pointers that they use.
+
+  // Check if we see any stores. If there are no stores, then we don't
+  // care if the pointers are *restrict*.
+  if (!Stores.size()) {
+        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+        return true;
+  }
+
+  // Holds the read and read-write *pointers* that we find.
+  ValueVector Reads;
+  ValueVector ReadWrites;
+
+  // Holds the analyzed pointers. We don't want to call GetUnderlyingObjects
+  // multiple times on the same object. If the ptr is accessed twice, once
+  // for read and once for write, it will only appear once (on the write
+  // list). This is okay, since we are going to check for conflicts between
+  // writes and between reads and writes, but not between reads and reads.
+  ValueSet Seen;
+
+  ValueVector::iterator I, IE;
+  for (I = Stores.begin(), IE = Stores.end(); I != IE; ++I) {
+    StoreInst *ST = dyn_cast<StoreInst>(*I);
+    assert(ST && "Bad StoreInst");
+    Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
+    // If we did *not* see this pointer before, insert it to
+    // the read-write list. At this phase it is only a 'write' list.
+    if (Seen.insert(Ptr))
+      ReadWrites.push_back(Ptr);
+  }
+
+  for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
+    LoadInst *LD = dyn_cast<LoadInst>(*I);
+    assert(LD && "Bad LoadInst");
+    Value* Ptr = LD->getPointerOperand();
+    // If we did *not* see this pointer before, insert it to the
+    // read list. If we *did* see it before, then it is already in
+    // the read-write list. This allows us to vectorize expressions
+    // such as A[i] += x;  Because the address of A[i] is a read-write
+    // pointer. This only works if the index of A[i] is consecutive.
+    // If the address of i is unknown (for example A[B[i]]) then we may
+    // read a few words, modify, and write a few words, and some of the
+    // words may be written to the same address.
+    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+      Reads.push_back(Ptr);
+  }
+
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool RT = true;
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.Pointers.clear();
+    RT = false;
+  }
+
+  PtrRtCheck.Need = RT;
+
+  if (RT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
+  // Now that the pointers are in two lists (Reads and ReadWrites), we
+  // can check that there are no conflicts between each of the writes and
+  // between the writes to the reads.
+  ValueSet WriteObjects;
+  ValueVector TempObjects;
+
+  // Check that the read-writes do not conflict with other read-write
+  // pointers.
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
+        return RT;
+      }
+      if (!WriteObjects.insert(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
+              << **it <<"\n");
+        return RT;
+      }
+    }
+    TempObjects.clear();
+  }
+
+  /// Check that the reads don't conflict with the read-writes.
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) {
+    GetUnderlyingObjects(*I, TempObjects, DL);
+    for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
+         it != e; ++it) {
+      if (!isIdentifiedObject(*it)) {
+        DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
+        return RT;
+      }
+      if (WriteObjects.count(*it)) {
+        DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
+              << **it <<"\n");
+        return RT;
+      }
+    }
+    TempObjects.clear();
+  }
+
+  // It is safe to vectorize and we don't need any runtime checks.
+  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
+  return true;
+}
+
+bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
+                                                ReductionKind Kind) {
+  if (Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // Find the possible incoming reduction variable.
+  BasicBlock *BB = Phi->getParent();
+  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
+  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
+  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+
+  // ExitInstruction is the single value which is used outside the loop.
+  // We only allow for a single reduction value to be used outside the loop.
+  // This includes users of the reduction, variables (which form a cycle
+  // which ends in the phi node).
+  Instruction *ExitInstruction = 0;
+
+  // Iter is our iterator. We start with the PHI node and scan for all of the
+  // users of this instruction. All users must be instructions which can be
+  // used as reduction variables (such as ADD). We may have a single
+  // out-of-block user. They cycle must end with the original PHI.
+  // Also, we can't have multiple block-local users.
+  Instruction *Iter = Phi;
+  while (true) {
+    // Any reduction instr must be of one of the allowed kinds.
+    if (!isReductionInstr(Iter, Kind))
+      return false;
+
+    // Did we found a user inside this block ?
+    bool FoundInBlockUser = false;
+    // Did we reach the initial PHI node ?
+    bool FoundStartPHI = false;
+
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_empty())
+      return false;
+
+    // For each of the *users* of iter.
+    for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
+         it != e; ++it) {
+      Instruction *U = cast<Instruction>(*it);
+      // We already know that the PHI is a user.
+      if (U == Phi) {
+        FoundStartPHI = true;
+        continue;
+      }
+      // Check if we found the exit user.
+      BasicBlock *Parent = U->getParent();
+      if (Parent != BB) {
+        // We must have a single exit instruction.
+        if (ExitInstruction != 0)
+          return false;
+        ExitInstruction = Iter;
+      }
+      // We can't have multiple inside users.
+      if (FoundInBlockUser)
+        return false;
+      FoundInBlockUser = true;
+      Iter = U;
+    }
+
+    // We found a reduction var if we have reached the original
+    // phi node and we only have a single instruction with out-of-loop
+    // users.
+   if (FoundStartPHI && ExitInstruction) {
+     // This instruction is allowed to have out-of-loop users.
+     AllowedExit.insert(ExitInstruction);
+
+     // Save the description of this reduction variable.
+     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+     Reductions[Phi] = RD;
+     return true;
+   }
+  }
+}
+
+bool
+LoopVectorizationLegality::isReductionInstr(Instruction *I,
+                                            ReductionKind Kind) {
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::PHI:
+      // possibly.
+      return true;
+    case Instruction::Add:
+    case Instruction::Sub:
+      return Kind == IntegerAdd;
+    case Instruction::Mul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+      return Kind == IntegerMult;
+    case Instruction::And:
+      return Kind == IntegerAnd;
+    case Instruction::Or:
+      return Kind == IntegerOr;
+    case Instruction::Xor:
+      return Kind == IntegerXor;
+    }
+}
+
+bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  // Check that the PHI is consecutive and starts at zero.
+  const SCEV *PhiScev = SE->getSCEV(Phi);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR) {
+    DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
+    return false;
+  }
+  const SCEV *Step = AR->getStepRecurrence(*SE);
+
+  if (!Step->isOne()) {
+    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
+    return false;
+  }
+  return true;
+}
+
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
+    return false;
+
+  return AR->isAffine();
+}
+
+unsigned
+LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+  if (!VTTI) {
+    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
+    return 1;
+  }
+
+  float Cost = expectedCost(1);
+  unsigned Width = 1;
+  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  for (unsigned i=2; i <= VF; i*=2) {
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
+    float VectorCost = expectedCost(i) / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+          (int)VectorCost << ".\n");
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  return Width;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  // We can only estimate the cost of single basic block loops.
+  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+
+  BasicBlock *BB = TheLoop->getHeader();
+  unsigned Cost = 0;
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    Instruction *Inst = it;
+    unsigned C = getInstructionCost(Inst, VF);
+    Cost += C;
+    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+          " For instruction: "<< *Inst << "\n");
+  }
+
+  return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+  assert(VTTI && "Invalid vector target transformation info");
+
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (Legal->isUniformAfterVectorization(I))
+    VF = 1;
+
+  Type *RetTy = I->getType();
+  Type *VectorTy = ToVectorTy(RetTy, VF);
+
+
+  // TODO: We need to estimate the cost of intrinsic calls.
+  switch (I->getOpcode()) {
+    case Instruction::GetElementPtr:
+      // We mark this instruction as zero-cost because scalar GEPs are usually
+      // lowered to the intruction addressing mode. At the moment we don't
+      // generate vector geps.
+      return 0;
+    case Instruction::Br: {
+      return VTTI->getCFInstrCost(I->getOpcode());
+    }
+    case Instruction::PHI:
+      return 0;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+    }
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(I);
+      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+      Type *CondTy = SI->getCondition()->getType();
+      if (ScalarCond)
+        CondTy = VectorType::get(CondTy, VF);
+
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      Type *ValTy = I->getOperand(0)->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(I);
+      Type *ValTy = SI->getValueOperand()->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                              SI->getAlignment(), SI->getPointerAddressSpace());
+
+      // Scalarized stores.
+      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+        unsigned Cost = 0;
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                              ValTy);
+        // The cost of extracting from the value vector.
+        Cost += VF * (ExtCost);
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           ValTy->getScalarType(),
+                                           SI->getAlignment(),
+                                           SI->getPointerAddressSpace());
+        return Cost;
+      }
+
+      // Wide stores.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+                                   SI->getPointerAddressSpace());
+    }
+    case Instruction::Load: {
+      LoadInst *LI = cast<LoadInst>(I);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+                                     LI->getAlignment(),
+                                     LI->getPointerAddressSpace());
+
+      // Scalarized loads.
+      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+        unsigned Cost = 0;
+        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+        // The cost of inserting the loaded value into the result vector.
+        Cost += VF * (InCost);
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           RetTy->getScalarType(),
+                                           LI->getAlignment(),
+                                           LI->getPointerAddressSpace());
+        return Cost;
+      }
+
+      // Wide loads.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+                                   LI->getPointerAddressSpace());
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    }
+    default: {
+      // We are scalarizing the instruction. Return the cost of the scalar
+      // instruction, plus the cost of insert and extract into vector
+      // elements, times the vector width.
+      unsigned Cost = 0;
+
+      bool IsVoid = RetTy->isVoidTy();
+
+      unsigned InsCost = (IsVoid ? 0 :
+                          VTTI->getInstrCost(Instruction::InsertElement,
+                                             VectorTy));
+
+      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                            VectorTy);
+
+      // The cost of inserting the results plus extracting each one of the
+      // operands.
+      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+
+      // The cost of executing VF copies of the scalar instruction.
+      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
+      return Cost;
+    }
+  }// end of switch.
+}
+
+Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
+}
+
+} // namespace
+
+char LoopVectorize::ID = 0;
+static const char lv_name[] = "Loop Vectorization";
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createLoopVectorizePass() {
+    return new LoopVectorize();
+  }
+}
+
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 1ef60029bcf4..d26973a7b380 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements common infrastructure for libLLVMVectorizeOpts.a, which 
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
 // implements several vectorization transformations over the LLVM intermediate
 // representation, including the C bindings for that library.
 //
@@ -23,10 +23,11 @@
 
 using namespace llvm;
 
-/// initializeVectorizationPasses - Initialize all passes linked into the 
+/// initializeVectorizationPasses - Initialize all passes linked into the
 /// Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeBBVectorizePass(Registry);
+  initializeLoopVectorizePass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
@@ -37,3 +38,6 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createBBVectorizePass());
 }
 
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopVectorizePass());
+}
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 7ef1131de193..b72c17f667fb 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -66,6 +66,25 @@ static const Module *getModuleFromVal(const Value *V) {
   return 0;
 }
 
+static void PrintCallingConv(unsigned cc, raw_ostream &Out)
+{
+  switch (cc) {
+    case CallingConv::Fast:         Out << "fastcc"; break;
+    case CallingConv::Cold:         Out << "coldcc"; break;
+    case CallingConv::X86_StdCall:  Out << "x86_stdcallcc"; break;
+    case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
+    case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
+    case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
+    case CallingConv::ARM_APCS:     Out << "arm_apcscc"; break;
+    case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc"; break;
+    case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;
+    case CallingConv::MSP430_INTR:  Out << "msp430_intrcc"; break;
+    case CallingConv::PTX_Kernel:   Out << "ptx_kernel"; break;
+    case CallingConv::PTX_Device:   Out << "ptx_device"; break;
+    default:                        Out << "cc" << cc; break;
+  }
+}
+ 
 // PrintEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
 static void PrintEscapedString(StringRef Name, raw_ostream &Out) {
@@ -141,8 +160,8 @@ static void PrintLLVMName(raw_ostream &OS, const Value *V) {
 /// TypePrinting - Type printing machinery.
 namespace {
 class TypePrinting {
-  TypePrinting(const TypePrinting &);   // DO NOT IMPLEMENT
-  void operator=(const TypePrinting&);  // DO NOT IMPLEMENT
+  TypePrinting(const TypePrinting &) LLVM_DELETED_FUNCTION;
+  void operator=(const TypePrinting&) LLVM_DELETED_FUNCTION;
 public:
 
   /// NamedTypes - The named types that are used by the current module.
@@ -380,8 +399,8 @@ private:
   /// Add all of the functions arguments, basic blocks, and instructions.
   void processFunction();
 
-  SlotTracker(const SlotTracker &);  // DO NOT IMPLEMENT
-  void operator=(const SlotTracker &);  // DO NOT IMPLEMENT
+  SlotTracker(const SlotTracker &) LLVM_DELETED_FUNCTION;
+  void operator=(const SlotTracker &) LLVM_DELETED_FUNCTION;
 };
 
 }  // end anonymous namespace
@@ -1029,6 +1048,9 @@ static void WriteAsOperandInternal(raw_ostream &Out, const Value *V,
       Out << "sideeffect ";
     if (IA->isAlignStack())
       Out << "alignstack ";
+    // We don't emit the AD_ATT dialect as it's the assumed default.
+    if (IA->getDialect() == InlineAsm::AD_Intel)
+      Out << "inteldialect ";
     Out << '"';
     PrintEscapedString(IA->getAsmString(), Out);
     Out << "\", \"";
@@ -1222,8 +1244,8 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
   // Print the type
   TypePrinter.print(Operand->getType(), Out);
   // Print parameter attributes list
-  if (Attrs != Attribute::None)
-    Out << ' ' << Attribute::getAsString(Attrs);
+  if (Attrs.hasAttributes())
+    Out << ' ' << Attrs.getAsString();
   Out << ' ';
   // Print the operand
   WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
@@ -1285,8 +1307,9 @@ void AssemblyWriter::printModule(const Module *M) {
   // Output all globals.
   if (!M->global_empty()) Out << '\n';
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
-       I != E; ++I)
-    printGlobal(I);
+       I != E; ++I) {
+    printGlobal(I); Out << '\n';
+  }
 
   // Output all aliases.
   if (!M->alias_empty()) Out << "\n";
@@ -1353,12 +1376,12 @@ static void PrintLinkage(GlobalValue::LinkageTypes LT,
   case GlobalValue::LinkerPrivateWeakLinkage:
     Out << "linker_private_weak ";
     break;
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
-    Out << "linker_private_weak_def_auto ";
-    break;
   case GlobalValue::InternalLinkage:      Out << "internal ";       break;
   case GlobalValue::LinkOnceAnyLinkage:   Out << "linkonce ";       break;
   case GlobalValue::LinkOnceODRLinkage:   Out << "linkonce_odr ";   break;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+    Out << "linkonce_odr_auto_hide ";
+    break;
   case GlobalValue::WeakAnyLinkage:       Out << "weak ";           break;
   case GlobalValue::WeakODRLinkage:       Out << "weak_odr ";       break;
   case GlobalValue::CommonLinkage:        Out << "common ";         break;
@@ -1436,7 +1459,6 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     Out << ", align " << GV->getAlignment();
 
   printInfoComment(*GV);
-  Out << '\n';
 }
 
 void AssemblyWriter::printAlias(const GlobalAlias *GA) {
@@ -1527,27 +1549,16 @@ void AssemblyWriter::printFunction(const Function *F) {
   PrintVisibility(F->getVisibility(), Out);
 
   // Print the calling convention.
-  switch (F->getCallingConv()) {
-  case CallingConv::C: break;   // default
-  case CallingConv::Fast:         Out << "fastcc "; break;
-  case CallingConv::Cold:         Out << "coldcc "; break;
-  case CallingConv::X86_StdCall:  Out << "x86_stdcallcc "; break;
-  case CallingConv::X86_FastCall: Out << "x86_fastcallcc "; break;
-  case CallingConv::X86_ThisCall: Out << "x86_thiscallcc "; break;
-  case CallingConv::ARM_APCS:     Out << "arm_apcscc "; break;
-  case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc "; break;
-  case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc "; break;
-  case CallingConv::MSP430_INTR:  Out << "msp430_intrcc "; break;
-  case CallingConv::PTX_Kernel:   Out << "ptx_kernel "; break;
-  case CallingConv::PTX_Device:   Out << "ptx_device "; break;
-  default: Out << "cc" << F->getCallingConv() << " "; break;
+  if (F->getCallingConv() != CallingConv::C) {
+    PrintCallingConv(F->getCallingConv(), Out);
+    Out << " ";
   }
 
   FunctionType *FT = F->getFunctionType();
   const AttrListPtr &Attrs = F->getAttributes();
   Attributes RetAttrs = Attrs.getRetAttributes();
-  if (RetAttrs != Attribute::None)
-    Out <<  Attribute::getAsString(Attrs.getRetAttributes()) << ' ';
+  if (RetAttrs.hasAttributes())
+    Out <<  Attrs.getRetAttributes().getAsString() << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
@@ -1576,8 +1587,8 @@ void AssemblyWriter::printFunction(const Function *F) {
       TypePrinter.print(FT->getParamType(i), Out);
 
       Attributes ArgAttrs = Attrs.getParamAttributes(i+1);
-      if (ArgAttrs != Attribute::None)
-        Out << ' ' << Attribute::getAsString(ArgAttrs);
+      if (ArgAttrs.hasAttributes())
+        Out << ' ' << ArgAttrs.getAsString();
     }
   }
 
@@ -1590,8 +1601,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->hasUnnamedAddr())
     Out << " unnamed_addr";
   Attributes FnAttrs = Attrs.getFnAttributes();
-  if (FnAttrs != Attribute::None)
-    Out << ' ' << Attribute::getAsString(Attrs.getFnAttributes());
+  if (FnAttrs.hasAttributes())
+    Out << ' ' << Attrs.getFnAttributes().getAsString();
   if (F->hasSection()) {
     Out << " section \"";
     PrintEscapedString(F->getSection(), Out);
@@ -1624,8 +1635,8 @@ void AssemblyWriter::printArgument(const Argument *Arg,
   TypePrinter.print(Arg->getType(), Out);
 
   // Output parameter attributes list
-  if (Attrs != Attribute::None)
-    Out << ' ' << Attribute::getAsString(Attrs);
+  if (Attrs.hasAttributes())
+    Out << ' ' << Attrs.getAsString();
 
   // Output name, if available...
   if (Arg->hasName()) {
@@ -1828,20 +1839,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << " void";
   } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Print the calling convention being used.
-    switch (CI->getCallingConv()) {
-    case CallingConv::C: break;   // default
-    case CallingConv::Fast:  Out << " fastcc"; break;
-    case CallingConv::Cold:  Out << " coldcc"; break;
-    case CallingConv::X86_StdCall:  Out << " x86_stdcallcc"; break;
-    case CallingConv::X86_FastCall: Out << " x86_fastcallcc"; break;
-    case CallingConv::X86_ThisCall: Out << " x86_thiscallcc"; break;
-    case CallingConv::ARM_APCS:     Out << " arm_apcscc "; break;
-    case CallingConv::ARM_AAPCS:    Out << " arm_aapcscc "; break;
-    case CallingConv::ARM_AAPCS_VFP:Out << " arm_aapcs_vfpcc "; break;
-    case CallingConv::MSP430_INTR:  Out << " msp430_intrcc "; break;
-    case CallingConv::PTX_Kernel:   Out << " ptx_kernel"; break;
-    case CallingConv::PTX_Device:   Out << " ptx_device"; break;
-    default: Out << " cc" << CI->getCallingConv(); break;
+    if (CI->getCallingConv() != CallingConv::C) {
+      Out << " ";
+      PrintCallingConv(CI->getCallingConv(), Out);
     }
 
     Operand = CI->getCalledValue();
@@ -1850,8 +1850,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Type *RetTy = FTy->getReturnType();
     const AttrListPtr &PAL = CI->getAttributes();
 
-    if (PAL.getRetAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getRetAttributes());
+    if (PAL.getRetAttributes().hasAttributes())
+      Out << ' ' << PAL.getRetAttributes().getAsString();
 
     // If possible, print out the short form of the call instruction.  We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -1874,8 +1874,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       writeParamOperand(CI->getArgOperand(op), PAL.getParamAttributes(op + 1));
     }
     Out << ')';
-    if (PAL.getFnAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
+    if (PAL.getFnAttributes().hasAttributes())
+      Out << ' ' << PAL.getFnAttributes().getAsString();
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledValue();
     PointerType *PTy = cast<PointerType>(Operand->getType());
@@ -1884,24 +1884,13 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     const AttrListPtr &PAL = II->getAttributes();
 
     // Print the calling convention being used.
-    switch (II->getCallingConv()) {
-    case CallingConv::C: break;   // default
-    case CallingConv::Fast:  Out << " fastcc"; break;
-    case CallingConv::Cold:  Out << " coldcc"; break;
-    case CallingConv::X86_StdCall:  Out << " x86_stdcallcc"; break;
-    case CallingConv::X86_FastCall: Out << " x86_fastcallcc"; break;
-    case CallingConv::X86_ThisCall: Out << " x86_thiscallcc"; break;
-    case CallingConv::ARM_APCS:     Out << " arm_apcscc "; break;
-    case CallingConv::ARM_AAPCS:    Out << " arm_aapcscc "; break;
-    case CallingConv::ARM_AAPCS_VFP:Out << " arm_aapcs_vfpcc "; break;
-    case CallingConv::MSP430_INTR:  Out << " msp430_intrcc "; break;
-    case CallingConv::PTX_Kernel:   Out << " ptx_kernel"; break;
-    case CallingConv::PTX_Device:   Out << " ptx_device"; break;
-    default: Out << " cc" << II->getCallingConv(); break;
+    if (II->getCallingConv() != CallingConv::C) {
+      Out << " ";
+      PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.getRetAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getRetAttributes());
+    if (PAL.getRetAttributes().hasAttributes())
+      Out << ' ' << PAL.getRetAttributes().getAsString();
 
     // If possible, print out the short form of the invoke instruction. We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -1925,8 +1914,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
 
     Out << ')';
-    if (PAL.getFnAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
+    if (PAL.getFnAttributes().hasAttributes())
+      Out << ' ' << PAL.getFnAttributes().getAsString();
 
     Out << "\n          to ";
     writeOperand(II->getNormalDest(), true);
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index c8219eb78777..f1268e6ef86b 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -7,11 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the AttributesList class and Attribute utilities.
+// This file implements the Attributes, AttributeImpl, AttrBuilder,
+// AttributeListImpl, and AttrListPtr classes.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Attributes.h"
+#include "AttributesImpl.h"
+#include "LLVMContextImpl.h"
 #include "llvm/Type.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/FoldingSet.h"
@@ -23,215 +26,382 @@
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
-// Attribute Function Definitions
+// Attributes Implementation
 //===----------------------------------------------------------------------===//
 
-std::string Attribute::getAsString(Attributes Attrs) {
+Attributes Attributes::get(LLVMContext &Context, ArrayRef<AttrVal> Vals) {
+  AttrBuilder B;
+  for (ArrayRef<AttrVal>::iterator I = Vals.begin(), E = Vals.end();
+       I != E; ++I)
+    B.addAttribute(*I);
+  return Attributes::get(Context, B);
+}
+
+Attributes Attributes::get(LLVMContext &Context, AttrBuilder &B) {
+  // If there are no attributes, return an empty Attributes class.
+  if (!B.hasAttributes())
+    return Attributes();
+
+  // Otherwise, build a key to look up the existing attributes.
+  LLVMContextImpl *pImpl = Context.pImpl;
+  FoldingSetNodeID ID;
+  ID.AddInteger(B.Raw());
+
+  void *InsertPoint;
+  AttributesImpl *PA = pImpl->AttrsSet.FindNodeOrInsertPos(ID, InsertPoint);
+
+  if (!PA) {
+    // If we didn't find any existing attributes of the same shape then create a
+    // new one and insert it.
+    PA = new AttributesImpl(B.Raw());
+    pImpl->AttrsSet.InsertNode(PA, InsertPoint);
+  }
+
+  // Return the AttributesList that we found or created.
+  return Attributes(PA);
+}
+
+bool Attributes::hasAttribute(AttrVal Val) const {
+  return Attrs && Attrs->hasAttribute(Val);
+}
+
+bool Attributes::hasAttributes() const {
+  return Attrs && Attrs->hasAttributes();
+}
+
+bool Attributes::hasAttributes(const Attributes &A) const {
+  return Attrs && Attrs->hasAttributes(A);
+}
+
+/// This returns the alignment field of an attribute as a byte alignment value.
+unsigned Attributes::getAlignment() const {
+  if (!hasAttribute(Attributes::Alignment))
+    return 0;
+  return 1U << ((Attrs->getAlignment() >> 16) - 1);
+}
+
+/// This returns the stack alignment field of an attribute as a byte alignment
+/// value.
+unsigned Attributes::getStackAlignment() const {
+  if (!hasAttribute(Attributes::StackAlignment))
+    return 0;
+  return 1U << ((Attrs->getStackAlignment() >> 26) - 1);
+}
+
+uint64_t Attributes::Raw() const {
+  return Attrs ? Attrs->Raw() : 0;
+}
+
+Attributes Attributes::typeIncompatible(Type *Ty) {
+  AttrBuilder Incompatible;
+
+  if (!Ty->isIntegerTy())
+    // Attributes that only apply to integers.
+    Incompatible.addAttribute(Attributes::SExt)
+      .addAttribute(Attributes::ZExt);
+
+  if (!Ty->isPointerTy())
+    // Attributes that only apply to pointers.
+    Incompatible.addAttribute(Attributes::ByVal)
+      .addAttribute(Attributes::Nest)
+      .addAttribute(Attributes::NoAlias)
+      .addAttribute(Attributes::NoCapture)
+      .addAttribute(Attributes::StructRet);
+
+  return Attributes::get(Ty->getContext(), Incompatible);
+}
+
+/// encodeLLVMAttributesForBitcode - This returns an integer containing an
+/// encoding of all the LLVM attributes found in the given attribute bitset.
+/// Any change to this encoding is a breaking change to bitcode compatibility.
+uint64_t Attributes::encodeLLVMAttributesForBitcode(Attributes Attrs) {
+  // FIXME: It doesn't make sense to store the alignment information as an
+  // expanded out value, we should store it as a log2 value.  However, we can't
+  // just change that here without breaking bitcode compatibility.  If this ever
+  // becomes a problem in practice, we should introduce new tag numbers in the
+  // bitcode file and have those tags use a more efficiently encoded alignment
+  // field.
+
+  // Store the alignment in the bitcode as a 16-bit raw value instead of a 5-bit
+  // log2 encoded value. Shift the bits above the alignment up by 11 bits.
+  uint64_t EncodedAttrs = Attrs.Raw() & 0xffff;
+  if (Attrs.hasAttribute(Attributes::Alignment))
+    EncodedAttrs |= Attrs.getAlignment() << 16;
+  EncodedAttrs |= (Attrs.Raw() & (0xffffULL << 21)) << 11;
+  return EncodedAttrs;
+}
+
+/// decodeLLVMAttributesForBitcode - This returns an attribute bitset containing
+/// the LLVM attributes that have been decoded from the given integer.  This
+/// function must stay in sync with 'encodeLLVMAttributesForBitcode'.
+Attributes Attributes::decodeLLVMAttributesForBitcode(LLVMContext &C,
+                                                      uint64_t EncodedAttrs) {
+  // The alignment is stored as a 16-bit raw value from bits 31--16.  We shift
+  // the bits above 31 down by 11 bits.
+  unsigned Alignment = (EncodedAttrs & (0xffffULL << 16)) >> 16;
+  assert((!Alignment || isPowerOf2_32(Alignment)) &&
+         "Alignment must be a power of two.");
+
+  AttrBuilder B(EncodedAttrs & 0xffff);
+  if (Alignment)
+    B.addAlignmentAttr(Alignment);
+  B.addRawValue((EncodedAttrs & (0xffffULL << 32)) >> 11);
+  return Attributes::get(C, B);
+}
+
+std::string Attributes::getAsString() const {
   std::string Result;
-  if (Attrs & Attribute::ZExt)
+  if (hasAttribute(Attributes::ZExt))
     Result += "zeroext ";
-  if (Attrs & Attribute::SExt)
+  if (hasAttribute(Attributes::SExt))
     Result += "signext ";
-  if (Attrs & Attribute::NoReturn)
+  if (hasAttribute(Attributes::NoReturn))
     Result += "noreturn ";
-  if (Attrs & Attribute::NoUnwind)
+  if (hasAttribute(Attributes::NoUnwind))
     Result += "nounwind ";
-  if (Attrs & Attribute::UWTable)
+  if (hasAttribute(Attributes::UWTable))
     Result += "uwtable ";
-  if (Attrs & Attribute::ReturnsTwice)
+  if (hasAttribute(Attributes::ReturnsTwice))
     Result += "returns_twice ";
-  if (Attrs & Attribute::InReg)
+  if (hasAttribute(Attributes::InReg))
     Result += "inreg ";
-  if (Attrs & Attribute::NoAlias)
+  if (hasAttribute(Attributes::NoAlias))
     Result += "noalias ";
-  if (Attrs & Attribute::NoCapture)
+  if (hasAttribute(Attributes::NoCapture))
     Result += "nocapture ";
-  if (Attrs & Attribute::StructRet)
+  if (hasAttribute(Attributes::StructRet))
     Result += "sret ";
-  if (Attrs & Attribute::ByVal)
+  if (hasAttribute(Attributes::ByVal))
     Result += "byval ";
-  if (Attrs & Attribute::Nest)
+  if (hasAttribute(Attributes::Nest))
     Result += "nest ";
-  if (Attrs & Attribute::ReadNone)
+  if (hasAttribute(Attributes::ReadNone))
     Result += "readnone ";
-  if (Attrs & Attribute::ReadOnly)
+  if (hasAttribute(Attributes::ReadOnly))
     Result += "readonly ";
-  if (Attrs & Attribute::OptimizeForSize)
+  if (hasAttribute(Attributes::OptimizeForSize))
     Result += "optsize ";
-  if (Attrs & Attribute::NoInline)
+  if (hasAttribute(Attributes::NoInline))
     Result += "noinline ";
-  if (Attrs & Attribute::InlineHint)
+  if (hasAttribute(Attributes::InlineHint))
     Result += "inlinehint ";
-  if (Attrs & Attribute::AlwaysInline)
+  if (hasAttribute(Attributes::AlwaysInline))
     Result += "alwaysinline ";
-  if (Attrs & Attribute::StackProtect)
+  if (hasAttribute(Attributes::StackProtect))
     Result += "ssp ";
-  if (Attrs & Attribute::StackProtectReq)
+  if (hasAttribute(Attributes::StackProtectReq))
     Result += "sspreq ";
-  if (Attrs & Attribute::NoRedZone)
+  if (hasAttribute(Attributes::NoRedZone))
     Result += "noredzone ";
-  if (Attrs & Attribute::NoImplicitFloat)
+  if (hasAttribute(Attributes::NoImplicitFloat))
     Result += "noimplicitfloat ";
-  if (Attrs & Attribute::Naked)
+  if (hasAttribute(Attributes::Naked))
     Result += "naked ";
-  if (Attrs & Attribute::NonLazyBind)
+  if (hasAttribute(Attributes::NonLazyBind))
     Result += "nonlazybind ";
-  if (Attrs & Attribute::AddressSafety)
+  if (hasAttribute(Attributes::AddressSafety))
     Result += "address_safety ";
-  if (Attrs & Attribute::StackAlignment) {
+  if (hasAttribute(Attributes::MinSize))
+    Result += "minsize ";
+  if (hasAttribute(Attributes::StackAlignment)) {
     Result += "alignstack(";
-    Result += utostr(Attribute::getStackAlignmentFromAttrs(Attrs));
+    Result += utostr(getStackAlignment());
     Result += ") ";
   }
-  if (Attrs & Attribute::Alignment) {
+  if (hasAttribute(Attributes::Alignment)) {
     Result += "align ";
-    Result += utostr(Attribute::getAlignmentFromAttrs(Attrs));
+    Result += utostr(getAlignment());
     Result += " ";
   }
-  if (Attrs & Attribute::IANSDialect)
-    Result += "ia_nsdialect ";
-
   // Trim the trailing space.
   assert(!Result.empty() && "Unknown attribute!");
   Result.erase(Result.end()-1);
   return Result;
 }
 
-Attributes Attribute::typeIncompatible(Type *Ty) {
-  Attributes Incompatible = None;
-  
-  if (!Ty->isIntegerTy())
-    // Attributes that only apply to integers.
-    Incompatible |= SExt | ZExt;
-  
-  if (!Ty->isPointerTy())
-    // Attributes that only apply to pointers.
-    Incompatible |= ByVal | Nest | NoAlias | StructRet | NoCapture;
-  
-  return Incompatible;
+//===----------------------------------------------------------------------===//
+// AttrBuilder Implementation
+//===----------------------------------------------------------------------===//
+
+AttrBuilder &AttrBuilder::addAttribute(Attributes::AttrVal Val){
+  Bits |= AttributesImpl::getAttrMask(Val);
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addRawValue(uint64_t Val) {
+  Bits |= Val;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAlignmentAttr(unsigned Align) {
+  if (Align == 0) return *this;
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x40000000 && "Alignment too large.");
+  Bits |= (Log2_32(Align) + 1) << 16;
+  return *this;
+}
+AttrBuilder &AttrBuilder::addStackAlignmentAttr(unsigned Align){
+  // Default alignment, allow the target to define how to align it.
+  if (Align == 0) return *this;
+  assert(isPowerOf2_32(Align) && "Alignment must be a power of two.");
+  assert(Align <= 0x100 && "Alignment too large.");
+  Bits |= (Log2_32(Align) + 1) << 26;
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::removeAttribute(Attributes::AttrVal Val) {
+  Bits &= ~AttributesImpl::getAttrMask(Val);
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::addAttributes(const Attributes &A) {
+  Bits |= A.Raw();
+  return *this;
+}
+
+AttrBuilder &AttrBuilder::removeAttributes(const Attributes &A){
+  Bits &= ~A.Raw();
+  return *this;
+}
+
+bool AttrBuilder::hasAttribute(Attributes::AttrVal A) const {
+  return Bits & AttributesImpl::getAttrMask(A);
+}
+
+bool AttrBuilder::hasAttributes() const {
+  return Bits != 0;
+}
+bool AttrBuilder::hasAttributes(const Attributes &A) const {
+  return Bits & A.Raw();
+}
+bool AttrBuilder::hasAlignmentAttr() const {
+  return Bits & AttributesImpl::getAttrMask(Attributes::Alignment);
+}
+
+uint64_t AttrBuilder::getAlignment() const {
+  if (!hasAlignmentAttr())
+    return 0;
+  return 1U <<
+    (((Bits & AttributesImpl::getAttrMask(Attributes::Alignment)) >> 16) - 1);
+}
+
+uint64_t AttrBuilder::getStackAlignment() const {
+  if (!hasAlignmentAttr())
+    return 0;
+  return 1U <<
+    (((Bits & AttributesImpl::getAttrMask(Attributes::StackAlignment))>>26)-1);
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeListImpl Definition
+// AttributeImpl Definition
 //===----------------------------------------------------------------------===//
 
-namespace llvm {
-  class AttributeListImpl;
+uint64_t AttributesImpl::getAttrMask(uint64_t Val) {
+  switch (Val) {
+  case Attributes::None:            return 0;
+  case Attributes::ZExt:            return 1 << 0;
+  case Attributes::SExt:            return 1 << 1;
+  case Attributes::NoReturn:        return 1 << 2;
+  case Attributes::InReg:           return 1 << 3;
+  case Attributes::StructRet:       return 1 << 4;
+  case Attributes::NoUnwind:        return 1 << 5;
+  case Attributes::NoAlias:         return 1 << 6;
+  case Attributes::ByVal:           return 1 << 7;
+  case Attributes::Nest:            return 1 << 8;
+  case Attributes::ReadNone:        return 1 << 9;
+  case Attributes::ReadOnly:        return 1 << 10;
+  case Attributes::NoInline:        return 1 << 11;
+  case Attributes::AlwaysInline:    return 1 << 12;
+  case Attributes::OptimizeForSize: return 1 << 13;
+  case Attributes::StackProtect:    return 1 << 14;
+  case Attributes::StackProtectReq: return 1 << 15;
+  case Attributes::Alignment:       return 31 << 16;
+  case Attributes::NoCapture:       return 1 << 21;
+  case Attributes::NoRedZone:       return 1 << 22;
+  case Attributes::NoImplicitFloat: return 1 << 23;
+  case Attributes::Naked:           return 1 << 24;
+  case Attributes::InlineHint:      return 1 << 25;
+  case Attributes::StackAlignment:  return 7 << 26;
+  case Attributes::ReturnsTwice:    return 1 << 29;
+  case Attributes::UWTable:         return 1 << 30;
+  case Attributes::NonLazyBind:     return 1U << 31;
+  case Attributes::AddressSafety:   return 1ULL << 32;
+  case Attributes::MinSize:         return 1ULL << 33;
+  }
+  llvm_unreachable("Unsupported attribute type");
 }
 
-static ManagedStatic<FoldingSet<AttributeListImpl> > AttributesLists;
+bool AttributesImpl::hasAttribute(uint64_t A) const {
+  return (Bits & getAttrMask(A)) != 0;
+}
 
-namespace llvm {
-static ManagedStatic<sys::SmartMutex<true> > ALMutex;
+bool AttributesImpl::hasAttributes() const {
+  return Bits != 0;
+}
 
-class AttributeListImpl : public FoldingSetNode {
-  sys::cas_flag RefCount;
-  
-  // AttributesList is uniqued, these should not be publicly available.
-  void operator=(const AttributeListImpl &); // Do not implement
-  AttributeListImpl(const AttributeListImpl &); // Do not implement
-  ~AttributeListImpl();                        // Private implementation
-public:
-  SmallVector<AttributeWithIndex, 4> Attrs;
-  
-  AttributeListImpl(ArrayRef<AttributeWithIndex> attrs)
-    : Attrs(attrs.begin(), attrs.end()) {
-    RefCount = 0;
-  }
-  
-  void AddRef() {
-    sys::SmartScopedLock<true> Lock(*ALMutex);
-    ++RefCount;
-  }
-  void DropRef() {
-    sys::SmartScopedLock<true> Lock(*ALMutex);
-    if (!AttributesLists.isConstructed())
-      return;
-    sys::cas_flag new_val = --RefCount;
-    if (new_val == 0)
-      delete this;
-  }
-  
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, Attrs);
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeWithIndex> Attrs){
-    for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-      ID.AddInteger(Attrs[i].Attrs.Raw());
-      ID.AddInteger(Attrs[i].Index);
-    }
-  }
-};
+bool AttributesImpl::hasAttributes(const Attributes &A) const {
+  return Bits & A.Raw();        // FIXME: Raw() won't work here in the future.
 }
 
-AttributeListImpl::~AttributeListImpl() {
-  // NOTE: Lock must be acquired by caller.
-  AttributesLists->RemoveNode(this);
+uint64_t AttributesImpl::getAlignment() const {
+  return Bits & getAttrMask(Attributes::Alignment);
 }
 
+uint64_t AttributesImpl::getStackAlignment() const {
+  return Bits & getAttrMask(Attributes::StackAlignment);
+}
 
-AttrListPtr AttrListPtr::get(ArrayRef<AttributeWithIndex> Attrs) {
+//===----------------------------------------------------------------------===//
+// AttributeListImpl Definition
+//===----------------------------------------------------------------------===//
+
+AttrListPtr AttrListPtr::get(LLVMContext &C,
+                             ArrayRef<AttributeWithIndex> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
     return AttrListPtr();
-  
+
 #ifndef NDEBUG
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    assert(Attrs[i].Attrs != Attribute::None && 
+    assert(Attrs[i].Attrs.hasAttributes() &&
            "Pointless attribute!");
     assert((!i || Attrs[i-1].Index < Attrs[i].Index) &&
            "Misordered AttributesList!");
   }
 #endif
-  
+
   // Otherwise, build a key to look up the existing attributes.
+  LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
   AttributeListImpl::Profile(ID, Attrs);
-  void *InsertPos;
-  
-  sys::SmartScopedLock<true> Lock(*ALMutex);
-  
-  AttributeListImpl *PAL =
-    AttributesLists->FindNodeOrInsertPos(ID, InsertPos);
-  
+
+  void *InsertPoint;
+  AttributeListImpl *PA = pImpl->AttrsLists.FindNodeOrInsertPos(ID,
+                                                                InsertPoint);
+
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
-  if (!PAL) {
-    PAL = new AttributeListImpl(Attrs);
-    AttributesLists->InsertNode(PAL, InsertPos);
+  if (!PA) {
+    PA = new AttributeListImpl(Attrs);
+    pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
-  
+
   // Return the AttributesList that we found or created.
-  return AttrListPtr(PAL);
+  return AttrListPtr(PA);
 }
 
-
 //===----------------------------------------------------------------------===//
 // AttrListPtr Method Implementations
 //===----------------------------------------------------------------------===//
 
-AttrListPtr::AttrListPtr(AttributeListImpl *LI) : AttrList(LI) {
-  if (LI) LI->AddRef();
-}
-
-AttrListPtr::AttrListPtr(const AttrListPtr &P) : AttrList(P.AttrList) {
-  if (AttrList) AttrList->AddRef();  
-}
-
 const AttrListPtr &AttrListPtr::operator=(const AttrListPtr &RHS) {
-  sys::SmartScopedLock<true> Lock(*ALMutex);
   if (AttrList == RHS.AttrList) return *this;
-  if (AttrList) AttrList->DropRef();
+
   AttrList = RHS.AttrList;
-  if (AttrList) AttrList->AddRef();
   return *this;
 }
 
-AttrListPtr::~AttrListPtr() {
-  if (AttrList) AttrList->DropRef();
-}
-
-/// getNumSlots - Return the number of slots used in this attribute list. 
+/// getNumSlots - Return the number of slots used in this attribute list.
 /// This is the number of arguments that have an attribute set on them
 /// (including the function itself).
 unsigned AttrListPtr::getNumSlots() const {
@@ -245,48 +415,60 @@ const AttributeWithIndex &AttrListPtr::getSlot(unsigned Slot) const {
   return AttrList->Attrs[Slot];
 }
 
-
-/// getAttributes - The attributes for the specified index are
-/// returned.  Attributes for the result are denoted with Idx = 0.
-/// Function notes are denoted with idx = ~0.
+/// getAttributes - The attributes for the specified index are returned.
+/// Attributes for the result are denoted with Idx = 0.  Function notes are
+/// denoted with idx = ~0.
 Attributes AttrListPtr::getAttributes(unsigned Idx) const {
-  if (AttrList == 0) return Attribute::None;
-  
+  if (AttrList == 0) return Attributes();
+
   const SmallVector<AttributeWithIndex, 4> &Attrs = AttrList->Attrs;
   for (unsigned i = 0, e = Attrs.size(); i != e && Attrs[i].Index <= Idx; ++i)
     if (Attrs[i].Index == Idx)
       return Attrs[i].Attrs;
-  return Attribute::None;
+
+  return Attributes();
 }
 
 /// hasAttrSomewhere - Return true if the specified attribute is set for at
 /// least one parameter or for the return value.
-bool AttrListPtr::hasAttrSomewhere(Attributes Attr) const {
+bool AttrListPtr::hasAttrSomewhere(Attributes::AttrVal Attr) const {
   if (AttrList == 0) return false;
-  
+
   const SmallVector<AttributeWithIndex, 4> &Attrs = AttrList->Attrs;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i)
-    if (Attrs[i].Attrs & Attr)
+    if (Attrs[i].Attrs.hasAttribute(Attr))
       return true;
+
   return false;
 }
 
+unsigned AttrListPtr::getNumAttrs() const {
+  return AttrList ? AttrList->Attrs.size() : 0;
+}
+
+Attributes &AttrListPtr::getAttributesAtIndex(unsigned i) const {
+  assert(AttrList && "Trying to get an attribute from an empty list!");
+  assert(i < AttrList->Attrs.size() && "Index out of range!");
+  return AttrList->Attrs[i].Attrs;
+}
 
-AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const {
+AttrListPtr AttrListPtr::addAttr(LLVMContext &C, unsigned Idx,
+                                 Attributes Attrs) const {
   Attributes OldAttrs = getAttributes(Idx);
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't change a known alignment.
-  Attributes OldAlign = OldAttrs & Attribute::Alignment;
-  Attributes NewAlign = Attrs & Attribute::Alignment;
+  unsigned OldAlign = OldAttrs.getAlignment();
+  unsigned NewAlign = Attrs.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
 #endif
-  
-  Attributes NewAttrs = OldAttrs | Attrs;
-  if (NewAttrs == OldAttrs)
+
+  AttrBuilder NewAttrs =
+    AttrBuilder(OldAttrs).addAttributes(Attrs);
+  if (NewAttrs == AttrBuilder(OldAttrs))
     return *this;
-  
+
   SmallVector<AttributeWithIndex, 8> NewAttrList;
   if (AttrList == 0)
     NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
@@ -299,61 +481,67 @@ AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const {
 
     // If there are attributes already at this index, merge them in.
     if (i != e && OldAttrList[i].Index == Idx) {
-      Attrs |= OldAttrList[i].Attrs;
+      Attrs =
+        Attributes::get(C, AttrBuilder(Attrs).
+                        addAttributes(OldAttrList[i].Attrs));
       ++i;
     }
-    
+
     NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
-    
+
     // Copy attributes for arguments after this one.
-    NewAttrList.insert(NewAttrList.end(), 
+    NewAttrList.insert(NewAttrList.end(),
                        OldAttrList.begin()+i, OldAttrList.end());
   }
-  
-  return get(NewAttrList);
+
+  return get(C, NewAttrList);
 }
 
-AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
+AttrListPtr AttrListPtr::removeAttr(LLVMContext &C, unsigned Idx,
+                                    Attributes Attrs) const {
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't pass in alignment, which no current use does.
-  assert(!(Attrs & Attribute::Alignment) && "Attempt to exclude alignment!");
+  assert(!Attrs.hasAttribute(Attributes::Alignment) &&
+         "Attempt to exclude alignment!");
 #endif
   if (AttrList == 0) return AttrListPtr();
-  
+
   Attributes OldAttrs = getAttributes(Idx);
-  Attributes NewAttrs = OldAttrs & ~Attrs;
-  if (NewAttrs == OldAttrs)
+  AttrBuilder NewAttrs =
+    AttrBuilder(OldAttrs).removeAttributes(Attrs);
+  if (NewAttrs == AttrBuilder(OldAttrs))
     return *this;
 
   SmallVector<AttributeWithIndex, 8> NewAttrList;
   const SmallVector<AttributeWithIndex, 4> &OldAttrList = AttrList->Attrs;
   unsigned i = 0, e = OldAttrList.size();
-  
+
   // Copy attributes for arguments before this one.
   for (; i != e && OldAttrList[i].Index < Idx; ++i)
     NewAttrList.push_back(OldAttrList[i]);
-  
+
   // If there are attributes already at this index, merge them in.
   assert(OldAttrList[i].Index == Idx && "Attribute isn't set?");
-  Attrs = OldAttrList[i].Attrs & ~Attrs;
+  Attrs = Attributes::get(C, AttrBuilder(OldAttrList[i].Attrs).
+                          removeAttributes(Attrs));
   ++i;
-  if (Attrs)  // If any attributes left for this parameter, add them.
+  if (Attrs.hasAttributes()) // If any attributes left for this param, add them.
     NewAttrList.push_back(AttributeWithIndex::get(Idx, Attrs));
-  
+
   // Copy attributes for arguments after this one.
-  NewAttrList.insert(NewAttrList.end(), 
+  NewAttrList.insert(NewAttrList.end(),
                      OldAttrList.begin()+i, OldAttrList.end());
-  
-  return get(NewAttrList);
+
+  return get(C, NewAttrList);
 }
 
 void AttrListPtr::dump() const {
   dbgs() << "PAL[ ";
   for (unsigned i = 0; i < getNumSlots(); ++i) {
     const AttributeWithIndex &PAWI = getSlot(i);
-    dbgs() << "{" << PAWI.Index << "," << PAWI.Attrs << "} ";
+    dbgs() << "{" << PAWI.Index << "," << PAWI.Attrs.getAsString() << "} ";
   }
-  
+
   dbgs() << "]\n";
 }
diff --git a/lib/VMCore/AttributesImpl.h b/lib/VMCore/AttributesImpl.h
new file mode 100644
index 000000000000..5c107e1ebba3
--- /dev/null
+++ b/lib/VMCore/AttributesImpl.h
@@ -0,0 +1,71 @@
+//===-- AttributesImpl.h - Attributes Internals -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines various helper methods and classes used by LLVMContextImpl
+// for creating and managing attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ATTRIBUTESIMPL_H
+#define LLVM_ATTRIBUTESIMPL_H
+
+#include "llvm/Attributes.h"
+#include "llvm/ADT/FoldingSet.h"
+
+namespace llvm {
+
+class AttributesImpl : public FoldingSetNode {
+  uint64_t Bits;                // FIXME: We will be expanding this.
+public:
+  AttributesImpl(uint64_t bits) : Bits(bits) {}
+
+  bool hasAttribute(uint64_t A) const;
+
+  bool hasAttributes() const;
+  bool hasAttributes(const Attributes &A) const;
+
+  uint64_t getAlignment() const;
+  uint64_t getStackAlignment() const;
+
+  uint64_t Raw() const { return Bits; } // FIXME: Remove.
+
+  static uint64_t getAttrMask(uint64_t Val);
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, Bits);
+  }
+  static void Profile(FoldingSetNodeID &ID, uint64_t Bits) {
+    ID.AddInteger(Bits);
+  }
+};
+
+class AttributeListImpl : public FoldingSetNode {
+  // AttributesList is uniqued, these should not be publicly available.
+  void operator=(const AttributeListImpl &) LLVM_DELETED_FUNCTION;
+  AttributeListImpl(const AttributeListImpl &) LLVM_DELETED_FUNCTION;
+public:
+  SmallVector<AttributeWithIndex, 4> Attrs;
+
+  AttributeListImpl(ArrayRef<AttributeWithIndex> attrs)
+    : Attrs(attrs.begin(), attrs.end()) {}
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, Attrs);
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeWithIndex> Attrs){
+    for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
+      ID.AddInteger(Attrs[i].Attrs.Raw());
+      ID.AddInteger(Attrs[i].Index);
+    }
+  }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp
index 094ca755132c..5fff460e8bc4 100644
--- a/lib/VMCore/AutoUpgrade.cpp
+++ b/lib/VMCore/AutoUpgrade.cpp
@@ -148,7 +148,8 @@ bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
   if (NewFn)
     F = NewFn;
   if (unsigned id = F->getIntrinsicID())
-    F->setAttributes(Intrinsic::getAttributes((Intrinsic::ID)id));
+    F->setAttributes(Intrinsic::getAttributes(F->getContext(),
+                                              (Intrinsic::ID)id));
   return Upgraded;
 }
 
diff --git a/lib/VMCore/CMakeLists.txt b/lib/VMCore/CMakeLists.txt
index 6a20be6f3431..06eab0e8f026 100644
--- a/lib/VMCore/CMakeLists.txt
+++ b/lib/VMCore/CMakeLists.txt
@@ -1,5 +1,3 @@
-set(LLVM_REQUIRES_RTTI 1)
-
 add_llvm_library(LLVMCore
   AsmWriter.cpp
   Attributes.cpp
@@ -8,6 +6,7 @@ add_llvm_library(LLVMCore
   ConstantFold.cpp
   Constants.cpp
   Core.cpp
+  DataLayout.cpp
   DebugInfo.cpp
   DebugLoc.cpp
   DIBuilder.cpp
@@ -32,6 +31,7 @@ add_llvm_library(LLVMCore
   PrintModulePass.cpp
   Type.cpp
   TypeFinder.cpp
+  TargetTransformInfo.cpp
   Use.cpp
   User.cpp
   Value.cpp
@@ -42,7 +42,7 @@ add_llvm_library(LLVMCore
 
 # Workaround: It takes over 20 minutes to compile with msvc10.
 # FIXME: Suppressing optimizations to core libraries would not be good thing.
-if( MSVC_VERSION EQUAL 1600 )
+if( MSVC_VERSION LESS 1700 )
 set_property(
   SOURCE Function.cpp
   PROPERTY COMPILE_FLAGS "/Og-"
diff --git a/lib/VMCore/ConstantFold.cpp b/lib/VMCore/ConstantFold.cpp
index 8e8287624359..fe3edac42e76 100644
--- a/lib/VMCore/ConstantFold.cpp
+++ b/lib/VMCore/ConstantFold.cpp
@@ -12,7 +12,7 @@
 // ConstantExpr::get* methods to automatically fold constants when possible.
 //
 // The current constant folding implementation is implemented in two pieces: the
-// pieces that don't need TargetData, and the pieces that do. This is to avoid
+// pieces that don't need DataLayout, and the pieces that do. This is to avoid
 // a dependence in VMCore on Target.
 //
 //===----------------------------------------------------------------------===//
@@ -87,9 +87,13 @@ foldConstantCastPair(
   Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opc);
 
+  // Assume that pointers are never more than 64 bits wide.
+  IntegerType *FakeIntPtrTy = Type::getInt64Ty(DstTy->getContext());
+
   // Let CastInst::isEliminableCastPair do the heavy lifting.
   return CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy,
-                                        Type::getInt64Ty(DstTy->getContext()));
+                                        FakeIntPtrTy, FakeIntPtrTy,
+                                        FakeIntPtrTy);
 }
 
 static Constant *FoldBitCast(Constant *V, Type *DestTy) {
@@ -514,10 +518,6 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     return UndefValue::get(DestTy);
   }
 
-  // No compile-time operations on this type yet.
-  if (V->getType()->isPPC_FP128Ty() || DestTy->isPPC_FP128Ty())
-    return 0;
-
   if (V->isNullValue() && !DestTy->isX86_MMXTy())
     return Constant::getNullValue(DestTy);
 
@@ -576,6 +576,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                   DestTy->isDoubleTy() ? APFloat::IEEEdouble :
                   DestTy->isX86_FP80Ty() ? APFloat::x87DoubleExtended :
                   DestTy->isFP128Ty() ? APFloat::IEEEquad :
+                  DestTy->isPPC_FP128Ty() ? APFloat::PPCDoubleDouble :
                   APFloat::Bogus,
                   APFloat::rmNearestTiesToEven, &ignored);
       return ConstantFP::get(V->getContext(), Val);
@@ -646,7 +647,8 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   case Instruction::SIToFP:
     if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
       APInt api = CI->getValue();
-      APFloat apf(APInt::getNullValue(DestTy->getPrimitiveSizeInBits()), true);
+      APFloat apf(APInt::getNullValue(DestTy->getPrimitiveSizeInBits()),
+                  !DestTy->isPPC_FP128Ty() /* isEEEE */);
       (void)apf.convertFromAPInt(api, 
                                  opc==Instruction::SIToFP,
                                  APFloat::rmNearestTiesToEven);
@@ -867,10 +869,6 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
 
 Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
                                               Constant *C1, Constant *C2) {
-  // No compile-time operations on this type yet.
-  if (C1->getType()->isPPC_FP128Ty())
-    return 0;
-
   // Handle UndefValue up front.
   if (isa<UndefValue>(C1) || isa<UndefValue>(C2)) {
     switch (Opcode) {
@@ -1273,10 +1271,6 @@ static FCmpInst::Predicate evaluateFCmpRelation(Constant *V1, Constant *V2) {
   assert(V1->getType() == V2->getType() &&
          "Cannot compare values of different types!");
 
-  // No compile-time operations on this type yet.
-  if (V1->getType()->isPPC_FP128Ty())
-    return FCmpInst::BAD_FCMP_PREDICATE;
-
   // Handle degenerate case quickly
   if (V1 == V2) return FCmpInst::FCMP_OEQ;
 
@@ -1602,10 +1596,6 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
     return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(pred));
   }
 
-  // No compile-time operations on this type yet.
-  if (C1->getType()->isPPC_FP128Ty())
-    return 0;
-
   // icmp eq/ne(null,GV) -> false/true
   if (C1->isNullValue()) {
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C2))
diff --git a/lib/VMCore/Constants.cpp b/lib/VMCore/Constants.cpp
index a4e21e16b3fc..edd6a73b0867 100644
--- a/lib/VMCore/Constants.cpp
+++ b/lib/VMCore/Constants.cpp
@@ -245,6 +245,33 @@ bool Constant::canTrap() const {
   }
 }
 
+/// isThreadDependent - Return true if the value can vary between threads.
+bool Constant::isThreadDependent() const {
+  SmallPtrSet<const Constant*, 64> Visited;
+  SmallVector<const Constant*, 64> WorkList;
+  WorkList.push_back(this);
+  Visited.insert(this);
+
+  while (!WorkList.empty()) {
+    const Constant *C = WorkList.pop_back_val();
+
+    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+      if (GV->isThreadLocal())
+        return true;
+    }
+
+    for (unsigned I = 0, E = C->getNumOperands(); I != E; ++I) {
+      const Constant *D = dyn_cast<Constant>(C->getOperand(I));
+      if (!D)
+        continue;
+      if (Visited.insert(D))
+        WorkList.push_back(D);
+    }
+  }
+
+  return false;
+}
+
 /// isConstantUsed - Return true if the constant has users other than constant
 /// exprs and other dangling things.
 bool Constant::isConstantUsed() const {
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index 8903a8f40f95..996eb12d69ea 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -33,7 +33,7 @@ struct ConstantTraits;
 /// behind the scenes to implement unary constant exprs.
 class UnaryConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -50,7 +50,7 @@ public:
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -71,7 +71,7 @@ public:
 /// behind the scenes to implement select constant exprs.
 class SelectConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -92,7 +92,7 @@ public:
 /// extractelement constant exprs.
 class ExtractElementConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -113,7 +113,7 @@ public:
 /// insertelement constant exprs.
 class InsertElementConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -135,7 +135,7 @@ public:
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -160,7 +160,7 @@ public:
 /// extractvalue constant exprs.
 class ExtractValueConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -186,7 +186,7 @@ public:
 /// insertvalue constant exprs.
 class InsertValueConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -234,7 +234,7 @@ public:
 // needed in order to store the predicate value for these instructions.
 class CompareConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -352,18 +352,21 @@ struct ExprMapKeyType {
 struct InlineAsmKeyType {
   InlineAsmKeyType(StringRef AsmString,
                    StringRef Constraints, bool hasSideEffects,
-                   bool isAlignStack)
+                   bool isAlignStack, InlineAsm::AsmDialect asmDialect)
     : asm_string(AsmString), constraints(Constraints),
-      has_side_effects(hasSideEffects), is_align_stack(isAlignStack) {}
+      has_side_effects(hasSideEffects), is_align_stack(isAlignStack),
+      asm_dialect(asmDialect) {}
   std::string asm_string;
   std::string constraints;
   bool has_side_effects;
   bool is_align_stack;
+  InlineAsm::AsmDialect asm_dialect;
   bool operator==(const InlineAsmKeyType& that) const {
     return this->asm_string == that.asm_string &&
            this->constraints == that.constraints &&
            this->has_side_effects == that.has_side_effects &&
-           this->is_align_stack == that.is_align_stack;
+           this->is_align_stack == that.is_align_stack &&
+           this->asm_dialect == that.asm_dialect;
   }
   bool operator<(const InlineAsmKeyType& that) const {
     if (this->asm_string != that.asm_string)
@@ -374,6 +377,8 @@ struct InlineAsmKeyType {
       return this->has_side_effects < that.has_side_effects;
     if (this->is_align_stack != that.is_align_stack)
       return this->is_align_stack < that.is_align_stack;
+    if (this->asm_dialect != that.asm_dialect)
+      return this->asm_dialect < that.asm_dialect;
     return false;
   }
 
@@ -490,7 +495,8 @@ template<>
 struct ConstantCreator<InlineAsm, PointerType, InlineAsmKeyType> {
   static InlineAsm *create(PointerType *Ty, const InlineAsmKeyType &Key) {
     return new InlineAsm(Ty, Key.asm_string, Key.constraints,
-                         Key.has_side_effects, Key.is_align_stack);
+                         Key.has_side_effects, Key.is_align_stack,
+                         Key.asm_dialect);
   }
 };
 
@@ -499,7 +505,8 @@ struct ConstantKeyData<InlineAsm> {
   typedef InlineAsmKeyType ValType;
   static ValType getValType(InlineAsm *Asm) {
     return InlineAsmKeyType(Asm->getAsmString(), Asm->getConstraintString(),
-                            Asm->hasSideEffects(), Asm->isAlignStack());
+                            Asm->hasSideEffects(), Asm->isAlignStack(),
+                            Asm->getDialect());
   }
 };
 
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index 972db3cb86e1..847bc134ddb7 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -568,6 +568,19 @@ const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
   return 0;
 }
 
+unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V)
+{
+  return cast<MDNode>(unwrap(V))->getNumOperands();
+}
+
+void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest)
+{
+  const MDNode *N = cast<MDNode>(unwrap(V));
+  const unsigned numOperands = N->getNumOperands();
+  for (unsigned i = 0; i < numOperands; i++)
+    Dest[i] = wrap(N->getOperand(i));
+}
+
 unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char* name)
 {
   if (NamedMDNode *N = unwrap(M)->getNamedMetadata(name)) {
@@ -1084,6 +1097,8 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMLinkOnceAnyLinkage;
   case GlobalValue::LinkOnceODRLinkage:
     return LLVMLinkOnceODRLinkage;
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
+    return LLVMLinkOnceODRAutoHideLinkage;
   case GlobalValue::WeakAnyLinkage:
     return LLVMWeakAnyLinkage;
   case GlobalValue::WeakODRLinkage:
@@ -1098,8 +1113,6 @@ LLVMLinkage LLVMGetLinkage(LLVMValueRef Global) {
     return LLVMLinkerPrivateLinkage;
   case GlobalValue::LinkerPrivateWeakLinkage:
     return LLVMLinkerPrivateWeakLinkage;
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
-    return LLVMLinkerPrivateWeakDefAutoLinkage;
   case GlobalValue::DLLImportLinkage:
     return LLVMDLLImportLinkage;
   case GlobalValue::DLLExportLinkage:
@@ -1129,6 +1142,9 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
   case LLVMLinkOnceODRLinkage:
     GV->setLinkage(GlobalValue::LinkOnceODRLinkage);
     break;
+  case LLVMLinkOnceODRAutoHideLinkage:
+    GV->setLinkage(GlobalValue::LinkOnceODRAutoHideLinkage);
+    break;
   case LLVMWeakAnyLinkage:
     GV->setLinkage(GlobalValue::WeakAnyLinkage);
     break;
@@ -1150,9 +1166,6 @@ void LLVMSetLinkage(LLVMValueRef Global, LLVMLinkage Linkage) {
   case LLVMLinkerPrivateWeakLinkage:
     GV->setLinkage(GlobalValue::LinkerPrivateWeakLinkage);
     break;
-  case LLVMLinkerPrivateWeakDefAutoLinkage:
-    GV->setLinkage(GlobalValue::LinkerPrivateWeakDefAutoLinkage);
-    break;
   case LLVMDLLImportLinkage:
     GV->setLinkage(GlobalValue::DLLImportLinkage);
     break;
@@ -1368,14 +1381,20 @@ void LLVMSetGC(LLVMValueRef Fn, const char *GC) {
 void LLVMAddFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
   Function *Func = unwrap<Function>(Fn);
   const AttrListPtr PAL = Func->getAttributes();
-  const AttrListPtr PALnew = PAL.addAttr(~0U, Attributes(PA));
+  AttrBuilder B(PA);
+  const AttrListPtr PALnew =
+    PAL.addAttr(Func->getContext(), AttrListPtr::FunctionIndex,
+                Attributes::get(Func->getContext(), B));
   Func->setAttributes(PALnew);
 }
 
 void LLVMRemoveFunctionAttr(LLVMValueRef Fn, LLVMAttribute PA) {
   Function *Func = unwrap<Function>(Fn);
   const AttrListPtr PAL = Func->getAttributes();
-  const AttrListPtr PALnew = PAL.removeAttr(~0U, Attributes(PA));
+  AttrBuilder B(PA);
+  const AttrListPtr PALnew =
+    PAL.removeAttr(Func->getContext(), AttrListPtr::FunctionIndex,
+                   Attributes::get(Func->getContext(), B));
   Func->setAttributes(PALnew);
 }
 
@@ -1445,11 +1464,15 @@ LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
 }
 
 void LLVMAddAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
-  unwrap<Argument>(Arg)->addAttr(Attributes(PA));
+  Argument *A = unwrap<Argument>(Arg);
+  AttrBuilder B(PA);
+  A->addAttr(Attributes::get(A->getContext(), B));
 }
 
 void LLVMRemoveAttribute(LLVMValueRef Arg, LLVMAttribute PA) {
-  unwrap<Argument>(Arg)->removeAttr(Attributes(PA));
+  Argument *A = unwrap<Argument>(Arg);
+  AttrBuilder B(PA);
+  A->removeAttr(Attributes::get(A->getContext(), B));
 }
 
 LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
@@ -1461,8 +1484,10 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
   
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
-  unwrap<Argument>(Arg)->addAttr(
-          Attribute::constructAlignmentFromInt(align));
+  AttrBuilder B;
+  B.addAlignmentAttr(align);
+  unwrap<Argument>(Arg)->addAttr(Attributes::
+                                 get(unwrap<Argument>(Arg)->getContext(), B));
 }
 
 /*--.. Operations on basic blocks ..........................................--*/
@@ -1651,23 +1676,28 @@ void LLVMSetInstructionCallConv(LLVMValueRef Instr, unsigned CC) {
 void LLVMAddInstrAttribute(LLVMValueRef Instr, unsigned index, 
                            LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  AttrBuilder B(PA);
   Call.setAttributes(
-    Call.getAttributes().addAttr(index, Attributes(PA)));
+    Call.getAttributes().addAttr(Call->getContext(), index,
+                                 Attributes::get(Call->getContext(), B)));
 }
 
 void LLVMRemoveInstrAttribute(LLVMValueRef Instr, unsigned index, 
                               LLVMAttribute PA) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
+  AttrBuilder B(PA);
   Call.setAttributes(
-    Call.getAttributes().removeAttr(index, Attributes(PA)));
+    Call.getAttributes().removeAttr(Call->getContext(), index,
+                                    Attributes::get(Call->getContext(), B)));
 }
 
 void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index, 
                                 unsigned align) {
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
-  Call.setAttributes(
-    Call.getAttributes().addAttr(index, 
-        Attribute::constructAlignmentFromInt(align)));
+  AttrBuilder B;
+  B.addAlignmentAttr(align);
+  Call.setAttributes(Call.getAttributes().addAttr(Call->getContext(), index,
+                                       Attributes::get(Call->getContext(), B)));
 }
 
 /*--.. Operations on call instructions (only) ..............................--*/
diff --git a/lib/VMCore/DIBuilder.cpp b/lib/VMCore/DIBuilder.cpp
index f5894e9a32ea..152b825523da 100644
--- a/lib/VMCore/DIBuilder.cpp
+++ b/lib/VMCore/DIBuilder.cpp
@@ -492,7 +492,8 @@ DIType DIBuilder::createStructType(DIDescriptor Context, StringRef Name,
     NULL,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), RunTimeLang),
-    Constant::getNullValue(Type::getInt32Ty(VMContext))
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
   };
   return DIType(MDNode::get(VMContext, Elts));
 }
@@ -550,7 +551,7 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
                                         uint64_t SizeInBits,
                                         uint64_t AlignInBits,
                                         DIArray Elements,
-                                        DIType ClassType, unsigned Flags) {
+                                        DIType ClassType) {
   // TAG_enumeration_type is encoded in DICompositeType format.
   Value *Elts[] = {
     GetTagConstant(VMContext, dwarf::DW_TAG_enumeration_type),
@@ -561,7 +562,7 @@ DIType DIBuilder::createEnumerationType(DIDescriptor Scope, StringRef Name,
     ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
     ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), Flags),
+    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ClassType,
     Elements,
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
@@ -640,6 +641,30 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   return DIType(MDNode::get(VMContext, Elts));
 }
 
+/// createArtificialType - Create a new DIType with "artificial" flag set.
+DIType DIBuilder::createObjectPointerType(DIType Ty) {
+  if (Ty.isObjectPointer())
+    return Ty;
+
+  SmallVector<Value *, 9> Elts;
+  MDNode *N = Ty;
+  assert (N && "Unexpected input DIType!");
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i))
+      Elts.push_back(V);
+    else
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  }
+
+  unsigned CurFlags = Ty.getFlags();
+  CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
+
+  // Flags are stored at this slot.
+  Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
+
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
 /// retainType - Retain DIType in a module even if it is not referenced
 /// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
@@ -682,7 +707,9 @@ DIType DIBuilder::createTemporaryType(DIFile F) {
 /// can be RAUW'd if the full type is seen.
 DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
                                     DIDescriptor Scope, DIFile F,
-                                    unsigned Line, unsigned RuntimeLang) {
+                                    unsigned Line, unsigned RuntimeLang,
+                                    uint64_t SizeInBits,
+                                    uint64_t AlignInBits) {
   // Create a temporary MDNode.
   Value *Elts[] = {
     GetTagConstant(VMContext, Tag),
@@ -690,9 +717,8 @@ DIType DIBuilder::createForwardDecl(unsigned Tag, StringRef Name,
     MDString::get(VMContext, Name),
     F,
     ConstantInt::get(Type::getInt32Ty(VMContext), Line),
-    // To ease transition include sizes etc of 0.
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
-    ConstantInt::get(Type::getInt32Ty(VMContext), 0),
+    ConstantInt::get(Type::getInt64Ty(VMContext), SizeInBits),
+    ConstantInt::get(Type::getInt64Ty(VMContext), AlignInBits),
     ConstantInt::get(Type::getInt32Ty(VMContext), 0),
     ConstantInt::get(Type::getInt32Ty(VMContext),
                      DIDescriptor::FlagFwdDecl),
diff --git a/lib/Target/TargetData.cpp b/lib/VMCore/DataLayout.cpp
index cc6dc1e25998..19cf0f5cd3e8 100644
--- a/lib/Target/TargetData.cpp
+++ b/lib/VMCore/DataLayout.cpp
@@ -1,4 +1,4 @@
-//===-- TargetData.cpp - Data size & alignment routines --------------------==//
+//===-- DataLayout.cpp - Data size & alignment routines --------------------==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines target properties related to datatype size/offset/alignment
+// This file defines layout properties related to datatype size/offset/alignment
 // information.
 //
 // This structure should be created once, filled in if the defaults are not
@@ -16,7 +16,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Module.h"
@@ -31,17 +31,17 @@
 #include <cstdlib>
 using namespace llvm;
 
-// Handle the Pass registration stuff necessary to use TargetData's.
+// Handle the Pass registration stuff necessary to use DataLayout's.
 
 // Register the default SparcV9 implementation...
-INITIALIZE_PASS(TargetData, "targetdata", "Target Data Layout", false, true)
-char TargetData::ID = 0;
+INITIALIZE_PASS(DataLayout, "datalayout", "Data Layout", false, true)
+char DataLayout::ID = 0;
 
 //===----------------------------------------------------------------------===//
 // Support for StructLayout
 //===----------------------------------------------------------------------===//
 
-StructLayout::StructLayout(StructType *ST, const TargetData &TD) {
+StructLayout::StructLayout(StructType *ST, const DataLayout &TD) {
   assert(!ST->isOpaque() && "Cannot get layout of opaque structs");
   StructAlignment = 0;
   StructSize = 0;
@@ -54,7 +54,7 @@ StructLayout::StructLayout(StructType *ST, const TargetData &TD) {
 
     // Add padding if necessary to align the data element properly.
     if ((StructSize & (TyAlign-1)) != 0)
-      StructSize = TargetData::RoundUpAlignment(StructSize, TyAlign);
+      StructSize = DataLayout::RoundUpAlignment(StructSize, TyAlign);
 
     // Keep track of maximum alignment constraint.
     StructAlignment = std::max(TyAlign, StructAlignment);
@@ -69,7 +69,7 @@ StructLayout::StructLayout(StructType *ST, const TargetData &TD) {
   // Add padding to the end of the struct so that it could be put in an array
   // and all array elements would be aligned correctly.
   if ((StructSize & (StructAlignment-1)) != 0)
-    StructSize = TargetData::RoundUpAlignment(StructSize, StructAlignment);
+    StructSize = DataLayout::RoundUpAlignment(StructSize, StructAlignment);
 }
 
 
@@ -94,14 +94,14 @@ unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
 }
 
 //===----------------------------------------------------------------------===//
-// TargetAlignElem, TargetAlign support
+// LayoutAlignElem, LayoutAlign support
 //===----------------------------------------------------------------------===//
 
-TargetAlignElem
-TargetAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
+LayoutAlignElem
+LayoutAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
                      unsigned pref_align, uint32_t bit_width) {
   assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
-  TargetAlignElem retval;
+  LayoutAlignElem retval;
   retval.AlignType = align_type;
   retval.ABIAlign = abi_align;
   retval.PrefAlign = pref_align;
@@ -110,18 +110,46 @@ TargetAlignElem::get(AlignTypeEnum align_type, unsigned abi_align,
 }
 
 bool
-TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
+LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
   return (AlignType == rhs.AlignType
           && ABIAlign == rhs.ABIAlign
           && PrefAlign == rhs.PrefAlign
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-const TargetAlignElem
-TargetData::InvalidAlignmentElem = { (AlignTypeEnum)0xFF, 0, 0, 0 };
+const LayoutAlignElem
+DataLayout::InvalidAlignmentElem =
+            LayoutAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
 
 //===----------------------------------------------------------------------===//
-//                       TargetData Class Implementation
+// PointerAlignElem, PointerAlign support
+//===----------------------------------------------------------------------===//
+
+PointerAlignElem
+PointerAlignElem::get(uint32_t addr_space, unsigned abi_align,
+                     unsigned pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  PointerAlignElem retval;
+  retval.AddressSpace = addr_space;
+  retval.ABIAlign = abi_align;
+  retval.PrefAlign = pref_align;
+  retval.TypeBitWidth = bit_width;
+  return retval;
+}
+
+bool
+PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
+  return (ABIAlign == rhs.ABIAlign
+          && AddressSpace == rhs.AddressSpace
+          && PrefAlign == rhs.PrefAlign
+          && TypeBitWidth == rhs.TypeBitWidth);
+}
+
+const PointerAlignElem
+DataLayout::InvalidPointerElem = PointerAlignElem::get(~0U, 0U, 0U, 0U);
+
+//===----------------------------------------------------------------------===//
+//                       DataLayout Class Implementation
 //===----------------------------------------------------------------------===//
 
 /// getInt - Get an integer ignoring errors.
@@ -131,14 +159,11 @@ static int getInt(StringRef R) {
   return Result;
 }
 
-void TargetData::init() {
-  initializeTargetDataPass(*PassRegistry::getPassRegistry());
+void DataLayout::init() {
+  initializeDataLayoutPass(*PassRegistry::getPassRegistry());
 
   LayoutMap = 0;
   LittleEndian = false;
-  PointerMemSize = 8;
-  PointerABIAlign = 8;
-  PointerPrefAlign = PointerABIAlign;
   StackNaturalAlign = 0;
 
   // Default alignments
@@ -154,9 +179,10 @@ void TargetData::init() {
   setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32, v1i64, ...
   setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
   setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct
+  setPointerAlignment(0, 8, 8, 8);
 }
 
-std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
+std::string DataLayout::parseSpecifier(StringRef Desc, DataLayout *td) {
 
   if (td)
     td->init();
@@ -185,13 +211,16 @@ std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
         td->LittleEndian = true;
       break;
     case 'p': {
-      // Pointer size.
+      int AddrSpace = 0;
+      if (Specifier.size() > 1) {
+        AddrSpace = getInt(Specifier.substr(1));
+        if (AddrSpace < 0 || AddrSpace > (1 << 24))
+          return "Invalid address space, must be a positive 24bit integer";
+      }
       Split = Token.split(':');
       int PointerMemSizeBits = getInt(Split.first);
       if (PointerMemSizeBits < 0 || PointerMemSizeBits % 8 != 0)
         return "invalid pointer size, must be a positive 8-bit multiple";
-      if (td)
-        td->PointerMemSize = PointerMemSizeBits / 8;
 
       // Pointer ABI alignment.
       Split = Split.second.split(':');
@@ -200,8 +229,6 @@ std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
         return "invalid pointer ABI alignment, "
                "must be a positive 8-bit multiple";
       }
-      if (td)
-        td->PointerABIAlign = PointerABIAlignBits / 8;
 
       // Pointer preferred alignment.
       Split = Split.second.split(':');
@@ -210,11 +237,12 @@ std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
         return "invalid pointer preferred alignment, "
                "must be a positive 8-bit multiple";
       }
-      if (td) {
-        td->PointerPrefAlign = PointerPrefAlignBits / 8;
-        if (td->PointerPrefAlign == 0)
-          td->PointerPrefAlign = td->PointerABIAlign;
-      }
+
+      if (PointerPrefAlignBits == 0)
+        PointerPrefAlignBits = PointerABIAlignBits;
+      if (td)
+        td->setPointerAlignment(AddrSpace, PointerABIAlignBits/8,
+            PointerPrefAlignBits/8, PointerMemSizeBits/8);
       break;
     }
     case 'i':
@@ -256,7 +284,7 @@ std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
       unsigned PrefAlign = PrefAlignBits / 8;
       if (PrefAlign == 0)
         PrefAlign = ABIAlign;
-      
+
       if (td)
         td->setAlignment(AlignType, ABIAlign, PrefAlign, Size);
       break;
@@ -266,8 +294,8 @@ std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
       do {
         int Width = getInt(Specifier);
         if (Width <= 0) {
-          return std::string("invalid native integer size \'") + Specifier.str() +
-                 "\', must be a positive integer.";
+          return std::string("invalid native integer size \'") +
+            Specifier.str() + "\', must be a positive integer.";
         }
         if (td && Width != 0)
           td->LegalIntWidths.push_back(Width);
@@ -298,24 +326,26 @@ std::string TargetData::parseSpecifier(StringRef Desc, TargetData *td) {
 ///
 /// @note This has to exist, because this is a pass, but it should never be
 /// used.
-TargetData::TargetData() : ImmutablePass(ID) {
-  report_fatal_error("Bad TargetData ctor used.  "
-                    "Tool did not specify a TargetData to use?");
+DataLayout::DataLayout() : ImmutablePass(ID) {
+  report_fatal_error("Bad DataLayout ctor used.  "
+                    "Tool did not specify a DataLayout to use?");
 }
 
-TargetData::TargetData(const Module *M)
+DataLayout::DataLayout(const Module *M)
   : ImmutablePass(ID) {
   std::string errMsg = parseSpecifier(M->getDataLayout(), this);
-  assert(errMsg == "" && "Module M has malformed target data layout string.");
+  assert(errMsg == "" && "Module M has malformed data layout string.");
   (void)errMsg;
 }
 
 void
-TargetData::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
+DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
                          unsigned pref_align, uint32_t bit_width) {
   assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  assert(pref_align < (1 << 16) && "Alignment doesn't fit in bitfield");
+  assert(bit_width < (1 << 24) && "Bit width doesn't fit in bitfield");
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == align_type &&
+    if (Alignments[i].AlignType == (unsigned)align_type &&
         Alignments[i].TypeBitWidth == bit_width) {
       // Update the abi, preferred alignments.
       Alignments[i].ABIAlign = abi_align;
@@ -324,20 +354,35 @@ TargetData::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
     }
   }
 
-  Alignments.push_back(TargetAlignElem::get(align_type, abi_align,
+  Alignments.push_back(LayoutAlignElem::get(align_type, abi_align,
                                             pref_align, bit_width));
 }
 
+void
+DataLayout::setPointerAlignment(uint32_t addr_space, unsigned abi_align,
+                         unsigned pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  DenseMap<unsigned,PointerAlignElem>::iterator val = Pointers.find(addr_space);
+  if (val == Pointers.end()) {
+    Pointers[addr_space] = PointerAlignElem::get(addr_space,
+          abi_align, pref_align, bit_width);
+  } else {
+    val->second.ABIAlign = abi_align;
+    val->second.PrefAlign = pref_align;
+    val->second.TypeBitWidth = bit_width;
+  }
+}
+
 /// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or
-/// preferred if ABIInfo = false) the target wants for the specified datatype.
-unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType,
+/// preferred if ABIInfo = false) the layout wants for the specified datatype.
+unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
                                       uint32_t BitWidth, bool ABIInfo,
                                       Type *Ty) const {
   // Check to see if we have an exact match and remember the best match we see.
   int BestMatchIdx = -1;
   int LargestInt = -1;
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == AlignType &&
+    if (Alignments[i].AlignType == (unsigned)AlignType &&
         Alignments[i].TypeBitWidth == BitWidth)
       return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
 
@@ -410,11 +455,11 @@ public:
 
 } // end anonymous namespace
 
-TargetData::~TargetData() {
+DataLayout::~DataLayout() {
   delete static_cast<StructLayoutMap*>(LayoutMap);
 }
 
-const StructLayout *TargetData::getStructLayout(StructType *Ty) const {
+const StructLayout *DataLayout::getStructLayout(StructType *Ty) const {
   if (!LayoutMap)
     LayoutMap = new StructLayoutMap();
 
@@ -437,17 +482,35 @@ const StructLayout *TargetData::getStructLayout(StructType *Ty) const {
   return L;
 }
 
-std::string TargetData::getStringRepresentation() const {
+std::string DataLayout::getStringRepresentation() const {
   std::string Result;
   raw_string_ostream OS(Result);
 
-  OS << (LittleEndian ? "e" : "E")
-     << "-p:" << PointerMemSize*8 << ':' << PointerABIAlign*8
-     << ':' << PointerPrefAlign*8
-     << "-S" << StackNaturalAlign*8;
+  OS << (LittleEndian ? "e" : "E");
+  SmallVector<unsigned, 8> addrSpaces;
+  // Lets get all of the known address spaces and sort them
+  // into increasing order so that we can emit the string
+  // in a cleaner format.
+  for (DenseMap<unsigned, PointerAlignElem>::const_iterator
+      pib = Pointers.begin(), pie = Pointers.end();
+      pib != pie; ++pib) {
+    addrSpaces.push_back(pib->first);
+  }
+  std::sort(addrSpaces.begin(), addrSpaces.end());
+  for (SmallVector<unsigned, 8>::iterator asb = addrSpaces.begin(),
+      ase = addrSpaces.end(); asb != ase; ++asb) {
+    const PointerAlignElem &PI = Pointers.find(*asb)->second;
+    OS << "-p";
+    if (PI.AddressSpace) {
+      OS << PI.AddressSpace;
+    }
+     OS << ":" << PI.TypeBitWidth*8 << ':' << PI.ABIAlign*8
+        << ':' << PI.PrefAlign*8;
+  }
+  OS << "-S" << StackNaturalAlign*8;
 
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    const TargetAlignElem &AI = Alignments[i];
+    const LayoutAlignElem &AI = Alignments[i];
     OS << '-' << (char)AI.AlignType << AI.TypeBitWidth << ':'
        << AI.ABIAlign*8 << ':' << AI.PrefAlign*8;
   }
@@ -462,12 +525,15 @@ std::string TargetData::getStringRepresentation() const {
 }
 
 
-uint64_t TargetData::getTypeSizeInBits(Type *Ty) const {
+uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   case Type::LabelTyID:
-  case Type::PointerTyID:
-    return getPointerSizeInBits();
+    return getPointerSizeInBits(0);
+  case Type::PointerTyID: {
+    unsigned AS = dyn_cast<PointerType>(Ty)->getAddressSpace();
+    return getPointerSizeInBits(AS);
+    }
   case Type::ArrayTyID: {
     ArrayType *ATy = cast<ArrayType>(Ty);
     return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
@@ -493,10 +559,12 @@ uint64_t TargetData::getTypeSizeInBits(Type *Ty) const {
   // only 80 bits contain information.
   case Type::X86_FP80TyID:
     return 80;
-  case Type::VectorTyID:
-    return cast<VectorType>(Ty)->getBitWidth();
+  case Type::VectorTyID: {
+    VectorType *VTy = cast<VectorType>(Ty);
+    return VTy->getNumElements()*getTypeSizeInBits(VTy->getElementType());
+  }
   default:
-    llvm_unreachable("TargetData::getTypeSizeInBits(): Unsupported type");
+    llvm_unreachable("DataLayout::getTypeSizeInBits(): Unsupported type");
   }
 }
 
@@ -508,17 +576,22 @@ uint64_t TargetData::getTypeSizeInBits(Type *Ty) const {
   Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
   == false) for the requested type \a Ty.
  */
-unsigned TargetData::getAlignment(Type *Ty, bool abi_or_pref) const {
+unsigned DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
   int AlignType = -1;
 
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
   switch (Ty->getTypeID()) {
   // Early escape for the non-numeric types.
   case Type::LabelTyID:
-  case Type::PointerTyID:
     return (abi_or_pref
-            ? getPointerABIAlignment()
-            : getPointerPrefAlignment());
+            ? getPointerABIAlignment(0)
+            : getPointerPrefAlignment(0));
+  case Type::PointerTyID: {
+    unsigned AS = dyn_cast<PointerType>(Ty)->getAddressSpace();
+    return (abi_or_pref
+            ? getPointerABIAlignment(AS)
+            : getPointerPrefAlignment(AS));
+    }
   case Type::ArrayTyID:
     return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
 
@@ -558,18 +631,18 @@ unsigned TargetData::getAlignment(Type *Ty, bool abi_or_pref) const {
                           abi_or_pref, Ty);
 }
 
-unsigned TargetData::getABITypeAlignment(Type *Ty) const {
+unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
   return getAlignment(Ty, true);
 }
 
 /// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
 /// an integer type of the specified bitwidth.
-unsigned TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
+unsigned DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
   return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
 }
 
 
-unsigned TargetData::getCallFrameTypeAlignment(Type *Ty) const {
+unsigned DataLayout::getCallFrameTypeAlignment(Type *Ty) const {
   for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
     if (Alignments[i].AlignType == STACK_ALIGN)
       return Alignments[i].ABIAlign;
@@ -577,24 +650,37 @@ unsigned TargetData::getCallFrameTypeAlignment(Type *Ty) const {
   return getABITypeAlignment(Ty);
 }
 
-unsigned TargetData::getPrefTypeAlignment(Type *Ty) const {
+unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
   return getAlignment(Ty, false);
 }
 
-unsigned TargetData::getPreferredTypeAlignmentShift(Type *Ty) const {
+unsigned DataLayout::getPreferredTypeAlignmentShift(Type *Ty) const {
   unsigned Align = getPrefTypeAlignment(Ty);
   assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
   return Log2_32(Align);
 }
 
-/// getIntPtrType - Return an unsigned integer type that is the same size or
-/// greater to the host pointer size.
-IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
-  return IntegerType::get(C, getPointerSizeInBits());
+/// getIntPtrType - Return an integer type with size at least as big as that
+/// of a pointer in the given address space.
+IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
+                                       unsigned AddressSpace) const {
+  return IntegerType::get(C, getPointerSizeInBits(AddressSpace));
 }
 
+/// getIntPtrType - Return an integer (vector of integer) type with size at
+/// least as big as that of a pointer of the given pointer (vector of pointer)
+/// type.
+Type *DataLayout::getIntPtrType(Type *Ty) const {
+  assert(Ty->isPtrOrPtrVectorTy() &&
+         "Expected a pointer or pointer vector type.");
+  unsigned NumBits = getTypeSizeInBits(Ty->getScalarType());
+  IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+    return VectorType::get(IntTy, VecTy->getNumElements());
+  return IntTy;
+}
 
-uint64_t TargetData::getIndexedOffset(Type *ptrTy,
+uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
                                       ArrayRef<Value *> Indices) const {
   Type *Ty = ptrTy;
   assert(Ty->isPointerTy() && "Illegal argument for getIndexedOffset()");
@@ -634,7 +720,7 @@ uint64_t TargetData::getIndexedOffset(Type *ptrTy,
 /// getPreferredAlignment - Return the preferred alignment of the specified
 /// global.  This includes an explicitly requested alignment (if the global
 /// has one).
-unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
+unsigned DataLayout::getPreferredAlignment(const GlobalVariable *GV) const {
   Type *ElemType = GV->getType()->getElementType();
   unsigned Alignment = getPrefTypeAlignment(ElemType);
   unsigned GVAlignment = GV->getAlignment();
@@ -658,6 +744,6 @@ unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
 /// getPreferredAlignmentLog - Return the preferred alignment of the
 /// specified global, returned in log form.  This includes an explicitly
 /// requested alignment (if the global has one).
-unsigned TargetData::getPreferredAlignmentLog(const GlobalVariable *GV) const {
+unsigned DataLayout::getPreferredAlignmentLog(const GlobalVariable *GV) const {
   return Log2_32(getPreferredAlignment(GV));
 }
diff --git a/lib/VMCore/DebugInfo.cpp b/lib/VMCore/DebugInfo.cpp
index c8f8f7d67b84..3029ce273434 100644
--- a/lib/VMCore/DebugInfo.cpp
+++ b/lib/VMCore/DebugInfo.cpp
@@ -111,6 +111,16 @@ Function *DIDescriptor::getFunctionField(unsigned Elt) const {
   return 0;
 }
 
+void DIDescriptor::replaceFunctionField(unsigned Elt, Function *F) {
+  if (DbgNode == 0)
+    return;
+
+  if (Elt < DbgNode->getNumOperands()) {
+    MDNode *Node = const_cast<MDNode*>(DbgNode);
+    Node->replaceOperandWith(Elt, F);
+  }
+}
+
 unsigned DIVariable::getNumAddrElements() const {
   if (getVersion() <= LLVMDebugVersion8)
     return DbgNode->getNumOperands()-6;
diff --git a/lib/VMCore/Dominators.cpp b/lib/VMCore/Dominators.cpp
index 60bdeac16b36..77b2403d87dd 100644
--- a/lib/VMCore/Dominators.cpp
+++ b/lib/VMCore/Dominators.cpp
@@ -161,6 +161,11 @@ bool DominatorTree::dominates(const Instruction *Def,
 
 bool DominatorTree::dominates(const BasicBlockEdge &BBE,
                               const BasicBlock *UseBB) const {
+  // Assert that we have a single edge. We could handle them by simply
+  // returning false, but since isSingleEdge is linear on the number of
+  // edges, the callers can normally handle them more efficiently.
+  assert(BBE.isSingleEdge());
+
   // If the BB the edge ends in doesn't dominate the use BB, then the
   // edge also doesn't.
   const BasicBlock *Start = BBE.getStart();
@@ -207,6 +212,11 @@ bool DominatorTree::dominates(const BasicBlockEdge &BBE,
 
 bool DominatorTree::dominates(const BasicBlockEdge &BBE,
                               const Use &U) const {
+  // Assert that we have a single edge. We could handle them by simply
+  // returning false, but since isSingleEdge is linear on the number of
+  // edges, the callers can normally handle them more efficiently.
+  assert(BBE.isSingleEdge());
+
   Instruction *UserInst = cast<Instruction>(U.getUser());
   // A PHI in the end of the edge is dominated by it.
   PHINode *PN = dyn_cast<PHINode>(UserInst);
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 2e0b3168c953..9c4f2d939952 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -78,7 +78,8 @@ unsigned Argument::getArgNo() const {
 /// in its containing function.
 bool Argument::hasByValAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->paramHasAttr(getArgNo()+1, Attribute::ByVal);
+  return getParent()->getParamAttributes(getArgNo()+1).
+    hasAttribute(Attributes::ByVal);
 }
 
 unsigned Argument::getParamAlignment() const {
@@ -91,21 +92,24 @@ unsigned Argument::getParamAlignment() const {
 /// it in its containing function.
 bool Argument::hasNestAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->paramHasAttr(getArgNo()+1, Attribute::Nest);
+  return getParent()->getParamAttributes(getArgNo()+1).
+    hasAttribute(Attributes::Nest);
 }
 
 /// hasNoAliasAttr - Return true if this argument has the noalias attribute on
 /// it in its containing function.
 bool Argument::hasNoAliasAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->paramHasAttr(getArgNo()+1, Attribute::NoAlias);
+  return getParent()->getParamAttributes(getArgNo()+1).
+    hasAttribute(Attributes::NoAlias);
 }
 
 /// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
 /// on it in its containing function.
 bool Argument::hasNoCaptureAttr() const {
   if (!getType()->isPointerTy()) return false;
-  return getParent()->paramHasAttr(getArgNo()+1, Attribute::NoCapture);
+  return getParent()->getParamAttributes(getArgNo()+1).
+    hasAttribute(Attributes::NoCapture);
 }
 
 /// hasSRetAttr - Return true if this argument has the sret attribute on
@@ -114,7 +118,8 @@ bool Argument::hasStructRetAttr() const {
   if (!getType()->isPointerTy()) return false;
   if (this != getParent()->arg_begin())
     return false; // StructRet param must be first param
-  return getParent()->paramHasAttr(1, Attribute::StructRet);
+  return getParent()->getParamAttributes(1).
+    hasAttribute(Attributes::StructRet);
 }
 
 /// addAttr - Add a Attribute to an argument
@@ -180,7 +185,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage,
 
   // Ensure intrinsics have the right parameter attributes.
   if (unsigned IID = getIntrinsicID())
-    setAttributes(Intrinsic::getAttributes(Intrinsic::ID(IID)));
+    setAttributes(Intrinsic::getAttributes(getContext(), Intrinsic::ID(IID)));
 
 }
 
@@ -244,13 +249,13 @@ void Function::dropAllReferences() {
 
 void Function::addAttribute(unsigned i, Attributes attr) {
   AttrListPtr PAL = getAttributes();
-  PAL = PAL.addAttr(i, attr);
+  PAL = PAL.addAttr(getContext(), i, attr);
   setAttributes(PAL);
 }
 
 void Function::removeAttribute(unsigned i, Attributes attr) {
   AttrListPtr PAL = getAttributes();
-  PAL = PAL.removeAttr(i, attr);
+  PAL = PAL.removeAttr(getContext(), i, attr);
   setAttributes(PAL);
 }
 
diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp
index 003a5d4e4eba..ea2f0a6d556f 100644
--- a/lib/VMCore/GCOV.cpp
+++ b/lib/VMCore/GCOV.cpp
@@ -28,19 +28,19 @@ GCOVFile::~GCOVFile() {
 }
 
 /// isGCDAFile - Return true if Format identifies a .gcda file.
-static bool isGCDAFile(GCOVFormat Format) {
-  return Format == GCDA_402 || Format == GCDA_404;
+static bool isGCDAFile(GCOV::GCOVFormat Format) {
+  return Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404;
 }
 
 /// isGCNOFile - Return true if Format identifies a .gcno file.
-static bool isGCNOFile(GCOVFormat Format) {
-  return Format == GCNO_402 || Format == GCNO_404;
+static bool isGCNOFile(GCOV::GCOVFormat Format) {
+  return Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404;
 }
 
 /// read - Read GCOV buffer.
 bool GCOVFile::read(GCOVBuffer &Buffer) {
-  GCOVFormat Format = Buffer.readGCOVFormat();
-  if (Format == InvalidGCOV)
+  GCOV::GCOVFormat Format = Buffer.readGCOVFormat();
+  if (Format == GCOV::InvalidGCOV)
     return false;
 
   unsigned i = 0;
@@ -48,7 +48,7 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
     GCOVFunction *GFun = NULL;
     if (isGCDAFile(Format)) {
       // Use existing function while reading .gcda file.
-      assert (i < Functions.size() && ".gcda data does not match .gcno data");
+      assert(i < Functions.size() && ".gcda data does not match .gcno data");
       GFun = Functions[i];
     } else if (isGCNOFile(Format)){
       GFun = new GCOVFunction();
@@ -87,21 +87,21 @@ GCOVFunction::~GCOVFunction() {
 
 /// read - Read a aunction from the buffer. Return false if buffer cursor
 /// does not point to a function tag.
-bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
+bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
   if (!Buff.readFunctionTag())
     return false;
 
   Buff.readInt(); // Function header length
   Ident = Buff.readInt(); 
   Buff.readInt(); // Checksum #1
-  if (Format != GCNO_402)
+  if (Format != GCOV::GCNO_402)
     Buff.readInt(); // Checksum #2
 
   Name = Buff.readString();
-  if (Format == GCNO_402 || Format == GCNO_404)
+  if (Format == GCOV::GCNO_402 || Format == GCOV::GCNO_404)
     Filename = Buff.readString();
 
-  if (Format == GCDA_402 || Format == GCDA_404) {
+  if (Format == GCOV::GCDA_402 || Format == GCOV::GCDA_404) {
     Buff.readArcTag();
     uint32_t Count = Buff.readInt() / 2;
     for (unsigned i = 0, e = Count; i != e; ++i) {
@@ -113,7 +113,9 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
   LineNumber = Buff.readInt();
 
   // read blocks.
-  assert (Buff.readBlockTag() && "Block Tag not found!");
+  bool BlockTagFound = Buff.readBlockTag();
+  (void)BlockTagFound;
+  assert(BlockTagFound && "Block Tag not found!");
   uint32_t BlockCount = Buff.readInt();
   for (int i = 0, e = BlockCount; i != e; ++i) {
     Buff.readInt(); // Block flags;
@@ -124,7 +126,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
   while (Buff.readEdgeTag()) {
     uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
     uint32_t BlockNo = Buff.readInt();
-    assert (BlockNo < BlockCount && "Unexpected Block number!");
+    assert(BlockNo < BlockCount && "Unexpected Block number!");
     for (int i = 0, e = EdgeCount; i != e; ++i) {
       Blocks[BlockNo]->addEdge(Buff.readInt());
       Buff.readInt(); // Edge flag
@@ -136,7 +138,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOVFormat Format) {
     uint32_t LineTableLength = Buff.readInt();
     uint32_t Size = Buff.getCursor() + LineTableLength*4;
     uint32_t BlockNo = Buff.readInt();
-    assert (BlockNo < BlockCount && "Unexpected Block number!");
+    assert(BlockNo < BlockCount && "Unexpected Block number!");
     GCOVBlock *Block = Blocks[BlockNo];
     Buff.readInt(); // flag
     while (Buff.getCursor() != (Size - 4)) {
diff --git a/lib/VMCore/IRBuilder.cpp b/lib/VMCore/IRBuilder.cpp
index 5c4e6d964274..04f08fe28e00 100644
--- a/lib/VMCore/IRBuilder.cpp
+++ b/lib/VMCore/IRBuilder.cpp
@@ -80,7 +80,7 @@ CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align,
 
 CallInst *IRBuilderBase::
 CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
-             bool isVolatile, MDNode *TBAATag) {
+             bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag) {
   Dst = getCastedInt8PtrValue(Dst);
   Src = getCastedInt8PtrValue(Src);
 
@@ -94,6 +94,10 @@ CreateMemCpy(Value *Dst, Value *Src, Value *Size, unsigned Align,
   // Set the TBAA info if present.
   if (TBAATag)
     CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  // Set the TBAA Struct info if present.
+  if (TBAAStructTag)
+    CI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag);
   
   return CI;  
 }
diff --git a/lib/VMCore/InlineAsm.cpp b/lib/VMCore/InlineAsm.cpp
index 736e370a6de6..2e636aacfde8 100644
--- a/lib/VMCore/InlineAsm.cpp
+++ b/lib/VMCore/InlineAsm.cpp
@@ -27,19 +27,20 @@ InlineAsm::~InlineAsm() {
 
 InlineAsm *InlineAsm::get(FunctionType *Ty, StringRef AsmString,
                           StringRef Constraints, bool hasSideEffects,
-                          bool isAlignStack) {
-  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack);
+                          bool isAlignStack, AsmDialect asmDialect) {
+  InlineAsmKeyType Key(AsmString, Constraints, hasSideEffects, isAlignStack,
+                       asmDialect);
   LLVMContextImpl *pImpl = Ty->getContext().pImpl;
   return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(Ty), Key);
 }
 
 InlineAsm::InlineAsm(PointerType *Ty, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
-                     bool isAlignStack)
+                     bool isAlignStack, AsmDialect asmDialect)
   : Value(Ty, Value::InlineAsmVal),
-    AsmString(asmString), 
-    Constraints(constraints), HasSideEffects(hasSideEffects), 
-    IsAlignStack(isAlignStack) {
+    AsmString(asmString), Constraints(constraints),
+    HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
+    Dialect(asmDialect) {
 
   // Do various checks on the constraint string and type.
   assert(Verify(getFunctionType(), constraints) &&
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 9af98e8a9b3d..94bd2a15632d 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -332,21 +332,30 @@ CallInst::CallInst(const CallInst &CI)
 
 void CallInst::addAttribute(unsigned i, Attributes attr) {
   AttrListPtr PAL = getAttributes();
-  PAL = PAL.addAttr(i, attr);
+  PAL = PAL.addAttr(getContext(), i, attr);
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, Attributes attr) {
   AttrListPtr PAL = getAttributes();
-  PAL = PAL.removeAttr(i, attr);
+  PAL = PAL.removeAttr(getContext(), i, attr);
   setAttributes(PAL);
 }
 
-bool CallInst::paramHasAttr(unsigned i, Attributes attr) const {
-  if (AttributeList.paramHasAttr(i, attr))
+bool CallInst::hasFnAttr(Attributes::AttrVal A) const {
+  if (AttributeList.getParamAttributes(AttrListPtr::FunctionIndex)
+      .hasAttribute(A))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->paramHasAttr(i, attr);
+    return F->getParamAttributes(AttrListPtr::FunctionIndex).hasAttribute(A);
+  return false;
+}
+
+bool CallInst::paramHasAttr(unsigned i, Attributes::AttrVal A) const {
+  if (AttributeList.getParamAttributes(i).hasAttribute(A))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getParamAttributes(i).hasAttribute(A);
   return false;
 }
 
@@ -562,23 +571,32 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
   return setSuccessor(idx, B);
 }
 
-bool InvokeInst::paramHasAttr(unsigned i, Attributes attr) const {
-  if (AttributeList.paramHasAttr(i, attr))
+bool InvokeInst::hasFnAttr(Attributes::AttrVal A) const {
+  if (AttributeList.getParamAttributes(AttrListPtr::FunctionIndex).
+      hasAttribute(A))
     return true;
   if (const Function *F = getCalledFunction())
-    return F->paramHasAttr(i, attr);
+    return F->getParamAttributes(AttrListPtr::FunctionIndex).hasAttribute(A);
+  return false;
+}
+
+bool InvokeInst::paramHasAttr(unsigned i, Attributes::AttrVal A) const {
+  if (AttributeList.getParamAttributes(i).hasAttribute(A))
+    return true;
+  if (const Function *F = getCalledFunction())
+    return F->getParamAttributes(i).hasAttribute(A);
   return false;
 }
 
 void InvokeInst::addAttribute(unsigned i, Attributes attr) {
   AttrListPtr PAL = getAttributes();
-  PAL = PAL.addAttr(i, attr);
+  PAL = PAL.addAttr(getContext(), i, attr);
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, Attributes attr) {
   AttrListPtr PAL = getAttributes();
-  PAL = PAL.removeAttr(i, attr);
+  PAL = PAL.removeAttr(getContext(), i, attr);
   setAttributes(PAL);
 }
 
@@ -1381,18 +1399,6 @@ Type *GetElementPtrInst::getIndexedType(Type *Ptr, ArrayRef<uint64_t> IdxList) {
   return getIndexedTypeInternal(Ptr, IdxList);
 }
 
-unsigned GetElementPtrInst::getAddressSpace(Value *Ptr) {
-  Type *Ty = Ptr->getType();
-
-  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    Ty = VTy->getElementType();
-
-  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
-    return PTy->getAddressSpace();
-
-  llvm_unreachable("Invalid GEP pointer type");
-}
-
 /// hasAllZeroIndices - Return true if all of the indices of this GEP are
 /// zeros.  If so, the result pointer and the first operand have the same
 /// value, just potentially different types.
@@ -2112,7 +2118,8 @@ bool CastInst::isNoopCast(Type *IntPtrTy) const {
 /// If no such cast is permited, the function returns 0.
 unsigned CastInst::isEliminableCastPair(
   Instruction::CastOps firstOp, Instruction::CastOps secondOp,
-  Type *SrcTy, Type *MidTy, Type *DstTy, Type *IntPtrTy) {
+  Type *SrcTy, Type *MidTy, Type *DstTy, Type *SrcIntPtrTy, Type *MidIntPtrTy,
+  Type *DstIntPtrTy) {
   // Define the 144 possibilities for these two cast instructions. The values
   // in this matrix determine what to do in a given situation and select the
   // case in the switch below.  The rows correspond to firstOp, the columns 
@@ -2215,9 +2222,9 @@ unsigned CastInst::isEliminableCastPair(
       return 0;
     case 7: { 
       // ptrtoint, inttoptr -> bitcast (ptr -> ptr) if int size is >= ptr size
-      if (!IntPtrTy)
+      if (!SrcIntPtrTy || DstIntPtrTy != SrcIntPtrTy)
         return 0;
-      unsigned PtrSize = IntPtrTy->getScalarSizeInBits();
+      unsigned PtrSize = SrcIntPtrTy->getScalarSizeInBits();
       unsigned MidSize = MidTy->getScalarSizeInBits();
       if (MidSize >= PtrSize)
         return Instruction::BitCast;
@@ -2256,9 +2263,9 @@ unsigned CastInst::isEliminableCastPair(
       return 0;
     case 13: {
       // inttoptr, ptrtoint -> bitcast if SrcSize<=PtrSize and SrcSize==DstSize
-      if (!IntPtrTy)
+      if (!MidIntPtrTy)
         return 0;
-      unsigned PtrSize = IntPtrTy->getScalarSizeInBits();
+      unsigned PtrSize = MidIntPtrTy->getScalarSizeInBits();
       unsigned SrcSize = SrcTy->getScalarSizeInBits();
       unsigned DstSize = DstTy->getScalarSizeInBits();
       if (SrcSize <= PtrSize && SrcSize == DstSize)
@@ -2836,7 +2843,7 @@ BitCastInst::BitCastInst(
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
 
-void CmpInst::Anchor() const {}
+void CmpInst::anchor() {}
 
 CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
                  Value *LHS, Value *RHS, const Twine &Name,
diff --git a/lib/VMCore/LLVMContext.cpp b/lib/VMCore/LLVMContext.cpp
index f07f0b393926..2446ec996d04 100644
--- a/lib/VMCore/LLVMContext.cpp
+++ b/lib/VMCore/LLVMContext.cpp
@@ -53,6 +53,11 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   unsigned RangeID = getMDKindID("range");
   assert(RangeID == MD_range && "range kind id drifted");
   (void)RangeID;
+
+  // Create the 'tbaa.struct' metadata kind.
+  unsigned TBAAStructID = getMDKindID("tbaa.struct");
+  assert(TBAAStructID == MD_tbaa_struct && "tbaa.struct kind id drifted");
+  (void)TBAAStructID;
 }
 LLVMContext::~LLVMContext() { delete pImpl; }
 
diff --git a/lib/VMCore/LLVMContextImpl.cpp b/lib/VMCore/LLVMContextImpl.cpp
index 6279bb823dbf..d35d2844b89b 100644
--- a/lib/VMCore/LLVMContextImpl.cpp
+++ b/lib/VMCore/LLVMContextImpl.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLVMContextImpl.h"
+#include "llvm/Attributes.h"
 #include "llvm/Module.h"
 #include "llvm/ADT/STLExtras.h"
 #include <algorithm>
@@ -93,7 +94,21 @@ LLVMContextImpl::~LLVMContextImpl() {
        E = CDSConstants.end(); I != E; ++I)
     delete I->second;
   CDSConstants.clear();
-  
+
+  // Destroy attributes.
+  for (FoldingSetIterator<AttributesImpl> I = AttrsSet.begin(),
+         E = AttrsSet.end(); I != E; ) {
+    FoldingSetIterator<AttributesImpl> Elem = I++;
+    delete &*Elem;
+  }
+
+  // Destroy attribute lists.
+  for (FoldingSetIterator<AttributeListImpl> I = AttrsLists.begin(),
+         E = AttrsLists.end(); I != E; ) {
+    FoldingSetIterator<AttributeListImpl> Elem = I++;
+    delete &*Elem;
+  }
+
   // Destroy MDNodes.  ~MDNode can move and remove nodes between the MDNodeSet
   // and the NonUniquedMDNodes sets, so copy the values out first.
   SmallVector<MDNode*, 8> MDNodes;
@@ -107,6 +122,7 @@ LLVMContextImpl::~LLVMContextImpl() {
     (*I)->destroy();
   assert(MDNodeSet.empty() && NonUniquedMDNodes.empty() &&
          "Destroying all MDNodes didn't empty the Context's sets.");
+
   // Destroy MDStrings.
   DeleteContainerSeconds(MDStringCache);
 }
diff --git a/lib/VMCore/LLVMContextImpl.h b/lib/VMCore/LLVMContextImpl.h
index 2252028b1569..90cf424a3c92 100644
--- a/lib/VMCore/LLVMContextImpl.h
+++ b/lib/VMCore/LLVMContextImpl.h
@@ -16,6 +16,7 @@
 #define LLVM_LLVMCONTEXT_IMPL_H
 
 #include "llvm/LLVMContext.h"
+#include "AttributesImpl.h"
 #include "ConstantsContext.h"
 #include "LeaksContext.h"
 #include "llvm/Constants.h"
@@ -253,10 +254,14 @@ public:
   typedef DenseMap<DenseMapAPFloatKeyInfo::KeyTy, ConstantFP*, 
                          DenseMapAPFloatKeyInfo> FPMapTy;
   FPMapTy FPConstants;
-  
+
+  FoldingSet<AttributesImpl> AttrsSet;
+  FoldingSet<AttributeListImpl> AttrsLists;
+
   StringMap<Value*> MDStringCache;
-  
+
   FoldingSet<MDNode> MDNodeSet;
+
   // MDNodes may be uniqued or not uniqued.  When they're not uniqued, they
   // aren't in the MDNodeSet, but they're still shared between objects, so no
   // one object can destroy them.  This set allows us to at least destroy them
diff --git a/lib/VMCore/Makefile b/lib/VMCore/Makefile
index 2b9b0f258cfa..8b9865152e24 100644
--- a/lib/VMCore/Makefile
+++ b/lib/VMCore/Makefile
@@ -9,7 +9,6 @@
 LEVEL = ../..
 LIBRARYNAME = LLVMCore
 BUILD_ARCHIVE = 1
-REQUIRES_RTTI = 1
 
 BUILT_SOURCES = $(PROJ_OBJ_ROOT)/include/llvm/Intrinsics.gen
 
diff --git a/lib/VMCore/PassManager.cpp b/lib/VMCore/PassManager.cpp
index 4530c0495f1a..53f11499e4b9 100644
--- a/lib/VMCore/PassManager.cpp
+++ b/lib/VMCore/PassManager.cpp
@@ -1189,7 +1189,7 @@ void PMDataManager::dumpAnalysisUsage(StringRef Msg, const Pass *P,
   assert(PassDebugging >= Details);
   if (Set.empty())
     return;
-  dbgs() << (void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
+  dbgs() << (const void*)P << std::string(getDepth()*2+3, ' ') << Msg << " Analyses:";
   for (unsigned i = 0; i != Set.size(); ++i) {
     if (i) dbgs() << ',';
     const PassInfo *PInf = PassRegistry::getPassRegistry()->getPassInfo(Set[i]);
diff --git a/lib/VMCore/TargetTransformInfo.cpp b/lib/VMCore/TargetTransformInfo.cpp
new file mode 100644
index 000000000000..e91c29c45699
--- /dev/null
+++ b/lib/VMCore/TargetTransformInfo.cpp
@@ -0,0 +1,31 @@
+//===- llvm/VMCore/TargetTransformInfo.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+/// Default ctor.
+///
+/// @note This has to exist, because this is a pass, but it should never be
+/// used.
+TargetTransformInfo::TargetTransformInfo() : ImmutablePass(ID) {
+  /// You are seeing this error because your pass required the TTI
+  /// using a call to "getAnalysis<TargetTransformInfo>()", and you did
+  /// not initialize a machine target which can provide the TTI.
+  /// You should use "getAnalysisIfAvailable<TargetTransformInfo>()" instead.
+  report_fatal_error("Bad TargetTransformInfo ctor used.  "
+                     "Tool did not specify a TargetTransformInfo to use?");
+}
+
+INITIALIZE_PASS(TargetTransformInfo, "targettransforminfo",
+                "Target Transform Info", false, true)
+char TargetTransformInfo::ID = 0;
+
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 5e9a00fc085d..1656ab2cab3a 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -47,35 +47,17 @@ Type *Type::getScalarType() {
   return this;
 }
 
+const Type *Type::getScalarType() const {
+  if (const VectorType *VTy = dyn_cast<VectorType>(this))
+    return VTy->getElementType();
+  return this;
+}
+
 /// isIntegerTy - Return true if this is an IntegerType of the specified width.
 bool Type::isIntegerTy(unsigned Bitwidth) const {
   return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
 }
 
-/// isIntOrIntVectorTy - Return true if this is an integer type or a vector of
-/// integer types.
-///
-bool Type::isIntOrIntVectorTy() const {
-  if (isIntegerTy())
-    return true;
-  if (getTypeID() != Type::VectorTyID) return false;
-  
-  return cast<VectorType>(this)->getElementType()->isIntegerTy();
-}
-
-/// isFPOrFPVectorTy - Return true if this is a FP type or a vector of FP types.
-///
-bool Type::isFPOrFPVectorTy() const {
-  if (getTypeID() == Type::HalfTyID || getTypeID() == Type::FloatTyID ||
-      getTypeID() == Type::DoubleTyID ||
-      getTypeID() == Type::FP128TyID || getTypeID() == Type::X86_FP80TyID || 
-      getTypeID() == Type::PPC_FP128TyID)
-    return true;
-  if (getTypeID() != Type::VectorTyID) return false;
-  
-  return cast<VectorType>(this)->getElementType()->isFloatingPointTy();
-}
-
 // canLosslesslyBitCastTo - Return true if this type can be converted to
 // 'Ty' without any reinterpretation of bits.  For example, i8* to i32*.
 //
@@ -220,8 +202,6 @@ Type *Type::getStructElementType(unsigned N) const {
   return cast<StructType>(this)->getElementType(N);
 }
 
-
-
 Type *Type::getSequentialElementType() const {
   return cast<SequentialType>(this)->getElementType();
 }
@@ -235,12 +215,10 @@ unsigned Type::getVectorNumElements() const {
 }
 
 unsigned Type::getPointerAddressSpace() const {
-  return cast<PointerType>(this)->getAddressSpace();
+  return cast<PointerType>(getScalarType())->getAddressSpace();
 }
 
 
-
-
 //===----------------------------------------------------------------------===//
 //                          Primitive 'Type' data
 //===----------------------------------------------------------------------===//
@@ -400,12 +378,10 @@ FunctionType *FunctionType::get(Type *ReturnType,
   return FT;
 }
 
-
 FunctionType *FunctionType::get(Type *Result, bool isVarArg) {
   return get(Result, ArrayRef<Type *>(), isVarArg);
 }
 
-
 /// isValidReturnType - Return true if the specified type is valid as a return
 /// type.
 bool FunctionType::isValidReturnType(Type *RetTy) {
@@ -553,7 +529,6 @@ StructType *StructType::create(LLVMContext &Context) {
   return create(Context, StringRef());
 }
 
-
 StructType *StructType::create(ArrayRef<Type*> Elements, StringRef Name,
                                bool isPacked) {
   assert(!Elements.empty() &&
@@ -637,7 +612,6 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
   return std::equal(element_begin(), element_end(), Other->element_begin());
 }
 
-
 /// getTypeByName - Return the type with the specified name, or null if there
 /// is none by that name.
 StructType *Module::getTypeByName(StringRef Name) const {
@@ -700,7 +674,6 @@ ArrayType::ArrayType(Type *ElType, uint64_t NumEl)
   NumElements = NumEl;
 }
 
-
 ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
   Type *ElementType = const_cast<Type*>(elementType);
   assert(isValidElementType(ElementType) && "Invalid type for array element!");
diff --git a/lib/VMCore/User.cpp b/lib/VMCore/User.cpp
index 5f35ce4b9a4f..e847ce6ee5cd 100644
--- a/lib/VMCore/User.cpp
+++ b/lib/VMCore/User.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Constant.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/User.h"
+#include "llvm/Operator.h"
 
 namespace llvm {
 
@@ -78,4 +79,12 @@ void User::operator delete(void *Usr) {
   ::operator delete(Storage);
 }
 
+//===----------------------------------------------------------------------===//
+//                             Operator Class
+//===----------------------------------------------------------------------===//
+
+Operator::~Operator() {
+  llvm_unreachable("should never destroy an Operator");
+}
+
 } // End llvm namespace
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index d8711082ff07..8d0720dc1223 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -394,7 +394,7 @@ static bool isDereferenceablePointer(const Value *V,
   // It's also not always safe to follow a bitcast, for example:
   //   bitcast i8* (alloca i8) to i32*
   // would result in a 4-byte load from a 1-byte alloca. Some cases could
-  // be handled using TargetData to check sizes and alignments though.
+  // be handled using DataLayout to check sizes and alignments though.
 
   // These are obviously ok.
   if (isa<AllocaInst>(V)) return true;
diff --git a/lib/VMCore/ValueTypes.cpp b/lib/VMCore/ValueTypes.cpp
index d1ca95317539..2ee9f0f4c99f 100644
--- a/lib/VMCore/ValueTypes.cpp
+++ b/lib/VMCore/ValueTypes.cpp
@@ -55,24 +55,32 @@ bool EVT::isExtendedVector() const {
   return LLVMTy->isVectorTy();
 }
 
+bool EVT::isExtended16BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 16;
+}
+
+bool EVT::isExtended32BitVector() const {
+  return isExtendedVector() && getExtendedSizeInBits() == 32;
+}
+
 bool EVT::isExtended64BitVector() const {
-  return isExtendedVector() && getSizeInBits() == 64;
+  return isExtendedVector() && getExtendedSizeInBits() == 64;
 }
 
 bool EVT::isExtended128BitVector() const {
-  return isExtendedVector() && getSizeInBits() == 128;
+  return isExtendedVector() && getExtendedSizeInBits() == 128;
 }
 
 bool EVT::isExtended256BitVector() const {
-  return isExtendedVector() && getSizeInBits() == 256;
+  return isExtendedVector() && getExtendedSizeInBits() == 256;
 }
 
 bool EVT::isExtended512BitVector() const {
-  return isExtendedVector() && getSizeInBits() == 512;
+  return isExtendedVector() && getExtendedSizeInBits() == 512;
 }
 
 bool EVT::isExtended1024BitVector() const {
-  return isExtendedVector() && getSizeInBits() == 1024;
+  return isExtendedVector() && getExtendedSizeInBits() == 1024;
 }
 
 EVT EVT::getExtendedVectorElementType() const {
@@ -120,15 +128,21 @@ std::string EVT::getEVTString() const {
   case MVT::Other:   return "ch";
   case MVT::Glue:    return "glue";
   case MVT::x86mmx:  return "x86mmx";
+  case MVT::v2i1:    return "v2i1";
+  case MVT::v4i1:    return "v4i1";
+  case MVT::v8i1:    return "v8i1";
+  case MVT::v16i1:   return "v16i1";
   case MVT::v2i8:    return "v2i8";
   case MVT::v4i8:    return "v4i8";
   case MVT::v8i8:    return "v8i8";
   case MVT::v16i8:   return "v16i8";
   case MVT::v32i8:   return "v32i8";
+  case MVT::v1i16:   return "v1i16";
   case MVT::v2i16:   return "v2i16";
   case MVT::v4i16:   return "v4i16";
   case MVT::v8i16:   return "v8i16";
   case MVT::v16i16:  return "v16i16";
+  case MVT::v1i32:   return "v1i32";
   case MVT::v2i32:   return "v2i32";
   case MVT::v4i32:   return "v4i32";
   case MVT::v8i32:   return "v8i32";
@@ -171,15 +185,21 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::f128:    return Type::getFP128Ty(Context);
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::v2i1:    return VectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::v4i1:    return VectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::v8i1:    return VectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
   case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
   case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
   case MVT::v16i8:   return VectorType::get(Type::getInt8Ty(Context), 16);
   case MVT::v32i8:   return VectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
   case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
   case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
   case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
   case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
+  case MVT::v1i32:   return VectorType::get(Type::getInt32Ty(Context), 1);
   case MVT::v2i32:   return VectorType::get(Type::getInt32Ty(Context), 2);
   case MVT::v4i32:   return VectorType::get(Type::getInt32Ty(Context), 4);
   case MVT::v8i32:   return VectorType::get(Type::getInt32Ty(Context), 8);
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index 38914b3fe7ec..eb40b09d29f7 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -400,8 +400,8 @@ void Verifier::visitGlobalValue(GlobalValue &GV) {
             "Only global arrays can have appending linkage!", GVar);
   }
 
-  Assert1(!GV.hasLinkerPrivateWeakDefAutoLinkage() || GV.hasDefaultVisibility(),
-          "linker_private_weak_def_auto can only have default visibility!",
+  Assert1(!GV.hasLinkOnceODRAutoHideLinkage() || GV.hasDefaultVisibility(),
+          "linkonce_odr_auto_hide can only have default visibility!",
           &GV);
 }
 
@@ -526,40 +526,60 @@ void Verifier::visitMDNode(MDNode &MD, Function *F) {
 // value of the specified type.  The value V is printed in error messages.
 void Verifier::VerifyParameterAttrs(Attributes Attrs, Type *Ty,
                                     bool isReturnValue, const Value *V) {
-  if (Attrs == Attribute::None)
+  if (!Attrs.hasAttributes())
     return;
 
-  Attributes FnCheckAttr = Attrs & Attribute::FunctionOnly;
-  Assert1(!FnCheckAttr, "Attribute " + Attribute::getAsString(FnCheckAttr) +
-          " only applies to the function!", V);
-
-  if (isReturnValue) {
-    Attributes RetI = Attrs & Attribute::ParameterOnly;
-    Assert1(!RetI, "Attribute " + Attribute::getAsString(RetI) +
-            " does not apply to return values!", V);
-  }
-
-  for (unsigned i = 0;
-       i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
-    Attributes MutI = Attrs & Attribute::MutuallyIncompatible[i];
-    Assert1(MutI.isEmptyOrSingleton(), "Attributes " +
-            Attribute::getAsString(MutI) + " are incompatible!", V);
-  }
-
-  Attributes TypeI = Attrs & Attribute::typeIncompatible(Ty);
-  Assert1(!TypeI, "Wrong type for attribute " +
-          Attribute::getAsString(TypeI), V);
-
-  Attributes ByValI = Attrs & Attribute::ByVal;
-  if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
-    Assert1(!ByValI || PTy->getElementType()->isSized(),
-            "Attribute " + Attribute::getAsString(ByValI) +
-            " does not support unsized types!", V);
-  } else {
-    Assert1(!ByValI,
-            "Attribute " + Attribute::getAsString(ByValI) +
-            " only applies to parameters with pointer type!", V);
-  }
+  Assert1(!Attrs.hasFunctionOnlyAttrs(),
+          "Some attributes in '" + Attrs.getAsString() +
+          "' only apply to functions!", V);
+
+  if (isReturnValue)
+    Assert1(!Attrs.hasParameterOnlyAttrs(),
+            "Attributes 'byval', 'nest', 'sret', and 'nocapture' "
+            "do not apply to return values!", V);
+
+  // Check for mutually incompatible attributes.
+  Assert1(!((Attrs.hasAttribute(Attributes::ByVal) &&
+             Attrs.hasAttribute(Attributes::Nest)) ||
+            (Attrs.hasAttribute(Attributes::ByVal) &&
+             Attrs.hasAttribute(Attributes::StructRet)) ||
+            (Attrs.hasAttribute(Attributes::Nest) &&
+             Attrs.hasAttribute(Attributes::StructRet))), "Attributes "
+          "'byval, nest, and sret' are incompatible!", V);
+
+  Assert1(!((Attrs.hasAttribute(Attributes::ByVal) &&
+             Attrs.hasAttribute(Attributes::Nest)) ||
+            (Attrs.hasAttribute(Attributes::ByVal) &&
+             Attrs.hasAttribute(Attributes::InReg)) ||
+            (Attrs.hasAttribute(Attributes::Nest) &&
+             Attrs.hasAttribute(Attributes::InReg))), "Attributes "
+          "'byval, nest, and inreg' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(Attributes::ZExt) &&
+            Attrs.hasAttribute(Attributes::SExt)), "Attributes "
+          "'zeroext and signext' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(Attributes::ReadNone) &&
+            Attrs.hasAttribute(Attributes::ReadOnly)), "Attributes "
+          "'readnone and readonly' are incompatible!", V);
+
+  Assert1(!(Attrs.hasAttribute(Attributes::NoInline) &&
+            Attrs.hasAttribute(Attributes::AlwaysInline)), "Attributes "
+          "'noinline and alwaysinline' are incompatible!", V);
+
+  Assert1(!AttrBuilder(Attrs).
+            hasAttributes(Attributes::typeIncompatible(Ty)),
+          "Wrong types for attribute: " +
+          Attributes::typeIncompatible(Ty).getAsString(), V);
+
+  if (PointerType *PTy = dyn_cast<PointerType>(Ty))
+    Assert1(!Attrs.hasAttribute(Attributes::ByVal) ||
+            PTy->getElementType()->isSized(),
+            "Attribute 'byval' does not support unsized types!", V);
+  else
+    Assert1(!Attrs.hasAttribute(Attributes::ByVal),
+            "Attribute 'byval' only applies to parameters with pointer type!",
+            V);
 }
 
 // VerifyFunctionAttrs - Check parameter attributes against a function type.
@@ -585,26 +605,50 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT,
 
     VerifyParameterAttrs(Attr.Attrs, Ty, Attr.Index == 0, V);
 
-    if (Attr.Attrs & Attribute::Nest) {
+    if (Attr.Attrs.hasAttribute(Attributes::Nest)) {
       Assert1(!SawNest, "More than one parameter has attribute nest!", V);
       SawNest = true;
     }
 
-    if (Attr.Attrs & Attribute::StructRet)
+    if (Attr.Attrs.hasAttribute(Attributes::StructRet))
       Assert1(Attr.Index == 1, "Attribute sret not on first parameter!", V);
   }
 
   Attributes FAttrs = Attrs.getFnAttributes();
-  Attributes NotFn = FAttrs & (~Attribute::FunctionOnly);
-  Assert1(!NotFn, "Attribute " + Attribute::getAsString(NotFn) +
-          " does not apply to the function!", V);
-
-  for (unsigned i = 0;
-       i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
-    Attributes MutI = FAttrs & Attribute::MutuallyIncompatible[i];
-    Assert1(MutI.isEmptyOrSingleton(), "Attributes " +
-            Attribute::getAsString(MutI) + " are incompatible!", V);
-  }
+  AttrBuilder NotFn(FAttrs);
+  NotFn.removeFunctionOnlyAttrs();
+  Assert1(!NotFn.hasAttributes(), "Attributes '" +
+          Attributes::get(V->getContext(), NotFn).getAsString() +
+          "' do not apply to the function!", V);
+
+  // Check for mutually incompatible attributes.
+  Assert1(!((FAttrs.hasAttribute(Attributes::ByVal) &&
+             FAttrs.hasAttribute(Attributes::Nest)) ||
+            (FAttrs.hasAttribute(Attributes::ByVal) &&
+             FAttrs.hasAttribute(Attributes::StructRet)) ||
+            (FAttrs.hasAttribute(Attributes::Nest) &&
+             FAttrs.hasAttribute(Attributes::StructRet))), "Attributes "
+          "'byval, nest, and sret' are incompatible!", V);
+
+  Assert1(!((FAttrs.hasAttribute(Attributes::ByVal) &&
+             FAttrs.hasAttribute(Attributes::Nest)) ||
+            (FAttrs.hasAttribute(Attributes::ByVal) &&
+             FAttrs.hasAttribute(Attributes::InReg)) ||
+            (FAttrs.hasAttribute(Attributes::Nest) &&
+             FAttrs.hasAttribute(Attributes::InReg))), "Attributes "
+          "'byval, nest, and inreg' are incompatible!", V);
+
+  Assert1(!(FAttrs.hasAttribute(Attributes::ZExt) &&
+            FAttrs.hasAttribute(Attributes::SExt)), "Attributes "
+          "'zeroext and signext' are incompatible!", V);
+
+  Assert1(!(FAttrs.hasAttribute(Attributes::ReadNone) &&
+            FAttrs.hasAttribute(Attributes::ReadOnly)), "Attributes "
+          "'readnone and readonly' are incompatible!", V);
+
+  Assert1(!(FAttrs.hasAttribute(Attributes::NoInline) &&
+            FAttrs.hasAttribute(Attributes::AlwaysInline)), "Attributes "
+          "'noinline and alwaysinline' are incompatible!", V);
 }
 
 static bool VerifyAttributeCount(const AttrListPtr &Attrs, unsigned Params) {
@@ -661,6 +705,7 @@ void Verifier::visitFunction(Function &F) {
   case CallingConv::Cold:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_ThisCall:
+  case CallingConv::Intel_OCL_BI:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
     Assert1(!F.isVarArg(),
@@ -1170,9 +1215,8 @@ void Verifier::VerifyCallSite(CallSite CS) {
 
       VerifyParameterAttrs(Attr, CS.getArgument(Idx-1)->getType(), false, I);
 
-      Attributes VArgI = Attr & Attribute::VarArgsIncompatible;
-      Assert1(!VArgI, "Attribute " + Attribute::getAsString(VArgI) +
-              " cannot be used for vararg call arguments!", I);
+      Assert1(!Attr.hasIncompatibleWithVarArgsAttrs(),
+              "Attribute 'sret' cannot be used for vararg call arguments!", I);
     }
 
   // Verify that there's no metadata unless it's a direct call to an intrinsic.
@@ -1378,6 +1422,15 @@ void Verifier::visitLoadInst(LoadInst &LI) {
             "Load cannot have Release ordering", &LI);
     Assert1(LI.getAlignment() != 0,
             "Atomic load must specify explicit alignment", &LI);
+    if (!ElTy->isPointerTy()) {
+      Assert2(ElTy->isIntegerTy(),
+              "atomic store operand must have integer type!",
+              &LI, ElTy);
+      unsigned Size = ElTy->getPrimitiveSizeInBits();
+      Assert2(Size >= 8 && !(Size & (Size - 1)),
+              "atomic store operand must be power-of-two byte-sized integer",
+              &LI, ElTy);
+    }
   } else {
     Assert1(LI.getSynchScope() == CrossThread,
             "Non-atomic load cannot have SynchronizationScope specified", &LI);
@@ -1444,6 +1497,15 @@ void Verifier::visitStoreInst(StoreInst &SI) {
             "Store cannot have Acquire ordering", &SI);
     Assert1(SI.getAlignment() != 0,
             "Atomic store must specify explicit alignment", &SI);
+    if (!ElTy->isPointerTy()) {
+      Assert2(ElTy->isIntegerTy(),
+              "atomic store operand must have integer type!",
+              &SI, ElTy);
+      unsigned Size = ElTy->getPrimitiveSizeInBits();
+      Assert2(Size >= 8 && !(Size & (Size - 1)),
+              "atomic store operand must be power-of-two byte-sized integer",
+              &SI, ElTy);
+    }
   } else {
     Assert1(SI.getSynchScope() == CrossThread,
             "Non-atomic store cannot have SynchronizationScope specified", &SI);
@@ -1471,6 +1533,13 @@ void Verifier::visitAtomicCmpXchgInst(AtomicCmpXchgInst &CXI) {
   PointerType *PTy = dyn_cast<PointerType>(CXI.getOperand(0)->getType());
   Assert1(PTy, "First cmpxchg operand must be a pointer.", &CXI);
   Type *ElTy = PTy->getElementType();
+  Assert2(ElTy->isIntegerTy(),
+          "cmpxchg operand must have integer type!",
+          &CXI, ElTy);
+  unsigned Size = ElTy->getPrimitiveSizeInBits();
+  Assert2(Size >= 8 && !(Size & (Size - 1)),
+          "cmpxchg operand must be power-of-two byte-sized integer",
+          &CXI, ElTy);
   Assert2(ElTy == CXI.getOperand(1)->getType(),
           "Expected value type does not match pointer operand type!",
           &CXI, ElTy);
@@ -1488,6 +1557,13 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   PointerType *PTy = dyn_cast<PointerType>(RMWI.getOperand(0)->getType());
   Assert1(PTy, "First atomicrmw operand must be a pointer.", &RMWI);
   Type *ElTy = PTy->getElementType();
+  Assert2(ElTy->isIntegerTy(),
+          "atomicrmw operand must have integer type!",
+          &RMWI, ElTy);
+  unsigned Size = ElTy->getPrimitiveSizeInBits();
+  Assert2(Size >= 8 && !(Size & (Size - 1)),
+          "atomicrmw operand must be power-of-two byte-sized integer",
+          &RMWI, ElTy);
   Assert2(ElTy == RMWI.getOperand(1)->getType(),
           "Argument value type does not match pointer operand type!",
           &RMWI, ElTy);
@@ -1575,6 +1651,13 @@ void Verifier::visitLandingPadInst(LandingPadInst &LPI) {
 
 void Verifier::verifyDominatesUse(Instruction &I, unsigned i) {
   Instruction *Op = cast<Instruction>(I.getOperand(i));
+  // If the we have an invalid invoke, don't try to compute the dominance.
+  // We already reject it in the invoke specific checks and the dominance
+  // computation doesn't handle multiple edges.
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Op)) {
+    if (II->getNormalDest() == II->getUnwindDest())
+      return;
+  }
 
   const Use &U = I.getOperandUse(i);
   Assert2(InstsInThisBlock.count(Op) || DT->dominates(Op, U),
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index dac637335bd1..36751cd31dac 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -10,3 +10,10 @@ foreach(entry ${entries})
     endif()
   endif()
 endforeach(entry)
+
+# Also add in the compiler-rt tree if present and we have a sufficiently
+# recent version of CMake.
+if(${CMAKE_VERSION} VERSION_GREATER 2.8.7 AND
+   ${LLVM_BUILD_RUNTIME})
+  add_llvm_external_project(compiler-rt)
+endif()
diff --git a/projects/sample/Makefile.llvm.rules b/projects/sample/Makefile.llvm.rules
index a6553020f86f..7ed1c1b4ed6b 100644
--- a/projects/sample/Makefile.llvm.rules
+++ b/projects/sample/Makefile.llvm.rules
@@ -1437,7 +1437,7 @@ install-local::
 uninstall-local::
 	$(Echo) Uninstall circumvented with NO_INSTALL
 else
-DestTool = $(DESTDIR)$(PROJ_bindir)/$(TOOLEXENAME)
+DestTool = $(DESTDIR)$(PROJ_bindir)/$(program_prefix)$(TOOLEXENAME)
 
 install-local:: $(DestTool)
 
@@ -1451,7 +1451,7 @@ uninstall-local::
 
 # TOOLALIAS install.
 ifdef TOOLALIAS
-DestToolAlias = $(DESTDIR)$(PROJ_bindir)/$(TOOLALIAS)$(EXEEXT)
+DestToolAlias = $(DESTDIR)$(PROJ_bindir)/$(program_prefix)$(TOOLALIAS)$(EXEEXT)
 
 install-local:: $(DestToolAlias)
 
diff --git a/projects/sample/autoconf/configure.ac b/projects/sample/autoconf/configure.ac
index bd0b16a4a698..8012c23412db 100644
--- a/projects/sample/autoconf/configure.ac
+++ b/projects/sample/autoconf/configure.ac
@@ -304,8 +304,8 @@ AC_CACHE_CHECK([target architecture],[llvm_cv_target_arch],
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
-  mips-*)                 llvm_cv_target_arch="Mips" ;;
-  mipsel-*)               llvm_cv_target_arch="Mips" ;;
+  mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
+  mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
diff --git a/projects/sample/configure b/projects/sample/configure
index df08c7c4e048..cfbb6c69224a 100755
--- a/projects/sample/configure
+++ b/projects/sample/configure
@@ -3840,8 +3840,8 @@ else
   sparc*-*)               llvm_cv_target_arch="Sparc" ;;
   powerpc*-*)             llvm_cv_target_arch="PowerPC" ;;
   arm*-*)                 llvm_cv_target_arch="ARM" ;;
-  mips-*)                 llvm_cv_target_arch="Mips" ;;
-  mipsel-*)               llvm_cv_target_arch="Mips" ;;
+  mips-* | mips64-*)      llvm_cv_target_arch="Mips" ;;
+  mipsel-* | mips64el-*)  llvm_cv_target_arch="Mips" ;;
   xcore-*)                llvm_cv_target_arch="XCore" ;;
   msp430-*)               llvm_cv_target_arch="MSP430" ;;
   hexagon-*)              llvm_cv_target_arch="Hexagon" ;;
diff --git a/runtime/libprofile/CMakeLists.txt b/runtime/libprofile/CMakeLists.txt
index 414ad00b4a80..8609715b33f0 100644
--- a/runtime/libprofile/CMakeLists.txt
+++ b/runtime/libprofile/CMakeLists.txt
@@ -13,7 +13,8 @@ set_target_properties( profile_rt-static
   PROPERTIES
   OUTPUT_NAME "profile_rt" )
 
-add_llvm_loadable_module( profile_rt-shared ${SOURCES} )
+set(BUILD_SHARED_LIBS ON)
+add_llvm_library( profile_rt-shared ${SOURCES} )
 set_target_properties( profile_rt-shared
   PROPERTIES
   OUTPUT_NAME "profile_rt" )
diff --git a/runtime/libprofile/CommonProfiling.c b/runtime/libprofile/CommonProfiling.c
index acc17ce11e01..8f4119c2c67d 100644
--- a/runtime/libprofile/CommonProfiling.c
+++ b/runtime/libprofile/CommonProfiling.c
@@ -28,14 +28,35 @@
 
 static char *SavedArgs = 0;
 static unsigned SavedArgsLength = 0;
+static const char *SavedEnvVar = 0;
 
 static const char *OutputFilename = "llvmprof.out";
 
+/* check_environment_variable - Check to see if the LLVMPROF_OUTPUT environment
+ * variable is set.  If it is then save it and set OutputFilename.
+ */
+static void check_environment_variable(void) {
+  const char *EnvVar;
+  if (SavedEnvVar) return; /* Guarantee that we can't leak memory. */
+
+  if ((EnvVar = getenv("LLVMPROF_OUTPUT")) != NULL) {
+    /* The string that getenv returns is allowed to be statically allocated,
+     * which means it may be changed by future calls to getenv, so copy it.
+     */
+    SavedEnvVar = strdup(EnvVar);
+    OutputFilename = SavedEnvVar;
+  }
+}
+
 /* save_arguments - Save argc and argv as passed into the program for the file
  * we output.
+ * If either the LLVMPROF_OUTPUT environment variable or the -llvmprof-output
+ * command line argument are set then change OutputFilename to the provided
+ * value.  The command line argument value overrides the environment variable.
  */
 int save_arguments(int argc, const char **argv) {
   unsigned Length, i;
+  if (!SavedEnvVar && !SavedArgs) check_environment_variable();
   if (SavedArgs || !argv) return argc;  /* This can be called multiple times */
 
   /* Check to see if there are any arguments passed into the program for the
@@ -54,6 +75,7 @@ int save_arguments(int argc, const char **argv) {
         puts("-llvmprof-output requires a filename argument!");
       else {
         OutputFilename = strdup(argv[1]);
+        if (SavedEnvVar) { free((void *)SavedEnvVar); SavedEnvVar = 0; }
         memmove((char**)&argv[1], &argv[2], (argc-1)*sizeof(char*));
         --argc;
       }
diff --git a/runtime/libprofile/Makefile b/runtime/libprofile/Makefile
index d8511495ce6e..6e9225382a9e 100644
--- a/runtime/libprofile/Makefile
+++ b/runtime/libprofile/Makefile
@@ -44,8 +44,15 @@ ifeq ($(HOST_OS),Darwin)
     # command line.
     DARWIN_VERS := $(shell echo $(TARGET_TRIPLE) | sed 's/.*darwin\([0-9]*\).*/\1/')
     ifneq ($(DARWIN_VERS),8)
-       LLVMLibsOptions    := $(LLVMLibsOptions)  \
+       LLVMLibsOptions    := $(LLVMLibsOptions) \
                             -Wl,-install_name \
                             -Wl,"@executable_path/../lib/lib$(LIBRARYNAME)$(SHLIBEXT)"
     endif
+
+    # If we're doing an Apple-style build, add the LTO object path.
+    ifeq ($(RC_BUILDIT),YES)
+       TempFile           := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/profile_rt-lto.XXXXXX)
+       LLVMLibsOptions    := $(LLVMLibsOptions) \
+                             -Wl,-object_path_lto -Wl,$(TempFile)
+    endif
 endif
diff --git a/runtime/libprofile/Profiling.h b/runtime/libprofile/Profiling.h
index c6b9a4d71c02..acc6399a18f9 100644
--- a/runtime/libprofile/Profiling.h
+++ b/runtime/libprofile/Profiling.h
@@ -15,7 +15,7 @@
 #ifndef PROFILING_H
 #define PROFILING_H
 
-#include "llvm/Analysis/ProfileInfoTypes.h" /* for enum ProfilingType */
+#include "llvm/Analysis/ProfileDataTypes.h" /* for enum ProfilingType */
 
 /* save_arguments - Save argc and argv as passed into the program for the file
  * we output.
diff --git a/test/Analysis/BasicAA/noalias-geps.ll b/test/Analysis/BasicAA/noalias-geps.ll
new file mode 100644
index 000000000000..a93d778da074
--- /dev/null
+++ b/test/Analysis/BasicAA/noalias-geps.ll
@@ -0,0 +1,54 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+; Check that geps with equal base offsets of noalias base pointers stay noalias.
+define i32 @test(i32* %p, i16 %i) {
+  %pi = getelementptr i32* %p, i32 0
+  %pi.next = getelementptr i32* %p, i32 1
+  %b = icmp eq i16 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr i32* %pi, i32 1
+  %g = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr i32* %pi, i32 1
+  %g2 = getelementptr i32* %pi.next, i32 1
+  br label %bb3
+
+bb3:
+  %ptr_phi = phi i32* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi i32* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr i32* %ptr_phi , i32 1
+  %g1 = getelementptr i32* %ptr_phi2 , i32 1
+
+ret i32 0
+}
+
+; Check that geps with equal indices of noalias base pointers stay noalias.
+define i32 @test2([2 x i32]* %p, i32 %i) {
+  %pi = getelementptr [2 x i32]* %p, i32 0
+  %pi.next = getelementptr [2 x i32]* %p, i32 1
+  %b = icmp eq i32 %i, 0
+  br i1 %b, label %bb1, label %bb2
+
+bb1:
+  %f = getelementptr [2 x i32]* %pi, i32 1
+  %g = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb2:
+  %f2 = getelementptr [2 x i32]* %pi, i32 1
+  %g2 = getelementptr [2 x i32]* %pi.next, i32 1
+  br label %bb3
+bb3:
+  %ptr_phi = phi [2 x i32]* [ %f, %bb1 ], [ %f2, %bb2 ]
+  %ptr_phi2 = phi [2 x i32]* [ %g, %bb1 ], [ %g2, %bb2 ]
+; CHECK: NoAlias: i32* %f1, i32* %g1
+  %f1 = getelementptr [2 x i32]* %ptr_phi , i32 1, i32 %i
+  %g1 = getelementptr [2 x i32]* %ptr_phi2 , i32 1, i32 %i
+
+ret i32 0
+}
diff --git a/test/Analysis/BasicAA/nocapture.ll b/test/Analysis/BasicAA/nocapture.ll
index a8658ec801ac..ffc0a09a078d 100644
--- a/test/Analysis/BasicAA/nocapture.ll
+++ b/test/Analysis/BasicAA/nocapture.ll
@@ -13,3 +13,24 @@ define i32 @test2() {
        ret i32 %c
 }
 
+declare void @test3(i32** %p, i32* %q) nounwind
+
+define i32 @test4(i32* noalias nocapture %p) nounwind {
+; CHECK: call void @test3
+; CHECK: store i32 0, i32* %p
+; CHECK: store i32 1, i32* %x
+; CHECK: %y = load i32* %p
+; CHECK: ret i32 %y
+entry:
+       %q = alloca i32*
+       ; Here test3 might store %p to %q. This doesn't violate %p's nocapture
+       ; attribute since the copy doesn't outlive the function.
+       call void @test3(i32** %q, i32* %p) nounwind
+       store i32 0, i32* %p
+       %x = load i32** %q
+       ; This store might write to %p and so we can't eliminate the subsequent
+       ; load
+       store i32 1, i32* %x
+       %y = load i32* %p
+       ret i32 %y
+}
diff --git a/test/Analysis/BasicAA/phi-speculation.ll b/test/Analysis/BasicAA/phi-speculation.ll
new file mode 100644
index 000000000000..21c65929862f
--- /dev/null
+++ b/test/Analysis/BasicAA/phi-speculation.ll
@@ -0,0 +1,33 @@
+target datalayout =
+"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; ptr_phi and ptr2_phi do not alias.
+; CHECK: NoAlias: i32* %ptr2_phi, i32* %ptr_phi
+
+define i32 @test_noalias(i32* %ptr2, i32 %count, i32* %coeff) {
+entry:
+  %ptr = getelementptr inbounds i32* %ptr2, i64 1
+  br label %while.body
+
+while.body:
+  %num = phi i32 [ %count, %entry ], [ %dec, %while.body ]
+  %ptr_phi = phi i32* [ %ptr, %entry ], [ %ptr_inc, %while.body ]
+  %ptr2_phi = phi i32* [ %ptr2, %entry ], [ %ptr2_inc, %while.body ]
+  %result.09 = phi i32 [ 0 , %entry ], [ %add, %while.body ]
+  %dec = add nsw i32 %num, -1
+  %0 = load i32* %ptr_phi, align 4
+  store i32 %0, i32* %ptr2_phi, align 4
+  %1 = load i32* %coeff, align 4
+  %2 = load i32* %ptr_phi, align 4
+  %mul = mul nsw i32 %1, %2
+  %add = add nsw i32 %mul, %result.09
+  %tobool = icmp eq i32 %dec, 0
+  %ptr_inc = getelementptr inbounds i32* %ptr_phi, i64 1
+  %ptr2_inc = getelementptr inbounds i32* %ptr2_phi, i64 1
+  br i1 %tobool, label %the_exit, label %while.body
+
+the_exit:
+  ret i32 %add
+}
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 74d06a18f7b9..08adfa8a36fb 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -88,3 +88,30 @@ exit:
 }
 
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
+
+define i32 @test4(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Printing analysis {{.*}} for function 'test4'
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 2, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !2
+; CHECK: edge entry -> return probability is 7 / 85
+; CHECK: edge entry -> sw.bb probability is 14 / 85
+; CHECK: edge entry -> sw.bb1 probability is 64 / 85
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!2 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/Analysis/CallGraph/do-nothing-intrinsic.ll b/test/Analysis/CallGraph/do-nothing-intrinsic.ll
new file mode 100644
index 000000000000..f28ad10f57c8
--- /dev/null
+++ b/test/Analysis/CallGraph/do-nothing-intrinsic.ll
@@ -0,0 +1,13 @@
+; RUN: opt < %s -basiccg
+; PR13903
+
+define void @main() {
+  invoke void @llvm.donothing()
+          to label %ret unwind label %unw
+unw:
+  %tmp = landingpad i8 personality i8 0 cleanup
+  br label %ret
+ret:
+  ret void
+}
+declare void @llvm.donothing() nounwind readnone
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
new file mode 100644
index 000000000000..37cca8d54067
--- /dev/null
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} add
+  %A = add <4 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %B = add <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} add
+  %C = add <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} add
+  %D = add <4 x i64> undef, undef
+  ;CHECK: cost of 8 {{.*}} add
+  %E = add <8 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @xor(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} xor
+  %A = xor <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %B = xor <8 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %C = xor <2 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} xor
+  %D = xor <4 x i64> undef, undef
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
+define i32 @fmul(i32 %arg) {
+  ;CHECK: cost of 1 {{.*}} fmul
+  %A = fmul <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fmul
+  %B = fmul <8 x float> undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
new file mode 100644
index 000000000000..75c97a781e7f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+
+  ; -- Same size registeres --
+  ;CHECK: cost of 1 {{.*}} zext
+  %A = zext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 2 {{.*}} sext
+  %B = sext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 0 {{.*}} trunc
+  %C = trunc <4 x i32> undef to <4 x i1>
+
+  ; -- Different size registers --
+  ;CHECK-NOT: cost of 1 {{.*}} zext
+  %D = zext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} sext
+  %E = sext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i1>
+
+  ; -- scalars --
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %G = zext i1 undef to i32
+  ;CHECK: cost of 0 {{.*}} trunc
+  %H = trunc i32 undef to i1
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+define i32 @zext_sext(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+
+  ;CHECK: cost of 1 {{.*}} sext
+  %A = sext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} zext
+  %B = zext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} sext
+  %C = sext <4 x i32> undef to <4 x i64>
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %D = zext <4 x i32> undef to <4 x i64>
+  ;CHECK: cost of 1 {{.*}} trunc
+
+  %E = trunc <4 x i64> undef to <4 x i32>
+  ;CHECK: cost of 1 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i16>
+
+  ;CHECK: cost of 3 {{.*}} trunc
+  %G = trunc <8 x i64> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+define i32 @masks(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll
new file mode 100644
index 000000000000..f868bd18b54f
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cmp.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @cmp(i32 %arg) {
+  ;  -- floats --
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %A = fcmp olt <2 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %B = fcmp olt <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %C = fcmp olt <8 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %D = fcmp olt <2 x double> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %E = fcmp olt <4 x double> undef, undef
+
+  ;  -- integers --
+
+  ;CHECK: cost of 1 {{.*}} icmp
+  %F = icmp eq <16 x i8> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %G = icmp eq <8 x i16> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %H = icmp eq <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %I = icmp eq <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %J = icmp eq <4 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %K = icmp eq <8 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %L = icmp eq <16 x i16> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %M = icmp eq <32 x i8> undef, undef
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
diff --git a/test/Analysis/CostModel/X86/i32.ll b/test/Analysis/CostModel/X86/i32.ll
new file mode 100644
index 000000000000..4015e0b1eef4
--- /dev/null
+++ b/test/Analysis/CostModel/X86/i32.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s
+
+
+;CHECK: cost of 2 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i64 undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/insert-extract-at-zero.ll b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
new file mode 100644
index 000000000000..87bf7c488b91
--- /dev/null
+++ b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @insert-extract-at-zero-idx(i32 %arg, float %fl) {
+  ;CHECK: cost of 0 {{.*}} extract
+  %A = extractelement <4 x float> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %B = extractelement <4 x i32> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %C = extractelement <4 x float> undef, i32 1
+
+  ;CHECK: cost of 0 {{.*}} extract
+  %D = extractelement <8 x float> undef, i32 0
+  ;CHECK: cost of 1 {{.*}} extract
+  %E = extractelement <8 x float> undef, i32 1
+
+  ;CHECK: cost of 1 {{.*}} extract
+  %F = extractelement <8 x float> undef, i32 %arg
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %G = insertelement <4 x float> undef, float %fl, i32 0
+  ;CHECK: cost of 1 {{.*}} insert
+  %H = insertelement <4 x float> undef, float %fl, i32 1
+  ;CHECK: cost of 1 {{.*}} insert
+  %I = insertelement <4 x i32> undef, i32 %arg, i32 0
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %J = insertelement <4 x double> undef, double undef, i32 0
+
+  ;CHECK: cost of 0 {{.*}} insert
+  %K = insertelement <8 x double> undef, double undef, i32 4
+  ;CHECK: cost of 0 {{.*}} insert
+  %L = insertelement <16 x double> undef, double undef, i32 8
+  ;CHECK: cost of 1 {{.*}} insert
+  %M = insertelement <16 x double> undef, double undef, i32 9
+  ret i32 0
+}
+
diff --git a/test/Analysis/CostModel/X86/lit.local.cfg b/test/Analysis/CostModel/X86/lit.local.cfg
new file mode 100644
index 000000000000..a8ad0f1a28b2
--- /dev/null
+++ b/test/Analysis/CostModel/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Analysis/CostModel/X86/loop_v2.ll b/test/Analysis/CostModel/X86/loop_v2.ll
new file mode 100644
index 000000000000..260a60676ab7
--- /dev/null
+++ b/test/Analysis/CostModel/X86/loop_v2.ll
@@ -0,0 +1,43 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define i32 @foo(i32* nocapture %A) nounwind uwtable readonly ssp {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ %12, %vector.body ]
+  %0 = getelementptr inbounds i32* %A, i64 %index
+  %1 = bitcast i32* %0 to <2 x i32>*
+  %2 = load <2 x i32>* %1, align 4
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ;CHECK: cost of 1 {{.*}} extract
+  %4 = extractelement <2 x i64> %3, i32 0
+  %5 = getelementptr inbounds i32* %A, i64 %4
+  ;CHECK: cost of 1 {{.*}} extract
+  %6 = extractelement <2 x i64> %3, i32 1
+  %7 = getelementptr inbounds i32* %A, i64 %6
+  %8 = load i32* %5, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} insert
+  %9 = insertelement <2 x i32> undef, i32 %8, i32 0
+  %10 = load i32* %7, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} insert
+  %11 = insertelement <2 x i32> %9, i32 %10, i32 1
+  %12 = add nsw <2 x i32> %11, %vec.phi
+  %index.next = add i64 %index, 2
+  %13 = icmp eq i64 %index.next, 192
+  br i1 %13, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  %14 = extractelement <2 x i32> %12, i32 0
+  %15 = extractelement <2 x i32> %12, i32 1
+  %16 = add i32 %14, %15
+  ret i32 %16
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/CostModel/X86/tiny.ll b/test/Analysis/CostModel/X86/tiny.ll
new file mode 100644
index 000000000000..cc7b443a7dfc
--- /dev/null
+++ b/test/Analysis/CostModel/X86/tiny.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: cost of 1 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i32 %arg, %arg
+  ret i32 %e
+}
diff --git a/test/Analysis/CostModel/X86/vectorized-loop.ll b/test/Analysis/CostModel/X86/vectorized-loop.ll
new file mode 100644
index 000000000000..7919a9ca9a64
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vectorized-loop.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @foo(i32* noalias nocapture %A, i32* noalias nocapture %B, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  ;CHECK: cost of 1 {{.*}} icmp
+  %cmp7 = icmp slt i32 %start, %end
+  br i1 %cmp7, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  ;CHECK: cost of 1 {{.*}} sext
+  %0 = sext i32 %start to i64
+  %1 = sub i32 %end, %start
+  %2 = zext i32 %1 to i64
+  %end.idx = add i64 %2, %0
+  ;CHECK: cost of 1 {{.*}} add
+  %n.vec = and i64 %2, 4294967288
+  %end.idx.rnd.down = add i64 %n.vec, %0
+  ;CHECK: cost of 1 {{.*}} icmp
+  %cmp.zero = icmp eq i64 %n.vec, 0
+  br i1 %cmp.zero, label %middle.block, label %vector.body
+
+vector.body:                                      ; preds = %for.body.lr.ph, %vector.body
+  %index = phi i64 [ %index.next, %vector.body ], [ %0, %for.body.lr.ph ]
+  %3 = add i64 %index, 2
+  %4 = getelementptr inbounds i32* %B, i64 %3
+  ;CHECK: cost of 0 {{.*}} bitcast
+  %5 = bitcast i32* %4 to <8 x i32>*
+  ;CHECK: cost of 1 {{.*}} load
+  %6 = load <8 x i32>* %5, align 4
+  ;CHECK: cost of 4 {{.*}} mul
+  %7 = mul nsw <8 x i32> %6, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %8 = getelementptr inbounds i32* %A, i64 %index
+  %9 = bitcast i32* %8 to <8 x i32>*
+  %10 = load <8 x i32>* %9, align 4
+  ;CHECK: cost of 4 {{.*}} add
+  %11 = add nsw <8 x i32> %10, %7
+  ;CHECK: cost of 1 {{.*}} store
+  store <8 x i32> %11, <8 x i32>* %9, align 4
+  %index.next = add i64 %index, 8
+  %12 = icmp eq i64 %index.next, %end.idx.rnd.down
+  ;CHECK: cost of 1 {{.*}} br
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body, %for.body.lr.ph
+  %cmp.n = icmp eq i64 %end.idx, %end.idx.rnd.down
+  br i1 %cmp.n, label %for.end, label %for.body
+
+for.body:                                         ; preds = %middle.block, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %end.idx.rnd.down, %middle.block ]
+  %13 = add nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32* %B, i64 %13
+  ;CHECK: cost of 1 {{.*}} load
+  %14 = load i32* %arrayidx, align 4, !tbaa !0
+  ;CHECK: cost of 1 {{.*}} mul
+  %mul = mul nsw i32 %14, 5
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  ;CHECK: cost of 1 {{.*}} load
+  %15 = load i32* %arrayidx2, align 4, !tbaa !0
+  %add3 = add nsw i32 %15, %mul
+  store i32 %add3, i32* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  ;CHECK: cost of 0 {{.*}} trunc
+  %16 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %16, %end
+  ;CHECK: cost of 1 {{.*}} br
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %middle.block, %for.body, %entry
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Analysis/LoopDependenceAnalysis/lit.local.cfg b/test/Analysis/CostModel/lit.local.cfg
index 19eebc0ac7ac..19eebc0ac7ac 100644
--- a/test/Analysis/LoopDependenceAnalysis/lit.local.cfg
+++ b/test/Analysis/CostModel/lit.local.cfg
diff --git a/test/Analysis/CostModel/no_info.ll b/test/Analysis/CostModel/no_info.ll
new file mode 100644
index 000000000000..d20d56b79a7f
--- /dev/null
+++ b/test/Analysis/CostModel/no_info.ll
@@ -0,0 +1,15 @@
+; RUN: opt < %s -cost-model -analyze | FileCheck %s
+
+; The cost model does not have any target information so it can't make a decision.
+; Notice that OPT does not read the triple information from the module itself, only through the command line.
+
+; This info ignored:
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: Unknown cost {{.*}} add
+;CHECK: Unknown cost {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i32 %arg, %arg
+  ret i32 %e
+}
diff --git a/test/Analysis/DependenceAnalysis/Banerjee.ll b/test/Analysis/DependenceAnalysis/Banerjee.ll
new file mode 100644
index 000000000000..8865ee94016f
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Banerjee.ll
@@ -0,0 +1,595 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Banerjee.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 1; i <= 10; i++)
+;;    for (long int j = 1; j <= 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 1];
+
+define void @banerjee0(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 1, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 1, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 11
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 11
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 1; i <= n; i++)
+;;    for (long int j = 1; j <= m; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 1];
+
+define void @banerjee1(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end9
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  %0 = add i64 %n, 1
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc7
+  %B.addr.06 = phi i64* [ %B.addr.1.lcssa, %for.inc7 ], [ %B, %for.cond1.preheader.preheader ]
+  %i.05 = phi i64 [ %inc8, %for.inc7 ], [ 1, %for.cond1.preheader.preheader ]
+  %1 = add i64 %m, 1
+  %cmp21 = icmp sgt i64 %m, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc7
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 1, %for.body3.preheader ]
+  %B.addr.12 = phi i64* [ %incdec.ptr, %for.body3 ], [ %B.addr.06, %for.body3.preheader ]
+  %mul = mul nsw i64 %i.05, 10
+  %add = add nsw i64 %mul, %j.03
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.05, 10
+  %add5 = add nsw i64 %mul4, %j.03
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %2 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [* <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.12, i64 1
+  store i64 %2, i64* %B.addr.12, align 8
+  %inc = add nsw i64 %j.03, 1
+  %exitcond = icmp eq i64 %inc, %1
+  br i1 %exitcond, label %for.inc7.loopexit, label %for.body3
+
+for.inc7.loopexit:                                ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.06, i64 %m
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.inc7.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.06, %for.cond1.preheader ], [ %scevgep, %for.inc7.loopexit ]
+  %inc8 = add nsw i64 %i.05, 1
+  %exitcond7 = icmp eq i64 %inc8, %0
+  br i1 %exitcond7, label %for.end9.loopexit, label %for.cond1.preheader
+
+for.end9.loopexit:                                ; preds = %for.inc7
+  br label %for.end9
+
+for.end9:                                         ; preds = %for.end9.loopexit, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = 0;
+;;      *B++ = A[10*i + j + 100];
+
+define void @banerjee2(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 100
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 99];
+
+define void @banerjee3(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 99
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> >]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 100];
+
+define void @banerjee4(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -100
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j - 99];
+
+define void @banerjee5(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %sub = add nsw i64 %add5, -99
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %sub
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [< <]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 9];
+
+define void @banerjee6(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 9
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [=> <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 10];
+
+define void @banerjee7(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 10
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> <=]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    for (long int j = 0; j < 10; j++) {
+;;      A[10*i + j] = ...
+;;      ... = A[10*i + j + 11];
+
+define void @banerjee8(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 10
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 10
+  %add5 = add nsw i64 %mul4, %j.02
+  %add6 = add nsw i64 %add5, 11
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %0 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [> <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 10
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 10
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 10
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[30*i + 500*j] = ...
+;;      ... = A[i - 500*j + 11];
+
+define void @banerjee9(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 30
+  %mul4 = mul nsw i64 %j.02, 500
+  %add = add nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %0 = mul i64 %j.02, -500
+  %sub = add i64 %i.03, %0
+  %add6 = add nsw i64 %sub, 11
+  %arrayidx7 = getelementptr inbounds i64* %A, i64 %add6
+  %1 = load i64* %arrayidx7, align 8
+; CHECK: da analyze - flow [<= =|<]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %1, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[i + 500*j] = ...
+;;      ... = A[i - 500*j + 11];
+
+define void @banerjee10(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %j.02, 500
+  %add = add nsw i64 %i.03, %mul
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %0 = mul i64 %j.02, -500
+  %sub = add i64 %i.03, %0
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %1 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<> =]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %1, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[300*i + j] = ...
+;;      ... = A[250*i - j + 11];
+
+define void @banerjee11(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 300
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 250
+  %sub = sub nsw i64 %mul4, %j.02
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [<= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 20; i++)
+;;    for (long int j = 0; j < 20; j++) {
+;;      A[100*i + j] = ...
+;;      ... = A[100*i - j + 11];
+
+define void @banerjee12(i64* %A, i64* %B, i64 %m, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i64* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i64* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %mul = mul nsw i64 %i.03, 100
+  %add = add nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i64* %A, i64 %add
+  store i64 0, i64* %arrayidx, align 8
+  %mul4 = mul nsw i64 %i.03, 100
+  %sub = sub nsw i64 %mul4, %j.02
+  %add5 = add nsw i64 %sub, 11
+  %arrayidx6 = getelementptr inbounds i64* %A, i64 %add5
+  %0 = load i64* %arrayidx6, align 8
+; CHECK: da analyze - flow [= <>]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.11, i64 1
+  store i64 %0, i64* %B.addr.11, align 8
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 20
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i64* %B.addr.04, i64 20
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 20
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Coupled.ll b/test/Analysis/DependenceAnalysis/Coupled.ll
new file mode 100644
index 000000000000..60163fe7c2d0
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Coupled.ll
@@ -0,0 +1,509 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Coupled.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][i] = ...
+;;   ... = A[i + 10][i + 9]
+
+define void @couple0([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  store i32 %conv, i32* %arrayidx1, align 4
+  %add = add nsw i64 %i.02, 9
+  %add2 = add nsw i64 %i.02, 10
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %add2, i64 %add
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][i] = ...
+;;   ... = A[i + 9][i + 9]
+
+define void @couple1([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  store i32 %conv, i32* %arrayidx1, align 4
+  %add = add nsw i64 %i.02, 9
+  %add2 = add nsw i64 %i.02, 9
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %add2, i64 %add
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - consistent flow [-9]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - 6] = ...
+;;   ... = A[i][i]
+
+define void @couple2([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -6
+  %mul1 = mul nsw i64 %i.02, 3
+  %sub2 = add nsw i64 %mul1, -6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %sub2, i64 %sub
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - 5] = ...
+;;   ... = A[i][i]
+
+define void @couple3([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -5
+  %mul1 = mul nsw i64 %i.02, 3
+  %sub2 = add nsw i64 %mul1, -6
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %sub2, i64 %sub
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - 6][3*i - n] = ...
+;;   ... = A[i][i]
+
+define void @couple4([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %conv1 = sext i32 %n to i64
+  %sub = sub nsw i64 %mul, %conv1
+  %mul2 = mul nsw i64 %i.02, 3
+  %sub3 = add nsw i64 %mul2, -6
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %sub3, i64 %sub
+  store i32 %conv, i32* %arrayidx4, align 4
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[3*i - n + 1][3*i - n] = ...
+;;   ... = A[i][i]
+
+define void @couple5([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %conv1 = sext i32 %n to i64
+  %sub = sub nsw i64 %mul, %conv1
+  %mul2 = mul nsw i64 %i.02, 3
+  %conv3 = sext i32 %n to i64
+  %sub4 = sub nsw i64 %mul2, %conv3
+  %add = add nsw i64 %sub4, 1
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %add, i64 %sub
+  store i32 %conv, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][3*i - 6] = ...
+;;   ... = A[i][i]
+
+define void @couple6([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -6
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %sub
+  store i32 %conv, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - flow [=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 50; i++)
+;;   A[i][3*i - 5] = ...
+;;   ... = A[i][i]
+
+define void @couple7([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul nsw i64 %i.02, 3
+  %sub = add nsw i64 %mul, -5
+  %arrayidx1 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %sub
+  store i32 %conv, i32* %arrayidx1, align 4
+  %arrayidx3 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 50
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][3 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple8([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 3, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][2 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple9([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 2, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][6 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple10([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 6, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 3!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 15; i++)
+;;   A[3*i - 18][18 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple11([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 18, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [=|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 9!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 16
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i <= 12; i++)
+;;   A[3*i - 18][22 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple12([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 11!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 12; i++)
+;;   A[3*i - 18][22 - i] = ...
+;;   ... = A[i][i]
+
+define void @couple13([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx2 = getelementptr inbounds [100 x i32]* %A, i64 %sub1, i64 %sub
+  store i32 %conv, i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 100; i++)
+;;   A[3*i - 18][18 - i][i] = ...
+;;   ... = A[i][i][i]
+
+define void @couple14([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 18, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub1, i64 %sub, i64 %i.02
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.02, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [=|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = 9!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;; for (long int i = 0; i < 100; i++)
+;;   A[3*i - 18][22 - i][i] = ...
+;;   ... = A[i][i][i]
+
+define void @couple15([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %sub = sub nsw i64 22, %i.02
+  %mul = mul nsw i64 %i.02, 3
+  %sub1 = add nsw i64 %mul, -18
+  %arrayidx3 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub1, i64 %sub, i64 %i.02
+  store i32 %conv, i32* %arrayidx3, align 4
+  %arrayidx6 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.02, i64 %i.02, i64 %i.02
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add nsw i64 %i.02, 1
+  %cmp = icmp slt i64 %inc, 100
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ExactRDIV.ll b/test/Analysis/DependenceAnalysis/ExactRDIV.ll
new file mode 100644
index 000000000000..aa5d254a0ce2
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ExactRDIV.ll
@@ -0,0 +1,508 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ExactRDIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 10; i++)
+;;    A[4*i + 10] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[2*j + 1];
+
+define void @rdiv0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 2
+  %add = add nsw i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %mul5 = shl nsw i64 %j.02, 1
+  %add64 = or i64 %mul5, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add64
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc9 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc9, 10
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[j];
+
+define void @rdiv1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[j];
+
+define void @rdiv2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[j];
+
+define void @rdiv3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[11*i - 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[j];
+
+define void @rdiv4(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = add nsw i64 %mul, -45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %j.02
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv5(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j < 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv6(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 10
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv7(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    A[-11*i + 45] = ...
+;;  for (long int j = 0; j <= 10; j++)
+;;    ... = A[-j];
+
+define void @rdiv8(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -11
+  %add = add nsw i64 %mul, 45
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %for.body
+  %j.02 = phi i64 [ %inc7, %for.body4 ], [ 0, %for.body ]
+  %B.addr.01 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.body ]
+  %sub = sub nsw i64 0, %j.02
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc7 = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc7, 11
+  br i1 %cmp2, label %for.body4, label %for.end8
+
+for.end8:                                         ; preds = %for.body4
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j < 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv9(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 5
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j <= 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv10(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 10
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 6
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i <= 5; i++)
+;;    for (long int j = 0; j <= 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv11(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 11
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 5
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 5; i++)
+;;    for (long int j = 0; j < 10; j++)
+;;      A[11*i - j] = ...
+;;      ... = A[45];
+
+define void @rdiv12(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc5 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc6, %for.inc5 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 11
+  %sub = sub nsw i64 %mul, %j.02
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 45
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 11
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3
+  %inc6 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc6, 6
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ExactSIV.ll b/test/Analysis/DependenceAnalysis/ExactSIV.ll
new file mode 100644
index 000000000000..71e050246291
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ExactSIV.ll
@@ -0,0 +1,428 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ExactSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[i + 10] = ...
+;;    ... = A[2*i + 1];
+
+define void @exact0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %add13 = or i64 %mul, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add13
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[4*i + 10] = ...
+;;    ... = A[2*i + 1];
+
+define void @exact1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 2
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %add23 = or i64 %mul1, 1
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add23
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 10; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 12; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact4(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 12; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact5(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 18; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact6(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 18
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 18; i++) {
+;;    A[6*i] = ...
+;;    ... = A[i + 60];
+
+define void @exact7(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %add = add i64 %i.02, 60
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact8(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 10; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact9(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 11
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 12; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact10(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [>]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 12
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 12; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact11(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 13
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 18; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact12(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [=>|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 18
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i <= 18; i++) {
+;;    A[-6*i] = ...
+;;    ... = A[-i - 60];
+
+define void @exact13(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, -6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub1 = sub i64 -60, %i.02
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/GCD.ll b/test/Analysis/DependenceAnalysis/GCD.ll
new file mode 100644
index 000000000000..94c93a8a0dd4
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/GCD.ll
@@ -0,0 +1,597 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'GCD.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j] = ...
+;;      ... = A[6*i + 8*j];
+
+define void @gcd0(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc8
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add = add nsw i64 %mul5, %mul6
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - flow [=> *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc9 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc9, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j] = ...
+;;      ... = A[6*i + 8*j + 1];
+
+define void @gcd1(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc9
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add = add nsw i64 %mul5, %mul6
+  %add7 = or i64 %add, 1
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc10 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc10, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i - 4*j + 1] = ...
+;;      ... = A[6*i + 8*j];
+
+define void @gcd2(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc9
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul4 = shl nsw i64 %j.02, 2
+  %sub = sub nsw i64 %mul, %mul4
+  %add5 = or i64 %sub, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add5
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul5 = mul nsw i64 %i.03, 6
+  %mul6 = shl nsw i64 %j.02, 3
+  %add7 = add nsw i64 %mul5, %mul6
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc10 = add nsw i64 %i.03, 1
+  %exitcond6 = icmp ne i64 %inc10, 100
+  br i1 %exitcond6, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 2*j] = ...
+;;      ... = A[i + 2*j - 1];
+
+define void @gcd3(i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc7
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc7 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc8, %for.inc7 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %j.02, 1
+  %add = add nsw i64 %i.03, %mul
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul4 = shl nsw i64 %j.02, 1
+  %add5 = add nsw i64 %i.03, %mul4
+  %sub = add nsw i64 %add5, -1
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - flow [<> *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc7
+
+for.inc7:                                         ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc8 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc8, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end9
+
+for.end9:                                         ; preds = %for.inc7
+  ret void
+}
+
+
+;;  void gcd4(int *A, int *B, long int M, long int N) {
+;;    for (long int i = 0; i < 100; i++)
+;;      for (long int j = 0; j < 100; j++) {
+;;        A[5*i + 10*j*M + 9*M*N] = i;
+;;        *B++ = A[15*i + 20*j*M - 21*N*M + 4];
+
+define void @gcd4(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc17
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc17 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %mul4 = mul nsw i64 %j.02, 10
+  %mul5 = mul nsw i64 %mul4, %M
+  %add = add nsw i64 %mul, %mul5
+  %mul6 = mul nsw i64 %M, 9
+  %mul7 = mul nsw i64 %mul6, %N
+  %add8 = add nsw i64 %add, %mul7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add8
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul9 = mul nsw i64 %i.03, 15
+  %mul10 = mul nsw i64 %j.02, 20
+  %mul11 = mul nsw i64 %mul10, %M
+  %add12 = add nsw i64 %mul9, %mul11
+  %mul13 = mul nsw i64 %N, 21
+  %mul14 = mul nsw i64 %mul13, %M
+  %sub = sub nsw i64 %add12, %mul14
+  %add15 = add nsw i64 %sub, 4
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %add15
+  %0 = load i32* %arrayidx16, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc18 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc18, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end19
+
+for.end19:                                        ; preds = %for.inc17
+  ret void
+}
+
+
+;;  void gcd5(int *A, int *B, long int M, long int N) {
+;;    for (long int i = 0; i < 100; i++)
+;;      for (long int j = 0; j < 100; j++) {
+;;        A[5*i + 10*j*M + 9*M*N] = i;
+;;        *B++ = A[15*i + 20*j*M - 21*N*M + 5];
+
+define void @gcd5(i32* %A, i32* %B, i64 %M, i64 %N) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %entry, %for.inc17
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %scevgep, %for.inc17 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc18, %for.inc17 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.cond1.preheader, %for.body3
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %mul4 = mul nsw i64 %j.02, 10
+  %mul5 = mul nsw i64 %mul4, %M
+  %add = add nsw i64 %mul, %mul5
+  %mul6 = mul nsw i64 %M, 9
+  %mul7 = mul nsw i64 %mul6, %N
+  %add8 = add nsw i64 %add, %mul7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add8
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul9 = mul nsw i64 %i.03, 15
+  %mul10 = mul nsw i64 %j.02, 20
+  %mul11 = mul nsw i64 %mul10, %M
+  %add12 = add nsw i64 %mul9, %mul11
+  %mul13 = mul nsw i64 %N, 21
+  %mul14 = mul nsw i64 %mul13, %M
+  %sub = sub nsw i64 %add12, %mul14
+  %add15 = add nsw i64 %sub, 5
+  %arrayidx16 = getelementptr inbounds i32* %A, i64 %add15
+  %0 = load i32* %arrayidx16, align 4
+; CHECK: da analyze - flow [<> *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %exitcond = icmp ne i64 %inc, 100
+  br i1 %exitcond, label %for.body3, label %for.inc17
+
+for.inc17:                                        ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.04, i64 100
+  %inc18 = add nsw i64 %i.03, 1
+  %exitcond5 = icmp ne i64 %inc18, 100
+  br i1 %exitcond5, label %for.cond1.preheader, label %for.end19
+
+for.end19:                                        ; preds = %for.inc17
+  ret void
+}
+
+
+;;  void gcd6(long int n, int A[][n], int *B) {
+;;    for (long int i = 0; i < n; i++)
+;;      for (long int j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd6(i64 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i64 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end12
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc10
+  %i.06 = phi i64 [ %inc11, %for.inc10 ], [ 0, %for.cond1.preheader.preheader ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc10 ], [ %B, %for.cond1.preheader.preheader ]
+  %cmp21 = icmp sgt i64 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc10
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 0, %for.body3.preheader ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %conv = trunc i64 %i.06 to i32
+  %mul = shl nsw i64 %j.03, 2
+  %mul4 = shl nsw i64 %i.06, 1
+  %0 = mul nsw i64 %mul4, %n
+  %arrayidx.sum = add i64 %0, %mul
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  store i32 %conv, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i64 %j.03, 6
+  %add7 = or i64 %mul6, 1
+  %mul7 = shl nsw i64 %i.06, 3
+  %1 = mul nsw i64 %mul7, %n
+  %arrayidx8.sum = add i64 %1, %add7
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %arrayidx8.sum
+  %2 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %2, i32* %B.addr.12, align 4
+  %inc = add nsw i64 %j.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body3, label %for.inc10.loopexit
+
+for.inc10.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %n
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.inc10.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc10.loopexit ]
+  %inc11 = add nsw i64 %i.06, 1
+  %exitcond8 = icmp ne i64 %inc11, %n
+  br i1 %exitcond8, label %for.cond1.preheader, label %for.end12.loopexit
+
+for.end12.loopexit:                               ; preds = %for.inc10
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.end12.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd7(int n, int A[][n], int *B) {
+;;    for (int i = 0; i < n; i++)
+;;      for (int j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd7(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %0 = zext i32 %n to i64
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %indvars.iv8 = phi i64 [ 0, %for.cond1.preheader.preheader ], [ %indvars.iv.next9, %for.inc13 ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %1 = add i32 %n, -1
+  %2 = zext i32 %1 to i64
+  %3 = add i64 %2, 1
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc13
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %4 = trunc i64 %indvars.iv to i32
+  %mul = shl nsw i32 %4, 2
+  %idxprom = sext i32 %mul to i64
+  %5 = trunc i64 %indvars.iv8 to i32
+  %mul4 = shl nsw i32 %5, 1
+  %idxprom5 = sext i32 %mul4 to i64
+  %6 = mul nsw i64 %idxprom5, %0
+  %arrayidx.sum = add i64 %6, %idxprom
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  %7 = trunc i64 %indvars.iv8 to i32
+  store i32 %7, i32* %arrayidx6, align 4
+  %8 = trunc i64 %indvars.iv to i32
+  %mul7 = mul nsw i32 %8, 6
+  %add7 = or i32 %mul7, 1
+  %idxprom8 = sext i32 %add7 to i64
+  %9 = trunc i64 %indvars.iv8 to i32
+  %mul9 = shl nsw i32 %9, 3
+  %idxprom10 = sext i32 %mul9 to i64
+  %10 = mul nsw i64 %idxprom10, %0
+  %arrayidx11.sum = add i64 %10, %idxprom8
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %arrayidx11.sum
+  %11 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %11, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %3
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %indvars.iv.next9 = add i64 %indvars.iv8, 1
+  %lftr.wideiv10 = trunc i64 %indvars.iv.next9 to i32
+  %exitcond11 = icmp ne i32 %lftr.wideiv10, %n
+  br i1 %exitcond11, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd8(int n, int *A, int *B) {
+;;    for (int i = 0; i < n; i++)
+;;      for (int j = 0; j < n; j++) {
+;;        A[n*2*i + 4*j] = i;
+;;        *B++ = A[n*8*i + 6*j + 1];
+
+define void @gcd8(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.cond1.preheader.preheader, label %for.end15
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %i.06 = phi i32 [ %inc14, %for.inc13 ], [ 0, %for.cond1.preheader.preheader ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %0 = add i32 %n, -1
+  %1 = zext i32 %0 to i64
+  %2 = add i64 %1, 1
+  %cmp21 = icmp sgt i32 %n, 0
+  br i1 %cmp21, label %for.body3.preheader, label %for.inc13
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %mul = shl nsw i32 %n, 1
+  %mul4 = mul nsw i32 %mul, %i.06
+  %3 = trunc i64 %indvars.iv to i32
+  %mul5 = shl nsw i32 %3, 2
+  %add = add nsw i32 %mul4, %mul5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.06, i32* %arrayidx, align 4
+  %mul6 = shl nsw i32 %n, 3
+  %mul7 = mul nsw i32 %mul6, %i.06
+  %4 = trunc i64 %indvars.iv to i32
+  %mul8 = mul nsw i32 %4, 6
+  %add9 = add nsw i32 %mul7, %mul8
+  %add10 = or i32 %add9, 1
+  %idxprom11 = sext i32 %add10 to i64
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %idxprom11
+  %5 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %5, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %2
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %inc14 = add nsw i32 %i.06, 1
+  %exitcond7 = icmp ne i32 %inc14, %n
+  br i1 %exitcond7, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
+
+
+;;  void gcd9(unsigned n, int A[][n], int *B) {
+;;    for (unsigned i = 0; i < n; i++)
+;;      for (unsigned j = 0; j < n; j++) {
+;;        A[2*i][4*j] = i;
+;;        *B++ = A[8*i][6*j + 1];
+
+define void @gcd9(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  %0 = zext i32 %n to i64
+  %cmp4 = icmp eq i32 %n, 0
+  br i1 %cmp4, label %for.end15, label %for.cond1.preheader.preheader
+
+for.cond1.preheader.preheader:                    ; preds = %entry
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.preheader, %for.inc13
+  %indvars.iv8 = phi i64 [ 0, %for.cond1.preheader.preheader ], [ %indvars.iv.next9, %for.inc13 ]
+  %B.addr.05 = phi i32* [ %B.addr.1.lcssa, %for.inc13 ], [ %B, %for.cond1.preheader.preheader ]
+  %1 = add i32 %n, -1
+  %2 = zext i32 %1 to i64
+  %3 = add i64 %2, 1
+  %cmp21 = icmp eq i32 %n, 0
+  br i1 %cmp21, label %for.inc13, label %for.body3.preheader
+
+for.body3.preheader:                              ; preds = %for.cond1.preheader
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3.preheader, %for.body3
+  %indvars.iv = phi i64 [ 0, %for.body3.preheader ], [ %indvars.iv.next, %for.body3 ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.05, %for.body3.preheader ]
+  %4 = trunc i64 %indvars.iv to i32
+  %mul = shl i32 %4, 2
+  %idxprom = zext i32 %mul to i64
+  %5 = trunc i64 %indvars.iv8 to i32
+  %mul4 = shl i32 %5, 1
+  %idxprom5 = zext i32 %mul4 to i64
+  %6 = mul nsw i64 %idxprom5, %0
+  %arrayidx.sum = add i64 %6, %idxprom
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %arrayidx.sum
+  %7 = trunc i64 %indvars.iv8 to i32
+  store i32 %7, i32* %arrayidx6, align 4
+  %8 = trunc i64 %indvars.iv to i32
+  %mul7 = mul i32 %8, 6
+  %add7 = or i32 %mul7, 1
+  %idxprom8 = zext i32 %add7 to i64
+  %9 = trunc i64 %indvars.iv8 to i32
+  %mul9 = shl i32 %9, 3
+  %idxprom10 = zext i32 %mul9 to i64
+  %10 = mul nsw i64 %idxprom10, %0
+  %arrayidx11.sum = add i64 %10, %idxprom8
+  %arrayidx12 = getelementptr inbounds i32* %A, i64 %arrayidx11.sum
+  %11 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [* *|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %11, i32* %B.addr.12, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.body3, label %for.inc13.loopexit
+
+for.inc13.loopexit:                               ; preds = %for.body3
+  %scevgep = getelementptr i32* %B.addr.05, i64 %3
+  br label %for.inc13
+
+for.inc13:                                        ; preds = %for.inc13.loopexit, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.05, %for.cond1.preheader ], [ %scevgep, %for.inc13.loopexit ]
+  %indvars.iv.next9 = add i64 %indvars.iv8, 1
+  %lftr.wideiv10 = trunc i64 %indvars.iv.next9 to i32
+  %exitcond11 = icmp ne i32 %lftr.wideiv10, %n
+  br i1 %exitcond11, label %for.cond1.preheader, label %for.end15.loopexit
+
+for.end15.loopexit:                               ; preds = %for.inc13
+  br label %for.end15
+
+for.end15:                                        ; preds = %for.end15.loopexit, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Preliminary.ll b/test/Analysis/DependenceAnalysis/Preliminary.ll
new file mode 100644
index 000000000000..3ef63fd5592f
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Preliminary.ll
@@ -0,0 +1,469 @@
+; RUN: opt < %s -analyze -basicaa -indvars -da | FileCheck %s
+
+; This series of tests is more interesting when debugging is enabled.
+
+; ModuleID = 'Preliminary.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;; may alias
+;; int p0(int n, int *A, int *B) {
+;;  A[0] = n;
+;;  return B[1];
+
+define i32 @p0(i32 %n, i32* %A, i32* %B) nounwind uwtable ssp {
+entry:
+  store i32 %n, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 1
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - confused!
+  ret i32 %0
+}
+
+
+;; no alias
+;; int p1(int n, int *restrict A, int *restrict B) {
+;;  A[0] = n;
+;;  return B[1];
+
+define i32 @p1(i32 %n, i32* noalias %A, i32* noalias %B) nounwind uwtable ssp {
+entry:
+  store i32 %n, i32* %A, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 1
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  ret i32 %0
+}
+
+;; check loop nesting levels
+;;  for (long int i = 0; i < n; i++)
+;;    for (long int j = 0; j < n; j++)
+;;      for (long int k = 0; k < n; k++)
+;;        A[i][j][k] = ...
+;;      for (long int k = 0; k < n; k++)
+;;        ... = A[i + 3][j + 2][k + 1];
+
+define void @p2(i64 %n, [100 x [100 x i64]]* %A, i64* %B) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i64 %n, 0
+  br i1 %cmp10, label %for.cond1.preheader, label %for.end26
+
+for.cond1.preheader:                              ; preds = %for.inc24, %entry
+  %B.addr.012 = phi i64* [ %B.addr.1.lcssa, %for.inc24 ], [ %B, %entry ]
+  %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %entry ]
+  %cmp26 = icmp sgt i64 %n, 0
+  br i1 %cmp26, label %for.cond4.preheader, label %for.inc24
+
+for.cond4.preheader:                              ; preds = %for.inc21, %for.cond1.preheader
+  %B.addr.18 = phi i64* [ %B.addr.2.lcssa, %for.inc21 ], [ %B.addr.012, %for.cond1.preheader ]
+  %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond1.preheader ]
+  %cmp51 = icmp sgt i64 %n, 0
+  br i1 %cmp51, label %for.body6, label %for.cond10.loopexit
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.cond4.preheader ]
+  %arrayidx8 = getelementptr inbounds [100 x [100 x i64]]* %A, i64 %i.011, i64 %j.07, i64 %k.02
+  store i64 %i.011, i64* %arrayidx8, align 8
+  %inc = add nsw i64 %k.02, 1
+  %cmp5 = icmp slt i64 %inc, %n
+  br i1 %cmp5, label %for.body6, label %for.cond10.loopexit
+
+for.cond10.loopexit:                              ; preds = %for.body6, %for.cond4.preheader
+  %cmp113 = icmp sgt i64 %n, 0
+  br i1 %cmp113, label %for.body12, label %for.inc21
+
+for.body12:                                       ; preds = %for.body12, %for.cond10.loopexit
+  %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.cond10.loopexit ]
+  %B.addr.24 = phi i64* [ %incdec.ptr, %for.body12 ], [ %B.addr.18, %for.cond10.loopexit ]
+  %add = add nsw i64 %k9.05, 1
+  %add13 = add nsw i64 %j.07, 2
+  %add14 = add nsw i64 %i.011, 3
+  %arrayidx17 = getelementptr inbounds [100 x [100 x i64]]* %A, i64 %add14, i64 %add13, i64 %add
+  %0 = load i64* %arrayidx17, align 8
+; CHECK: da analyze - flow [-3 -2]!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.24, i64 1
+  store i64 %0, i64* %B.addr.24, align 8
+  %inc19 = add nsw i64 %k9.05, 1
+  %cmp11 = icmp slt i64 %inc19, %n
+  br i1 %cmp11, label %for.body12, label %for.inc21
+
+for.inc21:                                        ; preds = %for.body12, %for.cond10.loopexit
+  %B.addr.2.lcssa = phi i64* [ %B.addr.18, %for.cond10.loopexit ], [ %incdec.ptr, %for.body12 ]
+  %inc22 = add nsw i64 %j.07, 1
+  %cmp2 = icmp slt i64 %inc22, %n
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc21, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.012, %for.cond1.preheader ], [ %B.addr.2.lcssa, %for.inc21 ]
+  %inc25 = add nsw i64 %i.011, 1
+  %cmp = icmp slt i64 %inc25, %n
+  br i1 %cmp, label %for.cond1.preheader, label %for.end26
+
+for.end26:                                        ; preds = %for.inc24, %entry
+  ret void
+}
+
+
+;; classify subscripts
+;;  for (long int i = 0; i < n; i++)
+;;  for (long int j = 0; j < n; j++)
+;;  for (long int k = 0; k < n; k++)
+;;  for (long int l = 0; l < n; l++)
+;;  for (long int m = 0; m < n; m++)
+;;  for (long int o = 0; o < n; o++)
+;;  for (long int p = 0; p < n; p++)
+;;  for (long int q = 0; q < n; q++)
+;;  for (long int r = 0; r < n; r++)
+;;  for (long int s = 0; s < n; s++)
+;;  for (long int u = 0; u < n; u++)
+;;  for (long int t = 0; t < n; t++) {
+;;          A[i - 3] [j] [2] [k-1] [2*l + 1] [m] [p + q] [r + s] = ...
+;;    ... = A[i + 3] [2] [u] [1-k] [3*l - 1] [o] [1 + n] [t + 2];
+
+define void @p3(i64 %n, [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64* %B) nounwind uwtable ssp {
+entry:
+  %cmp44 = icmp sgt i64 %n, 0
+  br i1 %cmp44, label %for.cond1.preheader, label %for.end90
+
+for.cond1.preheader:                              ; preds = %for.inc88, %entry
+  %B.addr.046 = phi i64* [ %B.addr.1.lcssa, %for.inc88 ], [ %B, %entry ]
+  %i.045 = phi i64 [ %inc89, %for.inc88 ], [ 0, %entry ]
+  %cmp240 = icmp sgt i64 %n, 0
+  br i1 %cmp240, label %for.cond4.preheader, label %for.inc88
+
+for.cond4.preheader:                              ; preds = %for.inc85, %for.cond1.preheader
+  %B.addr.142 = phi i64* [ %B.addr.2.lcssa, %for.inc85 ], [ %B.addr.046, %for.cond1.preheader ]
+  %j.041 = phi i64 [ %inc86, %for.inc85 ], [ 0, %for.cond1.preheader ]
+  %cmp536 = icmp sgt i64 %n, 0
+  br i1 %cmp536, label %for.cond7.preheader, label %for.inc85
+
+for.cond7.preheader:                              ; preds = %for.inc82, %for.cond4.preheader
+  %B.addr.238 = phi i64* [ %B.addr.3.lcssa, %for.inc82 ], [ %B.addr.142, %for.cond4.preheader ]
+  %k.037 = phi i64 [ %inc83, %for.inc82 ], [ 0, %for.cond4.preheader ]
+  %cmp832 = icmp sgt i64 %n, 0
+  br i1 %cmp832, label %for.cond10.preheader, label %for.inc82
+
+for.cond10.preheader:                             ; preds = %for.inc79, %for.cond7.preheader
+  %B.addr.334 = phi i64* [ %B.addr.4.lcssa, %for.inc79 ], [ %B.addr.238, %for.cond7.preheader ]
+  %l.033 = phi i64 [ %inc80, %for.inc79 ], [ 0, %for.cond7.preheader ]
+  %cmp1128 = icmp sgt i64 %n, 0
+  br i1 %cmp1128, label %for.cond13.preheader, label %for.inc79
+
+for.cond13.preheader:                             ; preds = %for.inc76, %for.cond10.preheader
+  %B.addr.430 = phi i64* [ %B.addr.5.lcssa, %for.inc76 ], [ %B.addr.334, %for.cond10.preheader ]
+  %m.029 = phi i64 [ %inc77, %for.inc76 ], [ 0, %for.cond10.preheader ]
+  %cmp1424 = icmp sgt i64 %n, 0
+  br i1 %cmp1424, label %for.cond16.preheader, label %for.inc76
+
+for.cond16.preheader:                             ; preds = %for.inc73, %for.cond13.preheader
+  %B.addr.526 = phi i64* [ %B.addr.6.lcssa, %for.inc73 ], [ %B.addr.430, %for.cond13.preheader ]
+  %o.025 = phi i64 [ %inc74, %for.inc73 ], [ 0, %for.cond13.preheader ]
+  %cmp1720 = icmp sgt i64 %n, 0
+  br i1 %cmp1720, label %for.cond19.preheader, label %for.inc73
+
+for.cond19.preheader:                             ; preds = %for.inc70, %for.cond16.preheader
+  %B.addr.622 = phi i64* [ %B.addr.7.lcssa, %for.inc70 ], [ %B.addr.526, %for.cond16.preheader ]
+  %p.021 = phi i64 [ %inc71, %for.inc70 ], [ 0, %for.cond16.preheader ]
+  %cmp2016 = icmp sgt i64 %n, 0
+  br i1 %cmp2016, label %for.cond22.preheader, label %for.inc70
+
+for.cond22.preheader:                             ; preds = %for.inc67, %for.cond19.preheader
+  %B.addr.718 = phi i64* [ %B.addr.8.lcssa, %for.inc67 ], [ %B.addr.622, %for.cond19.preheader ]
+  %q.017 = phi i64 [ %inc68, %for.inc67 ], [ 0, %for.cond19.preheader ]
+  %cmp2312 = icmp sgt i64 %n, 0
+  br i1 %cmp2312, label %for.cond25.preheader, label %for.inc67
+
+for.cond25.preheader:                             ; preds = %for.inc64, %for.cond22.preheader
+  %B.addr.814 = phi i64* [ %B.addr.9.lcssa, %for.inc64 ], [ %B.addr.718, %for.cond22.preheader ]
+  %r.013 = phi i64 [ %inc65, %for.inc64 ], [ 0, %for.cond22.preheader ]
+  %cmp268 = icmp sgt i64 %n, 0
+  br i1 %cmp268, label %for.cond28.preheader, label %for.inc64
+
+for.cond28.preheader:                             ; preds = %for.inc61, %for.cond25.preheader
+  %B.addr.910 = phi i64* [ %B.addr.10.lcssa, %for.inc61 ], [ %B.addr.814, %for.cond25.preheader ]
+  %s.09 = phi i64 [ %inc62, %for.inc61 ], [ 0, %for.cond25.preheader ]
+  %cmp294 = icmp sgt i64 %n, 0
+  br i1 %cmp294, label %for.cond31.preheader, label %for.inc61
+
+for.cond31.preheader:                             ; preds = %for.inc58, %for.cond28.preheader
+  %u.06 = phi i64 [ %inc59, %for.inc58 ], [ 0, %for.cond28.preheader ]
+  %B.addr.105 = phi i64* [ %B.addr.11.lcssa, %for.inc58 ], [ %B.addr.910, %for.cond28.preheader ]
+  %cmp321 = icmp sgt i64 %n, 0
+  br i1 %cmp321, label %for.body33, label %for.inc58
+
+for.body33:                                       ; preds = %for.body33, %for.cond31.preheader
+  %t.03 = phi i64 [ %inc, %for.body33 ], [ 0, %for.cond31.preheader ]
+  %B.addr.112 = phi i64* [ %incdec.ptr, %for.body33 ], [ %B.addr.105, %for.cond31.preheader ]
+  %add = add nsw i64 %r.013, %s.09
+  %add34 = add nsw i64 %p.021, %q.017
+  %mul = shl nsw i64 %l.033, 1
+  %add3547 = or i64 %mul, 1
+  %sub = add nsw i64 %k.037, -1
+  %sub36 = add nsw i64 %i.045, -3
+  %arrayidx43 = getelementptr inbounds [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64 %sub36, i64 %j.041, i64 2, i64 %sub, i64 %add3547, i64 %m.029, i64 %add34, i64 %add
+  store i64 %i.045, i64* %arrayidx43, align 8
+  %add44 = add nsw i64 %t.03, 2
+  %add45 = add nsw i64 %n, 1
+  %mul46 = mul nsw i64 %l.033, 3
+  %sub47 = add nsw i64 %mul46, -1
+  %sub48 = sub nsw i64 1, %k.037
+  %add49 = add nsw i64 %i.045, 3
+  %arrayidx57 = getelementptr inbounds [100 x [100 x [100 x [100 x [100 x [100 x [100 x i64]]]]]]]* %A, i64 %add49, i64 2, i64 %u.06, i64 %sub48, i64 %sub47, i64 %o.025, i64 %add45, i64 %add44
+  %0 = load i64* %arrayidx57, align 8
+; CHECK: da analyze - flow [-6 * * => * * * * * * * *] splitable!
+; CHECK: da analyze - split level = 3, iteration = 1!
+  %incdec.ptr = getelementptr inbounds i64* %B.addr.112, i64 1
+  store i64 %0, i64* %B.addr.112, align 8
+  %inc = add nsw i64 %t.03, 1
+  %cmp32 = icmp slt i64 %inc, %n
+  br i1 %cmp32, label %for.body33, label %for.inc58
+
+for.inc58:                                        ; preds = %for.body33, %for.cond31.preheader
+  %B.addr.11.lcssa = phi i64* [ %B.addr.105, %for.cond31.preheader ], [ %incdec.ptr, %for.body33 ]
+  %inc59 = add nsw i64 %u.06, 1
+  %cmp29 = icmp slt i64 %inc59, %n
+  br i1 %cmp29, label %for.cond31.preheader, label %for.inc61
+
+for.inc61:                                        ; preds = %for.inc58, %for.cond28.preheader
+  %B.addr.10.lcssa = phi i64* [ %B.addr.910, %for.cond28.preheader ], [ %B.addr.11.lcssa, %for.inc58 ]
+  %inc62 = add nsw i64 %s.09, 1
+  %cmp26 = icmp slt i64 %inc62, %n
+  br i1 %cmp26, label %for.cond28.preheader, label %for.inc64
+
+for.inc64:                                        ; preds = %for.inc61, %for.cond25.preheader
+  %B.addr.9.lcssa = phi i64* [ %B.addr.814, %for.cond25.preheader ], [ %B.addr.10.lcssa, %for.inc61 ]
+  %inc65 = add nsw i64 %r.013, 1
+  %cmp23 = icmp slt i64 %inc65, %n
+  br i1 %cmp23, label %for.cond25.preheader, label %for.inc67
+
+for.inc67:                                        ; preds = %for.inc64, %for.cond22.preheader
+  %B.addr.8.lcssa = phi i64* [ %B.addr.718, %for.cond22.preheader ], [ %B.addr.9.lcssa, %for.inc64 ]
+  %inc68 = add nsw i64 %q.017, 1
+  %cmp20 = icmp slt i64 %inc68, %n
+  br i1 %cmp20, label %for.cond22.preheader, label %for.inc70
+
+for.inc70:                                        ; preds = %for.inc67, %for.cond19.preheader
+  %B.addr.7.lcssa = phi i64* [ %B.addr.622, %for.cond19.preheader ], [ %B.addr.8.lcssa, %for.inc67 ]
+  %inc71 = add nsw i64 %p.021, 1
+  %cmp17 = icmp slt i64 %inc71, %n
+  br i1 %cmp17, label %for.cond19.preheader, label %for.inc73
+
+for.inc73:                                        ; preds = %for.inc70, %for.cond16.preheader
+  %B.addr.6.lcssa = phi i64* [ %B.addr.526, %for.cond16.preheader ], [ %B.addr.7.lcssa, %for.inc70 ]
+  %inc74 = add nsw i64 %o.025, 1
+  %cmp14 = icmp slt i64 %inc74, %n
+  br i1 %cmp14, label %for.cond16.preheader, label %for.inc76
+
+for.inc76:                                        ; preds = %for.inc73, %for.cond13.preheader
+  %B.addr.5.lcssa = phi i64* [ %B.addr.430, %for.cond13.preheader ], [ %B.addr.6.lcssa, %for.inc73 ]
+  %inc77 = add nsw i64 %m.029, 1
+  %cmp11 = icmp slt i64 %inc77, %n
+  br i1 %cmp11, label %for.cond13.preheader, label %for.inc79
+
+for.inc79:                                        ; preds = %for.inc76, %for.cond10.preheader
+  %B.addr.4.lcssa = phi i64* [ %B.addr.334, %for.cond10.preheader ], [ %B.addr.5.lcssa, %for.inc76 ]
+  %inc80 = add nsw i64 %l.033, 1
+  %cmp8 = icmp slt i64 %inc80, %n
+  br i1 %cmp8, label %for.cond10.preheader, label %for.inc82
+
+for.inc82:                                        ; preds = %for.inc79, %for.cond7.preheader
+  %B.addr.3.lcssa = phi i64* [ %B.addr.238, %for.cond7.preheader ], [ %B.addr.4.lcssa, %for.inc79 ]
+  %inc83 = add nsw i64 %k.037, 1
+  %cmp5 = icmp slt i64 %inc83, %n
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc85
+
+for.inc85:                                        ; preds = %for.inc82, %for.cond4.preheader
+  %B.addr.2.lcssa = phi i64* [ %B.addr.142, %for.cond4.preheader ], [ %B.addr.3.lcssa, %for.inc82 ]
+  %inc86 = add nsw i64 %j.041, 1
+  %cmp2 = icmp slt i64 %inc86, %n
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc88
+
+for.inc88:                                        ; preds = %for.inc85, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i64* [ %B.addr.046, %for.cond1.preheader ], [ %B.addr.2.lcssa, %for.inc85 ]
+  %inc89 = add nsw i64 %i.045, 1
+  %cmp = icmp slt i64 %inc89, %n
+  br i1 %cmp, label %for.cond1.preheader, label %for.end90
+
+for.end90:                                        ; preds = %for.inc88, %entry
+  ret void
+}
+
+
+;; cleanup around chars, shorts, ints
+;;void p4(int *A, int *B, long int n)
+;;  for (char i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i8 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = sext i8 %i.03 to i32
+  %conv3 = sext i8 %i.03 to i64
+  %add = add i64 %conv3, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %idxprom4 = sext i8 %i.03 to i64
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %idxprom4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i8 %i.03, 1
+  %conv = sext i8 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p5(int *A, int *B, long int n)
+;;  for (short i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i16 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = sext i16 %i.03 to i32
+  %conv3 = sext i16 %i.03 to i64
+  %add = add i64 %conv3, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %idxprom4 = sext i16 %i.03 to i64
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %idxprom4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i16 %i.03, 1
+  %conv = sext i16 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p6(int *A, int *B, long int n)
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @p6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom2 = sext i32 %i.03 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;void p7(unsigned *A, unsigned *B,  char n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p7(i32* %A, i32* %B, i8 signext %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i8 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %conv = sext i8 %n to i64
+  %add = add i64 %conv, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+
+;;void p8(unsigned *A, unsigned *B,  short n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p8(i32* %A, i32* %B, i16 signext %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i16 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %conv = sext i16 %n to i64
+  %add = add i64 %conv, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;void p9(unsigned *A, unsigned *B,  int n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p9(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %add = add nsw i32 %n, 1
+  %idxprom1 = sext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;void p10(unsigned *A, unsigned *B,  unsigned n)
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @p10(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = zext i32 %n to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 0, i32* %arrayidx, align 4
+  %add = add i32 %n, 1
+  %idxprom1 = zext i32 %add to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Propagating.ll b/test/Analysis/DependenceAnalysis/Propagating.ll
new file mode 100644
index 000000000000..076348c68dc8
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Propagating.ll
@@ -0,0 +1,467 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Propagating.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 1][i + j] = i;
+;;      *B++ = A[i][i + j];
+
+define void @prop0([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc9, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add nsw i64 %i.03, %j.02
+  %add4 = add nsw i64 %i.03, 1
+  %arrayidx5 = getelementptr inbounds [100 x i32]* %A, i64 %add4, i64 %add
+  store i32 %conv, i32* %arrayidx5, align 4
+  %add6 = add nsw i64 %i.03, %j.02
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add6
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - consistent flow [1 -1]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %inc10 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc10, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      for (long int k = 0; k < 100; k++)
+;;        A[j - i][i + 1][j + k] = ...
+;;        ... = A[j - i][i][j + k];
+
+define void @prop1([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc18, %entry
+  %B.addr.06 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc18 ]
+  %i.05 = phi i64 [ 0, %entry ], [ %inc19, %for.inc18 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc15, %for.cond1.preheader
+  %B.addr.14 = phi i32* [ %B.addr.06, %for.cond1.preheader ], [ %incdec.ptr, %for.inc15 ]
+  %j.03 = phi i64 [ 0, %for.cond1.preheader ], [ %inc16, %for.inc15 ]
+  br label %for.body6
+
+for.body6:                                        ; preds = %for.body6, %for.cond4.preheader
+  %k.02 = phi i64 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
+  %B.addr.21 = phi i32* [ %B.addr.14, %for.cond4.preheader ], [ %incdec.ptr, %for.body6 ]
+  %conv = trunc i64 %i.05 to i32
+  %add = add nsw i64 %j.03, %k.02
+  %add7 = add nsw i64 %i.05, 1
+  %sub = sub nsw i64 %j.03, %i.05
+  %arrayidx9 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub, i64 %add7, i64 %add
+  store i32 %conv, i32* %arrayidx9, align 4
+  %add10 = add nsw i64 %j.03, %k.02
+  %sub11 = sub nsw i64 %j.03, %i.05
+  %arrayidx14 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub11, i64 %i.05, i64 %add10
+  %0 = load i32* %arrayidx14, align 4
+; CHECK: da analyze - consistent flow [1 1 -1]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.21, i64 1
+  store i32 %0, i32* %B.addr.21, align 4
+  %inc = add nsw i64 %k.02, 1
+  %cmp5 = icmp slt i64 %inc, 100
+  br i1 %cmp5, label %for.body6, label %for.inc15
+
+for.inc15:                                        ; preds = %for.body6
+  %inc16 = add nsw i64 %j.03, 1
+  %cmp2 = icmp slt i64 %inc16, 100
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc18
+
+for.inc18:                                        ; preds = %for.inc15
+  %inc19 = add nsw i64 %i.05, 1
+  %cmp = icmp slt i64 %inc19, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end20
+
+for.end20:                                        ; preds = %for.inc18
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i - 1][2*i] = ...
+;;      ... = A[i][i + j + 110];
+
+define void @prop2([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc8, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc8 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc9, %for.inc8 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %sub = add nsw i64 %i.03, -1
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %sub, i64 %mul
+  store i32 %conv, i32* %arrayidx4, align 4
+  %add = add nsw i64 %i.03, %j.02
+  %add5 = add nsw i64 %add, 110
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add5
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc8
+
+for.inc8:                                         ; preds = %for.body3
+  %inc9 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc9, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end10
+
+for.end10:                                        ; preds = %for.inc8
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i][2*j + i] = ...
+;;      ... = A[i][2*j - i + 5];
+
+define void @prop3([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc9, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc9 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc10, %for.inc9 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %j.02, 1
+  %add = add nsw i64 %mul, %i.03
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add
+  store i32 %conv, i32* %arrayidx4, align 4
+  %mul5 = shl nsw i64 %j.02, 1
+  %sub = sub nsw i64 %mul5, %i.03
+  %add6 = add nsw i64 %sub, 5
+  %arrayidx8 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add6
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc9
+
+for.inc9:                                         ; preds = %for.body3
+  %inc10 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc10, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end11
+
+for.end11:                                        ; preds = %for.inc9
+  ret void
+}
+
+
+;; propagate Distance
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 2][2*i + j + 1] = ...
+;;      ... = A[i][2*i + j];
+
+define void @prop4([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc11, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc11 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc12, %for.inc11 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 1
+  %add5 = add nsw i64 %i.03, 2
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %add5, i64 %add4
+  store i32 %conv, i32* %arrayidx6, align 4
+  %mul7 = shl nsw i64 %i.03, 1
+  %add8 = add nsw i64 %mul7, %j.02
+  %arrayidx10 = getelementptr inbounds [100 x i32]* %A, i64 %i.03, i64 %add8
+  %0 = load i32* %arrayidx10, align 4
+; CHECK: da analyze - consistent flow [2 -3]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc11
+
+for.inc11:                                        ; preds = %for.body3
+  %inc12 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc12, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end13
+
+for.end13:                                        ; preds = %for.inc11
+  ret void
+}
+
+
+;; propagate Point
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[3*i - 18][22 - i][2*i + j] = ...
+;;      ... = A[i][i][3*i + j];
+
+define void @prop5([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc13, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc13 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc14, %for.inc13 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add nsw i64 %mul, %j.02
+  %sub = sub nsw i64 22, %i.03
+  %mul4 = mul nsw i64 %i.03, 3
+  %sub5 = add nsw i64 %mul4, -18
+  %arrayidx7 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %sub5, i64 %sub, i64 %add
+  store i32 %conv, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i64 %i.03, 3
+  %add9 = add nsw i64 %mul8, %j.02
+  %arrayidx12 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.03, i64 %i.03, i64 %add9
+  %0 = load i32* %arrayidx12, align 4
+; CHECK: da analyze - flow [< -16] splitable!
+; CHECK: da analyze - split level = 1, iteration = 11!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc13
+
+for.inc13:                                        ; preds = %for.body3
+  %inc14 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc14, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.inc13
+  ret void
+}
+
+
+;; propagate Line
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[i + 1][4*i + j + 2] = ...
+;;      ... = A[2*i][8*i + j];
+
+define void @prop6([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc12, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc12 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc13, %for.inc12 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 2
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %add5 = add nsw i64 %i.03, 1
+  %arrayidx6 = getelementptr inbounds [100 x i32]* %A, i64 %add5, i64 %add4
+  store i32 %conv, i32* %arrayidx6, align 4
+  %mul7 = shl nsw i64 %i.03, 3
+  %add8 = add nsw i64 %mul7, %j.02
+  %mul9 = shl nsw i64 %i.03, 1
+  %arrayidx11 = getelementptr inbounds [100 x i32]* %A, i64 %mul9, i64 %add8
+  %0 = load i32* %arrayidx11, align 4
+; CHECK: da analyze - flow [=> -2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc12
+
+for.inc12:                                        ; preds = %for.body3
+  %inc13 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc13, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end14
+
+for.end14:                                        ; preds = %for.inc12
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i + 4][-5*i + j + 2] = ...
+;;      ... = A[-2*i + 20][5*i + j];
+
+define void @prop7([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc14, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc14 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc15, %for.inc14 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -5
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %mul5 = shl nsw i64 %i.03, 1
+  %add6 = add nsw i64 %mul5, 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %add6, i64 %add4
+  store i32 %conv, i32* %arrayidx7, align 4
+  %mul8 = mul nsw i64 %i.03, 5
+  %add9 = add nsw i64 %mul8, %j.02
+  %mul10 = mul nsw i64 %i.03, -2
+  %add11 = add nsw i64 %mul10, 20
+  %arrayidx13 = getelementptr inbounds [100 x i32]* %A, i64 %add11, i64 %add9
+  %0 = load i32* %arrayidx13, align 4
+; CHECK: da analyze - flow [* -38] splitable!
+; CHECK: da analyze - split level = 1, iteration = 4!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc14
+
+for.inc14:                                        ; preds = %for.body3
+  %inc15 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc15, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end16
+
+for.end16:                                        ; preds = %for.inc14
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[4][j + 2] = ...
+;;      ... = A[-2*i + 4][5*i + j];
+
+define void @prop8([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc10 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add nsw i64 %j.02, 2
+  %arrayidx4 = getelementptr inbounds [100 x i32]* %A, i64 4, i64 %add
+  store i32 %conv, i32* %arrayidx4, align 4
+  %mul = mul nsw i64 %i.03, 5
+  %add5 = add nsw i64 %mul, %j.02
+  %mul6 = mul nsw i64 %i.03, -2
+  %add7 = add nsw i64 %mul6, 4
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %A, i64 %add7, i64 %add5
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - flow [p<= 2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:                                        ; preds = %for.body3
+  %inc11 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc11, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.inc10
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 100; i++)
+;;    for (long int j = 0; j < 100; j++)
+;;      A[2*i + 4][5*i + j + 2] = ...
+;;      ... = A[4][j];
+
+define void @prop9([100 x i32]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %entry
+  %B.addr.04 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc10 ]
+  %i.03 = phi i64 [ 0, %entry ], [ %inc11, %for.inc10 ]
+  br label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.02 = phi i64 [ 0, %for.cond1.preheader ], [ %inc, %for.body3 ]
+  %B.addr.11 = phi i32* [ %B.addr.04, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, 5
+  %add = add nsw i64 %mul, %j.02
+  %add4 = add nsw i64 %add, 2
+  %mul5 = shl nsw i64 %i.03, 1
+  %add6 = add nsw i64 %mul5, 4
+  %arrayidx7 = getelementptr inbounds [100 x i32]* %A, i64 %add6, i64 %add4
+  store i32 %conv, i32* %arrayidx7, align 4
+  %arrayidx9 = getelementptr inbounds [100 x i32]* %A, i64 4, i64 %j.02
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - flow [p<= 2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.11, i64 1
+  store i32 %0, i32* %B.addr.11, align 4
+  %inc = add nsw i64 %j.02, 1
+  %cmp2 = icmp slt i64 %inc, 100
+  br i1 %cmp2, label %for.body3, label %for.inc10
+
+for.inc10:                                        ; preds = %for.body3
+  %inc11 = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc11, 100
+  br i1 %cmp, label %for.cond1.preheader, label %for.end12
+
+for.end12:                                        ; preds = %for.inc10
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/Separability.ll b/test/Analysis/DependenceAnalysis/Separability.ll
new file mode 100644
index 000000000000..d42d3cdb39e5
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/Separability.ll
@@ -0,0 +1,267 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'Separability.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[n][i][j + k] = ...
+;;          ... = A[10][i + 10][2*j - l];
+
+define void @sep0([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc22, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc22 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc23, %for.inc22 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc19, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc19 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc20, %for.inc19 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc16, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc16 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc17, %for.inc16 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %j.05, %k.03
+  %idxprom = sext i32 %n to i64
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %idxprom, i64 %i.07, i64 %add
+  store i32 %conv, i32* %arrayidx11, align 4
+  %mul = shl nsw i64 %j.05, 1
+  %sub = sub nsw i64 %mul, %l.02
+  %add12 = add nsw i64 %i.07, 10
+  %arrayidx15 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 10, i64 %add12, i64 %sub
+  %0 = load i32* %arrayidx15, align 4
+; CHECK: da analyze - flow [-10 * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc16
+
+for.inc16:                                        ; preds = %for.body9
+  %inc17 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc17, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc19
+
+for.inc19:                                        ; preds = %for.inc16
+  %inc20 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc20, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc22
+
+for.inc22:                                        ; preds = %for.inc19
+  %inc23 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc23, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end24
+
+for.end24:                                        ; preds = %for.inc22
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][j + k] = ...
+;;          ... = A[10][i + 10][2*j - l];
+
+define void @sep1([100 x [100 x i32]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc22, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc22 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc23, %for.inc22 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc19, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc19 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc20, %for.inc19 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc16, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc16 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc17, %for.inc16 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %j.05, %k.03
+  %arrayidx11 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 %i.07, i64 %i.07, i64 %add
+  store i32 %conv, i32* %arrayidx11, align 4
+  %mul = shl nsw i64 %j.05, 1
+  %sub = sub nsw i64 %mul, %l.02
+  %add12 = add nsw i64 %i.07, 10
+  %arrayidx15 = getelementptr inbounds [100 x [100 x i32]]* %A, i64 10, i64 %add12, i64 %sub
+  %0 = load i32* %arrayidx15, align 4
+; CHECK: da analyze - flow [> * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc16
+
+for.inc16:                                        ; preds = %for.body9
+  %inc17 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc17, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc19
+
+for.inc19:                                        ; preds = %for.inc16
+  %inc20 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc20, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc22
+
+for.inc22:                                        ; preds = %for.inc19
+  %inc23 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc23, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end24
+
+for.end24:                                        ; preds = %for.inc22
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][i + k][l] = ...
+;;          ... = A[10][i + 10][j + k][l + 10];
+
+define void @sep2([100 x [100 x [100 x i32]]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc26, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc26 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc27, %for.inc26 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc23, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc23 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc24, %for.inc23 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc20, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc20 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc21, %for.inc20 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %i.07, %k.03
+  %arrayidx12 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 %i.07, i64 %i.07, i64 %add, i64 %l.02
+  store i32 %conv, i32* %arrayidx12, align 4
+  %add13 = add nsw i64 %l.02, 10
+  %add14 = add nsw i64 %j.05, %k.03
+  %add15 = add nsw i64 %i.07, 10
+  %arrayidx19 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 10, i64 %add15, i64 %add14, i64 %add13
+  %0 = load i32* %arrayidx19, align 4
+; CHECK: da analyze - flow [> * * -10]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc20
+
+for.inc20:                                        ; preds = %for.body9
+  %inc21 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc21, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc23
+
+for.inc23:                                        ; preds = %for.inc20
+  %inc24 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc24, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc26
+
+for.inc26:                                        ; preds = %for.inc23
+  %inc27 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc27, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end28
+
+for.end28:                                        ; preds = %for.inc26
+  ret void
+}
+
+
+;;  for (long int i = 0; i < 50; i++)
+;;    for (long int j = 0; j < 50; j++)
+;;      for (long int k = 0; k < 50; k++)
+;;        for (long int l = 0; l < 50; l++)
+;;          A[i][i][i + k][l + k] = ...
+;;          ... = A[10][i + 10][j + k][l + 10];
+
+define void @sep3([100 x [100 x [100 x i32]]]* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc27, %entry
+  %B.addr.08 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.inc27 ]
+  %i.07 = phi i64 [ 0, %entry ], [ %inc28, %for.inc27 ]
+  br label %for.cond4.preheader
+
+for.cond4.preheader:                              ; preds = %for.inc24, %for.cond1.preheader
+  %B.addr.16 = phi i32* [ %B.addr.08, %for.cond1.preheader ], [ %incdec.ptr, %for.inc24 ]
+  %j.05 = phi i64 [ 0, %for.cond1.preheader ], [ %inc25, %for.inc24 ]
+  br label %for.cond7.preheader
+
+for.cond7.preheader:                              ; preds = %for.inc21, %for.cond4.preheader
+  %B.addr.24 = phi i32* [ %B.addr.16, %for.cond4.preheader ], [ %incdec.ptr, %for.inc21 ]
+  %k.03 = phi i64 [ 0, %for.cond4.preheader ], [ %inc22, %for.inc21 ]
+  br label %for.body9
+
+for.body9:                                        ; preds = %for.body9, %for.cond7.preheader
+  %l.02 = phi i64 [ 0, %for.cond7.preheader ], [ %inc, %for.body9 ]
+  %B.addr.31 = phi i32* [ %B.addr.24, %for.cond7.preheader ], [ %incdec.ptr, %for.body9 ]
+  %conv = trunc i64 %i.07 to i32
+  %add = add nsw i64 %l.02, %k.03
+  %add10 = add nsw i64 %i.07, %k.03
+  %arrayidx13 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 %i.07, i64 %i.07, i64 %add10, i64 %add
+  store i32 %conv, i32* %arrayidx13, align 4
+  %add14 = add nsw i64 %l.02, 10
+  %add15 = add nsw i64 %j.05, %k.03
+  %add16 = add nsw i64 %i.07, 10
+  %arrayidx20 = getelementptr inbounds [100 x [100 x [100 x i32]]]* %A, i64 10, i64 %add16, i64 %add15, i64 %add14
+  %0 = load i32* %arrayidx20, align 4
+; CHECK: da analyze - flow [> * * *]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.31, i64 1
+  store i32 %0, i32* %B.addr.31, align 4
+  %inc = add nsw i64 %l.02, 1
+  %cmp8 = icmp slt i64 %inc, 50
+  br i1 %cmp8, label %for.body9, label %for.inc21
+
+for.inc21:                                        ; preds = %for.body9
+  %inc22 = add nsw i64 %k.03, 1
+  %cmp5 = icmp slt i64 %inc22, 50
+  br i1 %cmp5, label %for.cond7.preheader, label %for.inc24
+
+for.inc24:                                        ; preds = %for.inc21
+  %inc25 = add nsw i64 %j.05, 1
+  %cmp2 = icmp slt i64 %inc25, 50
+  br i1 %cmp2, label %for.cond4.preheader, label %for.inc27
+
+for.inc27:                                        ; preds = %for.inc24
+  %inc28 = add nsw i64 %i.07, 1
+  %cmp = icmp slt i64 %inc28, 50
+  br i1 %cmp, label %for.cond1.preheader, label %for.end29
+
+for.end29:                                        ; preds = %for.inc27
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/StrongSIV.ll b/test/Analysis/DependenceAnalysis/StrongSIV.ll
new file mode 100644
index 000000000000..be336c3580ce
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/StrongSIV.ll
@@ -0,0 +1,342 @@
+; RUN: opt < %s -analyze -basicaa -indvars -da | FileCheck %s
+
+; ModuleID = 'StrongSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i64 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom2 = sext i32 %i.03 to i64
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %idxprom2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %conv = sext i32 %inc to i64
+  %cmp = icmp slt i64 %conv, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong1(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %conv = sext i32 %n to i64
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv2 = trunc i64 %i.03 to i32
+  %add = add nsw i64 %i.03, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv2, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %i.03
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp slt i64 %inc, %conv
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.03
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (int i = 0; i < n; i++)
+;;    A[i + 2] = ...
+;;    ... = A[i];
+
+define void @strong3(i32* %A, i32* %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  br i1 %cmp1, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %add = add nsw i32 %i.03, 2
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32* %A, i64 %idxprom
+  store i32 %i.03, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %i.03 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow [2]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i32 %i.03, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 19; i++)
+;;    A[i + 19] = ...
+;;    ... = A[i];
+
+define void @strong4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 19
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 19
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[i + 19] = ...
+;;    ... = A[i];
+
+define void @strong5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, 19
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [19]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[2*i + 6] = ...
+;;    ... = A[2*i];
+
+define void @strong6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 6
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %mul1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow [3]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[2*i + 7] = ...
+;;    ... = A[2*i];
+
+define void @strong7(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 7
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = shl i64 %i.02, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %mul1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 20; i++)
+;;    A[i + n] = ...
+;;    ... = A[i];
+
+define void @strong8(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %add = add i64 %i.02, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %i.02
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - consistent flow [%n|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 20
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + n] = ...
+;;    ... = A[i + 2*n];
+
+define void @strong9(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %n, 1
+  %add1 = add i64 %i.03, %mul
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 1000; i++)
+;;    A[n*i + 5] = ...
+;;    ... = A[n*i + 5];
+
+define void @strong10(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = mul i64 %i.02, %n
+  %add = add i64 %mul, 5
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = mul i64 %i.02, %n
+  %add2 = add i64 %mul1, 5
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - consistent flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
new file mode 100644
index 000000000000..2a1b4e7e971d
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/SymbolicRDIV.ll
@@ -0,0 +1,312 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'SymbolicRDIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i + n1] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[3*j + 3*n1];
+
+define void @symbolicrdiv0(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end11, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %add = add i64 %mul, %n1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc10, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul56 = add i64 %j.03, %n1
+  %add7 = mul i64 %mul56, 3
+  %arrayidx8 = getelementptr inbounds i32* %A, i64 %add7
+  %0 = load i32* %arrayidx8, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc10 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc10, %n2
+  br i1 %cmp2, label %for.body4, label %for.end11
+
+for.end11:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i + 5*n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[3*j + 2*n2];
+
+define void @symbolicrdiv1(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond2.preheader, label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body, %entry
+  %cmp31 = icmp eq i64 %n2, 0
+  br i1 %cmp31, label %for.end12, label %for.body5
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %mul1 = mul i64 %n2, 5
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond2.preheader
+
+for.body5:                                        ; preds = %for.body5, %for.cond2.preheader
+  %j.03 = phi i64 [ %inc11, %for.body5 ], [ 0, %for.cond2.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body5 ], [ %B, %for.cond2.preheader ]
+  %mul6 = mul nsw i64 %j.03, 3
+  %mul7 = shl i64 %n2, 1
+  %add8 = add i64 %mul6, %mul7
+  %arrayidx9 = getelementptr inbounds i32* %A, i64 %add8
+  %0 = load i32* %arrayidx9, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc11 = add nsw i64 %j.03, 1
+  %cmp3 = icmp ult i64 %inc11, %n2
+  br i1 %cmp3, label %for.body5, label %for.end12
+
+for.end12:                                        ; preds = %for.body5, %for.cond2.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[2*i - n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + 2*n1];
+
+define void @symbolicrdiv2(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl nsw i64 %i.05, 1
+  %sub = sub i64 %mul, %n2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul6 = shl i64 %n1, 1
+  %add = sub i64 %mul6, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[j - n1];
+
+define void @symbolicrdiv3(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end9, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %add = sub i64 %n2, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc8, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %sub5 = sub i64 %j.03, %n1
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %sub5
+  %0 = load i32* %arrayidx6, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc8 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc8, %n2
+  br i1 %cmp2, label %for.body4, label %for.end9
+
+for.end9:                                         ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + 2*n1] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + n1];
+
+define void @symbolicrdiv4(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %mul = shl i64 %n1, 1
+  %add = sub i64 %mul, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %add6 = sub i64 %n1, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    A[-i + n2] = ...
+;;  for (long int j = 0; j < n2; j++)
+;;    ... = A[-j + 2*n2];
+
+define void @symbolicrdiv5(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.cond1.preheader, label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body, %entry
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.end10, label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.05 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %conv = trunc i64 %i.05 to i32
+  %add = sub i64 %n2, %i.05
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %inc = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc, %n1
+  br i1 %cmp, label %for.body, label %for.cond1.preheader
+
+for.body4:                                        ; preds = %for.body4, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc9, %for.body4 ], [ 0, %for.cond1.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body4 ], [ %B, %for.cond1.preheader ]
+  %mul = shl i64 %n2, 1
+  %add6 = sub i64 %mul, %j.03
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc9 = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc9, %n2
+  br i1 %cmp2, label %for.body4, label %for.end10
+
+for.end10:                                        ; preds = %for.body4, %for.cond1.preheader
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n1; i++)
+;;    for (long int j = 0; j < n2; j++)
+;;      A[j -i + n2] = ...
+;;      ... = A[2*n2];
+
+define void @symbolicrdiv6(i32* %A, i32* %B, i64 %n1, i64 %n2) nounwind uwtable ssp {
+entry:
+  %cmp4 = icmp eq i64 %n1, 0
+  br i1 %cmp4, label %for.end7, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc5, %entry
+  %B.addr.06 = phi i32* [ %B.addr.1.lcssa, %for.inc5 ], [ %B, %entry ]
+  %i.05 = phi i64 [ %inc6, %for.inc5 ], [ 0, %entry ]
+  %cmp21 = icmp eq i64 %n2, 0
+  br i1 %cmp21, label %for.inc5, label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %j.03 = phi i64 [ %inc, %for.body3 ], [ 0, %for.cond1.preheader ]
+  %B.addr.12 = phi i32* [ %incdec.ptr, %for.body3 ], [ %B.addr.06, %for.cond1.preheader ]
+  %conv = trunc i64 %i.05 to i32
+  %sub = sub nsw i64 %j.03, %i.05
+  %add = add i64 %sub, %n2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %n2, 1
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.12, i64 1
+  store i32 %0, i32* %B.addr.12, align 4
+  %inc = add nsw i64 %j.03, 1
+  %cmp2 = icmp ult i64 %inc, %n2
+  br i1 %cmp2, label %for.body3, label %for.inc5
+
+for.inc5:                                         ; preds = %for.body3, %for.cond1.preheader
+  %B.addr.1.lcssa = phi i32* [ %B.addr.06, %for.cond1.preheader ], [ %incdec.ptr, %for.body3 ]
+  %inc6 = add nsw i64 %i.05, 1
+  %cmp = icmp ult i64 %inc6, %n1
+  br i1 %cmp, label %for.cond1.preheader, label %for.end7
+
+for.end7:                                         ; preds = %for.inc5, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/SymbolicSIV.ll b/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
new file mode 100644
index 000000000000..ee2343fa51e9
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/SymbolicSIV.ll
@@ -0,0 +1,330 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'SymbolicSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i + n] = ...
+;;    ... = A[3*i + 3*n];
+
+define void @symbolicsiv0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %add = add i64 %mul, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul14 = add i64 %i.03, %n
+  %add3 = mul i64 %mul14, 3
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %add3
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i + 5*n] = ...
+;;    ... = A[3*i + 2*n];
+
+define void @symbolicsiv1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %mul1 = mul i64 %n, 5
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = mul nsw i64 %i.03, 3
+  %mul3 = shl i64 %n, 1
+  %add4 = add i64 %mul2, %mul3
+  %arrayidx5 = getelementptr inbounds i32* %A, i64 %add4
+  %0 = load i32* %arrayidx5, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[2*i - n] = ...
+;;    ... = A[-i + 2*n];
+
+define void @symbolicsiv2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl nsw i64 %i.03, 1
+  %sub = sub i64 %mul, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %n, 1
+  %add = sub i64 %mul2, %i.03
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i + n + 1] = ...
+;;    ... = A[i - 2*n];
+
+define void @symbolicsiv3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %add = add i64 %mul, %n
+  %add1 = add i64 %add, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add1
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %n, 1
+  %sub = sub i64 %i.03, %mul2
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i + 3*n] = ...
+;;    ... = A[-i + n];
+
+define void @symbolicsiv4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %mul1 = mul i64 %n, 3
+  %add = add i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %add2 = sub i64 %n, %i.03
+  %arrayidx3 = getelementptr inbounds i32* %A, i64 %add2
+  %0 = load i32* %arrayidx3, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long int i = 0; i < n; i++)
+;;    A[-2*i - 2*n] = ...
+;;    ... = A[-i - n];
+
+define void @symbolicsiv5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul nsw i64 %i.03, -2
+  %mul1 = shl i64 %n, 1
+  %sub = sub i64 %mul, %mul1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %sub
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub2 = sub nsw i64 0, %i.03
+  %sub3 = sub i64 %sub2, %n
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %sub3
+  %0 = load i32* %arrayidx4, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;; why doesn't SCEV package understand that n >= 0?
+;;void weaktest(int *A, int *B, long unsigned n)
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[i + n + 1] = ...
+;;    ... = A[-i];
+
+define void @weaktest(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %add1 = add i64 %add, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add1
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 0, %i.03
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [*|<] splitable!
+; CHECK: da analyze - split level = 1, iteration = ((0 smax (-1 + (-1 * %n))) /u 2)!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  void symbolicsiv6(int *A, int *B, long unsigned n, long unsigned N, long unsigned M) {
+;;    for (long int i = 0; i < n; i++) {
+;;      A[4*N*i + M] = i;
+;;      *B++ = A[4*N*i + 3*M + 1];
+
+define void @symbolicsiv6(i32* %A, i32* %B, i64 %n, i64 %N, i64 %M) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %for.body.preheader ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl i64 %N, 2
+  %mul1 = mul i64 %mul, %i.03
+  %add = add i64 %mul1, %M
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %N, 2
+  %mul3 = mul i64 %mul2, %i.03
+  %mul4 = mul i64 %M, 3
+  %add5 = add i64 %mul3, %mul4
+  %add6 = add i64 %add5, 1
+  %arrayidx7 = getelementptr inbounds i32* %A, i64 %add6
+  %0 = load i32* %arrayidx7, align 4
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+
+;;  void symbolicsiv7(int *A, int *B, long unsigned n, long unsigned N, long unsigned M) {
+;;    for (long int i = 0; i < n; i++) {
+;;      A[2*N*i + M] = i;
+;;      *B++ = A[2*N*i - 3*M + 2];
+
+define void @symbolicsiv7(i32* %A, i32* %B, i64 %n, i64 %N, i64 %M) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %for.body.preheader ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = shl i64 %N, 1
+  %mul1 = mul i64 %mul, %i.03
+  %add = add i64 %mul1, %M
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul2 = shl i64 %N, 1
+  %mul3 = mul i64 %mul2, %i.03
+  %0 = mul i64 %M, -3
+  %sub = add i64 %mul3, %0
+  %add5 = add i64 %sub, 2
+  %arrayidx6 = getelementptr inbounds i32* %A, i64 %add5
+  %1 = load i32* %arrayidx6, align 4
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+; CHECK: da analyze - flow [<>]!
+  store i32 %1, i32* %B.addr.02, align 4
+  %inc = add nsw i64 %i.03, 1
+  %exitcond = icmp ne i64 %inc, %n
+  br i1 %exitcond, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll b/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll
new file mode 100644
index 000000000000..343e8f49bf9e
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakCrossingSIV.ll
@@ -0,0 +1,220 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakCrossingSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[1 + n*i] = ...
+;;    ... = A[1 - n*i];
+
+define void @weakcrossing0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul1 = mul i64 %i.03, %n
+  %sub = sub i64 1, %mul1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[n + i] = ...
+;;    ... = A[1 + n - i];
+
+define void @weakcrossing1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %add = add i64 %i.03, %n
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %add1 = add i64 %n, 1
+  %sub = sub i64 %add1, %i.03
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - flow [<>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 0!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 3; i++)
+;;    A[i] = ...
+;;    ... = A[6 - i];
+
+define void @weakcrossing2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 3
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 4; i++)
+;;    A[i] = ...
+;;    ... = A[6 - i];
+
+define void @weakcrossing3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [0|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 10; i++)
+;;    A[i] = ...
+;;    ... = A[-6 - i];
+
+define void @weakcrossing4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 -6, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[3*i] = ...
+;;    ... = A[5 - 3*i];
+
+define void @weakcrossing5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, 3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %0 = mul i64 %i.03, -3
+  %sub = add i64 %0, 5
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %sub
+  %1 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %1, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 4; i++)
+;;    A[i] = ...
+;;    ... = A[5 - i];
+
+define void @weakcrossing6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.02
+  store i32 %conv, i32* %arrayidx, align 4
+  %sub = sub i64 5, %i.02
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %sub
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [<>] splitable!
+; CHECK: da analyze - split level = 1, iteration = 2!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 4
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll b/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll
new file mode 100644
index 000000000000..a59871602b6c
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakZeroDstSIV.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakZeroDstSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 30; i++)
+;;    A[2*i + 10] = ...
+;;    ... = A[10];
+
+define void @weakzerodst0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[n*i + 10] = ...
+;;    ... = A[10];
+
+define void @weakzerodst1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 10
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 5; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 6; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>p|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[2*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[2*i] = ...
+;;    ... = A[-10];
+
+define void @weakzerodst5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %mul = shl i64 %i.02, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 -10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[3*i] = ...
+;;    ... = A[10];
+
+define void @weakzerodst6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %mul = mul i64 %i.03, 3
+  %arrayidx = getelementptr inbounds i32* %A, i64 %mul
+  store i32 %conv, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 10
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll b/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll
new file mode 100644
index 000000000000..fd4f46269546
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/WeakZeroSrcSIV.ll
@@ -0,0 +1,212 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'WeakZeroSrcSIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  for (long unsigned i = 0; i < 30; i++)
+;;    A[10] = ...
+;;    ... = A[2*i + 10];
+
+define void @weakzerosrc0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %add = add i64 %mul, 10
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[10] = ...
+;;    ... = A[n*i + 10];
+
+define void @weakzerosrc1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = mul i64 %i.03, %n
+  %add = add i64 %mul, 10
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [p<=|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 5; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc2(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 5
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 6; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc3(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [=>p|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 6
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc4(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow [*|<]!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < 7; i++)
+;;    A[-10] = ...
+;;    ... = A[2*i];
+
+define void @weakzerosrc5(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.02 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %B.addr.01 = phi i32* [ %B, %entry ], [ %incdec.ptr, %for.body ]
+  %conv = trunc i64 %i.02 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 -10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = shl i64 %i.02, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.01, i64 1
+  store i32 %0, i32* %B.addr.01, align 4
+  %inc = add i64 %i.02, 1
+  %cmp = icmp ult i64 %inc, 7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+
+;;  for (long unsigned i = 0; i < n; i++)
+;;    A[10] = ...
+;;    ... = A[3*i];
+
+define void @weakzerosrc6(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %cmp1 = icmp eq i64 %n, 0
+  br i1 %cmp1, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.03 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
+  %B.addr.02 = phi i32* [ %incdec.ptr, %for.body ], [ %B, %entry ]
+  %conv = trunc i64 %i.03 to i32
+  %arrayidx = getelementptr inbounds i32* %A, i64 10
+  store i32 %conv, i32* %arrayidx, align 4
+  %mul = mul i64 %i.03, 3
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %mul
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  %incdec.ptr = getelementptr inbounds i32* %B.addr.02, i64 1
+  store i32 %0, i32* %B.addr.02, align 4
+  %inc = add i64 %i.03, 1
+  %cmp = icmp ult i64 %inc, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/ZIV.ll b/test/Analysis/DependenceAnalysis/ZIV.ll
new file mode 100644
index 000000000000..42b2389df268
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/ZIV.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -analyze -basicaa -da | FileCheck %s
+
+; ModuleID = 'ZIV.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.6.0"
+
+
+;;  A[n + 1] = ...
+;;  ... = A[1 + n];
+
+define void @z0(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %add = add i64 %n, 1
+  %arrayidx = getelementptr inbounds i32* %A, i64 %add
+  store i32 0, i32* %arrayidx, align 4
+  %add1 = add i64 %n, 1
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %add1
+  %0 = load i32* %arrayidx2, align 4
+; CHECK: da analyze - consistent flow!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;  A[n] = ...
+;;  ... = A[n + 1];
+
+define void @z1(i32* %A, i32* %B, i64 %n) nounwind uwtable ssp {
+entry:
+  %arrayidx = getelementptr inbounds i32* %A, i64 %n
+  store i32 0, i32* %arrayidx, align 4
+  %add = add i64 %n, 1
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %add
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - none!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
+
+
+;;  A[n] = ...
+;;  ... = A[m];
+
+define void @z2(i32* %A, i32* %B, i64 %n, i64 %m) nounwind uwtable ssp {
+entry:
+  %arrayidx = getelementptr inbounds i32* %A, i64 %n
+  store i32 0, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %A, i64 %m
+  %0 = load i32* %arrayidx1, align 4
+; CHECK: da analyze - flow!
+  store i32 %0, i32* %B, align 4
+  ret void
+}
diff --git a/test/Analysis/DependenceAnalysis/lit.local.cfg b/test/Analysis/DependenceAnalysis/lit.local.cfg
new file mode 100644
index 000000000000..c6106e4746f2
--- /dev/null
+++ b/test/Analysis/DependenceAnalysis/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Analysis/LoopDependenceAnalysis/alias.ll b/test/Analysis/LoopDependenceAnalysis/alias.ll
deleted file mode 100644
index 78d0bf4fee1a..000000000000
--- a/test/Analysis/LoopDependenceAnalysis/alias.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-;; x[5] = x[6] // with x being a pointer passed as argument
-
-define void @f1(i32* nocapture %xptr) nounwind {
-entry:
-  %x.ld.addr = getelementptr i32* %xptr, i64 6
-  %x.st.addr = getelementptr i32* %xptr, i64 5
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: dep
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[5] = x[6] // with x being an array on the stack
-
-define void @foo(...) nounwind {
-entry:
-  %xptr = alloca [256 x i32], align 4
-  %x.ld.addr = getelementptr [256 x i32]* %xptr, i64 0, i64 6
-  %x.st.addr = getelementptr [256 x i32]* %xptr, i64 0, i64 5
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-strong.ll b/test/Analysis/LoopDependenceAnalysis/siv-strong.ll
deleted file mode 100644
index 401e466d6669..000000000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-strong.ll
+++ /dev/null
@@ -1,110 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[i] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.addr      ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 256; i++)
-;;   x[i+1] = x[i] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %i.next = add i64 %i, 1
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.next
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 10; i++)
-;;   x[i+20] = x[i] + y[i]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %i.20 = add i64 %i, 20
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.20
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 10; i++)
-;;   x[10*i+1] = x[10*i] + y[i]
-
-define void @f4(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.10 = mul i64 %i, 10
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i.10
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.10
-  %i.10.1 = add i64 %i.10, 1
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.10.1
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 10
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll b/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll
deleted file mode 100644
index 9d0128c5fec4..000000000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-weak-crossing.ll
+++ /dev/null
@@ -1,118 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[255 - i] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.255 = sub i64 255, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.255
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 100; i++)
-;;   x[i] = x[255 - i] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.255 = sub i64 255, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.255
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 100
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; // the first iteration (i=0) leads to an out-of-bounds access of x. as the
-;; // result of this access is undefined, _any_ dependence result is safe.
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[256 - i] + y[i]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.256 = sub i64 0, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 1, i64 %i.256
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2:
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; // slightly contrived but valid IR for the following loop, where all
-;; // accesses in all iterations are within bounds. while this example's first
-;; // (ZIV-)subscript is (0, 1), accesses are dependent.
-;; for (i = 1; i < 256; i++)
-;;   x[i] = x[256 - i] + y[i]
-
-define void @f4(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %i.1 = add i64 1, %i
-  %i.256 = sub i64 -1, %i
-  %y.ld.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i.1
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 1, i64 %i.256
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i.1
-  %x = load i32* %x.ld.addr     ; 0
-  %y = load i32* %y.ld.addr     ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.st.addr ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll b/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll
deleted file mode 100644
index 1c5ae4c490e3..000000000000
--- a/test/Analysis/LoopDependenceAnalysis/siv-weak-zero.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-@y = common global [256 x i32] zeroinitializer, align 4
-
-;; for (i = 0; i < 256; i++)
-;;   x[i] = x[42] + y[i]
-
-define void @f1(...) nounwind {
-entry:
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 42
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x = load i32* %x.ld.addr   ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; for (i = 0; i < 250; i++)
-;;   x[i] = x[255] + y[i]
-
-define void @f2(...) nounwind {
-entry:
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 255
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x.addr = getelementptr [256 x i32]* @x, i64 0, i64 %i
-  %y.addr = getelementptr [256 x i32]* @y, i64 0, i64 %i
-  %x = load i32* %x.ld.addr   ; 0
-  %y = load i32* %y.addr      ; 1
-  %r = add i32 %y, %x
-  store i32 %r, i32* %x.addr  ; 2
-; CHECK: 0,2: dep
-; CHECK: 1,2: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 250
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/LoopDependenceAnalysis/ziv.ll b/test/Analysis/LoopDependenceAnalysis/ziv.ll
deleted file mode 100644
index 645ae7f152e2..000000000000
--- a/test/Analysis/LoopDependenceAnalysis/ziv.ll
+++ /dev/null
@@ -1,63 +0,0 @@
-; RUN: opt < %s -analyze -basicaa -lda | FileCheck %s
-
-@x = common global [256 x i32] zeroinitializer, align 4
-
-;; x[5] = x[6]
-
-define void @f1(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-  store i32 %x, i32* getelementptr ([256 x i32]* @x, i32 0, i64 5)
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[c] = x[c+1] // with c being a loop-invariant constant
-
-define void @f2(i64 %c0) nounwind {
-entry:
-  %c1 = add i64 %c0, 1
-  %x.ld.addr = getelementptr [256 x i32]* @x, i64 0, i64 %c0
-  %x.st.addr = getelementptr [256 x i32]* @x, i64 0, i64 %c1
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* %x.ld.addr
-  store i32 %x, i32* %x.st.addr
-; CHECK: 0,1: ind
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
-
-;; x[6] = x[6]
-
-define void @f3(...) nounwind {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %x = load i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-  store i32 %x, i32* getelementptr ([256 x i32]* @x, i32 0, i64 6)
-; CHECK: 0,1: dep
-  %i.next = add i64 %i, 1
-  %exitcond = icmp eq i64 %i.next, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/test/Analysis/Profiling/load-branch-weights-ifs.ll b/test/Analysis/Profiling/load-branch-weights-ifs.ll
new file mode 100644
index 000000000000..7ed090b7c366
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-ifs.ll
@@ -0,0 +1,122 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_mod - Branch taken 6 times in 7.
+define i32 @func_mod(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 7
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.else
+; CHECK: br i1 %tobool, label %if.then, label %if.else, !prof !0
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.else:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% taken probability.
+define i32 @func_const_true(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !1
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_const_true - conditional branch which 100% not-taken probability.
+define i32 @func_const_false(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %cmp = icmp eq i32 %0, 1
+  br i1 %cmp, label %if.then, label %if.end
+; CHECK: br i1 %cmp, label %if.then, label %if.end, !prof !2
+
+if.then:
+  store i32 1, i32* %retval
+  br label %return
+
+if.end:
+  store i32 0, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 7000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !3
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_mod(i32 %1)
+  br label %for.inc
+
+for.inc:
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %call1 = call i32 @func_const_true(i32 1)
+  %call2 = call i32 @func_const_false(i32 0)
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 6000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 0}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7000, i32 1}
+; CHECK-NOT: !4
diff --git a/test/Analysis/Profiling/load-branch-weights-loops.ll b/test/Analysis/Profiling/load-branch-weights-loops.ll
new file mode 100644
index 000000000000..9d1925a2d701
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-loops.ll
@@ -0,0 +1,188 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_for - Test branch probabilities for a vanilla for loop.
+define i32 @func_for(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !0
+
+for.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_for_odd - Test branch probabilities for a for loop with a continue and
+;; a break.
+define i32 @func_for_odd(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !1
+
+for.body:
+  %2 = load i32* %loop, align 4
+  %rem = srem i32 %2, 10
+  %tobool = icmp ne i32 %rem, 0
+  br i1 %tobool, label %if.then, label %if.end
+; CHECK: br i1 %tobool, label %if.then, label %if.end, !prof !2
+
+if.then:
+  br label %for.inc
+
+if.end:
+  %3 = load i32* %loop, align 4
+  %cmp1 = icmp eq i32 %3, 500
+  br i1 %cmp1, label %if.then2, label %if.end3
+; CHECK: br i1 %cmp1, label %if.then2, label %if.end3, !prof !3
+
+if.then2:
+  br label %for.end
+
+if.end3:
+  %4 = load i32* %N.addr, align 4
+  %5 = load i32* %ret, align 4
+  %add = add nsw i32 %5, %4
+  store i32 %add, i32* %ret, align 4
+  br label %for.inc
+
+for.inc:
+  %6 = load i32* %loop, align 4
+  %inc = add nsw i32 %6, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  %7 = load i32* %ret, align 4
+  ret i32 %7
+}
+
+;; func_while - Test branch probability in a vanilla while loop.
+define i32 @func_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %while.cond
+
+while.cond:
+  %0 = load i32* %loop, align 4
+  %1 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %while.body, label %while.end
+; CHECK: br i1 %cmp, label %while.body, label %while.end, !prof !0
+
+while.body:
+  %2 = load i32* %N.addr, align 4
+  %3 = load i32* %ret, align 4
+  %add = add nsw i32 %3, %2
+  store i32 %add, i32* %ret, align 4
+  %4 = load i32* %loop, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %while.cond
+
+while.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+;; func_while - Test branch probability in a vanilla do-while loop.
+define i32 @func_do_while(i32 %N) nounwind uwtable {
+entry:
+  %N.addr = alloca i32, align 4
+  %ret = alloca i32, align 4
+  %loop = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  store i32 0, i32* %ret, align 4
+  store i32 0, i32* %loop, align 4
+  br label %do.body
+
+do.body:
+  %0 = load i32* %N.addr, align 4
+  %1 = load i32* %ret, align 4
+  %add = add nsw i32 %1, %0
+  store i32 %add, i32* %ret, align 4
+  %2 = load i32* %loop, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %do.cond
+
+do.cond:
+  %3 = load i32* %loop, align 4
+  %4 = load i32* %N.addr, align 4
+  %cmp = icmp slt i32 %3, %4
+  br i1 %cmp, label %do.body, label %do.end
+; CHECK: br i1 %cmp, label %do.body, label %do.end, !prof !4
+
+do.end:
+  %5 = load i32* %ret, align 4
+  ret i32 %5
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  %call = call i32 @func_for(i32 1000)
+  %call1 = call i32 @func_for_odd(i32 1000)
+  %call2 = call i32 @func_while(i32 1000)
+  %call3 = call i32 @func_do_while(i32 1000)
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+!1 = metadata !{metadata !"branch_weights", i32 501, i32 0}
+!2 = metadata !{metadata !"branch_weights", i32 450, i32 51}
+!3 = metadata !{metadata !"branch_weights", i32 1, i32 50}
+!4 = metadata !{metadata !"branch_weights", i32 999, i32 1}
+; CHECK-NOT: !5
diff --git a/test/Analysis/Profiling/load-branch-weights-switches.ll b/test/Analysis/Profiling/load-branch-weights-switches.ll
new file mode 100644
index 000000000000..5587c7172bb6
--- /dev/null
+++ b/test/Analysis/Profiling/load-branch-weights-switches.ll
@@ -0,0 +1,165 @@
+; RUN: opt -insert-edge-profiling -o %t1 < %s
+; RUN: rm -f %t1.prof_data
+; RUN: lli %defaultjit -load %llvmshlibdir/libprofile_rt%shlibext %t1 \
+; RUN:     -llvmprof-output %t1.prof_data
+; RUN: opt -profile-file %t1.prof_data -profile-metadata-loader -S -o - < %s \
+; RUN:     | FileCheck %s
+; RUN: rm -f %t1.prof_data
+
+; FIXME: profile_rt.dll could be built on win32.
+; REQUIRES: loadable_module
+
+;; func_switch - Test branch probabilities for a switch instruction with an
+;; even chance of taking each case (or no case).
+define i32 @func_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 4
+  switch i32 %rem, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+  ]
+; CHECK: ], !prof !0
+
+sw.bb:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb1:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb2:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.epilog:
+  store i32 8, i32* %retval
+  br label %return
+
+return:
+  %1 = load i32* %retval
+  ret i32 %1
+}
+
+;; func_switch_switch - Test branch probabilities in a switch-instruction that
+;; leads to further switch instructions.  The first-tier switch occludes some
+;; possibilities in the second-tier switches, leading to some branches having a
+;; 0 probability.
+define i32 @func_switch_switch(i32 %N) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %N.addr = alloca i32, align 4
+  store i32 %N, i32* %N.addr, align 4
+  %0 = load i32* %N.addr, align 4
+  %rem = srem i32 %0, 2
+  switch i32 %rem, label %sw.default11 [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb5
+  ]
+; CHECK: ], !prof !1
+
+sw.bb:
+  %1 = load i32* %N.addr, align 4
+  %rem1 = srem i32 %1, 4
+  switch i32 %rem1, label %sw.default [
+    i32 0, label %sw.bb2
+    i32 1, label %sw.bb3
+    i32 2, label %sw.bb4
+  ]
+; CHECK: ], !prof !2
+
+sw.bb2:
+  store i32 5, i32* %retval
+  br label %return
+
+sw.bb3:
+  store i32 6, i32* %retval
+  br label %return
+
+sw.bb4:
+  store i32 7, i32* %retval
+  br label %return
+
+sw.default:
+  store i32 8, i32* %retval
+  br label %return
+
+sw.bb5:
+  %2 = load i32* %N.addr, align 4
+  %rem6 = srem i32 %2, 4
+  switch i32 %rem6, label %sw.default10 [
+    i32 0, label %sw.bb7
+    i32 1, label %sw.bb8
+    i32 2, label %sw.bb9
+  ]
+; CHECK: ], !prof !3
+
+sw.bb7:
+  store i32 9, i32* %retval
+  br label %return
+
+sw.bb8:
+  store i32 10, i32* %retval
+  br label %return
+
+sw.bb9:
+  store i32 11, i32* %retval
+  br label %return
+
+sw.default10:
+  store i32 12, i32* %retval
+  br label %return
+
+sw.default11:
+  store i32 13, i32* %retval
+  br label %return
+
+return:
+  %3 = load i32* %retval
+  ret i32 %3
+}
+
+define i32 @main(i32 %argc, i8** %argv) nounwind uwtable {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  %loop = alloca i32, align 4
+  store i32 0, i32* %retval
+  store i32 0, i32* %loop, align 4
+  br label %for.cond
+
+for.cond:
+  %0 = load i32* %loop, align 4
+  %cmp = icmp slt i32 %0, 4000
+  br i1 %cmp, label %for.body, label %for.end
+; CHECK: br i1 %cmp, label %for.body, label %for.end, !prof !4
+
+for.body:
+  %1 = load i32* %loop, align 4
+  %call = call i32 @func_switch(i32 %1)
+  %2 = load i32* %loop, align 4
+  %call1 = call i32 @func_switch_switch(i32 %2)
+  br label %for.inc
+
+for.inc:
+  %3 = load i32* %loop, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %loop, align 4
+  br label %for.cond
+
+for.end:
+  ret i32 0
+}
+
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 1000, i32 1000, i32 1000, i32 1000}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 0, i32 2000, i32 2000}
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 0, i32 1000, i32 0, i32 1000}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 1000, i32 0, i32 1000, i32 0}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 4000, i32 1}
+; CHECK-NOT: !5
diff --git a/test/Assembler/2008-09-02-FunctionNotes2.ll b/test/Assembler/2008-09-02-FunctionNotes2.ll
index 97351e2a5713..47eb011343fb 100644
--- a/test/Assembler/2008-09-02-FunctionNotes2.ll
+++ b/test/Assembler/2008-09-02-FunctionNotes2.ll
@@ -1,5 +1,5 @@
 ; Test function notes
-; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes noinline alwaysinline are incompatible"
+; RUN: not llvm-as %s -o /dev/null 2>&1 | grep "Attributes 'noinline and alwaysinline' are incompatible"
 define void @fn1() alwaysinline  noinline {
   ret void
 }
diff --git a/test/Assembler/global-addrspace-forwardref.ll b/test/Assembler/global-addrspace-forwardref.ll
new file mode 100644
index 000000000000..f0f094a2248d
--- /dev/null
+++ b/test/Assembler/global-addrspace-forwardref.ll
@@ -0,0 +1,8 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; Make sure the address space of forward decls is preserved
+
+; CHECK: @a2 = global i8 addrspace(1)* @a
+; CHECK: @a = addrspace(1) global i8 0
+@a2 = global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 0
diff --git a/test/Assembler/invalid-fwdref1.ll b/test/Assembler/invalid-fwdref1.ll
new file mode 100644
index 000000000000..ef8b16cadceb
--- /dev/null
+++ b/test/Assembler/invalid-fwdref1.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as %s -disable-output 2>&1 | grep "invalid forward reference to function as global value!"
+
+define i8* @test1() { ret i8* @test1a }
+define void @test1a() { }
diff --git a/test/Bindings/Ocaml/ipo_opts.ml b/test/Bindings/Ocaml/ipo_opts.ml
index 3a362319a731..d4537e4413fb 100644
--- a/test/Bindings/Ocaml/ipo_opts.ml
+++ b/test/Bindings/Ocaml/ipo_opts.ml
@@ -43,10 +43,10 @@ let test_transforms () =
       ignore (build_ret (build_call fn [| |] "" b) b);
   end;
 
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   
   ignore (PassManager.create ()
-           ++ TargetData.add td
+           ++ DataLayout.add td
            ++ add_argument_promotion
            ++ add_constant_merge
            ++ add_dead_arg_elimination
@@ -63,7 +63,7 @@ let test_transforms () =
            ++ PassManager.run_module m
            ++ PassManager.dispose);
 
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/scalar_opts.ml b/test/Bindings/Ocaml/scalar_opts.ml
index 34a7a6a01bd0..0760dad4ad02 100644
--- a/test/Bindings/Ocaml/scalar_opts.ml
+++ b/test/Bindings/Ocaml/scalar_opts.ml
@@ -38,10 +38,10 @@ let test_transforms () =
   let fn = define_function "fn" fty m in
   ignore (build_ret_void (builder_at_end context (entry_block fn)));
   
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   
   ignore (PassManager.create_function m
-           ++ TargetData.add td
+           ++ DataLayout.add td
            ++ add_verifier
            ++ add_constant_propagation
            ++ add_sccp
@@ -78,7 +78,7 @@ let test_transforms () =
            ++ PassManager.finalize
            ++ PassManager.dispose);
   
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/target.ml b/test/Bindings/Ocaml/target.ml
index 1b6b71e2759b..7a35a790ab3a 100644
--- a/test/Bindings/Ocaml/target.ml
+++ b/test/Bindings/Ocaml/target.ml
@@ -33,10 +33,10 @@ let m = create_module context filename
 (*===-- Target Data -------------------------------------------------------===*)
 
 let test_target_data () =
-  let td = TargetData.create (target_triple m) in
+  let td = DataLayout.create (target_triple m) in
   let sty = struct_type context [| i32_type; i64_type |] in
   
-  ignore (TargetData.as_string td);
+  ignore (DataLayout.as_string td);
   ignore (byte_order td);
   ignore (pointer_size td);
   ignore (intptr_type td);
@@ -49,7 +49,7 @@ let test_target_data () =
   ignore (element_at_offset td sty (Int64.of_int 1));
   ignore (offset_of_element td sty 1);
   
-  TargetData.dispose td
+  DataLayout.dispose td
 
 
 (*===-- Driver ------------------------------------------------------------===*)
diff --git a/test/Bindings/Ocaml/vmcore.ml b/test/Bindings/Ocaml/vmcore.ml
index b8eb6d3e3dd1..61be4b770358 100644
--- a/test/Bindings/Ocaml/vmcore.ml
+++ b/test/Bindings/Ocaml/vmcore.ml
@@ -113,14 +113,14 @@ let test_constants () =
   ignore (define_global "const_int_string" c m);
   insist (i32_type = type_of c);
 
-  (* RUN: grep 'const_string.*"cruel\00world"' < %t.ll
+  (* RUN: grep 'const_string.*"cruel\\00world"' < %t.ll
    *)
   group "string";
   let c = const_string context "cruel\000world" in
   ignore (define_global "const_string" c m);
   insist ((array_type i8_type 11) = type_of c);
 
-  (* RUN: grep 'const_stringz.*"hi\00again\00"' < %t.ll
+  (* RUN: grep 'const_stringz.*"hi\\00again\\00"' < %t.ll
    *)
   group "stringz";
   let c = const_stringz context "hi\000again" in
@@ -187,7 +187,7 @@ let test_constants () =
   ignore (define_global "const_all_ones" c m);
 
   group "pointer null"; begin
-    (* RUN: grep "const_pointer_null = global i64* null" < %t.ll
+    (* RUN: grep "const_pointer_null = global i64\* null" < %t.ll
      *)
     let c = const_pointer_null (pointer_type i64_type) in
     ignore (define_global "const_pointer_null" c m);
@@ -542,7 +542,7 @@ let test_users () =
 (*===-- Aliases -----------------------------------------------------------===*)
 
 let test_aliases () =
-  (* RUN: grep "@alias = alias i32* @aliasee" < %t.ll
+  (* RUN: grep "@alias = alias i32\* @aliasee" < %t.ll
    *)
   let v = declare_global i32_type "aliasee" m in
   ignore (add_alias m (pointer_type i32_type) v "alias")
@@ -554,7 +554,7 @@ let test_functions () =
   let ty = function_type i32_type [| i32_type; i64_type |] in
   let ty2 = function_type i8_type [| i8_type; i64_type |] in
   
-  (* RUN: grep "declare i32 @Fn1\(i32, i64\)" < %t.ll
+  (* RUN: grep 'declare i32 @Fn1(i32, i64)' < %t.ll
    *)
   begin group "declare";
     insist (None = lookup_function "Fn1" m);
@@ -935,7 +935,7 @@ let test_builder () =
 
   group "malloc/free"; begin
       (* RUN: grep "call.*@malloc(i32 ptrtoint" < %t.ll
-       * RUN: grep "call.*@free(i8*" < %t.ll
+       * RUN: grep "call.*@free(i8\*" < %t.ll
        * RUN: grep "call.*@malloc(i32 %" < %t.ll
        *)
       let bb1 = append_block context "MallocBlock1" fn in
@@ -947,7 +947,7 @@ let test_builder () =
   end;
 
   group "indirectbr"; begin
-    (* RUN: grep "indirectbr i8* blockaddress(@X7, %IBRBlock2), [label %IBRBlock2, label %IBRBlock3]" < %t.ll
+    (* RUN: grep "indirectbr i8\* blockaddress(@X7, %IBRBlock2), \[label %IBRBlock2, label %IBRBlock3\]" < %t.ll
      *)
     let bb1 = append_block context "IBRBlock1" fn in
 
@@ -1054,10 +1054,10 @@ let test_builder () =
 
     (* RUN: grep "%build_alloca = alloca i32" < %t.ll
      * RUN: grep "%build_array_alloca = alloca i32, i32 %P2" < %t.ll
-     * RUN: grep "%build_load = load i32* %build_array_alloca" < %t.ll
-     * RUN: grep "store i32 %P2, i32* %build_alloca" < %t.ll
-     * RUN: grep "%build_gep = getelementptr i32* %build_array_alloca, i32 %P2" < %t.ll
-     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_load = load i32\* %build_array_alloca" < %t.ll
+     * RUN: grep "store i32 %P2, i32\* %build_alloca" < %t.ll
+     * RUN: grep "%build_gep = getelementptr i32\* %build_array_alloca, i32 %P2" < %t.ll
+     * RUN: grep "%build_in_bounds_gep = getelementptr inbounds i32\* %build_array_alloca, i32 %P2" < %t.ll
      * RUN: grep "%build_struct_gep = getelementptr inbounds.*%build_alloca2, i32 0, i32 1" < %t.ll
      *)
     let alloca = build_alloca i32_type "build_alloca" b in
@@ -1106,14 +1106,14 @@ let test_builder () =
      * RUN: grep "%build_fptrunc2 = fptrunc double %build_sitofp to float" < %t.ll
      * RUN: grep "%build_fpext = fpext float %build_fptrunc to double" < %t.ll
      * RUN: grep "%build_fpext2 = fpext float %build_fptrunc to double" < %t.ll
-     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8*" < %t.ll
-     * RUN: grep "%build_ptrtoint = ptrtoint i8* %build_inttoptr to i64" < %t.ll
-     * RUN: grep "%build_ptrtoint2 = ptrtoint i8* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_inttoptr = inttoptr i32 %P1 to i8\*" < %t.ll
+     * RUN: grep "%build_ptrtoint = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
+     * RUN: grep "%build_ptrtoint2 = ptrtoint i8\* %build_inttoptr to i64" < %t.ll
      * RUN: grep "%build_bitcast = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast2 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast3 = bitcast i64 %build_ptrtoint to double" < %t.ll
      * RUN: grep "%build_bitcast4 = bitcast i64 %build_ptrtoint to double" < %t.ll
-     * RUN: grep "%build_pointercast = bitcast i8* %build_inttoptr to i16*" < %t.ll
+     * RUN: grep "%build_pointercast = bitcast i8\* %build_inttoptr to i16*" < %t.ll
      *)
     let inst28 = build_trunc p1 i8_type "build_trunc" atentry in
     let inst29 = build_zext inst28 i32_type "build_zext" atentry in
@@ -1148,7 +1148,7 @@ let test_builder () =
      * RUN: grep "%build_fcmp_false = fcmp false float %F1, %F2" < %t.ll
      * RUN: grep "%build_fcmp_true = fcmp true float %F2, %F1" < %t.ll
      * RUN: grep "%build_is_null.*= icmp eq.*%X0,.*null" < %t.ll
-     * RUN: grep "%build_is_not_null = icmp ne i8* %X1, null" < %t.ll
+     * RUN: grep "%build_is_not_null = icmp ne i8\* %X1, null" < %t.ll
      * RUN: grep "%build_ptrdiff" < %t.ll
      *)
     ignore (build_icmp Icmp.Ne    p1 p2 "build_icmp_ne" atentry);
@@ -1167,7 +1167,7 @@ let test_builder () =
   group "miscellaneous"; begin
     (* RUN: grep "%build_call = tail call cc63 i32 @.*(i32 signext %P2, i32 %P1)" < %t.ll
      * RUN: grep "%build_select = select i1 %build_icmp, i32 %P1, i32 %P2" < %t.ll
-     * RUN: grep "%build_va_arg = va_arg i8** null, i32" < %t.ll
+     * RUN: grep "%build_va_arg = va_arg i8\*\* null, i32" < %t.ll
      * RUN: grep "%build_extractelement = extractelement <4 x i32> %Vec1, i32 %P2" < %t.ll
      * RUN: grep "%build_insertelement = insertelement <4 x i32> %Vec1, i32 %P1, i32 %P2" < %t.ll
      * RUN: grep "%build_shufflevector = shufflevector <4 x i32> %Vec1, <4 x i32> %Vec2, <4 x i32> <i32 1, i32 1, i32 0, i32 0>" < %t.ll
@@ -1240,8 +1240,8 @@ let test_builder () =
   end;
 
   group "dbg"; begin
-    (* RUN: grep "%dbg = add i32 %P1, %P2, !dbg !1" < %t.ll
-     * RUN: grep "!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}" < %t.ll
+    (* RUN: grep '%dbg = add i32 %P1, %P2, !dbg !1' < %t.ll
+     * RUN: grep '!1 = metadata !{i32 2, i32 3, metadata !2, metadata !2}' < %t.ll
      *)
     insist ((current_debug_location atentry) = None);
 
diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
index b9f334176caa..8ac54be00d54 100644
--- a/test/Bitcode/blockaddress.ll
+++ b/test/Bitcode/blockaddress.ll
@@ -28,3 +28,18 @@ here:
 end:
   ret void
 }
+
+; PR13895
+define void @doitagain(i8** nocapture %pptr) {
+; CHECK: define void @doitagain
+entry:
+  br label %here
+
+here:
+  store i8* blockaddress(@doit, %here), i8** %pptr, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/Bitcode/function-encoding-rel-operands.ll b/test/Bitcode/function-encoding-rel-operands.ll
new file mode 100644
index 000000000000..aedb0c32676f
--- /dev/null
+++ b/test/Bitcode/function-encoding-rel-operands.ll
@@ -0,0 +1,49 @@
+; Basic sanity test to check that instruction operands are encoded with
+; relative IDs.
+; RUN: llvm-as < %s | llvm-bcanalyzer -dump | FileCheck %s
+
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_RET {{.*}}op0=1
+define i32 @test_int_binops(i32 %a) nounwind {
+entry:
+  %0 = add i32 %a, %a
+  %1 = sub i32 %0, %0
+  %2 = mul i32 %1, %1
+  ret i32 %2
+}
+
+
+; CHECK: FUNCTION_BLOCK
+; CHECK: INST_CAST {{.*}}op0=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_BINOP {{.*}}op0=1 op1=1
+; CHECK: INST_RET {{.*}}op0=1
+define double @test_float_binops(i32 %a) nounwind {
+  %1 = sitofp i32 %a to double
+  %2 = fadd double %1, %1
+  %3 = fsub double %2, %2
+  %4 = fmul double %3, %3
+  %5 = fdiv double %4, %4
+  ret double %5
+}
+
+
+; CHECK: FUNCTION_BLOCK
+; skip checking operands of INST_INBOUNDS_GEP since that depends on ordering
+; between literals and the formal parameters.
+; CHECK: INST_INBOUNDS_GEP {{.*}}
+; CHECK: INST_LOAD {{.*}}op0=1 {{.*}}
+; CHECK: INST_CMP2 op0=1 {{.*}}
+; CHECK: INST_RET {{.*}}op0=1
+define i1 @test_load(i32 %a, {i32, i32}* %ptr) nounwind {
+entry:
+  %0 = getelementptr inbounds {i32, i32}* %ptr, i32 %a, i32 0
+  %1 = load i32* %0
+  %2 = icmp eq i32 %1, %a
+  ret i1 %2
+}
diff --git a/test/BugPoint/crash-narrowfunctiontest.ll b/test/BugPoint/crash-narrowfunctiontest.ll
index d080d9dd4b0c..c81283695731 100644
--- a/test/BugPoint/crash-narrowfunctiontest.ll
+++ b/test/BugPoint/crash-narrowfunctiontest.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 define i32 @foo() { ret i32 1 }
 
diff --git a/test/BugPoint/metadata.ll b/test/BugPoint/metadata.ll
index 0eda5667ba4a..6dc9574bbe4b 100644
--- a/test/BugPoint/metadata.ll
+++ b/test/BugPoint/metadata.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes > /dev/null
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 ; Bugpoint should keep the call's metadata attached to the call.
 
diff --git a/test/BugPoint/remove_arguments_test.ll b/test/BugPoint/remove_arguments_test.ll
index 29a03b831077..5a45f846e103 100644
--- a/test/BugPoint/remove_arguments_test.ll
+++ b/test/BugPoint/remove_arguments_test.ll
@@ -1,6 +1,7 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crashcalls -silence-passes
 ; RUN: llvm-dis %t-reduced-simplified.bc -o - | FileCheck %s
 ; REQUIRES: loadable_module
+; XFAIL: lto_on_osx
 
 ; Test to make sure that arguments are removed from the function if they are 
 ; unnecessary. And clean up any types that that frees up too.
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 991cc9df1639..e10a532341e6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -7,6 +7,11 @@ configure_lit_site_cfg(
   ${CMAKE_CURRENT_BINARY_DIR}/Unit/lit.site.cfg
   )
 
+# Don't include check-llvm into check-all without LLVM_BUILD_TOOLS.
+if(NOT LLVM_BUILD_TOOLS)
+  set(EXCLUDE_FROM_ALL ON)
+endif()
+
 add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   ${CMAKE_CURRENT_BINARY_DIR}
   PARAMS llvm_site_config=${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg
@@ -14,10 +19,16 @@ add_lit_testsuite(check-llvm "Running the LLVM regression tests"
   DEPENDS UnitTests
           BugpointPasses LLVMHello
           llc lli llvm-ar llvm-as
-          llvm-diff
+          llvm-bcanalyzer llvm-diff
           llvm-dis llvm-extract llvm-dwarfdump
-          llvm-link llvm-mc llvm-nm llvm-objdump llvm-readobj
+          llvm-link
+          llvm-mc
+          llvm-mcmarkup
+          llvm-nm
+          llvm-objdump
+          llvm-readobj
           macho-dump opt
+          profile_rt-shared
           FileCheck count not
           yaml2obj
   )
diff --git a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
index 99db63713d42..36d15757c314 100644
--- a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
+++ b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
@@ -13,12 +13,12 @@
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x0000003c
-; BASIC-NEXT:         0x00000020
+; BASIC-NEXT:         0x00000022
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000001
 ; BASIC-NEXT:         0x00000000
-; BASIC-NEXT:         '411f0000 00616561 62690001 15000000 06020801 09011401 15011703 18011901'
+; BASIC-NEXT:         '41210000 00616561 62690001 17000000 060a0741 08010902 14011501 17031801 1901'
 
 ; CORTEXA8:        .ARM.attributes
 ; CORTEXA8-NEXT:         0x70000003
diff --git a/test/CodeGen/ARM/2010-12-07-PEIBug.ll b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
index 770ad4466aff..4879f4e10bac 100644
--- a/test/CodeGen/ARM/2010-12-07-PEIBug.ll
+++ b/test/CodeGen/ARM/2010-12-07-PEIBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a9 | FileCheck %s
 ; rdar://8728956
 
 define hidden void @foo() nounwind ssp {
diff --git a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
index 3e78c4623859..101a91396eb7 100644
--- a/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
+++ b/test/CodeGen/ARM/2011-06-16-TailCallByVal.ll
@@ -1,4 +1,9 @@
 ; RUN: llc < %s -arm-tail-calls=1 | FileCheck %s
+
+; tail call inside a function where byval argument is splitted between
+; registers and stack is currently unsupported.
+; XFAIL: *
+
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-ios"
 
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index 42b14914814a..6e0ef9619657 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -9,8 +9,8 @@ entry:
 }
 
 ; Trigger multiple NEON stores.
-; CHECK:      vstmia
-; CHECK-NEXT: vstmia
+; CHECK:      vst1.64
+; CHECK-NEXT: vst1.64
 define void @f_0_40(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 40, i32 16, i1 false)
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index 89c01d58c398..f9ede7401a3c 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -8,12 +8,12 @@ define void @test_sqrt(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw    r1, :lower16:{{.*}}
 ; CHECK:      movt    r1, :upper16:{{.*}}
-; CHECK:      vldmia  r1
+; CHECK:      vld1.64 {{.*}}, [r1, :128]
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64  {{.*}}
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -31,21 +31,21 @@ define void @test_cos(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -62,21 +62,21 @@ define void @test_exp(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -93,21 +93,21 @@ define void @test_exp2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -124,21 +124,21 @@ define void @test_log10(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -155,21 +155,21 @@ define void @test_log(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -186,21 +186,21 @@ define void @test_log2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -218,21 +218,21 @@ define void @test_pow(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -252,10 +252,10 @@ define void @test_powi(<4 x float>* %X) nounwind {
 
 ; CHECK:       movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:       movt  [[reg0]], :upper16:{{.*}}
-; CHECK:       vldmia  [[reg0]], {{.*}}
+; CHECK:       vld1.64 {{.*}}, :128
 ; CHECK:       vmul.f32 {{.*}}
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -275,21 +275,21 @@ define void @test_sin(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      {{[mov|vmov.32]}}  r0,
+; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -300,3 +300,34 @@ L.entry:
 
 declare <4 x float> @llvm.sin.v4f32(<4 x float>) nounwind readonly
 
+define void @test_floor(<4 x float>* %X) nounwind {
+
+; CHECK: test_floor:
+
+; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
+; CHECK:      movt  [[reg0]], :upper16:{{.*}}
+; CHECK:      vld1.64
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      vst1.64
+
+L.entry:
+  %0 = load <4 x float>* @A, align 16
+  %1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* %X, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readonly
+
diff --git a/test/CodeGen/ARM/2012-05-04-vmov.ll b/test/CodeGen/ARM/2012-05-04-vmov.ll
new file mode 100644
index 000000000000..d52ef2cc5a1c
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-04-vmov.ll
@@ -0,0 +1,11 @@
+; RUN: llc -O1 -march=arm -mcpu=cortex-a9 < %s | FileCheck -check-prefix=A9-CHECK %s
+; RUN: llc -O1 -march=arm -mcpu=swift < %s | FileCheck -check-prefix=SWIFT-CHECK %s
+; Check that swift doesn't use vmov.32. <rdar://problem/10453003>.
+
+define <2 x i32> @testuvec(<2 x i32> %A, <2 x i32> %B) nounwind {
+entry:
+  %div = udiv <2 x i32> %A, %B
+  ret <2 x i32> %div
+; A9-CHECK: vmov.32
+; SWIFT-CHECK-NOT: vmov.32
+}
diff --git a/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
new file mode 100644
index 000000000000..dd678436c04e
--- /dev/null
+++ b/test/CodeGen/ARM/2012-05-10-PreferVMOVtoVDUP32.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm -mcpu=swift < %s | FileCheck %s
+; <rdar://problem/10451892>
+
+define void @f(i32 %x, i32* %p) nounwind ssp {
+entry:
+; CHECK-NOT: vdup.32
+  %vecinit.i = insertelement <2 x i32> undef, i32 %x, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %x, i32 1
+  %0 = bitcast i32* %p to i8*
+  tail call void @llvm.arm.neon.vst1.v2i32(i8* %0, <2 x i32> %vecinit1.i, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
new file mode 100644
index 000000000000..ec7f72d7c2e8
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=thumb
+; Test that this doesn't crash.
+; <rdar://problem/12183003>
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.1.0"
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+
+define void @findEdges(i8*) nounwind ssp {
+  %2 = icmp sgt i32 undef, 0
+  br i1 %2, label %5, label %3
+
+; <label>:3                                       ; preds = %5, %1
+  %4 = phi i8* [ %0, %1 ], [ %19, %5 ]
+  ret void
+
+; <label>:5                                       ; preds = %5, %1
+  %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
+  %9 = getelementptr inbounds i8* null, i32 3
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
+  %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
+  %15 = getelementptr inbounds i8* %6, i32 3
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
+  %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
+  %19 = getelementptr inbounds i8* %6, i32 48
+  %20 = bitcast <16 x i8> %13 to <2 x i64>
+  %21 = bitcast <16 x i8> %8 to <2 x i64>
+  %22 = bitcast <16 x i8> %14 to <2 x i64>
+  %23 = shufflevector <2 x i64> %22, <2 x i64> undef, <1 x i32> zeroinitializer
+  %24 = bitcast <1 x i64> %23 to <8 x i8>
+  %25 = zext <8 x i8> %24 to <8 x i16>
+  %26 = sub <8 x i16> zeroinitializer, %25
+  %27 = bitcast <16 x i8> %17 to <2 x i64>
+  %28 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %26) nounwind
+  %29 = mul <8 x i16> %28, %28
+  %30 = add <8 x i16> zeroinitializer, %29
+  %31 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> undef, <8 x i16> %30) nounwind
+  %32 = bitcast <16 x i8> %11 to <2 x i64>
+  %33 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> zeroinitializer
+  %34 = bitcast <1 x i64> %33 to <8 x i8>
+  %35 = zext <8 x i8> %34 to <8 x i16>
+  %36 = sub <8 x i16> %35, zeroinitializer
+  %37 = bitcast <16 x i8> %18 to <2 x i64>
+  %38 = shufflevector <2 x i64> %37, <2 x i64> undef, <1 x i32> zeroinitializer
+  %39 = bitcast <1 x i64> %38 to <8 x i8>
+  %40 = zext <8 x i8> %39 to <8 x i16>
+  %41 = sub <8 x i16> zeroinitializer, %40
+  %42 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %36) nounwind
+  %43 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %41) nounwind
+  %44 = mul <8 x i16> %42, %42
+  %45 = mul <8 x i16> %43, %43
+  %46 = add <8 x i16> %45, %44
+  %47 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %31, <8 x i16> %46) nounwind
+  %48 = bitcast <8 x i16> %47 to <2 x i64>
+  %49 = shufflevector <2 x i64> %48, <2 x i64> undef, <1 x i32> zeroinitializer
+  %50 = bitcast <1 x i64> %49 to <4 x i16>
+  %51 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %50, <4 x i16> undef) nounwind
+  %52 = tail call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %51, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %53 = bitcast <4 x i16> %52 to <1 x i64>
+  %54 = shufflevector <1 x i64> %53, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %55 = bitcast <2 x i64> %54 to <8 x i16>
+  %56 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %55, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %57 = shufflevector <2 x i64> %20, <2 x i64> undef, <1 x i32> <i32 1>
+  %58 = bitcast <1 x i64> %57 to <8 x i8>
+  %59 = zext <8 x i8> %58 to <8 x i16>
+  %60 = sub <8 x i16> zeroinitializer, %59
+  %61 = shufflevector <2 x i64> %21, <2 x i64> undef, <1 x i32> <i32 1>
+  %62 = bitcast <1 x i64> %61 to <8 x i8>
+  %63 = zext <8 x i8> %62 to <8 x i16>
+  %64 = sub <8 x i16> %63, zeroinitializer
+  %65 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %60) nounwind
+  %66 = mul <8 x i16> %65, %65
+  %67 = add <8 x i16> zeroinitializer, %66
+  %68 = shufflevector <2 x i64> %27, <2 x i64> undef, <1 x i32> <i32 1>
+  %69 = bitcast <1 x i64> %68 to <8 x i8>
+  %70 = zext <8 x i8> %69 to <8 x i16>
+  %71 = sub <8 x i16> zeroinitializer, %70
+  %72 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> undef) nounwind
+  %73 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %71) nounwind
+  %74 = mul <8 x i16> %72, %72
+  %75 = mul <8 x i16> %73, %73
+  %76 = add <8 x i16> %75, %74
+  %77 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> <i32 1>
+  %78 = bitcast <1 x i64> %77 to <8 x i8>
+  %79 = zext <8 x i8> %78 to <8 x i16>
+  %80 = sub <8 x i16> %79, zeroinitializer
+  %81 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %80) nounwind
+  %82 = mul <8 x i16> %81, %81
+  %83 = add <8 x i16> zeroinitializer, %82
+  %84 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %76, <8 x i16> %83) nounwind
+  %85 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %67, <8 x i16> %84) nounwind
+  %86 = bitcast <8 x i16> %85 to <2 x i64>
+  %87 = shufflevector <2 x i64> %86, <2 x i64> undef, <1 x i32> <i32 1>
+  %88 = bitcast <1 x i64> %87 to <4 x i16>
+  %89 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %88, <4 x i16> undef) nounwind
+  %90 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %89, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %91 = bitcast <4 x i16> %90 to <1 x i64>
+  %92 = shufflevector <1 x i64> undef, <1 x i64> %91, <2 x i32> <i32 0, i32 1>
+  %93 = bitcast <2 x i64> %92 to <8 x i16>
+  %94 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %93, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %95 = bitcast <8 x i8> %56 to <1 x i64>
+  %96 = bitcast <8 x i8> %94 to <1 x i64>
+  %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
+  %98 = bitcast <2 x i64> %97 to <16 x i8>
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  %99 = icmp slt i32 undef, undef
+  br i1 %99, label %5, label %3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
new file mode 100644
index 000000000000..8471be5330b8
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-30-select.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; rdar://12201387
+
+;CHECK: select_s_v_v
+;CHECK: it  ne
+;CHECK-NEXT: vmovne.i32
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+
diff --git a/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
new file mode 100644
index 000000000000..e761ffe72c13
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=arm7tdmi | FileCheck %s
+
+; movw is only legal for V6T2 and later.
+; rdar://12300648
+
+define i32 @t(i32 %x) {
+; CHECK: t:
+; CHECK-NOT: movw
+  %tmp = add i32 %x, -65535
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
new file mode 100644
index 000000000000..75766099a220
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; Check for error message:
+; CHECK: non-trivial scalar-to-vector conversion, possible invalid constraint for vector type
+
+define void @f() nounwind ssp {
+  %1 = call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } asm "vldm $4, { ${0:q}, ${1:q}, ${2:q}, ${3:q} }", "=r,=r,=r,=r,r"(i64* undef) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 318437}
diff --git a/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
new file mode 100644
index 000000000000..6fa1391474bb
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-25-InlineAsmScalarToVectorConv2.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 2>&1 | FileCheck %s
+
+; Check for error message:
+; CHECK: scalar-to-vector conversion failed, possible invalid constraint for vector type
+
+define hidden void @f(i32* %corr, i32 %order) nounwind ssp {
+  tail call void asm sideeffect "vst1.s32 { ${1:q}, ${2:q} }, [$0]", "r,{q0},{q1}"(i32* %corr, <2 x i64>* undef, <2 x i64>* undef) nounwind, !srcloc !0
+  ret void
+}
+
+!0 = metadata !{i32 257}
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
new file mode 100644
index 000000000000..b5f6d311cb9c
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+; Test that we correctly use registers and align elements when using va_arg
+
+%struct_t = type { double, double, double }
+@static_val = constant %struct_t { double 1.0, double 2.0, double 3.0 }
+
+declare void @llvm.va_start(i8*) nounwind
+declare void @llvm.va_end(i8*) nounwind
+
+; CHECK: test_byval_8_bytes_alignment:
+define void @test_byval_8_bytes_alignment(i32 %i, ...) {
+entry:
+; CHECK: stm     r0, {r1, r2, r3}
+  %g = alloca i8*
+  %g1 = bitcast i8** %g to i8*
+  call void @llvm.va_start(i8* %g1)
+
+; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
+; CHECK: bfc	[[REG]], #0, #3
+  %0 = va_arg i8** %g, double
+  call void @llvm.va_end(i8* %g1)
+  
+  ret void
+}
+
+; CHECK: main:
+; CHECK: ldm     r0, {r2, r3}
+define i32 @main() {
+entry:
+  call void (i32, ...)* @test_byval_8_bytes_alignment(i32 555, %struct_t* byval @static_val)
+  ret i32 0
+}
+
+declare void @f(double);
+
+; CHECK:     test_byval_8_bytes_alignment_fixed_arg:
+; CHECK-NOT:   str     r1
+; CHECK:       str     r3, [sp, #12]
+; CHECK:       str     r2, [sp, #8]
+; CHECK-NOT:   str     r1
+define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %val) nounwind {
+entry:
+  %a = getelementptr inbounds %struct_t* %val, i32 0, i32 0
+  %0 = load double* %a
+  call void (double)* @f(double %0)
+  ret void
+}
+
+; CHECK: main_fixed_arg:
+; CHECK: ldm     r0, {r2, r3}
+define i32 @main_fixed_arg() {
+entry:
+  call void (i32, %struct_t*)* @test_byval_8_bytes_alignment_fixed_arg(i32 555, %struct_t* byval @static_val)
+  ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
new file mode 100644
index 000000000000..478048d09600
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s
+
+@.str = private unnamed_addr constant [12 x i8] c"val.a = %f\0A\00"
+%struct_t = type { double, double, double }
+@static_val = constant %struct_t { double 1.0, double 2.0, double 3.0 }
+
+declare i32 @printf(i8*, ...)
+
+; CHECK:     test_byval_usage_scheduling:
+; CHECK:       str     r3, [sp, #12]
+; CHECK:       str     r2, [sp, #8]
+; CHECK:       vldr    d16, [sp, #8]
+define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val) nounwind {
+entry:
+  %a = getelementptr inbounds %struct_t* %val, i32 0, i32 0
+  %0 = load double* %a
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([12 x i8]* @.str, i32 0, i32 0), double %0)
+  ret void
+}
diff --git a/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll b/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll
new file mode 100644
index 000000000000..f2395107d426
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-04-LDRB_POST_IMM-Crash.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=armv7-none-linux- | FileCheck %s
+; Check that LDRB_POST_IMM instruction emitted properly.
+
+%my_struct_t = type { i8, i8, i8, i8, i8 }
+@main.val = private unnamed_addr constant %my_struct_t { i8 1, i8 2, i8 3, i8 4, i8 5 }
+
+declare void @f(i32 %n1, i32 %n2, i32 %n3, %my_struct_t* byval %val);
+
+; CHECK: main:
+define i32 @main() nounwind {
+entry:
+; CHECK: ldrb	{{(r[0-9]+)}}, {{(\[r[0-9]+\])}}, #1
+  call void @f(i32 555, i32 555, i32 555, %my_struct_t* byval @main.val)
+  ret i32 0
+}
+
diff --git a/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
new file mode 100644
index 000000000000..fcc6a7f7e96f
--- /dev/null
+++ b/test/CodeGen/ARM/2012-10-18-PR14099-ByvalFrameAddress.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=armv7-linux-gnueabi | FileCheck %s
+
+%struct.s = type { [4 x i32] }
+@v = constant %struct.s zeroinitializer; 
+
+declare void @f(%struct.s* %p);
+
+; CHECK: t:
+define void @t(i32 %a, %struct.s* byval %s) nounwind {
+entry:
+
+; Here we need to only check proper start address of restored %s argument.
+; CHECK:      sub     sp, sp, #16
+; CHECK:      push    {r11, lr}
+; CHECK:      add     r0, sp, #12
+; CHECK:      stm     r0, {r1, r2, r3}
+; CHECK:      add     r0, sp, #12
+; CHECK-NEXT: bl f
+  call void @f(%struct.s* %s)
+  ret void
+}
+
+; CHECK: caller:
+define void @caller() {
+
+; CHECK:      ldm     r0, {r1, r2, r3}
+  call void @t(i32 0, %struct.s* @v);
+  ret void
+}
diff --git a/test/CodeGen/ARM/a15-mla.ll b/test/CodeGen/ARM/a15-mla.ll
new file mode 100644
index 000000000000..25f6de4762d5
--- /dev/null
+++ b/test/CodeGen/ARM/a15-mla.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s  -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s
+
+; This test checks that the VMLxForwarting feature is disabled for A15.
+; CHECK: fun_a
+define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{
+  %1 = add <4 x i32> %x, %y
+; CHECK-NOT: vmul
+; CHECK: vmla
+  %2 = mul <4 x i32> %1, %1
+  %3 = add <4 x i32> %y, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/ARM/a15.ll b/test/CodeGen/ARM/a15.ll
new file mode 100644
index 000000000000..6f816c1c2c53
--- /dev/null
+++ b/test/CodeGen/ARM/a15.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s  -mcpu=cortex-a15 | FileCheck %s
+
+; CHECK: a
+define i32 @a(i32 %x) {
+  ret i32 %x;
+}
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 8967730835a5..6e6b36377fde 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -159,3 +159,13 @@ entry:
   store i8 %3, i8* %old
   ret void
 }
+
+; CHECK: func4
+; This function should not need to use callee-saved registers.
+; rdar://problem/12203728
+; CHECK-NOT: r4
+define i32 @func4(i32* %p) nounwind optsize ssp {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 monotonic
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/atomicrmw_minmax.ll b/test/CodeGen/ARM/atomicrmw_minmax.ll
new file mode 100644
index 000000000000..69f1384e125c
--- /dev/null
+++ b/test/CodeGen/ARM/atomicrmw_minmax.ll
@@ -0,0 +1,21 @@
+;  RUN: llc -march=arm -mcpu=cortex-a9 < %s | FileCheck %s
+
+;  CHECK: max:
+define i32 @max(i8 %ctx, i32* %ptr, i32 %val)
+{
+;  CHECK: ldrex
+;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
+;  CHECK: movhi {{r[0-9]*}}, [[old]]
+  %old = atomicrmw umax i32* %ptr, i32 %val monotonic
+  ret i32 %old
+}
+
+;  CHECK: min:
+define i32 @min(i8 %ctx, i32* %ptr, i32 %val)
+{
+;  CHECK: ldrex
+;  CHECK: cmp [[old:r[0-9]*]], [[val:r[0-9]*]]
+;  CHECK: movlo {{r[0-9]*}}, [[old]]
+  %old = atomicrmw umin i32* %ptr, i32 %val monotonic
+  ret i32 %old
+}
diff --git a/test/CodeGen/ARM/avoid-cpsr-rmw.ll b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
index 1b385ab79c4e..96e83dd88e92 100644
--- a/test/CodeGen/ARM/avoid-cpsr-rmw.ll
+++ b/test/CodeGen/ARM/avoid-cpsr-rmw.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift     | FileCheck %s
 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
 ; dependency) when it isn't dependent on last CPSR defining instruction.
 ; rdar://8928208
diff --git a/test/CodeGen/ARM/call-noret-minsize.ll b/test/CodeGen/ARM/call-noret-minsize.ll
new file mode 100644
index 000000000000..df3c19eca6a0
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret-minsize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; rdar://12348580
+
+define void @t1() noreturn minsize nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: bl _bar
+
+; SWIFT: t1:
+; SWIFT: bl _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn minsize nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: bl _t1
+
+; SWIFT: t2:
+; SWIFT: bl _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/call-noret.ll b/test/CodeGen/ARM/call-noret.ll
new file mode 100644
index 000000000000..27062dca38dc
--- /dev/null
+++ b/test/CodeGen/ARM/call-noret.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
+; rdar://8979299
+
+define void @t1() noreturn nounwind ssp {
+entry:
+; ARM: t1:
+; ARM: mov lr, pc
+; ARM: b _bar
+
+; SWIFT: t1:
+; SWIFT: mov lr, pc
+; SWIFT: b _bar
+  tail call void @bar() noreturn nounwind
+  unreachable
+}
+
+define void @t2() noreturn nounwind ssp {
+entry:
+; ARM: t2:
+; ARM: mov lr, pc
+; ARM: b _t1
+
+; SWIFT: t2:
+; SWIFT: mov lr, pc
+; SWIFT: b _t1
+  tail call void @t1() noreturn nounwind
+  unreachable
+}
+
+declare void @bar() noreturn
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index f84774d9b615..bf51cd627b3c 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -45,3 +45,16 @@ entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
+
+; rdar://12559385
+define i64 @f5(i32 %vi) {
+entry:
+; CHECK: f5:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index fb0f4c67c927..3ba947579a3a 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a9 -verify-coalescing -verify-machineinstrs | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios0.0.0"
 
@@ -66,3 +66,295 @@ do.end:                                           ; preds = %do.body
 
 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+
+; CHECK: f3
+; This function has lane insertions that span basic blocks.
+; The trivial REG_SEQUENCE lowering can't handle that, but the coalescer can.
+;
+; void f3(float *p, float *q) {
+;   float32x2_t x;
+;   x[1] = p[3];
+;   if (q)
+;     x[0] = q[0] + q[1];
+;   else
+;     x[0] = p[2];
+;   vst1_f32(p+4, x);
+; }
+;
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f3(float* %p, float* %q) nounwind ssp {
+entry:
+  %arrayidx = getelementptr inbounds float* %p, i32 3
+  %0 = load float* %arrayidx, align 4
+  %vecins = insertelement <2 x float> undef, float %0, i32 1
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx2 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx2, align 4
+  %add = fadd float %1, %2
+  %vecins3 = insertelement <2 x float> %vecins, float %add, i32 0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx4 = getelementptr inbounds float* %p, i32 2
+  %3 = load float* %arrayidx4, align 4
+  %vecins5 = insertelement <2 x float> %vecins, float %3, i32 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
+  %add.ptr = getelementptr inbounds float* %p, i32 4
+  %4 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+
+; CHECK: f4
+; This function inserts a lane into a fully defined vector.
+; The destination lane isn't read, so the subregs can coalesce.
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f4(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx1 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx1, align 4
+  %add = fadd float %1, %2
+  %vecins = insertelement <2 x float> %vld1, float %add, i32 1
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+; CHECK: f5
+; Coalesce vector lanes through phis.
+; CHECK: vmov.f32 {{.*}}, #1.0
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+; CHECK: %if.end
+; We may leave the last insertelement in the if.end block.
+; It is inserting the %add value into a dead lane, but %add causes interference
+; in the entry block, and we don't do dead lane checks across basic blocks.
+define void @f5(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
+  %vecext = extractelement <4 x float> %vld1, i32 0
+  %vecext1 = extractelement <4 x float> %vld1, i32 1
+  %vecext2 = extractelement <4 x float> %vld1, i32 2
+  %vecext3 = extractelement <4 x float> %vld1, i32 3
+  %add = fadd float %vecext3, 1.000000e+00
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds float* %q, i32 1
+  %1 = load float* %arrayidx, align 4
+  %add4 = fadd float %vecext, %1
+  %2 = load float* %q, align 4
+  %add6 = fadd float %vecext1, %2
+  %arrayidx7 = getelementptr inbounds float* %q, i32 2
+  %3 = load float* %arrayidx7, align 4
+  %add8 = fadd float %vecext2, %3
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %a.0 = phi float [ %add4, %if.then ], [ %vecext, %entry ]
+  %b.0 = phi float [ %add6, %if.then ], [ %vecext1, %entry ]
+  %c.0 = phi float [ %add8, %if.then ], [ %vecext2, %entry ]
+  %vecinit = insertelement <4 x float> undef, float %a.0, i32 0
+  %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
+  %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
+  %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
+  ret void
+}
+
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+
+; CHECK: pr13999
+define void @pr13999() nounwind readonly {
+entry:
+ br i1 true, label %outer_loop, label %loop.end
+
+outer_loop:
+ %d = phi double [ 0.0, %entry ], [ %add, %after_inner_loop ]
+ %0 = insertelement <2 x double> <double 0.0, double 0.0>, double %d, i32 0
+ br i1 undef, label %after_inner_loop, label %inner_loop
+
+inner_loop:
+ br i1 true, label %after_inner_loop, label %inner_loop
+
+after_inner_loop:
+ %1 = phi <2 x double> [ %0, %outer_loop ], [ <double 0.0, double 0.0>,
+%inner_loop ]
+ %2 = extractelement <2 x double> %1, i32 1
+ %add = fadd double 1.0, %2
+ br i1 false, label %loop.end, label %outer_loop
+
+loop.end:
+ %d.end = phi double [ 0.0, %entry ], [ %add, %after_inner_loop ]
+ ret void
+}
+
+; CHECK: pr14078
+define arm_aapcs_vfpcc i32 @pr14078(i8* nocapture %arg, i8* nocapture %arg1, i32 %arg2) nounwind uwtable readonly {
+bb:
+  br i1 undef, label %bb31, label %bb3
+
+bb3:                                              ; preds = %bb12, %bb
+  %tmp = shufflevector <2 x i64> undef, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp4 = bitcast <1 x i64> %tmp to <2 x float>
+  %tmp5 = shufflevector <2 x float> %tmp4, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp6 = bitcast <4 x float> %tmp5 to <2 x i64>
+  %tmp7 = shufflevector <2 x i64> %tmp6, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp8 = bitcast <1 x i64> %tmp7 to <2 x float>
+  %tmp9 = tail call <2 x float> @baz(<2 x float> <float 0xFFFFFFFFE0000000, float 0.000000e+00>, <2 x float> %tmp8, <2 x float> zeroinitializer) nounwind
+  br i1 undef, label %bb10, label %bb12
+
+bb10:                                             ; preds = %bb3
+  %tmp11 = load <4 x float>* undef, align 8
+  br label %bb12
+
+bb12:                                             ; preds = %bb10, %bb3
+  %tmp13 = shufflevector <2 x float> %tmp9, <2 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %tmp14 = bitcast <2 x float> %tmp13 to <1 x i64>
+  %tmp15 = shufflevector <1 x i64> %tmp14, <1 x i64> zeroinitializer, <2 x i32> <i32 0, i32 1>
+  %tmp16 = bitcast <2 x i64> %tmp15 to <4 x float>
+  %tmp17 = fmul <4 x float> zeroinitializer, %tmp16
+  %tmp18 = bitcast <4 x float> %tmp17 to <2 x i64>
+  %tmp19 = shufflevector <2 x i64> %tmp18, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp20 = bitcast <1 x i64> %tmp19 to <2 x float>
+  %tmp21 = tail call <2 x float> @baz67(<2 x float> %tmp20, <2 x float> undef) nounwind
+  %tmp22 = tail call <2 x float> @baz67(<2 x float> %tmp21, <2 x float> %tmp21) nounwind
+  %tmp23 = shufflevector <2 x float> %tmp22, <2 x float> undef, <4 x i32> zeroinitializer
+  %tmp24 = bitcast <4 x float> %tmp23 to <2 x i64>
+  %tmp25 = shufflevector <2 x i64> %tmp24, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp26 = bitcast <1 x i64> %tmp25 to <2 x float>
+  %tmp27 = extractelement <2 x float> %tmp26, i32 0
+  %tmp28 = fcmp olt float %tmp27, 0.000000e+00
+  %tmp29 = select i1 %tmp28, i32 0, i32 undef
+  %tmp30 = icmp ult i32 undef, %arg2
+  br i1 %tmp30, label %bb3, label %bb31
+
+bb31:                                             ; preds = %bb12, %bb
+  %tmp32 = phi i32 [ 1, %bb ], [ %tmp29, %bb12 ]
+  ret i32 %tmp32
+}
+
+declare <2 x float> @baz(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+
+declare <2 x float> @baz67(<2 x float>, <2 x float>) nounwind readnone
+
+%struct.wombat.5 = type { %struct.quux, %struct.quux, %struct.quux, %struct.quux }
+%struct.quux = type { <4 x float> }
+
+; CHECK: pr14079
+define linkonce_odr arm_aapcs_vfpcc %struct.wombat.5 @pr14079(i8* nocapture %arg, i8* nocapture %arg1, i8* nocapture %arg2) nounwind uwtable inlinehint {
+bb:
+  %tmp = shufflevector <2 x i64> zeroinitializer, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp3 = bitcast <1 x i64> %tmp to <2 x float>
+  %tmp4 = shufflevector <2 x float> %tmp3, <2 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %tmp5 = shufflevector <2 x float> %tmp4, <2 x float> undef, <2 x i32> <i32 1, i32 3>
+  %tmp6 = bitcast <2 x float> %tmp5 to <1 x i64>
+  %tmp7 = shufflevector <1 x i64> undef, <1 x i64> %tmp6, <2 x i32> <i32 0, i32 1>
+  %tmp8 = bitcast <2 x i64> %tmp7 to <4 x float>
+  %tmp9 = shufflevector <2 x i64> zeroinitializer, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp10 = bitcast <1 x i64> %tmp9 to <2 x float>
+  %tmp11 = shufflevector <2 x float> %tmp10, <2 x float> undef, <2 x i32> <i32 0, i32 2>
+  %tmp12 = shufflevector <2 x float> %tmp11, <2 x float> undef, <2 x i32> <i32 0, i32 2>
+  %tmp13 = bitcast <2 x float> %tmp12 to <1 x i64>
+  %tmp14 = shufflevector <1 x i64> %tmp13, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %tmp15 = bitcast <2 x i64> %tmp14 to <4 x float>
+  %tmp16 = insertvalue %struct.wombat.5 undef, <4 x float> %tmp8, 1, 0
+  %tmp17 = insertvalue %struct.wombat.5 %tmp16, <4 x float> %tmp15, 2, 0
+  %tmp18 = insertvalue %struct.wombat.5 %tmp17, <4 x float> undef, 3, 0
+  ret %struct.wombat.5 %tmp18
+}
+
+; CHECK: adjustCopiesBackFrom
+; The shuffle in if.else3 must be preserved even though adjustCopiesBackFrom
+; is tempted to remove it.
+; CHECK: %if.else3
+; CHECK: vorr d
+define internal void @adjustCopiesBackFrom(<2 x i64>* noalias nocapture sret %agg.result, <2 x i64> %in) {
+entry:
+  %0 = extractelement <2 x i64> %in, i32 0
+  %cmp = icmp slt i64 %0, 1
+  %.in = select i1 %cmp, <2 x i64> <i64 0, i64 undef>, <2 x i64> %in
+  %1 = extractelement <2 x i64> %in, i32 1
+  %cmp1 = icmp slt i64 %1, 1
+  br i1 %cmp1, label %if.then2, label %if.else3
+
+if.then2:                                         ; preds = %entry
+  %2 = insertelement <2 x i64> %.in, i64 0, i32 1
+  br label %if.end4
+
+if.else3:                                         ; preds = %entry
+  %3 = shufflevector <2 x i64> %.in, <2 x i64> %in, <2 x i32> <i32 0, i32 3>
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.else3, %if.then2
+  %result.2 = phi <2 x i64> [ %2, %if.then2 ], [ %3, %if.else3 ]
+  store <2 x i64> %result.2, <2 x i64>* %agg.result, align 128
+  ret void
+}
+
+; <rdar://problem/12758887>
+; RegisterCoalescer::updateRegDefsUses() could visit an instruction more than
+; once under rare circumstances. When widening a register from QPR to DTriple
+; with the original virtual register in dsub_1_dsub_2, the double rewrite would
+; produce an invalid sub-register.
+;
+; This is because dsub_1_dsub_2 is not an idempotent sub-register index.
+; It will translate %vr:dsub_0 -> %vr:dsub_1.
+define hidden fastcc void @radar12758887() nounwind optsize ssp {
+entry:
+  br i1 undef, label %for.body, label %for.end70
+
+for.body:                                         ; preds = %for.end, %entry
+  br i1 undef, label %for.body29, label %for.end
+
+for.body29:                                       ; preds = %for.body29, %for.body
+  %0 = load <2 x double>* null, align 1
+  %splat40 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> zeroinitializer
+  %mul41 = fmul <2 x double> undef, %splat40
+  %add42 = fadd <2 x double> undef, %mul41
+  %splat44 = shufflevector <2 x double> %0, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %mul45 = fmul <2 x double> undef, %splat44
+  %add46 = fadd <2 x double> undef, %mul45
+  br i1 undef, label %for.end, label %for.body29
+
+for.end:                                          ; preds = %for.body29, %for.body
+  %accumR2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add42, %for.body29 ]
+  %accumI2.0.lcssa = phi <2 x double> [ zeroinitializer, %for.body ], [ %add46, %for.body29 ]
+  %1 = shufflevector <2 x double> %accumI2.0.lcssa, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+  %add58 = fadd <2 x double> undef, %1
+  %mul61 = fmul <2 x double> %add58, undef
+  %add63 = fadd <2 x double> undef, %mul61
+  %add64 = fadd <2 x double> undef, %add63
+  %add67 = fadd <2 x double> undef, %add64
+  store <2 x double> %add67, <2 x double>* undef, align 1
+  br i1 undef, label %for.end70, label %for.body
+
+for.end70:                                        ; preds = %for.end, %entry
+  ret void
+}
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index f4c1b5acef91..3baa103e3d5d 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv4t-unknown-linux-gnueabi -disable-cgp-branch-opts | FileCheck %s
+; RUN: llc < %s -mtriple=armv4t-unknown-linux-gnueabi -disable-cgp-branch-opts -verify-machineinstrs | FileCheck %s
 
 define i32 @f1() {
 ; CHECK: f1
@@ -45,6 +45,16 @@ r:
         ret void
 }
 
+define i32 @f8() nounwind {
+; Check that constant propagation through (i32)-1 => (float)Nan => (i32)-1
+; gives expected result
+; CHECK: f8
+; CHECK: mvn r0, #0
+        %tmp0 = bitcast i32 -1 to float
+        %tmp1 = bitcast float %tmp0 to i32
+        ret i32 %tmp1
+}
+
 %t1 = type { <3 x float>, <3 x float> }
 
 @const1 = global %t1 { <3 x float> zeroinitializer,
diff --git a/test/CodeGen/ARM/crash-shufflevector.ll b/test/CodeGen/ARM/crash-shufflevector.ll
new file mode 100644
index 000000000000..bdc0e0ea4db0
--- /dev/null
+++ b/test/CodeGen/ARM/crash-shufflevector.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=armv7
+
+declare void @g(<16 x i8>)
+define void @f(<4 x i8> %param1, <4 x i8> %param2) {
+   %y1 = shufflevector <4 x i8> %param1, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+   %y2 = shufflevector <4 x i8> %param2, <4 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+   %z = shufflevector <16 x i8> %y1, <16 x i8> %y2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
+   call void @g(<16 x i8> %z)
+   ret void
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/darwin-section-order.ll b/test/CodeGen/ARM/darwin-section-order.ll
new file mode 100644
index 000000000000..701028c0a537
--- /dev/null
+++ b/test/CodeGen/ARM/darwin-section-order.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=armv7-apple-darwin | FileCheck %s
+
+; CHECK: .section	__TEXT,__text,regular,pure_instructions
+; CHECK: .section	__TEXT,myprecious
+; CHECK: .section	__TEXT,__textcoal_nt,coalesced,pure_instructions
+; CHECK: .section	__TEXT,__const_coal,coalesced
+; CHECK: .section	__TEXT,__picsymbolstub4,symbol_stubs,none,16
+; CHECK: .section	__TEXT,__StaticInit,regular,pure_instructions
+
+
+define void @normal() nounwind readnone {
+; CHECK: .section	__TEXT,__text,regular,pure_instructions
+; CHECK: _normal:
+  ret void
+}
+
+define void @special() nounwind readnone section "__TEXT,myprecious" {
+; CHECK: .section	__TEXT,myprecious
+; CHECK: _special:
+  ret void
+}
diff --git a/test/CodeGen/ARM/deps-fix.ll b/test/CodeGen/ARM/deps-fix.ll
new file mode 100644
index 000000000000..288697a4dc7f
--- /dev/null
+++ b/test/CodeGen/ARM/deps-fix.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=arm -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard -mtriple armv7-linux-gnueabi | FileCheck %s
+
+;; This test checks that the ExecutionDepsFix pass performs the domain changes
+;; even when some dependencies are propagated through implicit definitions.
+
+; CHECK: fun_a
+define <4 x float> @fun_a(<4 x float> %in, <4 x float> %x, float %y) nounwind {
+; CHECK: vext
+; CHECK: vext
+; CHECK: vadd.f32
+  %1 = insertelement <4 x float> %in, float %y, i32 0
+  %2 = fadd <4 x float> %1, %x  
+  ret <4 x float> %2
+}
+; CHECK: fun_b
+define <4 x i32> @fun_b(<4 x i32> %in, <4 x i32> %x, i32 %y) nounwind {
+; CHECK: vmov.32
+; CHECK: vadd.i32
+  %1 = insertelement <4 x i32> %in, i32 %y, i32 0
+  %2 = add <4 x i32> %1, %x  
+  ret <4 x i32> %2
+}
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 3d29e05a0ccf..82cfca182b80 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -mtriple=arm-apple-darwin | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=CHECK-ARM
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=CHECK-SWIFT
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f1
 ; CHECK-ARM: __divsi3
+
+; CHECK-SWIFT: f1
+; CHECK-SWIFT: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -12,6 +16,9 @@ define i32 @f2(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f2
 ; CHECK-ARM: __udivsi3
+
+; CHECK-SWIFT: f2
+; CHECK-SWIFT: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -20,6 +27,10 @@ define i32 @f3(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f3
 ; CHECK-ARM: __modsi3
+
+; CHECK-SWIFT: f3
+; CHECK-SWIFT: sdiv
+; CHECK-SWIFT: mls
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -28,6 +39,10 @@ define i32 @f4(i32 %a, i32 %b) {
 entry:
 ; CHECK-ARM: f4
 ; CHECK-ARM: __umodsi3
+
+; CHECK-SWIFT: f4
+; CHECK-SWIFT: udiv
+; CHECK-SWIFT: mls
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll
index 7fbf8f409036..577f8aa7d39b 100644
--- a/test/CodeGen/ARM/divmod.ll
+++ b/test/CodeGen/ARM/divmod.ll
@@ -1,10 +1,18 @@
-; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+
+; rdar://12481395
 
 define void @foo(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
-; CHECK: foo:
-; CHECK: bl ___divmodsi4
-; CHECK-NOT: bl ___divmodsi4
+; A8: foo:
+; A8: bl ___divmodsi4
+; A8-NOT: bl ___divmodsi4
+
+; SWIFT: foo:
+; SWIFT: sdiv
+; SWIFT: mls
+; SWIFT-NOT: bl __divmodsi4
   %div = sdiv i32 %x, %y
   store i32 %div, i32* %P, align 4
   %rem = srem i32 %x, %y
@@ -15,9 +23,14 @@ entry:
 
 define void @bar(i32 %x, i32 %y, i32* nocapture %P) nounwind ssp {
 entry:
-; CHECK: bar:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: bl ___udivmodsi4
+; A8: bar:
+; A8: bl ___udivmodsi4
+; A8-NOT: bl ___udivmodsi4
+
+; SWIFT: bar:
+; SWIFT: udiv
+; SWIFT: mls
+; SWIFT-NOT: bl __udivmodsi4
   %div = udiv i32 %x, %y
   store i32 %div, i32* %P, align 4
   %rem = urem i32 %x, %y
@@ -32,14 +45,18 @@ entry:
 
 define void @do_indent(i32 %cols) nounwind {
 entry:
-; CHECK: do_indent:
+; A8: do_indent:
+; SWIFT: do_indent:
   %0 = load i32* @flags, align 4
   %1 = and i32 %0, 67108864
   %2 = icmp eq i32 %1, 0
   br i1 %2, label %bb1, label %bb
 
 bb:
-; CHECK: bl ___divmodsi4
+; A8: bl ___divmodsi4
+; SWIFT: sdiv
+; SWIFT: mls
+; SWIFT-NOT: bl __divmodsi4
   %3 = load i32* @tabsize, align 4
   %4 = srem i32 %cols, %3
   %5 = sdiv i32 %cols, %3
@@ -60,9 +77,14 @@ declare i8* @__memset_chk(i8*, i32, i32, i32) nounwind
 ; rdar://11714607
 define i32 @howmany(i32 %x, i32 %y) nounwind {
 entry:
-; CHECK: howmany:
-; CHECK: bl ___udivmodsi4
-; CHECK-NOT: ___udivsi3
+; A8: howmany:
+; A8: bl ___udivmodsi4
+; A8-NOT: ___udivsi3
+
+; SWIFT: howmany:
+; SWIFT: udiv
+; SWIFT: mls
+; SWIFT-NOT: bl __udivmodsi4
   %rem = urem i32 %x, %y
   %div = udiv i32 %x, %y
   %not.cmp = icmp ne i32 %rem, 0
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
new file mode 100644
index 000000000000..a5c41144584c
--- /dev/null
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -0,0 +1,100 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard < %s | FileCheck %s
+
+define <2 x float> @test_vmovs_via_vext_lane0to0(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane0to1(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to0(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to1(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+
+define float @test_vmovs_via_vdup(float, float %ret, float %lhs, float %rhs) {
+; CHECK: test_vmovs_via_vdup:
+
+  ; Do an operation (which will end up NEON because of +neonfp) to convince the
+  ; execution-domain pass that NEON is a good thing to use.
+  %res = fadd float %ret, %ret
+  ;  It makes sense for LLVM to do the addition in d0 here, because it's going
+  ;  to be returned. This means it will want a "vmov s0, s1":
+; CHECK: vdup.32 d0, d0[1]
+
+  ret float %res
+}
+
+declare float @llvm.sqrt.f32(float)
+
+declare void @bar()
+
+; This is a comp
+define float @test_ineligible(float, float %in) {
+; CHECK: test_ineligible:
+
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %val = fadd float %sqrt, %sqrt
+
+  ; This call forces a move from a callee-saved register to the return-reg. That
+  ; move is not eligible for conversion to a d-register instructions because the
+  ; use-def chains would be messed up. Primarily a compile-test (we used to
+  ; internal fault).
+  call void @bar()
+; CHECL: bl bar
+; CHECK: vext.32
+; CHECK: vext.32
+  ret float %val
+}
+
+define i32 @test_vmovs_no_sreg(i32 %in) {
+; CHECK: test_vmovs_no_sreg:
+
+  ; Check that the movement to and from GPRs takes place in the NEON domain.
+; CHECK: vmov.32 d
+  %x = bitcast i32 %in to float
+
+  %res = fadd float %x, %x
+
+; CHECK: vmov.32 r{{[0-9]+}}, d
+  %resi = bitcast float %res to i32
+
+  ret i32 %resi
+}
diff --git a/test/CodeGen/ARM/fabss.ll b/test/CodeGen/ARM/fabss.ll
index bcb4ee745234..46c2f1c65fe5 100644
--- a/test/CodeGen/ARM/fabss.ll
+++ b/test/CodeGen/ARM/fabss.ll
@@ -14,12 +14,12 @@ entry:
 declare float @fabsf(float)
 
 ; VFP2: test:
-; VFP2: 	vabs.f32	s1, s1
+; VFP2: 	vabs.f32	s2, s2
 
 ; NFP1: test:
 ; NFP1: 	vabs.f32	d1, d1
 ; NFP0: test:
-; NFP0: 	vabs.f32	s1, s1
+; NFP0: 	vabs.f32	s2, s2
 
 ; CORTEXA8: test:
 ; CORTEXA8:     vadd.f32        [[D1:d[0-9]+]]
diff --git a/test/CodeGen/ARM/fadds.ll b/test/CodeGen/ARM/fadds.ll
index e35103c045eb..48ef5ed88fb0 100644
--- a/test/CodeGen/ARM/fadds.ll
+++ b/test/CodeGen/ARM/fadds.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vadd.f32	s0, s1, s0
+; VFP2: 	vadd.f32	s
 
 ; NFP1: test:
-; NFP1: 	vadd.f32	d0, d1, d0
+; NFP1: 	vadd.f32	d
 ; NFP0: test:
-; NFP0: 	vadd.f32	s0, s1, s0
+; NFP0: 	vadd.f32	s
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vadd.f32	d0, d1, d0
+; CORTEXA8: 	vadd.f32	d
 ; CORTEXA9: test:
 ; CORTEXA9: 	vadd.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
new file mode 100644
index 000000000000..867d53f973db
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-none-linux-gnueabi | FileCheck %s --check-prefix=THUMB-ELF
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-none-linux-gnueabi | FileCheck %s --check-prefix=ARMv7-ELF
+
+@g = global i32 0, align 4
+
+define i32 @LoadGV() {
+entry:
+; THUMB: LoadGV
+; THUMB: movw [[reg0:r[0-9]+]],
+; THUMB: movt [[reg0]],
+; THUMB: add  [[reg0]], pc
+; THUMB-ELF: LoadGV
+; THUMB-ELF: ldr.n r[[reg0:[0-9]+]],
+; THUMB-ELF: ldr.n r[[reg1:[0-9]+]],
+; THUMB-ELF: ldr r[[reg0]], [r[[reg1]], r[[reg0]]]
+; ARM: LoadGV
+; ARM: ldr [[reg1:r[0-9]+]],
+; ARM: add [[reg1]], pc, [[reg1]]
+; ARMv7: LoadGV
+; ARMv7: movw [[reg2:r[0-9]+]],
+; ARMv7: movt [[reg2]],
+; ARMv7: add  [[reg2]], pc, [[reg2]]
+; ARMv7-ELF: LoadGV
+; ARMv7-ELF: ldr r[[reg2:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg3:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg2]], [r[[reg3]], r[[reg2]]]
+  %tmp = load i32* @g
+  ret i32 %tmp
+}
+
+@i = external global i32
+
+define i32 @LoadIndirectSymbol() {
+entry:
+; THUMB: LoadIndirectSymbol
+; THUMB: movw r[[reg3:[0-9]+]],
+; THUMB: movt r[[reg3]],
+; THUMB: add  r[[reg3]], pc
+; THUMB: ldr  r[[reg3]], [r[[reg3]]]
+; THUMB-ELF: LoadIndirectSymbol
+; THUMB-ELF: ldr.n r[[reg3:[0-9]+]],
+; THUMB-ELF: ldr.n r[[reg4:[0-9]+]],
+; THUMB-ELF: ldr r[[reg3]], [r[[reg4]], r[[reg3]]]
+; ARM: LoadIndirectSymbol
+; ARM: ldr [[reg4:r[0-9]+]],
+; ARM: ldr [[reg4]], [pc, [[reg4]]]
+; ARMv7: LoadIndirectSymbol
+; ARMv7: movw r[[reg5:[0-9]+]],
+; ARMv7: movt r[[reg5]],
+; ARMv7: add  r[[reg5]], pc, r[[reg5]]
+; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
+; ARMv7-ELF: LoadIndirectSymbol
+; ARMv7-ELF: ldr r[[reg5:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg6:[0-9]+]],
+; ARMv7-ELF: ldr r[[reg5]], [r[[reg6]], r[[reg5]]]
+  %tmp = load i32* @i
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index ecd5fe27a4b7..41fda4132632 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Very basic fast-isel functionality.
 define i32 @add(i32 %a, i32 %b) nounwind {
@@ -238,3 +240,67 @@ entry:
 }
 
 declare void @llvm.trap() nounwind
+
+define void @unaligned_i16_store(i16 %x, i16* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i16 %x, i16* %y, align 1
+  ret void
+}
+
+define i16 @unaligned_i16_load(i16* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i16* %x, align 1
+  ret i16 %0
+}
+
+define void @unaligned_i32_store(i32 %x, i32* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i32 %x, i32* %y, align 1
+  ret void
+}
+
+define i32 @unaligned_i32_load(i32* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i32* %x, align 1
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/fdivs.ll b/test/CodeGen/ARM/fdivs.ll
index 31c1ca940502..8fab00213585 100644
--- a/test/CodeGen/ARM/fdivs.ll
+++ b/test/CodeGen/ARM/fdivs.ll
@@ -10,14 +10,14 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vdiv.f32	s0, s1, s0
+; VFP2: 	vdiv.f32	s0, s2, s0
 
 ; NFP1: test:
-; NFP1: 	vdiv.f32	s0, s1, s0
+; NFP1: 	vdiv.f32	s0, s2, s0
 ; NFP0: test:
-; NFP0: 	vdiv.f32	s0, s1, s0
+; NFP0: 	vdiv.f32	s0, s2, s0
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vdiv.f32	s0, s1, s0
+; CORTEXA8: 	vdiv.f32	s0, s2, s0
 ; CORTEXA9: test:
 ; CORTEXA9: 	vdiv.f32	s{{.}}, s{{.}}, s{{.}}
diff --git a/test/CodeGen/ARM/fmuls.ll b/test/CodeGen/ARM/fmuls.ll
index 3c3182bc6341..1566a9272db1 100644
--- a/test/CodeGen/ARM/fmuls.ll
+++ b/test/CodeGen/ARM/fmuls.ll
@@ -10,15 +10,15 @@ entry:
 }
 
 ; VFP2: test:
-; VFP2: 	vmul.f32	s0, s1, s0
+; VFP2: 	vmul.f32	s
 
 ; NFP1: test:
-; NFP1: 	vmul.f32	d0, d1, d0
+; NFP1: 	vmul.f32	d
 ; NFP0: test:
-; NFP0: 	vmul.f32	s0, s1, s0
+; NFP0: 	vmul.f32	s
 
 ; CORTEXA8: test:
-; CORTEXA8: 	vmul.f32	d0, d1, d0
+; CORTEXA8: 	vmul.f32	d
 ; CORTEXA9: test:
 ; CORTEXA9: 	vmul.f32	s{{.}}, s{{.}}, s{{.}}
 
diff --git a/test/CodeGen/ARM/fp-fast.ll b/test/CodeGen/ARM/fp-fast.ll
new file mode 100644
index 000000000000..ec5718738177
--- /dev/null
+++ b/test/CodeGen/ARM/fp-fast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %t1)
+  ret float %t2
+}
+
+; CHECK: test2
+define float @test2(float %x, float %y) {
+; CHECK-NOT: vmul
+; CHECK: vfma.f32
+; CHECK-NOT: vmul
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %t1, float 2.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test3
+define float @test3(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vadd.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test4
+define float @test4(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vsub.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float -1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %x)
+  ret float %t2
+}
+
+; CHECK: test6
+define float @test6(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fsub float -0.0, %x
+  %t2 = call float @llvm.fma.f32(float %x, float 5.0, float %t1)
+  ret float %t2
+}
+
+declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/ARM/fp_convert.ll b/test/CodeGen/ARM/fp_convert.ll
index 7002cecf3640..44298b9c5d8d 100644
--- a/test/CodeGen/ARM/fp_convert.ll
+++ b/test/CodeGen/ARM/fp_convert.ll
@@ -31,7 +31,7 @@ define float @test3(i32 %a, i32 %b) {
 ; VFP2: test3:
 ; VFP2: vcvt.f32.u32 s{{.}}, s{{.}}
 ; NEON: test3:
-; NEON: vcvt.f32.u32 d0, d0
+; NEON: vcvt.f32.u32 d
 entry:
         %0 = add i32 %a, %b
         %1 = uitofp i32 %0 to float
@@ -42,7 +42,7 @@ define float @test4(i32 %a, i32 %b) {
 ; VFP2: test4:
 ; VFP2: vcvt.f32.s32 s{{.}}, s{{.}}
 ; NEON: test4:
-; NEON: vcvt.f32.s32 d0, d0
+; NEON: vcvt.f32.s32 d
 entry:
         %0 = add i32 %a, %b
         %1 = sitofp i32 %0 to float
diff --git a/test/CodeGen/ARM/fsubs.ll b/test/CodeGen/ARM/fsubs.ll
index bea8d5f4f30b..f039e74c8ee6 100644
--- a/test/CodeGen/ARM/fsubs.ll
+++ b/test/CodeGen/ARM/fsubs.ll
@@ -8,6 +8,6 @@ entry:
 	ret float %0
 }
 
-; VFP2: vsub.f32	s0, s1, s0
-; NFP1: vsub.f32	d0, d1, d0
-; NFP0: vsub.f32	s0, s1, s0
+; VFP2: vsub.f32	s
+; NFP1: vsub.f32	d
+; NFP0: vsub.f32	s
diff --git a/test/CodeGen/ARM/ifcvt1.ll b/test/CodeGen/ARM/ifcvt1.ll
index cd870bb5d4b2..fd831442c14b 100644
--- a/test/CodeGen/ARM/ifcvt1.ll
+++ b/test/CodeGen/ARM/ifcvt1.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -march=arm -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
 
 define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
+; A8: t1:
+; SWIFT: t1:
 	%tmp2 = icmp eq i32 %a, 0
 	br i1 %tmp2, label %cond_false, label %cond_true
 
 cond_true:
-; CHECK: subeq r0, r1, #1
+; A8: subeq r0, r1, #1
+; SWIFT: sub r0, r1, #1
 	%tmp5 = add i32 %b, 1
 	ret i32 %tmp5
 
 cond_false:
-; CHECK: addne r0, r1, #1
+; A8: addne r0, r1, #1
+; SWIFT: addne r0, r1, #1
 	%tmp7 = add i32 %b, -1
 	ret i32 %tmp7
 }
diff --git a/test/CodeGen/ARM/ifcvt12.ll b/test/CodeGen/ARM/ifcvt12.ll
new file mode 100644
index 000000000000..77bdca57e555
--- /dev/null
+++ b/test/CodeGen/ARM/ifcvt12.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm-apple-darwin -mcpu=cortex-a8 | FileCheck %s
+define i32 @f1(i32 %a, i32 %b, i32 %c) {
+; CHECK: f1:
+; CHECK: mlsne r0, r0, r1, r2
+    %tmp1 = icmp eq i32 %a, 0
+    br i1 %tmp1, label %cond_false, label %cond_true
+
+cond_true:
+    %tmp2 = mul i32 %a, %b
+    %tmp3 = sub i32 %c, %tmp2
+    ret i32 %tmp3
+
+cond_false:
+    ret i32 %a
+}
diff --git a/test/CodeGen/ARM/ifcvt5.ll b/test/CodeGen/ARM/ifcvt5.ll
index 95f5c97f2a9a..5081791bc257 100644
--- a/test/CodeGen/ARM/ifcvt5.ll
+++ b/test/CodeGen/ARM/ifcvt5.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
+; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift     | FileCheck %s -check-prefix=SWIFT
+; rdar://8402126
 
 @x = external global i32*		; <i32**> [#uses=1]
 
@@ -10,8 +12,12 @@ entry:
 }
 
 define i32 @t1(i32 %a, i32 %b) {
-; CHECK: t1:
-; CHECK: poplt {r7, pc}
+; A8: t1:
+; A8: poplt {r7, pc}
+
+; SWIFT: t1:
+; SWIFT: pop {r7, pc}
+; SWIFT: pop {r7, pc}
 entry:
 	%tmp1 = icmp sgt i32 %a, 10		; <i1> [#uses=1]
 	br i1 %tmp1, label %cond_true, label %UnifiedReturnBlock
diff --git a/test/CodeGen/ARM/indirectbr-2.ll b/test/CodeGen/ARM/indirectbr-2.ll
new file mode 100644
index 000000000000..084f520a8ee5
--- /dev/null
+++ b/test/CodeGen/ARM/indirectbr-2.ll
@@ -0,0 +1,46 @@
+; RUN: llc < %s -O0 -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s
+; <rdar://problem/12529625>
+
+@foo = global i32 34879, align 4
+@DWJumpTable2808 = global [2 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@func, %14) to i32), i32 ptrtoint (i8* blockaddress(@func, %4) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@func, %13) to i32), i32 ptrtoint (i8* blockaddress(@func, %4) to i32))]
+@0 = internal constant [45 x i8] c"func XXXXXXXXXXX :: bb xxxxxxxxxxxxxxxxxxxx\0A\00"
+
+; The indirect branch has the two destinations as successors. The lone PHI
+; statement shouldn't be implicitly defined.
+
+; CHECK:      func:
+; CHECK:      Ltmp1:    @ Block address taken
+; CHECK-NOT:            @ implicit-def: R0
+; CHECK:                @ 4-byte Reload
+
+define i32 @func() nounwind ssp {
+  %1 = alloca i32, align 4
+  %2 = load i32* @foo, align 4
+  %3 = icmp eq i32 %2, 34879
+  br label %4
+
+; <label>:4                                       ; preds = %0
+  %5 = zext i1 %3 to i32
+  %6 = mul i32 %5, 287
+  %7 = add i32 %6, 2
+  %8 = getelementptr [2 x i32]* @DWJumpTable2808, i32 0, i32 %5
+  %9 = load i32* %8
+  %10 = add i32 %9, ptrtoint (i8* blockaddress(@func, %4) to i32)
+  %11 = inttoptr i32 %10 to i8*
+  %12 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([45 x i8]* @0, i32 0, i32 0))
+  indirectbr i8* %11, [label %13, label %14]
+
+; <label>:13                                      ; preds = %4
+  %tmp14 = phi i32 [ %7, %4 ]
+  store i32 23958, i32* @foo, align 4
+  %tmp15 = load i32* %1, align 4
+  %tmp16 = icmp eq i32 %tmp15, 0
+  %tmp17 = zext i1 %tmp16 to i32
+  %tmp21 = add i32 %tmp17, %tmp14
+  ret i32 %tmp21
+
+; <label>:14                                      ; preds = %4
+  ret i32 42
+}
+
+declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
new file mode 100644
index 000000000000..1d72afefb5b8
--- /dev/null
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -0,0 +1,35 @@
+; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+
+; This test checks that when inserting one (integer) element into a vector,
+; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
+; one DPR to another that we check for.
+
+; CHECK: @f
+; CHECK-NOT: vorr d
+; CHECK: vmov.32 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <4 x i32> @f(<4 x i32> %in) {
+  %1 = insertelement <4 x i32> %in, i32 255, i32 3
+  ret <4 x i32> %1
+}
+
+; CHECK: @g
+; CHECK-NOT: vorr d
+; CHECK: vmov.16 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <8 x i16> @g(<8 x i16> %in) {
+  %1 = insertelement <8 x i16> %in, i16 255, i32 7
+  ret <8 x i16> %1
+}
+
+; CHECK: @h
+; CHECK-NOT: vorr d
+; CHECK: vmov.8 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <16 x i8> @h(<16 x i8> %in) {
+  %1 = insertelement <16 x i8> %in, i8 255, i32 15
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/ARM/ldr_post.ll b/test/CodeGen/ARM/ldr_post.ll
index 8ddf025dbf1b..a6ca43448380 100644
--- a/test/CodeGen/ARM/ldr_post.ll
+++ b/test/CodeGen/ARM/ldr_post.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
 
 ; CHECK: test1:
 ; CHECK: ldr {{.*, \[.*]}}, -r2
diff --git a/test/CodeGen/ARM/ldr_pre.ll b/test/CodeGen/ARM/ldr_pre.ll
index e904e5fd2cdb..6c40ad7326b6 100644
--- a/test/CodeGen/ARM/ldr_pre.ll
+++ b/test/CodeGen/ARM/ldr_pre.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s
 
 ; CHECK: test1:
 ; CHECK: ldr {{.*!}}
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
new file mode 100644
index 000000000000..e4a00e9ac303
--- /dev/null
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/mls.ll b/test/CodeGen/ARM/mls.ll
index a6cdba445451..066bf98de651 100644
--- a/test/CodeGen/ARM/mls.ll
+++ b/test/CodeGen/ARM/mls.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=arm -mattr=+v6t2 | FileCheck %s
+; RUN: llc < %s -march=arm -mattr=+v6t2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -13,4 +14,15 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
     ret i32 %tmp2
 }
 
+; CHECK: f1:
 ; CHECK: mls	r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r2, r0
+
+; CHECK: f2:
+; CHECK: mul r0, r0, r1
+; CHECK-NEXT: sub r0, r0, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: mul r0, r0, r1
+; NO_MULOPS-NEXT: sub r0, r0, r2
diff --git a/test/CodeGen/ARM/neon-fma.ll b/test/CodeGen/ARM/neon-fma.ll
new file mode 100644
index 000000000000..d2cca5009d6b
--- /dev/null
+++ b/test/CodeGen/ARM/neon-fma.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin10 -mcpu=swift | FileCheck %s
+
+; CHECK: test_v2f32
+; CHECK: vfma.f32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+
+define <2 x float> @test_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+entry:
+  %call = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone
+  ret <2 x float> %call
+}
+
+; CHECK: test_v4f32
+; CHECK: vfma.f32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+entry:
+  %call = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone
+  ret <4 x float> %call
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 944bfe060298..497619ed746a 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,10 +1,16 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -march=arm -mcpu=swift | FileCheck %s --check-prefix=SWIFT
 
 ; CHECK: t1
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vadd.i64 q
-; CHECK: vstmia
+; CHECK: vst1.64
+; SWIFT: t1
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
 define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -16,11 +22,17 @@ entry:
 }
 
 ; CHECK: t2
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vsub.i64 q
 ; CHECK: vmov r0, r1, d
 ; CHECK: vmov r2, r3, d
+; SWIFT: t2
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+, :128\]}}
+; SWIFT: vsub.i64 q
+; SWIFT: vmov r0, r1, d
+; SWIFT: vmov r2, r3, d
 define <4 x i32> @t2(<2 x i64>* %a, <2 x i64>* %b) nounwind readonly {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -30,3 +42,18 @@ entry:
 	ret <4 x i32> %3
 }
 
+; Limited alignment.
+; SWIFT: t3
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vld1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+; SWIFT: vadd.i64 q
+; SWIFT: vst1.64 {{.d[0-9]+, d[0-9]+}, \[r[0-9]+}}
+define void @t3(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
+entry:
+	%0 = load <2 x i64>* %a, align 8
+	%1 = load <2 x i64>* %b, align 8
+	%2 = add <2 x i64> %0, %1
+	%3 = bitcast <2 x i64> %2 to <4 x i32>
+	store <4 x i32> %3, <4 x i32>* %r, align 8
+	ret void
+}
diff --git a/test/CodeGen/ARM/opt-shuff-tstore.ll b/test/CodeGen/ARM/opt-shuff-tstore.ll
index df98e231ccfd..74c9a21355d7 100644
--- a/test/CodeGen/ARM/opt-shuff-tstore.ll
+++ b/test/CodeGen/ARM/opt-shuff-tstore.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: func_4_8
 ; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
   %r = add <4 x i8> %param, <i8 1, i8 2, i8 3, i8 4>
   store <4 x i8> %r, <4 x i8>* %p
@@ -11,7 +11,7 @@ define void @func_4_8(<4 x i8> %param, <4 x i8>* %p) {
 
 ; CHECK: func_2_16
 ; CHECK: vst1.32
-; CHECK-NEXT: bx lr
+; CHECK: bx lr
 define void @func_2_16(<2 x i16> %param, <2 x i16>* %p) {
   %r = add <2 x i16> %param, <i16 1, i16 2>
   store <2 x i16> %r, <2 x i16>* %p
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 05794e4ebddb..6d6586e4f283 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
 
 %struct.int16x8_t = type { <8 x i16> }
@@ -124,7 +124,6 @@ return1:
 return2:
 ; CHECK:        %return2
 ; CHECK:        vadd.i32
-; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
 ; CHECK:        vst2.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}
   %tmp100 = extractvalue %struct.__neon_int32x4x2_t %tmp2, 0 ; <<4 x i32>> [#uses=1]
@@ -137,7 +136,7 @@ return2:
 
 define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        t5:
-; CHECK:        vldmia
+; CHECK:        vld1.32
 ; How can FileCheck match Q and D registers? We need a lisp interpreter.
 ; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
@@ -243,8 +242,8 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK:        vldr
 ; CHECK-NOT:    vmov d{{.*}}, d16
 ; CHECK:        vmov.i32 d17
-; CHECK-NEXT:   vstmia r0, {d16, d17}
-; CHECK-NEXT:   vstmia r0, {d16, d17}
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
   store <4 x float> %4, <4 x float>* undef, align 16
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 557556662892..62708ed53d05 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -80,7 +80,7 @@ define double @f7(double %a, double %b) {
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
-; CHECK-NEON-NEXT: addeq.w {{r.*}}, [[R2]]
+; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
 ; CHECK-NEON-NEXT: ldr
 ; CHECK-NEON:      bx
 
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index 26f7cb68901f..750780891261 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -9,7 +9,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 
 ; T2: t1:
 ; T2: mvn r0, #-2147483648
-; T2: addle.w r1, r1
+; T2: addle r1, r0
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -23,7 +23,7 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; ARM: mov r0, r1
 
 ; T2: t2:
-; T2: suble.w r1, r1, #10
+; T2: suble r1, #10
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 10
@@ -33,12 +33,12 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: t3:
-; ARM: mvnlt r2, #0
-; ARM: and r0, r2, r3
+; ARM: andge r3, r3, r2
+; ARM: mov r0, r3
 
 ; T2: t3:
-; T2: movlt.w r2, #-1
-; T2: and.w r0, r2, r3
+; T2: andge r3, r2
+; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 -1, i32 %x
   %s = and i32 %z, %y
@@ -47,12 +47,12 @@ define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 
 define i32 @t4(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: t4:
-; ARM: movlt r2, #0
-; ARM: orr r0, r2, r3
+; ARM: orrge r3, r3, r2
+; ARM: mov r0, r3
 
 ; T2: t4:
-; T2: movlt r2, #0
-; T2: orr.w r0, r2, r3
+; T2: orrge r3, r2
+; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 0, i32 %x
   %s = or i32 %z, %y
@@ -81,7 +81,7 @@ define i32 @t6(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 ; T2: t6:
 ; T2-NOT: movge
-; T2: eorlt.w r3, r3, r2
+; T2: eorlt r3, r2
   %cond = icmp slt i32 %a, %b
   %tmp1 = select i1 %cond, i32 %c, i32 0
   %tmp2 = xor i32 %tmp1, %d
@@ -179,3 +179,46 @@ define i32 @t12(i32 %a, i32 %b) nounwind {
   %tmp1 = select i1 %cond, i32 %a, i32 %x
   ret i32 %tmp1
 }
+
+; Handle frame index operands.
+define void @pr13628() nounwind uwtable align 2 {
+  %x3 = alloca i8, i32 256, align 8
+  %x4 = load i8* undef, align 1
+  %x5 = icmp ne i8 %x4, 0
+  %x6 = select i1 %x5, i8* %x3, i8* null
+  call void @bar(i8* %x6) nounwind
+  ret void
+}
+declare void @bar(i8*)
+
+; Fold zext i1 into predicated add
+define i32 @t13(i32 %c, i32 %a) nounwind readnone ssp {
+entry:
+; ARM: t13
+; ARM: cmp r1, #10
+; ARM: addgt r0, r0, #1
+
+; T2: t13
+; T2: cmp r1, #10
+; T2: addgt r0, #1
+  %cmp = icmp sgt i32 %a, 10
+  %conv = zext i1 %cmp to i32
+  %add = add i32 %conv, %c
+  ret i32 %add
+}
+
+; Fold sext i1 into predicated sub
+define i32 @t14(i32 %c, i32 %a) nounwind readnone ssp {
+entry:
+; ARM: t14
+; ARM: cmp r1, #10
+; ARM: subgt r0, r0, #1
+
+; T2: t14
+; T2: cmp r1, #10
+; T2: subgt r0, #1
+  %cmp = icmp sgt i32 %a, 10
+  %conv = sext i1 %cmp to i32
+  %add = add i32 %conv, %c
+  ret i32 %add
+}
diff --git a/test/CodeGen/ARM/struct_byval.ll b/test/CodeGen/ARM/struct_byval.ll
index 99ba475ad7b1..e9541c278803 100644
--- a/test/CodeGen/ARM/struct_byval.ll
+++ b/test/CodeGen/ARM/struct_byval.ll
@@ -44,3 +44,47 @@ entry:
 declare i32 @e1(%struct.SmallStruct* nocapture byval %in) nounwind
 declare i32 @e2(%struct.LargeStruct* nocapture byval %in) nounwind
 declare i32 @e3(%struct.LargeStruct* nocapture byval align 16 %in) nounwind
+
+; rdar://12442472
+; We can't do tail call since address of s is passed to the callee and part of
+; s is in caller's local frame.
+define void @f3(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f3
+; CHECK: bl _consumestruct
+entry:
+  %0 = bitcast %struct.SmallStruct* %s to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+define void @f4(%struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f4
+; CHECK: bl _consumestruct
+entry:
+  %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
+  %0 = bitcast i32* %addr to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+; We can do tail call here since s is in the incoming argument area.
+define void @f5(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f5
+; CHECK: b _consumestruct
+entry:
+  %0 = bitcast %struct.SmallStruct* %s to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+define void @f6(i32 %a, i32 %b, i32 %c, i32 %d, %struct.SmallStruct* nocapture byval %s) nounwind optsize {
+; CHECK: f6
+; CHECK: b _consumestruct
+entry:
+  %addr = getelementptr inbounds %struct.SmallStruct* %s, i32 0, i32 0
+  %0 = bitcast i32* %addr to i8*
+  tail call void @consumestruct(i8* %0, i32 80) optsize
+  ret void
+}
+
+declare void @consumestruct(i8* nocapture %structp, i32 %structsize) nounwind
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 6fcbdee30d34..2961b94d2c1e 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -63,3 +63,24 @@ if.then:
 if.else:
   ret i32 %sub
 }
+
+; If the sub/rsb instruction is predicated, we can't use the flags.
+; <rdar://problem/12263428>
+; Test case from MultiSource/Benchmarks/Ptrdist/bc/number.s
+; CHECK: bc_raise
+; CHECK: rsbeq
+; CHECK: cmp
+define i32 @bc_raise() nounwind ssp {
+entry:
+  %val.2.i = select i1 undef, i32 0, i32 undef
+  %sub.i = sub nsw i32 0, %val.2.i
+  %retval.0.i = select i1 undef, i32 %val.2.i, i32 %sub.i
+  %cmp1 = icmp eq i32 %retval.0.i, 0
+  br i1 %cmp1, label %land.lhs.true, label %if.end11
+
+land.lhs.true:                                    ; preds = %num2long.exit
+  ret i32 17
+
+if.end11:                                         ; preds = %num2long.exit
+  ret i32 23
+}
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 474043afc11d..7f82ca701261 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm < %s | FileCheck %s
+; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/ARM/subreg-remat.ll b/test/CodeGen/ARM/subreg-remat.ll
index 03ae12c6dea0..455bfce0f2e5 100644
--- a/test/CodeGen/ARM/subreg-remat.ll
+++ b/test/CodeGen/ARM/subreg-remat.ll
@@ -4,14 +4,14 @@ target triple = "thumbv7-apple-ios"
 ;
 ; The vector %v2 is built like this:
 ;
-;   %vreg6:ssub_1<def> = VMOVSR %vreg0<kill>, pred:14, pred:%noreg, %vreg6<imp-def>; DPR_VFP2:%vreg6 GPR:%vreg0
+;   %vreg6:ssub_1<def> = ...
 ;   %vreg6:ssub_0<def> = VLDRS <cp#0>, 0, pred:14, pred:%noreg; mem:LD4[ConstantPool] DPR_VFP2:%vreg6
 ;
 ; When %vreg6 spills, the VLDRS constant pool load cannot be rematerialized
 ; since it implicitly reads the ssub_1 sub-register.
 ;
 ; CHECK: f1
-; CHECK: vmov    s1, r0
+; CHECK: vmov    d0, r0, r0
 ; CHECK: vldr s0, LCPI
 ; The vector must be spilled:
 ; CHECK: vstr d0,
diff --git a/test/CodeGen/ARM/trap.ll b/test/CodeGen/ARM/trap.ll
index 38842a9646ff..21865f8e4aed 100644
--- a/test/CodeGen/ARM/trap.ll
+++ b/test/CodeGen/ARM/trap.ll
@@ -14,4 +14,16 @@ entry:
   unreachable
 }
 
+define void @t2() nounwind {
+entry:
+; INSTR: t2:
+; INSTR: trap
+
+; FUNC: t2:
+; FUNC: bl __trap
+  call void @llvm.debugtrap()
+  unreachable
+}
+
 declare void @llvm.trap() nounwind
+declare void @llvm.debugtrap() nounwind
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 4e227dd5be36..fc2aa1e568e2 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -4,18 +4,18 @@
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
 ; CHECK:      PR13378:
-; CHECK:        vldmia
+; CHECK:        vld1.32
+; CHECK-NEXT:   vst1.32
+; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
-; CHECK-NEXT:   vstmia
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vst1.32
 
 entry:
-  %0 = load <4 x float>* undef
-  store <4 x float> zeroinitializer, <4 x float>* undef
-  store <4 x float> %0, <4 x float>* undef
+  %0 = load <4 x float>* undef, align 4
+  store <4 x float> zeroinitializer, <4 x float>* undef, align 4
+  store <4 x float> %0, <4 x float>* undef, align 4
   %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
-  store <4 x float> %1, <4 x float>* undef
+  store <4 x float> %1, <4 x float>* undef, align 4
   unreachable
 }
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index 869b92675def..3064202eb3fe 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=UNALIGNED
 
 ; rdar://7113725
@@ -59,3 +59,19 @@ entry:
   store double %tmp, double* %b, align 1
   ret void
 }
+
+define void @byte_word_ops(i32* %a, i32* %b) nounwind {
+entry:
+; EXPANDED: byte_word_ops:
+; EXPANDED: ldrb
+; EXPANDED: strb
+
+; UNALIGNED: byte_word_ops:
+; UNALIGNED-NOT: ldrb
+; UNALIGNED: ldr
+; UNALIGNED-NOT: strb
+; UNALIGNED: str
+  %tmp = load i32* %a, align 1
+  store i32 %tmp, i32* %b, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/unaligned_load_store_vector.ll b/test/CodeGen/ARM/unaligned_load_store_vector.ll
new file mode 100644
index 000000000000..25ae6517937b
--- /dev/null
+++ b/test/CodeGen/ARM/unaligned_load_store_vector.ll
@@ -0,0 +1,487 @@
+;RUN: llc < %s -march=arm -mattr=+v7 -mattr=+neon | FileCheck %s
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i8> %v1, <8 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i16> %v1, <4 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i32> %v1, <2 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.8
+  %v1 = load  <2 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x float> %v1, <2 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <16 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <16 x i8> %v1, <16 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i16> %v1, <8 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i32> %v1, <4 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i64>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i64> %v1, <2 x i64>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.8
+  %v1 = load  <4 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x float> %v1, <4 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i8> %v1, <8 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i16> %v1, <4 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i32> %v1, <2 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.16
+  %v1 = load  <2 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x float> %v1, <2 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <16 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <16 x i8> %v1, <16 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i16> %v1, <8 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i32> %v1, <4 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i64>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i64> %v1, <2 x i64>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.16
+  %v1 = load  <4 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x float> %v1, <4 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vldr
+  %v1 = load  <8 x i8>* %vi, align 4
+;CHECK: vstr
+  store <8 x i8> %v1, <8 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vldr
+  %v1 = load  <4 x i16>* %vi, align 4
+;CHECK: vstr
+  store <4 x i16> %v1, <4 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vldr
+  %v1 = load  <2 x i32>* %vi, align 4
+;CHECK: vstr
+  store <2 x i32> %v1, <2 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vldr
+  %v1 = load  <2 x float>* %vi, align 4
+;CHECK: vstr
+  store <2 x float> %v1, <2 x float>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.32
+  %v1 = load  <16 x i8>* %vi, align 4
+;CHECK: vst1.32
+  store <16 x i8> %v1, <16 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.32
+  %v1 = load  <8 x i16>* %vi, align 4
+;CHECK: vst1.32
+  store <8 x i16> %v1, <8 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.32
+  %v1 = load  <4 x i32>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x i32> %v1, <4 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.32
+  %v1 = load  <2 x i64>* %vi, align 4
+;CHECK: vst1.32
+  store <2 x i64> %v1, <2 x i64>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.32
+  %v1 = load  <4 x float>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x float> %v1, <4 x float>* %vo, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/vbsl-constant.ll b/test/CodeGen/ARM/vbsl-constant.ll
index f157dbdb970c..ffda0a51bdd0 100644
--- a/test/CodeGen/ARM/vbsl-constant.ll
+++ b/test/CodeGen/ARM/vbsl-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+neon | FileCheck %s
 
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
@@ -59,8 +59,8 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind
 
 define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
 ;CHECK: v_bslQi8:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
@@ -73,8 +73,8 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 
 define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
 ;CHECK: v_bslQi16:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
@@ -87,8 +87,8 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
 ;CHECK: v_bslQi32:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
@@ -101,9 +101,9 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
 ;CHECK: v_bslQi64:
-;CHECK: vldmia
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
+;CHECK: vld1.64
 ;CHECK: vbsl
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
index 9f3bb4e1030c..750fb0de5383 100644
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
 
+; rdar://12471808
+
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
 ;CHECK: vbsl
@@ -103,3 +105,98 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwin
 	%tmp7 = or <2 x i64> %tmp4, %tmp6
 	ret <2 x i64> %tmp7
 }
+
+define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: f1:
+; CHECK: vbsl
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: f2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
+  ret <4 x i16> %vbsl3.i
+}
+
+define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: f3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
+  ret <2 x i32> %vbsl3.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone optsize ssp {
+; CHECK: f4:
+; CHECK: vbsl
+  %vbsl4.i = tail call <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind
+  ret <2 x float> %vbsl4.i
+}
+
+define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: g1:
+; CHECK: vbsl
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: g2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: g3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
+  ret <4 x i32> %vbsl3.i
+}
+
+define <4 x float> @g4(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone optsize ssp {
+; CHECK: g4:
+; CHECK: vbsl
+  %vbsl4.i = tail call <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind
+  ret <4 x float> %vbsl4.i
+}
+
+define <1 x i64> @test_vbsl_s64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbsl_s64:
+; CHECK: vbsl d
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
+  ret <1 x i64> %vbsl3.i
+}
+
+define <1 x i64> @test_vbsl_u64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbsl_u64:
+; CHECK: vbsl d
+  %vbsl3.i = tail call <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64> %a, <1 x i64> %b, <1 x i64> %c) nounwind
+  ret <1 x i64> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_s64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbslq_s64:
+; CHECK: vbsl q
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
+  ret <2 x i64> %vbsl3.i
+}
+
+define <2 x i64> @test_vbslq_u64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK: test_vbslq_u64:
+; CHECK: vbsl q
+  %vbsl3.i = tail call <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c) nounwind
+  ret <2 x i64> %vbsl3.i
+}
+
+declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
+declare <2 x float> @llvm.arm.neon.vbsl.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm.neon.vbsl.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm.neon.vbsl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+declare <1 x i64> @llvm.arm.neon.vbsl.v1i64(<1 x i64>, <1 x i64>, <1 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 05332e4d8c5b..2cf94d63ca14 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -261,3 +261,73 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
   ret void
 }
+
+define <4 x i32> @tdupi(i32 %x, i32 %y) {
+;CHECK: tdupi
+;CHECK: vdup.32
+  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
+  ret <4 x i32> %4
+}
+
+define <4 x float> @tdupf(float %x, float %y) {
+;CHECK: tdupf
+;CHECK: vdup.32
+  %1 = insertelement <4 x float> undef, float %x, i32 0
+  %2 = insertelement <4 x float> %1, float %x, i32 1
+  %3 = insertelement <4 x float> %2, float %x, i32 2
+  %4 = insertelement <4 x float> %3, float %y, i32 3
+  ret <4 x float> %4
+}
+
+; This test checks that when splatting an element from a vector into another,
+; the value isn't moved out to GPRs first.
+define <4 x i32> @tduplane(<4 x i32> %invec) {
+;CHECK: tduplane
+;CHECK-NOT: vmov {{.*}}, d16[1]
+;CHECK: vdup.32 {{.*}}, d16[1]
+  %in = extractelement <4 x i32> %invec, i32 1
+  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
+  %4 = insertelement <4 x i32> %3, i32 255, i32 3
+  ret <4 x i32> %4
+}
+
+define <2 x float> @check_f32(<4 x float> %v) nounwind {
+;CHECK: check_f32:
+;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+  %x = extractelement <4 x float> %v, i32 3
+  %1 = insertelement  <2 x float> undef, float %x, i32 0
+  %2 = insertelement  <2 x float> %1, float %x, i32 1
+  ret <2 x float> %2
+}
+
+define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
+;CHECK: check_i32:
+;CHECK: vdup.32 {{.*}}, d{{..}}[1]
+  %x = extractelement <4 x i32> %v, i32 3
+  %1 = insertelement  <2 x i32> undef, i32 %x, i32 0
+  %2 = insertelement  <2 x i32> %1, i32 %x, i32 1
+  ret <2 x i32> %2
+}
+
+define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
+;CHECK: check_i16:
+;CHECK: vdup.16 {{.*}}, d{{..}}[3]
+  %x = extractelement <8 x i16> %v, i32 3
+  %1 = insertelement  <4 x i16> undef, i16 %x, i32 0
+  %2 = insertelement  <4 x i16> %1, i16 %x, i32 1
+  ret <4 x i16> %2
+}
+
+define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
+;CHECK: check_i8:
+;CHECK: vdup.8 {{.*}}, d{{..}}[3]
+  %x = extractelement <16 x i8> %v, i32 3
+  %1 = insertelement  <8  x i8> undef, i8 %x, i32 0
+  %2 = insertelement  <8  x i8> %1, i8 %x, i32 1
+  ret <8 x i8> %2
+}
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 8fd3db29197e..22af79762128 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -62,3 +62,14 @@ define <4 x i8> @i(<4 x i8>* %x) {
   %2 = sdiv <4 x i8> zeroinitializer, %1
   ret <4 x i8> %2
 }
+; CHECK: j:
+define <4 x i32> @j(<4 x i8>* %in) nounwind {
+  ; CHECK: vld1
+  ; CHECK: vmovl.u8
+  ; CHECK: vmovl.u16
+  ; CHECK-NOT: vand
+  %1 = load <4 x i8>* %in, align 4
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index e224bdfe25a5..f404eb8be5b7 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -74,6 +74,39 @@ define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 	ret <16 x i8> %tmp3
 }
 
+define <16 x i8> @test_vextq_undef_op2(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
+  ret <8 x i8> %tmp1
+}
+
+
+define <16 x i8> @test_vextq_undef_op2_undef(<16 x i8> %a) nounwind {
+;CHECK: test_vextq_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
+  ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @test_vextd_undef_op2_undef(<8 x i8> %a) nounwind {
+;CHECK: test_vextd_undef_op2_undef:
+;CHECK: vext
+entry:
+  %tmp1 = shufflevector <8 x i8> %a, <8 x i8> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 1>
+  ret <8 x i8> %tmp1
+}
+
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 1fc885d61372..c9ce3b7450b6 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
 
 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
 ;CHECK: vsetQ_lane32:
-;CHECK: vmov.32
+;CHECK: vmov.32 d{{.*}}[1], r1
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/ARM/vselect_imax.ll b/test/CodeGen/ARM/vselect_imax.ll
new file mode 100644
index 000000000000..f5994046de4b
--- /dev/null
+++ b/test/CodeGen/ARM/vselect_imax.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; Make sure that ARM backend with NEON handles vselect.
+
+define void @vmax_v4i32(<4 x i32>* %m, <4 x i32> %a, <4 x i32> %b) {
+; CHECK: vcgt.s32 [[QR:q[0-9]+]], [[Q1:q[0-9]+]], [[Q2:q[0-9]+]]
+; CHECK: vbsl [[QR]], [[Q1]], [[Q2]]
+    %cmpres = icmp sgt <4 x i32> %a, %b
+    %maxres = select <4 x i1> %cmpres, <4 x i32> %a,  <4 x i32> %b
+    store <4 x i32> %maxres, <4 x i32>* %m
+    ret void
+}
+
diff --git a/test/CodeGen/CellSPU/icmp16.ll b/test/CodeGen/CellSPU/icmp16.ll
index 2f9b091faea3..853ae1db160f 100644
--- a/test/CodeGen/CellSPU/icmp16.ll
+++ b/test/CodeGen/CellSPU/icmp16.ll
@@ -534,7 +534,7 @@ entry:
 define i16 @icmp_slt_immed04_i16(i16 %arg1, i16 %val1, i16 %val2) nounwind {
 ; CHECK:      icmp_slt_immed04_i16:
 ; CHECK:        lr
-; CHECK-NETX:   bi
+; CHECK-NEXT:   bi
 
 entry:
        %A = icmp slt i16 %arg1, 32768
@@ -559,7 +559,7 @@ define i1 @icmp_sle_setcc_i16(i16 %arg1, i16 %arg2, i16 %val1, i16 %val2) nounwi
 ; CHECK:        ilhu
 ; CHECK:        xorhi
 ; CHECK:        iohl
-; CHECK-NETX:   bi
+; CHECK:   bi
 
 entry:
        %A = icmp sle i16 %arg1, %arg2
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
new file mode 100644
index 000000000000..802ee2cb0558
--- /dev/null
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure we have the correct weight attached to each successor.
+define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Machine code for function test2:
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !0
+; CHECK: BB#0: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
+; CHECK: BB#4: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: BB#5: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index e9ac8b67493e..8a6efb620ec0 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = #7
 ; CHECK: memw(r29 + #0) = r[[T0]]
+; CHECK: r5 = #6
 ; CHECK: r0 = #1
 ; CHECK: r1 = #2
 ; CHECK: r2 = #3
 ; CHECK: r3 = #4
 ; CHECK: r4 = #5
-; CHECK: r5 = #6
 
 
 define void @foo() nounwind {
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index ab69b22df57c..186e39378854 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; Check that we generate new value store packet in V4
 
 @i = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
new file mode 100644
index 000000000000..79b5f4ae7c43
--- /dev/null
+++ b/test/CodeGen/Hexagon/remove_lsr.ll
@@ -0,0 +1,80 @@
+; Test fix for PR-13709.
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: foo
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+
+; Convert the sequence
+; r17:16 = lsr(r11:10, #32)
+; .. = r16
+; into
+; r17:16 = lsr(r11:10, #32)
+; .. = r11
+; This makes the lsr instruction dead and it gets removed subsequently
+; by a dead code removal pass.
+
+%union.vect64 = type { i64 }
+%union.vect32 = type { i32 }
+
+define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
+ %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
+ i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
+ nounwind {
+entry:
+  %scevgep = getelementptr %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
+  %scevgep28 = getelementptr %union.vect32* %s_odd, i32 1
+  %scevgep32 = getelementptr %union.vect32* %s_even, i32 1
+  %scevgep36 = getelementptr i8* %scr_s_odd_code_ptr, i32 1
+  %scevgep39 = getelementptr i8* %scr_s_even_code_ptr, i32 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
+  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
+  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
+  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
+  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
+  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
+  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
+  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
+  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
+  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
+  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
+  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
+  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
+  %conv3 = sext i8 %predicate.022 to i32
+  %1 = trunc i64 %val.021 to i32
+  %2 = trunc i64 %0 to i32
+  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
+  store i32 %3, i32* %lsr.iv3335, align 4, !tbaa !0
+  %conv8 = sext i8 %predicate_1.023 to i32
+  %4 = lshr i64 %val.021, 32
+  %5 = trunc i64 %4 to i32
+  %6 = lshr i64 %0, 32
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
+  store i32 %8, i32* %lsr.iv2931, align 4, !tbaa !0
+  %srcval = load i64* %lsr.iv27, align 8
+  %9 = load i8* %lsr.iv40, align 1, !tbaa !1
+  %10 = load i8* %lsr.iv37, align 1, !tbaa !1
+  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
+  %exitcond = icmp eq i8 %lftr.wideiv, 32
+  %scevgep26 = getelementptr %union.vect64* %lsr.iv, i32 1
+  %scevgep30 = getelementptr %union.vect32* %lsr.iv29, i32 1
+  %scevgep34 = getelementptr %union.vect32* %lsr.iv33, i32 1
+  %scevgep38 = getelementptr i8* %lsr.iv37, i32 1
+  %scevgep41 = getelementptr i8* %lsr.iv40, i32 1
+  %lsr.iv.next = add i32 %lsr.iv42, 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index 2e4ab633e415..683a4c21bcb8 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32
diff --git a/test/CodeGen/MSP430/fp.ll b/test/CodeGen/MSP430/fp.ll
new file mode 100644
index 000000000000..c3273eff05cb
--- /dev/null
+++ b/test/CodeGen/MSP430/fp.ll
@@ -0,0 +1,17 @@
+; RUN: llc -O0 -disable-fp-elim < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+define void @fp() nounwind {
+entry:
+; CHECK: fp:
+; CHECK: push.w r4
+; CHECK: mov.w r1, r4
+; CHECK: sub.w #2, r1
+  %i = alloca i16, align 2
+; CHECK: mov.w #0, -2(r4)
+  store i16 0, i16* %i, align 2
+; CHECK: pop.w r4
+  ret void
+}
diff --git a/test/CodeGen/Mips/alloca16.ll b/test/CodeGen/Mips/alloca16.ll
new file mode 100644
index 000000000000..731edae43cbb
--- /dev/null
+++ b/test/CodeGen/Mips/alloca16.ll
@@ -0,0 +1,75 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 25, align 4
+@jjjj = global i32 35, align 4
+@kkkk = global i32 100, align 4
+@t = global i32 25, align 4
+@riii = common global i32 0, align 4
+@rjjj = common global i32 0, align 4
+@rkkk = common global i32 0, align 4
+
+define void @temp(i32 %foo) nounwind {
+entry:
+  %foo.addr = alloca i32, align 4
+  store i32 %foo, i32* %foo.addr, align 4
+  %0 = load i32* %foo.addr, align 4
+  store i32 %0, i32* @t, align 4
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+; 16: 	.frame	$16,24,$ra
+; 16: 	save 	$ra, $s0, $s1, 24
+; 16: 	move	$16, $sp
+; 16:	move	${{[0-9]+}}, $sp
+; 16:	subu	$[[REGISTER:[0-9]+]], ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$sp, $[[REGISTER]]
+  %sssi = alloca i32, align 4
+  %ip = alloca i32*, align 4
+  %sssj = alloca i32, align 4
+  %0 = load i32* @iiii, align 4
+  store i32 %0, i32* %sssi, align 4
+  %1 = load i32* @kkkk, align 4
+  %mul = mul nsw i32 %1, 100
+  %2 = alloca i8, i32 %mul
+  %3 = bitcast i8* %2 to i32*
+  store i32* %3, i32** %ip, align 4
+  %4 = load i32* @jjjj, align 4
+  store i32 %4, i32* %sssj, align 4
+  %5 = load i32* @jjjj, align 4
+  %6 = load i32* @iiii, align 4
+  %7 = load i32** %ip, align 4
+  %arrayidx = getelementptr inbounds i32* %7, i32 %6
+  store i32 %5, i32* %arrayidx, align 4
+  %8 = load i32* @kkkk, align 4
+  %9 = load i32* @jjjj, align 4
+  %10 = load i32** %ip, align 4
+  %arrayidx1 = getelementptr inbounds i32* %10, i32 %9
+  store i32 %8, i32* %arrayidx1, align 4
+  %11 = load i32* @iiii, align 4
+  %12 = load i32* @kkkk, align 4
+  %13 = load i32** %ip, align 4
+  %arrayidx2 = getelementptr inbounds i32* %13, i32 %12
+  store i32 %11, i32* %arrayidx2, align 4
+  %14 = load i32** %ip, align 4
+  %arrayidx3 = getelementptr inbounds i32* %14, i32 25
+  %15 = load i32* %arrayidx3, align 4
+  store i32 %15, i32* @riii, align 4
+  %16 = load i32** %ip, align 4
+  %arrayidx4 = getelementptr inbounds i32* %16, i32 35
+  %17 = load i32* %arrayidx4, align 4
+  store i32 %17, i32* @rjjj, align 4
+  %18 = load i32** %ip, align 4
+  %arrayidx5 = getelementptr inbounds i32* %18, i32 100
+  %19 = load i32* %arrayidx5, align 4
+  store i32 %19, i32* @rkkk, align 4
+  %20 = load i32* @t, align 4
+  %21 = load i32** %ip, align 4
+  %arrayidx6 = getelementptr inbounds i32* %21, i32 %20
+  %22 = load i32* %arrayidx6, align 4
+; 16: 	save	16
+  call void @temp(i32 %22)
+; 16: 	restore	16
+  ret void
+}
diff --git a/test/CodeGen/Mips/atomic.ll b/test/CodeGen/Mips/atomic.ll
index 050689dcea6c..819f258c2a40 100644
--- a/test/CodeGen/Mips/atomic.ll
+++ b/test/CodeGen/Mips/atomic.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel --disable-machine-licm < %s | FileCheck %s
 
 @x = common global i32 0, align 4
 
@@ -181,8 +181,9 @@ entry:
 
 ; CHECK:   $[[BB0:[A-Z_0-9]+]]:
 ; CHECK:   ll      $[[R10:[0-9]+]], 0($[[R2]])
+; CHECK:   and     $[[R18:[0-9]+]], $[[R9]], $[[R6]]
 ; CHECK:   and     $[[R13:[0-9]+]], $[[R10]], $[[R7]]
-; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R9]]
+; CHECK:   or      $[[R14:[0-9]+]], $[[R13]], $[[R18]]
 ; CHECK:   sc      $[[R14]], 0($[[R2]])
 ; CHECK:   beq     $[[R14]], $zero, $[[BB0]]
 
diff --git a/test/CodeGen/Mips/atomicops.ll b/test/CodeGen/Mips/atomicops.ll
new file mode 100644
index 000000000000..b9c3804e0d72
--- /dev/null
+++ b/test/CodeGen/Mips/atomicops.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@.str = private unnamed_addr constant [8 x i8] c"%d, %d\0A\00", align 1
+
+define i32 @foo(i32* %mem, i32 %val, i32 %c) nounwind {
+entry:
+  %0 = atomicrmw add i32* %mem, i32 %val seq_cst
+  %add = add nsw i32 %0, %c
+  ret i32 %add
+; 16: foo:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %x = alloca i32, align 4
+  store volatile i32 0, i32* %x, align 4
+  %0 = atomicrmw add i32* %x, i32 1 seq_cst
+  %add.i = add nsw i32 %0, 2
+  %1 = load volatile i32* %x, align 4
+  %call1 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %add.i, i32 %1) nounwind
+  %2 = cmpxchg i32* %x, i32 1, i32 2 seq_cst
+  %3 = load volatile i32* %x, align 4
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %2, i32 %3) nounwind
+  %4 = atomicrmw xchg i32* %x, i32 1 seq_cst
+  %5 = load volatile i32* %x, align 4
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 %4, i32 %5) nounwind
+; 16: main:
+; 16:	lw	${{[0-9]+}}, %call16(__sync_synchronize)(${{[0-9]+}})
+; 16: 	lw	${{[0-9]+}}, %call16(__sync_fetch_and_add_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_val_compare_and_swap_4)(${{[0-9]+}})
+; 16:	lw	${{[0-9]+}}, %call16(__sync_lock_test_and_set_4)(${{[0-9]+}})
+
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+
diff --git a/test/CodeGen/Mips/brconeq.ll b/test/CodeGen/Mips/brconeq.ll
new file mode 100644
index 000000000000..613391557efd
--- /dev/null
+++ b/test/CodeGen/Mips/brconeq.ll
@@ -0,0 +1,38 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, %1
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/test/CodeGen/Mips/brconeqk.ll b/test/CodeGen/Mips/brconeqk.ll
new file mode 100644
index 000000000000..2c0e72dabd29
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqk.ll
@@ -0,0 +1,22 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 10
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconeqz.ll b/test/CodeGen/Mips/brconeqz.ll
new file mode 100644
index 000000000000..5586e7b976da
--- /dev/null
+++ b/test/CodeGen/Mips/brconeqz.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	beqz	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/brconge.ll b/test/CodeGen/Mips/brconge.ll
new file mode 100644
index 000000000000..02f0a633b313
--- /dev/null
+++ b/test/CodeGen/Mips/brconge.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp slt i32 %0, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 1, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brcongt.ll b/test/CodeGen/Mips/brcongt.ll
new file mode 100644
index 000000000000..767b51b21b91
--- /dev/null
+++ b/test/CodeGen/Mips/brcongt.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconle.ll b/test/CodeGen/Mips/brconle.ll
new file mode 100644
index 000000000000..854b2481c6e6
--- /dev/null
+++ b/test/CodeGen/Mips/brconle.ll
@@ -0,0 +1,37 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 -5, align 4
+@j = global i32 10, align 4
+@k = global i32 -5, align 4
+@result1 = global i32 0, align 4
+@result2 = global i32 1, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result1, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %2 = load i32* @k, align 4
+  %cmp1 = icmp sgt i32 %1, %2
+  br i1 %cmp1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  store i32 0, i32* @result1, align 4
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconlt.ll b/test/CodeGen/Mips/brconlt.ll
new file mode 100644
index 000000000000..931a3e8c7ba4
--- /dev/null
+++ b/test/CodeGen/Mips/brconlt.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 10, align 4
+@k = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %if.end, label %if.then
+
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconne.ll b/test/CodeGen/Mips/brconne.ll
new file mode 100644
index 000000000000..5d5bde3fcf91
--- /dev/null
+++ b/test/CodeGen/Mips/brconne.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 5, align 4
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnek.ll b/test/CodeGen/Mips/brconnek.ll
new file mode 100644
index 000000000000..6208d7c5a04b
--- /dev/null
+++ b/test/CodeGen/Mips/brconnek.ll
@@ -0,0 +1,25 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 5
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll
new file mode 100644
index 000000000000..47db7901b517
--- /dev/null
+++ b/test/CodeGen/Mips/brconnez.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 0, align 4
+@result = global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp eq i32 %0, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+; 16:	bnez	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
+; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
+; 16: $[[LABEL]]:
+
+if.then:                                          ; preds = %entry
+  store i32 1, i32* @result, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index b266ce61a8d1..2fdb736dc886 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -1,15 +1,37 @@
-; RUN: llc -march=mipsel  -enable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -check-prefix=None
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=Default
 
 define void @foo1() nounwind {
 entry:
-; CHECK:      jalr 
-; CHECK-NOT:  nop 
-; CHECK:      jr 
-; CHECK-NOT:  nop
-; CHECK:      .end
+; Default:     jalr 
+; Default-NOT: nop 
+; Default:     jr 
+; Default-NOT: nop
+; Default:     .end
+; None: jalr 
+; None: nop 
+; None: jr 
+; None: nop
+; None: .end
 
   tail call void @foo2(i32 3) nounwind
   ret void
 }
 
 declare void @foo2(i32)
+
+; Check that cvt.d.w goes into jalr's delay slot.
+;
+define void @foo3(i32 %a) nounwind {
+entry:
+; Default:     foo3:
+; Default:     jalr
+; Default:     cvt.d.w
+
+  %conv = sitofp i32 %a to double
+  tail call void @foo4(double %conv) nounwind
+  ret void
+}
+
+declare void @foo4(double)
+
diff --git a/test/CodeGen/Mips/brind.ll b/test/CodeGen/Mips/brind.ll
new file mode 100644
index 000000000000..4c591fa1bba1
--- /dev/null
+++ b/test/CodeGen/Mips/brind.ll
@@ -0,0 +1,40 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@main.L = internal unnamed_addr constant [5 x i8*] [i8* blockaddress(@main, %L1), i8* blockaddress(@main, %L2), i8* blockaddress(@main, %L3), i8* blockaddress(@main, %L4), i8* null], align 4
+@str = private unnamed_addr constant [2 x i8] c"A\00"
+@str5 = private unnamed_addr constant [2 x i8] c"B\00"
+@str6 = private unnamed_addr constant [2 x i8] c"C\00"
+@str7 = private unnamed_addr constant [2 x i8] c"D\00"
+@str8 = private unnamed_addr constant [2 x i8] c"E\00"
+
+define i32 @main() nounwind {
+entry:
+  %puts = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str, i32 0, i32 0))
+  br label %L1
+
+L1:                                               ; preds = %entry, %L3
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %L3 ]
+  %puts5 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str5, i32 0, i32 0))
+  br label %L2
+
+L2:                                               ; preds = %L1, %L3
+  %i.1 = phi i32 [ %i.0, %L1 ], [ %inc, %L3 ]
+  %puts6 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str6, i32 0, i32 0))
+  br label %L3
+
+L3:                                               ; preds = %L2, %L3
+  %i.2 = phi i32 [ %i.1, %L2 ], [ %inc, %L3 ]
+  %puts7 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str7, i32 0, i32 0))
+  %inc = add i32 %i.2, 1
+  %arrayidx = getelementptr inbounds [5 x i8*]* @main.L, i32 0, i32 %i.2
+  %0 = load i8** %arrayidx, align 4
+  indirectbr i8* %0, [label %L1, label %L2, label %L3, label %L4]
+; 16: 	jrc	 ${{[0-9]+}}
+L4:                                               ; preds = %L3
+  %puts8 = tail call i32 @puts(i8* getelementptr inbounds ([2 x i8]* @str8, i32 0, i32 0))
+  ret i32 0
+}
+
+declare i32 @puts(i8* nocapture) nounwind
+
+
diff --git a/test/CodeGen/Mips/check-noat.ll b/test/CodeGen/Mips/check-noat.ll
new file mode 100644
index 000000000000..bfeff677b34d
--- /dev/null
+++ b/test/CodeGen/Mips/check-noat.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s 
+
+define void @f() nounwind readnone {
+entry:
+; CHECK: f:
+; CHECK: .set  noat
+; CHECK: .set  at
+
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/div.ll b/test/CodeGen/Mips/div.ll
new file mode 100644
index 000000000000..00e2c1927459
--- /dev/null
+++ b/test/CodeGen/Mips/div.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 100, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = sdiv i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  store i32 %div, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/div_rem.ll b/test/CodeGen/Mips/div_rem.ll
new file mode 100644
index 000000000000..950192eee169
--- /dev/null
+++ b/test/CodeGen/Mips/div_rem.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+@llll = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = sdiv i32 %0, %1
+  store i32 %div, i32* @kkkk, align 4
+  %rem = srem i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @llll, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/divu.ll b/test/CodeGen/Mips/divu.ll
new file mode 100644
index 000000000000..b96a439390ca
--- /dev/null
+++ b/test/CodeGen/Mips/divu.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 100, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = udiv i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  store i32 %div, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/divu_remu.ll b/test/CodeGen/Mips/divu_remu.ll
new file mode 100644
index 000000000000..a6c1563ac195
--- /dev/null
+++ b/test/CodeGen/Mips/divu_remu.ll
@@ -0,0 +1,23 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+@llll = common global i32 0, align 4
+
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %div = udiv i32 %0, %1
+  store i32 %div, i32* @kkkk, align 4
+  %rem = urem i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @llll, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/dsp-r1.ll b/test/CodeGen/Mips/dsp-r1.ll
new file mode 100644
index 000000000000..c9dc8cfd0be0
--- /dev/null
+++ b/test/CodeGen/Mips/dsp-r1.ll
@@ -0,0 +1,1241 @@
+; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+
+define i32 @test__builtin_mips_extr_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr.w
+
+  %1 = tail call i32 @llvm.mips.extr.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv.w
+
+  %1 = tail call i32 @llvm.mips.extr.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_r_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_r.w
+
+  %1 = tail call i32 @llvm.mips.extr.r.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.r.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_s_h1(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_s.h
+
+  %1 = tail call i32 @llvm.mips.extr.s.h(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.s.h(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_rs_w1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_rs.w
+
+  %1 = tail call i32 @llvm.mips.extr.rs.w(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extr.rs.w(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extr_rs_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_rs.w
+
+  %1 = tail call i32 @llvm.mips.extr.rs.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_s_h2(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extr_s.h
+
+  %1 = tail call i32 @llvm.mips.extr.s.h(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extr_r_w2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extrv_r.w
+
+  %1 = tail call i32 @llvm.mips.extr.r.w(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extp1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extp ${{[0-9]+}}
+
+  %1 = tail call i32 @llvm.mips.extp(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extp(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extp2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extpv
+
+  %1 = tail call i32 @llvm.mips.extp(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i32 @test__builtin_mips_extpdp1(i32 %i0, i32, i64 %a0) nounwind {
+entry:
+; CHECK: extpdp ${{[0-9]+}}
+
+  %1 = tail call i32 @llvm.mips.extpdp(i64 %a0, i32 15)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.extpdp(i64, i32) nounwind
+
+define i32 @test__builtin_mips_extpdp2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: extpdpv
+
+  %1 = tail call i32 @llvm.mips.extpdp(i64 %a0, i32 %a1)
+  ret i32 %1
+}
+
+define i64 @test__builtin_mips_dpau_h_qbl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpau.h.qbl
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpau.h.qbl(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpau.h.qbl(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpau_h_qbr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpau.h.qbr
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpau.h.qbr(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpau.h.qbr(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsu_h_qbl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsu.h.qbl
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpsu.h.qbl(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsu.h.qbl(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsu_h_qbr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsu.h.qbr
+
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = bitcast i32 %a2.coerce to <4 x i8>
+  %3 = tail call i64 @llvm.mips.dpsu.h.qbr(i64 %a0, <4 x i8> %1, <4 x i8> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsu.h.qbr(i64, <4 x i8>, <4 x i8>) nounwind readnone
+
+define i64 @test__builtin_mips_dpaq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpaq_sa_l_w1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind {
+entry:
+; CHECK: dpaq_sa.l.w
+
+  %1 = tail call i64 @llvm.mips.dpaq.sa.l.w(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.dpaq.sa.l.w(i64, i32, i32) nounwind
+
+define i64 @test__builtin_mips_dpsq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsq_sa_l_w1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind {
+entry:
+; CHECK: dpsq_sa.l.w
+
+  %1 = tail call i64 @llvm.mips.dpsq.sa.l.w(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.dpsq.sa.l.w(i64, i32, i32) nounwind
+
+define i64 @test__builtin_mips_mulsaq_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: mulsaq_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.mulsaq.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.mulsaq.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_s_w_phl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_s.w.phl
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.s.w.phl(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.s.w.phl(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_s_w_phr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_s.w.phr
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.s.w.phr(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.s.w.phr(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_sa_w_phl1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_sa.w.phl
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.sa.w.phl(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.sa.w.phl(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_maq_sa_w_phr1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: maq_sa.w.phr
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.maq.sa.w.phr(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.maq.sa.w.phr(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_shilo1(i32 %i0, i32, i64 %a0) nounwind readnone {
+entry:
+; CHECK: shilo $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.shilo(i64 %a0, i32 0)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.shilo(i64, i32) nounwind readnone
+
+define i64 @test__builtin_mips_shilo2(i32 %i0, i32, i64 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shilov
+
+  %1 = tail call i64 @llvm.mips.shilo(i64 %a0, i32 %a1)
+  ret i64 %1
+}
+
+define i64 @test__builtin_mips_mthlip1(i32 %i0, i32, i64 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mthlip ${{[0-9]+}}
+
+  %1 = tail call i64 @llvm.mips.mthlip(i64 %a0, i32 %a1)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.mthlip(i64, i32) nounwind
+
+define i32 @test__builtin_mips_bposge321(i32 %i0) nounwind readonly {
+entry:
+; CHECK: bposge32 $BB{{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.bposge32()
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.bposge32() nounwind readonly
+
+define i64 @test__builtin_mips_madd1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: madd $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.madd(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.madd(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_maddu1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: maddu $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.maddu(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.maddu(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_msub1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: msub $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.msub(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.msub(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_msubu1(i32 %i0, i32, i64 %a0, i32 %a1, i32 %a2) nounwind readnone {
+entry:
+; CHECK: msubu $ac{{[0-9]}}
+
+  %1 = tail call i64 @llvm.mips.msubu(i64 %a0, i32 %a1, i32 %a2)
+  ret i64 %1
+}
+
+declare i64 @llvm.mips.msubu(i64, i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_mult1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: mult $ac{{[0-9]}}
+
+  %0 = tail call i64 @llvm.mips.mult(i32 %a0, i32 %a1)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.mult(i32, i32) nounwind readnone
+
+define i64 @test__builtin_mips_multu1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: multu $ac{{[0-9]}}
+
+  %0 = tail call i64 @llvm.mips.multu(i32 %a0, i32 %a1)
+  ret i64 %0
+}
+
+declare i64 @llvm.mips.multu(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_addq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addq.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_addq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addq_s.w
+
+  %0 = tail call i32 @llvm.mips.addq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_addu_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.addu.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.addu.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_addu_s_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.addu.s.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.addu.s.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_subq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subq.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_subq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: subq_s.w
+
+  %0 = tail call i32 @llvm.mips.subq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_subu_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subu.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subu.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_subu_s_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subu.s.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subu.s.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_addsc1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addsc ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.addsc(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addsc(i32, i32) nounwind
+
+define i32 @test__builtin_mips_addwc1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: addwc ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.addwc(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addwc(i32, i32) nounwind
+
+define i32 @test__builtin_mips_modsub1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: modsub ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.modsub(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.modsub(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_raddu_w_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: raddu.w.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call i32 @llvm.mips.raddu.w.qb(<4 x i8> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.raddu.w.qb(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_muleu_s_ph_qbl1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleu_s.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.muleu.s.ph.qbl(<4 x i8> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.muleu.s.ph.qbl(<4 x i8>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_muleu_s_ph_qbr1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleu_s.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.muleu.s.ph.qbr(<4 x i8> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.muleu.s.ph.qbr(<4 x i8>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mulq_rs_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mulq_rs.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mulq.rs.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mulq.rs.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_muleq_s_w_phl1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleq_s.w.phl
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call i32 @llvm.mips.muleq.s.w.phl(<2 x i16> %0, <2 x i16> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.muleq.s.w.phl(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_muleq_s_w_phr1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: muleq_s.w.phr
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call i32 @llvm.mips.muleq.s.w.phr(<2 x i16> %0, <2 x i16> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.muleq.s.w.phr(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_precrq_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: precrq.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precrq.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precrq.qb.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precrq_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precrq.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precrq.ph.w(i32 %a0, i32 %a1)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precrq.ph.w(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_precrq_rs_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: precrq_rs.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precrq.rs.ph.w(i32 %a0, i32 %a1)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precrq.rs.ph.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_precrqu_s_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: precrqu_s.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precrqu.s.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precrqu.s.qb.ph(<2 x i16>, <2 x i16>) nounwind
+
+
+define i32 @test__builtin_mips_cmpu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+declare i32 @llvm.mips.rddsp(i32) nounwind readonly
+
+define i32 @test__builtin_mips_cmpu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  tail call void @llvm.mips.cmpu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmpu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmp_eq_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.eq.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.eq.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.eq.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmp_lt_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.lt.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.lt.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.lt.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmp_le_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmp.le.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  tail call void @llvm.mips.cmp.le.ph(<2 x i16> %0, <2 x i16> %1)
+  %2 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %2
+}
+
+declare void @llvm.mips.cmp.le.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_pick_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readonly {
+entry:
+; CHECK: pick.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.pick.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.pick.qb(<4 x i8>, <4 x i8>) nounwind readonly
+
+define { i32 } @test__builtin_mips_pick_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readonly {
+entry:
+; CHECK: pick.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.pick.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.pick.ph(<2 x i16>, <2 x i16>) nounwind readonly
+
+define { i32 } @test__builtin_mips_packrl_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: packrl.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.packrl.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.packrl.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_rddsp1(i32 %i0) nounwind readonly {
+entry:
+; CHECK: rddsp ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_shll_qb1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shll.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shll.qb(<4 x i8>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shll.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shll_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shll.ph(<2 x i16>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shll_s_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: shll_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.s.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shll.s.ph(<2 x i16>, i32) nounwind
+
+define { i32 } @test__builtin_mips_shll_s_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind {
+entry:
+; CHECK: shllv_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shll.s.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_shll_s_w1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: shll_s.w
+
+  %0 = tail call i32 @llvm.mips.shll.s.w(i32 %a0, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.shll.s.w(i32, i32) nounwind
+
+define i32 @test__builtin_mips_shll_s_w2(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: shllv_s.w
+
+  %0 = tail call i32 @llvm.mips.shll.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_shrl_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shrl.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shrl.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shrl.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shrl_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrlv.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shrl.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shra.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_r_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.r.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shra.r.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_r_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shra.r.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_shra_r_w1(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: shra_r.w
+
+  %0 = tail call i32 @llvm.mips.shra.r.w(i32 %a0, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.shra.r.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_shra_r_w2(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.w
+
+  %0 = tail call i32 @llvm.mips.shra.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+define { i32 } @test__builtin_mips_absq_s_ph1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: absq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.absq.s.ph(<2 x i16> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.absq.s.ph(<2 x i16>) nounwind
+
+define i32 @test__builtin_mips_absq_s_w1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: absq_s.w
+
+  %0 = tail call i32 @llvm.mips.absq.s.w(i32 %a0)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.absq.s.w(i32) nounwind
+
+define i32 @test__builtin_mips_preceq_w_phl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceq.w.phl
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call i32 @llvm.mips.preceq.w.phl(<2 x i16> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.preceq.w.phl(<2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_preceq_w_phr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceq.w.phr
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call i32 @llvm.mips.preceq.w.phr(<2 x i16> %0)
+  ret i32 %1
+}
+
+declare i32 @llvm.mips.preceq.w.phr(<2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbl(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbl(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbr(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbr(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbla1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbla
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbla(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbla(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_precequ_ph_qbra1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: precequ.ph.qbra
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.precequ.ph.qbra(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precequ.ph.qbra(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbl1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbl
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbl(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbl(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbr1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbr
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbr(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbr(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbla1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbla
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbla(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbla(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_preceu_ph_qbra1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: preceu.ph.qbra
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <2 x i16> @llvm.mips.preceu.ph.qbra(<4 x i8> %0)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.preceu.ph.qbra(<4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_qb1(i32 %i0) nounwind readnone {
+entry:
+; CHECK: repl.qb
+
+  %0 = tail call <4 x i8> @llvm.mips.repl.qb(i32 127)
+  %1 = bitcast <4 x i8> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.repl.qb(i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_qb2(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: replv.qb
+
+  %0 = tail call <4 x i8> @llvm.mips.repl.qb(i32 %a0)
+  %1 = bitcast <4 x i8> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_repl_ph1(i32 %i0) nounwind readnone {
+entry:
+; CHECK: repl.ph
+
+  %0 = tail call <2 x i16> @llvm.mips.repl.ph(i32 0)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.repl.ph(i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_repl_ph2(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: replv.ph
+
+  %0 = tail call <2 x i16> @llvm.mips.repl.ph(i32 %a0)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define i32 @test__builtin_mips_bitrev1(i32 %i0, i32 %a0) nounwind readnone {
+entry:
+; CHECK: bitrev ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.bitrev(i32 %a0)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.bitrev(i32) nounwind readnone
+
+define i32 @test__builtin_mips_lbux1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lbux ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lbux(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lbux(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_lhx1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lhx ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lhx(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lhx(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_lwx1(i32 %i0, i8* %a0, i32 %a1) nounwind readonly {
+entry:
+; CHECK: lwx ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.lwx(i8* %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.lwx(i8*, i32) nounwind readonly
+
+define i32 @test__builtin_mips_wrdsp1(i32 %i0, i32 %a0) nounwind {
+entry:
+; CHECK: wrdsp ${{[0-9]+}}
+
+  tail call void @llvm.mips.wrdsp(i32 %a0, i32 31)
+  %0 = tail call i32 @llvm.mips.rddsp(i32 31)
+  ret i32 %0
+}
+
+declare void @llvm.mips.wrdsp(i32, i32) nounwind
diff --git a/test/CodeGen/Mips/dsp-r2.ll b/test/CodeGen/Mips/dsp-r2.ll
new file mode 100644
index 000000000000..631f9e43c23a
--- /dev/null
+++ b/test/CodeGen/Mips/dsp-r2.ll
@@ -0,0 +1,568 @@
+; RUN: llc -march=mipsel -mattr=+dspr2 < %s | FileCheck %s
+
+define i64 @test__builtin_mips_dpa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dps_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dps.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dps.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dps.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_mulsa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: mulsa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.mulsa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.mulsa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpax_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpax.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpax.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpax.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpsx_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind readnone {
+entry:
+; CHECK: dpsx.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsx.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsx.w.ph(i64, <2 x i16>, <2 x i16>) nounwind readnone
+
+define i64 @test__builtin_mips_dpaqx_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaqx_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaqx.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaqx.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpaqx_sa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpaqx_sa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpaqx.sa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpaqx.sa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsqx_s_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsqx_s.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsqx.s.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsqx.s.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define i64 @test__builtin_mips_dpsqx_sa_w_ph1(i32 %i0, i32, i64 %a0, i32 %a1.coerce, i32 %a2.coerce) nounwind {
+entry:
+; CHECK: dpsqx_sa.w.ph
+
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = bitcast i32 %a2.coerce to <2 x i16>
+  %3 = tail call i64 @llvm.mips.dpsqx.sa.w.ph(i64 %a0, <2 x i16> %1, <2 x i16> %2)
+  ret i64 %3
+}
+
+declare i64 @llvm.mips.dpsqx.sa.w.ph(i64, <2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addu_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addu.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addu.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_addu_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: addu_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addu.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addu.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mulq_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mulq_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mulq.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mulq.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subu_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subu.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subu.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_subu_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: subu_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subu.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subu.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_eq_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.eq.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.eq.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.eq.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_lt_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.lt.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.lt.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.lt.qb(<4 x i8>, <4 x i8>) nounwind
+
+define i32 @test__builtin_mips_cmpgdu_le_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: cmpgdu.le.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call i32 @llvm.mips.cmpgdu.le.qb(<4 x i8> %0, <4 x i8> %1)
+  ret i32 %2
+}
+
+declare i32 @llvm.mips.cmpgdu.le.qb(<4 x i8>, <4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_precr_qb_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: precr.qb.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <4 x i8> @llvm.mips.precr.qb.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.precr.qb.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_precr_sra_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precr_sra.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precr.sra.ph.w(i32 %a0, i32 %a1, i32 15)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precr.sra.ph.w(i32, i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_precr_sra_r_ph_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: precr_sra_r.ph.w
+
+  %0 = tail call <2 x i16> @llvm.mips.precr.sra.r.ph.w(i32 %a0, i32 %a1, i32 15)
+  %1 = bitcast <2 x i16> %0 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %1, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.precr.sra.r.ph.w(i32, i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shra.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_r_qb1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shra_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.r.qb(<4 x i8> %0, i32 3)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.shra.r.qb(<4 x i8>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shra_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shra_r_qb2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrav_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.shra.r.qb(<4 x i8> %0, i32 %a1)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_shrl_ph1(i32 %i0, i32 %a0.coerce) nounwind readnone {
+entry:
+; CHECK: shrl.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shrl.ph(<2 x i16> %0, i32 7)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.shrl.ph(<2 x i16>, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_shrl_ph2(i32 %i0, i32 %a0.coerce, i32 %a1) nounwind readnone {
+entry:
+; CHECK: shrlv.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = tail call <2 x i16> @llvm.mips.shrl.ph(<2 x i16> %0, i32 %a1)
+  %2 = bitcast <2 x i16> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+define { i32 } @test__builtin_mips_absq_s_qb1(i32 %i0, i32 %a0.coerce) nounwind {
+entry:
+; CHECK: absq_s.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = tail call <4 x i8> @llvm.mips.absq.s.qb(<4 x i8> %0)
+  %2 = bitcast <4 x i8> %1 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %2, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.absq.s.qb(<4 x i8>) nounwind
+
+define { i32 } @test__builtin_mips_mul_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mul.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mul.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mul.ph(<2 x i16>, <2 x i16>) nounwind
+
+define { i32 } @test__builtin_mips_mul_s_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind {
+entry:
+; CHECK: mul_s.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.mul.s.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.mul.s.ph(<2 x i16>, <2 x i16>) nounwind
+
+define i32 @test__builtin_mips_mulq_rs_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mulq_rs.w
+
+  %0 = tail call i32 @llvm.mips.mulq.rs.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.mulq.rs.w(i32, i32) nounwind
+
+define i32 @test__builtin_mips_mulq_s_w1(i32 %i0, i32 %a0, i32 %a1) nounwind {
+entry:
+; CHECK: mulq_s.w
+
+  %0 = tail call i32 @llvm.mips.mulq.s.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.mulq.s.w(i32, i32) nounwind
+
+define { i32 } @test__builtin_mips_adduh_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: adduh.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.adduh.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.adduh.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_adduh_r_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: adduh_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.adduh.r.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.adduh.r.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subuh_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subuh.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subuh.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subuh.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subuh_r_qb1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subuh_r.qb
+
+  %0 = bitcast i32 %a0.coerce to <4 x i8>
+  %1 = bitcast i32 %a1.coerce to <4 x i8>
+  %2 = tail call <4 x i8> @llvm.mips.subuh.r.qb(<4 x i8> %0, <4 x i8> %1)
+  %3 = bitcast <4 x i8> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <4 x i8> @llvm.mips.subuh.r.qb(<4 x i8>, <4 x i8>) nounwind readnone
+
+define { i32 } @test__builtin_mips_addqh_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: addqh.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addqh.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addqh.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_addqh_r_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: addqh_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.addqh.r.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.addqh.r.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_addqh_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: addqh.w
+
+  %0 = tail call i32 @llvm.mips.addqh.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addqh.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_addqh_r_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: addqh_r.w
+
+  %0 = tail call i32 @llvm.mips.addqh.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.addqh.r.w(i32, i32) nounwind readnone
+
+define { i32 } @test__builtin_mips_subqh_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subqh.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subqh.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subqh.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define { i32 } @test__builtin_mips_subqh_r_ph1(i32 %i0, i32 %a0.coerce, i32 %a1.coerce) nounwind readnone {
+entry:
+; CHECK: subqh_r.ph
+
+  %0 = bitcast i32 %a0.coerce to <2 x i16>
+  %1 = bitcast i32 %a1.coerce to <2 x i16>
+  %2 = tail call <2 x i16> @llvm.mips.subqh.r.ph(<2 x i16> %0, <2 x i16> %1)
+  %3 = bitcast <2 x i16> %2 to i32
+  %.fca.0.insert = insertvalue { i32 } undef, i32 %3, 0
+  ret { i32 } %.fca.0.insert
+}
+
+declare <2 x i16> @llvm.mips.subqh.r.ph(<2 x i16>, <2 x i16>) nounwind readnone
+
+define i32 @test__builtin_mips_subqh_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: subqh.w
+
+  %0 = tail call i32 @llvm.mips.subqh.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subqh.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_subqh_r_w1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: subqh_r.w
+
+  %0 = tail call i32 @llvm.mips.subqh.r.w(i32 %a0, i32 %a1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.subqh.r.w(i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_append1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: append ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.append(i32 %a0, i32 %a1, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.append(i32, i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_balign1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: balign ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.balign(i32 %a0, i32 %a1, i32 1)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.balign(i32, i32, i32) nounwind readnone
+
+define i32 @test__builtin_mips_prepend1(i32 %i0, i32 %a0, i32 %a1) nounwind readnone {
+entry:
+; CHECK: prepend ${{[0-9]+}}
+
+  %0 = tail call i32 @llvm.mips.prepend(i32 %a0, i32 %a1, i32 15)
+  ret i32 %0
+}
+
+declare i32 @llvm.mips.prepend(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
new file mode 100644
index 000000000000..3a21332b5c5a
--- /dev/null
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
+; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+define i8* @f1() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK:        addiu   $sp, $sp, -32
+; CHECK:        addiu   $2, $sp, 32
+}
+
+
+define i8* @f2() nounwind {
+entry:
+  %x = alloca [65536 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; check stack size (65536 + 8)
+; CHECK:        lui     $[[R0:[a-z0-9]+]], 65535
+; CHECK:        addiu   $[[R0]], $[[R0]], -8
+; CHECK:        addu    $sp, $sp, $[[R0]]
+
+; check return value ($sp + stack size)
+; CHECK:        lui     $[[R1:[a-z0-9]+]], 1
+; CHECK:        addu    $[[R1]], $sp, $[[R1]]
+; CHECK:        addiu   $2, $[[R1]], 8
+}
+
+
+define i32 @f3() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = ptrtoint i8* %0 to i32
+  %2 = call i8* @llvm.frameaddress(i32 0)
+  %3 = ptrtoint i8* %2 to i32
+  %add = add i32 %1, %3
+  ret i32 %add
+
+; CHECK:        addiu   $sp, $sp, -40
+
+; check return value ($fp + stack size + $fp)
+; CHECK:        addiu   $[[R0:[a-z0-9]+]], $fp, 40
+; CHECK:        addu    $2, $[[R0]], $fp
+}
+
+
+define i8* @f4() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK-MIPS64:        daddiu   $sp, $sp, -32
+; CHECK-MIPS64:        daddiu   $2, $sp, 32
+}
diff --git a/test/CodeGen/Mips/helloworld.ll b/test/CodeGen/Mips/helloworld.ll
index bee93accd428..aee58b650e7a 100644
--- a/test/CodeGen/Mips/helloworld.ll
+++ b/test/CodeGen/Mips/helloworld.ll
@@ -24,10 +24,10 @@ entry:
 ; C1:	addiu	${{[0-9]+}}, %lo($.str)
 ; C2:	move	$25, ${{[0-9]+}}
 ; C1:	move 	$gp, ${{[0-9]+}}
-; C1:	jalr 	${{[0-9]+}}
+; C1:	jalrc 	${{[0-9]+}}
 ; SR:	restore 	$ra, [[FS]]
 ; PE:	li	$2, 0
-; PE:	jr 	$ra
+; PE:	jrc 	$ra
 
 }
 
diff --git a/test/CodeGen/Mips/i32k.ll b/test/CodeGen/Mips/i32k.ll
new file mode 100644
index 000000000000..c6da8b1ac9a0
--- /dev/null
+++ b/test/CodeGen/Mips/i32k.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16a
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16b
+
+@.str = private unnamed_addr constant [4 x i8] c"%i\0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 29905
+; 16b:	li	${{[0-9]+}}, 16408
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 -1075344593) nounwind
+; 16a:	li	${{[0-9]+}}, 49127
+; 16b:	li	${{[0-9]+}}, 35631
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
new file mode 100644
index 000000000000..f96ce2647289
--- /dev/null
+++ b/test/CodeGen/Mips/init-array.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+
+target triple = "mipsel-unknown-linux"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @test }]
+; CHECK: .section
+; CHECK: .init_array
+; CHECK-NOT: .ctors
+; CHECK: .4byte test
+
+define internal void @test() section ".text.startup" {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/Mips/largeimm1.ll b/test/CodeGen/Mips/largeimm1.ll
index d65cc025d085..1c0f69c59011 100644
--- a/test/CodeGen/Mips/largeimm1.ll
+++ b/test/CodeGen/Mips/largeimm1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=mipsel < %s | FileCheck %s
 
-; CHECK: lui $at, 49152
-; CHECK: lui $at, 16384
+; CHECK: lui ${{[0-9]+}}, 49152
+; CHECK: lui ${{[0-9]+}}, 16384
 define void @f() nounwind {
 entry:
   %a1 = alloca [1073741824 x i8], align 1
diff --git a/test/CodeGen/Mips/largeimmprinting.ll b/test/CodeGen/Mips/largeimmprinting.ll
index 2e548790cd39..1e96346d1dd7 100644
--- a/test/CodeGen/Mips/largeimmprinting.ll
+++ b/test/CodeGen/Mips/largeimmprinting.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=mipsel -mcpu=mips32r2 < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | \
+; RUN: FileCheck %s -check-prefix=64
 
 %struct.S1 = type { [65536 x i8] }
 
@@ -6,9 +8,21 @@
 
 define void @f() nounwind {
 entry:
-; CHECK:  lui $at, 65535
-; CHECK:  addiu $at, $at, -16
-; CHECK:  addu  $sp, $sp, $at
+; 32:  lui $[[R0:[0-9]+]], 65535
+; 32:  addiu $[[R0]], $[[R0]], -24
+; 32:  addu $sp, $sp, $[[R0]]
+; 32:  lui $[[R1:[0-9]+]], 1
+; 32:  addu $[[R1]], $sp, $[[R1]]
+; 32:  sw $ra, 20($[[R1]])
+; 64:  daddiu  $[[R0:[0-9]+]], $zero, 1
+; 64:  dsll  $[[R0]], $[[R0]], 48
+; 64:  daddiu  $[[R0]], $[[R0]], -1
+; 64:  dsll  $[[R0]], $[[R0]], 16
+; 64:  daddiu  $[[R0]], $[[R0]], -48
+; 64:  daddu $sp, $sp, $[[R0]]
+; 64:  lui $[[R1:[0-9]+]], 1
+; 64:  daddu $[[R1]], $sp, $[[R1]]
+; 64:  sd  $ra, 40($[[R1]])
 
   %agg.tmp = alloca %struct.S1, align 1
   %tmp = getelementptr inbounds %struct.S1* %agg.tmp, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/Mips/llcarry.ll b/test/CodeGen/Mips/llcarry.ll
new file mode 100644
index 000000000000..7763daec3b32
--- /dev/null
+++ b/test/CodeGen/Mips/llcarry.ll
@@ -0,0 +1,51 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i64 4294967295, align 8
+@j = global i64 15, align 8
+@ii = global i64 4294967295, align 8
+@k = common global i64 0, align 8
+@l = common global i64 0, align 8
+@m = common global i64 0, align 8
+
+define void @test1() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %add = add nsw i64 %1, %0
+  store i64 %add, i64* @k, align 8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
+
+define void @test2() nounwind {
+entry:
+  %0 = load i64* @i, align 8
+  %1 = load i64* @j, align 8
+  %sub = sub nsw i64 %0, %1
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	subu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %sub, i64* @l, align 8
+  ret void
+}
+
+define void @test3() nounwind {
+entry:
+  %0 = load i64* @ii, align 8
+  %add = add nsw i64 %0, 15
+; 16:	addiu	${{[0-9]+}}, 15
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+; 16:	addu	${{[0-9]+}}, ${{[0-9]+}}, ${{[0-9]+}}
+  store i64 %add, i64* @m, align 8
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 0227b88fbc86..1a4f79c191e1 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -6,9 +6,15 @@
 define void @foo1(i32 %s) nounwind {
 entry:
 ; O32: bal
+; O32: lui $1, 0
+; O32: addiu $1, $1, {{[0-9]+}} 
+; N64: lui $1, 0
+; N64: daddiu $1, $1, 0
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, 0
 ; N64: bal
-; N64: highest
-; N64: higher
+; N64: dsll $1, $1, 16
+; N64: daddiu $1, $1, {{[0-9]+}}  
 
   %tobool = icmp eq i32 %s, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
new file mode 100644
index 000000000000..e26b0223b447
--- /dev/null
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -0,0 +1,16 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O3 < %s | FileCheck %s
+
+%struct.S = type { [8 x i32] }
+
+@g = common global %struct.S zeroinitializer, align 4
+
+define void @f(%struct.S* noalias sret %agg.result) nounwind {
+entry:
+; CHECK: daddu $2, $zero, $4
+
+  %0 = bitcast %struct.S* %agg.result to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/misha.ll b/test/CodeGen/Mips/misha.ll
new file mode 100644
index 000000000000..80637edb1674
--- /dev/null
+++ b/test/CodeGen/Mips/misha.ll
@@ -0,0 +1,69 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+define i32 @sumc(i8* nocapture %to, i8* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i8* %to, align 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i8 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i8* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i8* %from.addr.09, i32 1
+  %2 = load i8* %from.addr.09, align 1
+  %conv27 = zext i8 %2 to i32
+  %conv36 = zext i8 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i8
+  store i8 %conv4, i8* %to, align 1
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+; 16: sumc:
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lbu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: sum:
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+; 16: 	lhu	${{[0-9]+}}, 0(${{[0-9]+}})
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+define i32 @sum(i16* nocapture %to, i16* nocapture %from, i32) nounwind {
+entry:
+  %sext = shl i32 %0, 16
+  %conv = ashr exact i32 %sext, 16
+  %cmp8 = icmp eq i32 %conv, 0
+  br i1 %cmp8, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %.pre = load i16* %to, align 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %1 = phi i16 [ %.pre, %for.body.lr.ph ], [ %conv4, %for.body ]
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %from.addr.09 = phi i16* [ %from, %for.body.lr.ph ], [ %incdec.ptr, %for.body ]
+  %incdec.ptr = getelementptr inbounds i16* %from.addr.09, i32 1
+  %2 = load i16* %from.addr.09, align 2
+  %conv27 = zext i16 %2 to i32
+  %conv36 = zext i16 %1 to i32
+  %add = add nsw i32 %conv36, %conv27
+  %conv4 = trunc i32 %add to i16
+  store i16 %conv4, i16* %to, align 2
+  %inc = add nsw i32 %i.010, 1
+  %cmp = icmp eq i32 %inc, %conv
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+
diff --git a/test/CodeGen/Mips/mul.ll b/test/CodeGen/Mips/mul.ll
new file mode 100644
index 000000000000..4ce801b1c9f4
--- /dev/null
+++ b/test/CodeGen/Mips/mul.ll
@@ -0,0 +1,17 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 5, align 4
+@jjjj = global i32 -6, align 4
+@kkkk = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %mul = mul nsw i32 %1, %0
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+
+  store i32 %mul, i32* @kkkk, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/mulll.ll b/test/CodeGen/Mips/mulll.ll
new file mode 100644
index 000000000000..e37b9197df82
--- /dev/null
+++ b/test/CodeGen/Mips/mulll.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i64 5, align 8
+@jjjj = global i64 -6, align 8
+@kkkk = common global i64 0, align 8
+
+define void @test() nounwind {
+entry:
+  %0 = load i64* @iiii, align 8
+  %1 = load i64* @jjjj, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* @kkkk, align 8
+; 16:	multu	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+
+  ret void
+}
diff --git a/test/CodeGen/Mips/mulull.ll b/test/CodeGen/Mips/mulull.ll
new file mode 100644
index 000000000000..4d23c693184b
--- /dev/null
+++ b/test/CodeGen/Mips/mulull.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i64 5, align 8
+@jjjj = global i64 6, align 8
+@kkkk = common global i64 0, align 8
+@.str = private unnamed_addr constant [20 x i8] c"%lld * %lld = %lld\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i64* @iiii, align 8
+  %1 = load i64* @jjjj, align 8
+  %mul = mul nsw i64 %1, %0
+  store i64 %mul, i64* @kkkk, align 8
+; 16:	multu	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+; 16:	mult	${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mflo	${{[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/Mips/null.ll b/test/CodeGen/Mips/null.ll
index 7beae99c4557..00c66a9928f6 100644
--- a/test/CodeGen/Mips/null.ll
+++ b/test/CodeGen/Mips/null.ll
@@ -8,6 +8,6 @@ entry:
 ; 16: 	.set	mips16                  # @main
 
 
-; 16:	jr	$ra
+; 16:	jrc	$ra
 
 }
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index eac0d80c1c57..5558ba6e10f4 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -119,6 +119,16 @@ entry:
   ret void
 }
 
+%struct.S4 = type { [4 x i32] }
+
+define void @f5(i64 %a0, %struct.S4* nocapture byval %a1) nounwind {
+entry:
+  tail call void @f6(%struct.S4* byval %a1, i64 %a0) nounwind
+  ret void
+}
+
+declare void @f6(%struct.S4* nocapture byval, i64)
+
 !0 = metadata !{metadata !"int", metadata !1}
 !1 = metadata !{metadata !"omnipotent char", metadata !2}
 !2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Mips/rem.ll b/test/CodeGen/Mips/rem.ll
new file mode 100644
index 000000000000..b18f85dcbecf
--- /dev/null
+++ b/test/CodeGen/Mips/rem.ll
@@ -0,0 +1,19 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 -4, align 4
+@kkkk = common global i32 0, align 4
+
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %rem = srem i32 %0, %1
+; 16:	div	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @kkkk, align 4
+  ret void
+}
+
+
diff --git a/test/CodeGen/Mips/remat-immed-load.ll b/test/CodeGen/Mips/remat-immed-load.ll
new file mode 100644
index 000000000000..d93964bcaef6
--- /dev/null
+++ b/test/CodeGen/Mips/remat-immed-load.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=32
+; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64 < %s | FileCheck %s -check-prefix=64
+
+define void @f0() nounwind {
+entry:
+; 32:  addiu $4, $zero, 1
+; 32:  addiu $4, $zero, 1
+
+  tail call void @foo1(i32 1) nounwind
+  tail call void @foo1(i32 1) nounwind
+  ret void
+}
+
+declare void @foo1(i32)
+
+define void @f3() nounwind {
+entry:
+; 64:  daddiu $4, $zero, 1
+; 64:  daddiu $4, $zero, 1
+
+  tail call void @foo2(i64 1) nounwind
+  tail call void @foo2(i64 1) nounwind
+  ret void
+}
+
+declare void @foo2(i64)
+
+define void @f5() nounwind {
+entry:
+; 32:  lui $4, 1
+; 32:  lui $4, 1
+
+  tail call void @f6(i32 65536) nounwind
+  tail call void @f6(i32 65536) nounwind
+  ret void
+}
+
+declare void @f6(i32)
+
+define void @f7() nounwind {
+entry:
+; 64:  lui $4, 1
+; 64:  lui $4, 1
+
+  tail call void @f8(i64 65536) nounwind
+  tail call void @f8(i64 65536) nounwind
+  ret void
+}
+
+declare void @f8(i64)
+
diff --git a/test/CodeGen/Mips/remu.ll b/test/CodeGen/Mips/remu.ll
new file mode 100644
index 000000000000..472503c38403
--- /dev/null
+++ b/test/CodeGen/Mips/remu.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@iiii = global i32 103, align 4
+@jjjj = global i32 4, align 4
+@kkkk = common global i32 0, align 4
+@.str = private unnamed_addr constant [15 x i8] c"%u = %u %% %u\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @iiii, align 4
+  %1 = load i32* @jjjj, align 4
+  %rem = urem i32 %0, %1
+; 16:	divu	$zero, ${{[0-9]+}}, ${{[0-9]+}}
+; 16: 	mfhi	${{[0-9]+}}
+  store i32 %rem, i32* @kkkk, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/return-vector.ll b/test/CodeGen/Mips/return-vector.ll
new file mode 100644
index 000000000000..739c43c68a55
--- /dev/null
+++ b/test/CodeGen/Mips/return-vector.ll
@@ -0,0 +1,244 @@
+; RUN: llc -march=mipsel < %s | FileCheck %s
+
+
+; Check that function accesses vector return value from stack in cases when
+; vector can't be returned in registers. Also check that caller passes in
+; register $4 stack address where the vector should be placed.
+
+
+declare <8 x i32>    @i8(...)
+declare <4 x float>  @f4(...)
+declare <4 x double> @d4(...)
+
+define i32 @call_i8() {
+entry:
+  %call = call <8 x i32> (...)* @i8()
+  %v0 = extractelement <8 x i32> %call, i32 0
+  %v1 = extractelement <8 x i32> %call, i32 1
+  %v2 = extractelement <8 x i32> %call, i32 2
+  %v3 = extractelement <8 x i32> %call, i32 3
+  %v4 = extractelement <8 x i32> %call, i32 4
+  %v5 = extractelement <8 x i32> %call, i32 5
+  %v6 = extractelement <8 x i32> %call, i32 6
+  %v7 = extractelement <8 x i32> %call, i32 7
+  %add1 = add i32 %v0, %v1
+  %add2 = add i32 %v2, %v3
+  %add3 = add i32 %v4, %v5
+  %add4 = add i32 %v6, %v7
+  %add5 = add i32 %add1, %add2
+  %add6 = add i32 %add3, %add4
+  %add7 = add i32 %add5, %add6
+  ret i32 %add7
+
+; CHECK:        call_i8:
+; CHECK:        call16(i8)
+; CHECK:        addiu   $4, $sp, 32
+; CHECK:        lw      $[[R0:[a-z0-9]+]], 60($sp)
+; CHECK:        lw      $[[R1:[a-z0-9]+]], 56($sp)
+; CHECK:        lw      $[[R2:[a-z0-9]+]], 52($sp)
+; CHECK:        lw      $[[R3:[a-z0-9]+]], 48($sp)
+; CHECK:        lw      $[[R4:[a-z0-9]+]], 44($sp)
+; CHECK:        lw      $[[R5:[a-z0-9]+]], 40($sp)
+; CHECK:        lw      $[[R6:[a-z0-9]+]], 36($sp)
+; CHECK:        lw      $[[R7:[a-z0-9]+]], 32($sp)
+}
+
+
+define float @call_f4() {
+entry:
+  %call = call <4 x float> (...)* @f4()
+  %v0 = extractelement <4 x float> %call, i32 0
+  %v1 = extractelement <4 x float> %call, i32 1
+  %v2 = extractelement <4 x float> %call, i32 2
+  %v3 = extractelement <4 x float> %call, i32 3
+  %add1 = fadd float %v0, %v1
+  %add2 = fadd float %v2, %v3
+  %add3 = fadd float %add1, %add2
+  ret float %add3
+
+; CHECK:        call_f4:
+; CHECK:        call16(f4)
+; CHECK:        addiu   $4, $sp, 16
+; CHECK:        lwc1    $[[R0:[a-z0-9]+]], 28($sp)
+; CHECK:        lwc1    $[[R1:[a-z0-9]+]], 24($sp)
+; CHECK:        lwc1    $[[R3:[a-z0-9]+]], 20($sp)
+; CHECK:        lwc1    $[[R4:[a-z0-9]+]], 16($sp)
+}
+
+
+define double @call_d4() {
+entry:
+  %call = call <4 x double> (...)* @d4()
+  %v0 = extractelement <4 x double> %call, i32 0
+  %v1 = extractelement <4 x double> %call, i32 1
+  %v2 = extractelement <4 x double> %call, i32 2
+  %v3 = extractelement <4 x double> %call, i32 3
+  %add1 = fadd double %v0, %v1
+  %add2 = fadd double %v2, %v3
+  %add3 = fadd double %add1, %add2
+  ret double %add3
+
+; CHECK:        call_d4:
+; CHECK:        call16(d4)
+; CHECK:        addiu   $4, $sp, 32
+; CHECK:        ldc1    $[[R0:[a-z0-9]+]], 56($sp)
+; CHECK:        ldc1    $[[R1:[a-z0-9]+]], 48($sp)
+; CHECK:        ldc1    $[[R3:[a-z0-9]+]], 40($sp)
+; CHECK:        ldc1    $[[R4:[a-z0-9]+]], 32($sp)
+}
+
+
+
+; Check that function accesses vector return value from registers in cases when
+; vector can be returned in registers
+
+
+declare <4 x i32>    @i4(...)
+declare <2 x float>  @f2(...)
+declare <2 x double> @d2(...)
+
+define i32 @call_i4() {
+entry:
+  %call = call <4 x i32> (...)* @i4()
+  %v0 = extractelement <4 x i32> %call, i32 0
+  %v1 = extractelement <4 x i32> %call, i32 1
+  %v2 = extractelement <4 x i32> %call, i32 2
+  %v3 = extractelement <4 x i32> %call, i32 3
+  %add1 = add i32 %v0, %v1
+  %add2 = add i32 %v2, %v3
+  %add3 = add i32 %add1, %add2
+  ret i32 %add3
+
+; CHECK:        call_i4:
+; CHECK:        call16(i4)
+; CHECK-NOT:    lw
+; CHECK:        addu    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+; CHECK:        addu    $[[R5:[a-z0-9]+]], $[[R3:[a-z0-9]+]], $[[R4:[a-z0-9]+]]
+; CHECK:        addu    $[[R6:[a-z0-9]+]], $[[R5]], $[[R2]]
+}
+
+
+define float @call_f2() {
+entry:
+  %call = call <2 x float> (...)* @f2()
+  %v0 = extractelement <2 x float> %call, i32 0
+  %v1 = extractelement <2 x float> %call, i32 1
+  %add1 = fadd float %v0, %v1
+  ret float %add1
+
+; CHECK:        call_f2:
+; CHECK:        call16(f2)
+; CHECK-NOT:    lwc1
+; CHECK:        add.s    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+}
+
+
+define double @call_d2() {
+entry:
+  %call = call <2 x double> (...)* @d2()
+  %v0 = extractelement <2 x double> %call, i32 0
+  %v1 = extractelement <2 x double> %call, i32 1
+  %add1 = fadd double %v0, %v1
+  ret double %add1
+
+; CHECK:        call_d2:
+; CHECK:        call16(d2)
+; CHECK-NOT:    ldc1
+; CHECK:        add.d    $[[R2:[a-z0-9]+]], $[[R0:[a-z0-9]+]], $[[R1:[a-z0-9]+]]
+}
+
+
+
+; Check that function returns vector on stack in cases when vector can't be
+; returned in registers. Also check that vector is placed on stack starting
+; from the address in register $4.
+
+
+define <8 x i32> @return_i8() {
+entry:
+  ret <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+
+; CHECK:        return_i8:
+; CHECK:        sw      $[[R0:[a-z0-9]+]], 28($4)
+; CHECK:        sw      $[[R1:[a-z0-9]+]], 24($4)
+; CHECK:        sw      $[[R2:[a-z0-9]+]], 20($4)
+; CHECK:        sw      $[[R3:[a-z0-9]+]], 16($4)
+; CHECK:        sw      $[[R4:[a-z0-9]+]], 12($4)
+; CHECK:        sw      $[[R5:[a-z0-9]+]], 8($4)
+; CHECK:        sw      $[[R6:[a-z0-9]+]], 4($4)
+; CHECK:        sw      $[[R7:[a-z0-9]+]], 0($4)
+}
+
+
+define <4 x float> @return_f4(float %a, float %b, float %c, float %d) {
+entry:
+  %vecins1 = insertelement <4 x float> undef,    float %a, i32 0
+  %vecins2 = insertelement <4 x float> %vecins1, float %b, i32 1
+  %vecins3 = insertelement <4 x float> %vecins2, float %c, i32 2
+  %vecins4 = insertelement <4 x float> %vecins3, float %d, i32 3
+  ret <4 x float> %vecins4
+
+; CHECK:        return_f4:
+; CHECK:        lwc1    $[[R0:[a-z0-9]+]], 16($sp)
+; CHECK:        swc1    $[[R0]], 12($4)
+; CHECK:        sw      $7, 8($4)
+; CHECK:        sw      $6, 4($4)
+; CHECK:        sw      $5, 0($4)
+}
+
+
+define <4 x double> @return_d4(double %a, double %b, double %c, double %d) {
+entry:
+  %vecins1 = insertelement <4 x double> undef,    double %a, i32 0
+  %vecins2 = insertelement <4 x double> %vecins1, double %b, i32 1
+  %vecins3 = insertelement <4 x double> %vecins2, double %c, i32 2
+  %vecins4 = insertelement <4 x double> %vecins3, double %d, i32 3
+  ret <4 x double> %vecins4
+
+; CHECK:        return_d4:
+; CHECK:        sdc1    $[[R0:[a-z0-9]+]], 24($4)
+; CHECK:        sdc1    $[[R1:[a-z0-9]+]], 16($4)
+; CHECK:        sdc1    $[[R2:[a-z0-9]+]], 8($4)
+; CHECK:        sdc1    $[[R3:[a-z0-9]+]], 0($4)
+}
+
+
+
+; Check that function returns vector in registers in cases when vector can be
+; returned in registers.
+
+
+define <4 x i32> @return_i4() {
+entry:
+  ret <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+
+; CHECK:        return_i4:
+; CHECK:        addiu   $2, $zero, 0
+; CHECK:        addiu   $3, $zero, 1
+; CHECK:        addiu   $4, $zero, 2
+; CHECK:        addiu   $5, $zero, 3
+}
+
+
+define <2 x float> @return_f2(float %a, float %b) {
+entry:
+  %vecins1 = insertelement <2 x float> undef,    float %a, i32 0
+  %vecins2 = insertelement <2 x float> %vecins1, float %b, i32 1
+  ret <2 x float> %vecins2
+
+; CHECK:        return_f2:
+; CHECK:        mov.s   $f0, $f12
+; CHECK:        mov.s   $f2, $f14
+}
+
+
+define <2 x double> @return_d2(double %a, double %b) {
+entry:
+  %vecins1 = insertelement <2 x double> undef,    double %a, i32 0
+  %vecins2 = insertelement <2 x double> %vecins1, double %b, i32 1
+  ret <2 x double> %vecins2
+
+; CHECK:        return_d2:
+; CHECK:        mov.d   $f0, $f12
+; CHECK:        mov.d   $f2, $f14
+}
diff --git a/test/CodeGen/Mips/selpat.ll b/test/CodeGen/Mips/selpat.ll
new file mode 100644
index 000000000000..cda0c96ef4be
--- /dev/null
+++ b/test/CodeGen/Mips/selpat.ll
@@ -0,0 +1,350 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@t = global i32 10, align 4
+@f = global i32 199, align 4
+@a = global i32 1, align 4
+@b = global i32 10, align 4
+@c = global i32 1, align 4
+@z1 = common global i32 0, align 4
+@z2 = common global i32 0, align 4
+@z3 = common global i32 0, align 4
+@z4 = common global i32 0, align 4
+
+define void @calc_seleq() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp eq i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_seleqk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 1
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp eq i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp eq i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp eq i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seleqz() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp eq i32 %0, 0
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	beqz	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp eq i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp eq i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define i32 @calc_selgt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sgt i32 %0, %1
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+  %cmp1 = icmp sgt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret i32 undef
+}
+
+define void @calc_selle() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp sle i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp sle i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sle i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sle i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selltk() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp slt i32 %0, 10
+  %1 = load i32* @t, align 4
+  %2 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	slti	${{[0-9]+}}, {{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp slt i32 %3, 2
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp sgt i32 %4, 2
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp sgt i32 %0, 2
+  %cond15 = select i1 %cmp11, i32 %2, i32 %1
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+
+define void @calc_selne() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ne i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  store i32 %cond, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond10, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnek() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 1
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	cmpi	${{[0-9]+}}, 1
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ne i32 %0, 10
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %3 = load i32* @b, align 4
+  %cmp6 = icmp ne i32 %3, 3
+  %cond10 = select i1 %cmp6, i32 %2, i32 %1
+  store i32 %cond10, i32* @z3, align 4
+; 16:	cmpi	${{[0-9]+}}, 10
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp11 = icmp ne i32 %3, 10
+  %cond15 = select i1 %cmp11, i32 %1, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %cmp = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %cmp1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %cmp1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %cmp6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selnez2() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %tobool = icmp ne i32 %0, 0
+  %1 = load i32* @f, align 4
+  %2 = load i32* @t, align 4
+  %cond = select i1 %tobool, i32 %1, i32 %2
+  store i32 %cond, i32* @z1, align 4
+; 16:	bnez	${{[0-9]+}}, .+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %3 = load i32* @b, align 4
+  %tobool1 = icmp ne i32 %3, 0
+  %cond5 = select i1 %tobool1, i32 %2, i32 %1
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %tobool6 = icmp ne i32 %4, 0
+  %cond10 = select i1 %tobool6, i32 %1, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  store i32 %cond, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_seluge() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp uge i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp uge i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp uge i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %3, i32 %2
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp uge i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %3, i32 %2
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selugt() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %2 = load i32* @f, align 4
+  %3 = load i32* @t, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	btnez	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ugt i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ugt i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ugt i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
+
+define void @calc_selule() nounwind {
+entry:
+  %0 = load i32* @a, align 4
+  %1 = load i32* @b, align 4
+  %cmp = icmp ule i32 %0, %1
+  %2 = load i32* @t, align 4
+  %3 = load i32* @f, align 4
+  %cond = select i1 %cmp, i32 %2, i32 %3
+  store i32 %cond, i32* @z1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	bteqz	.+4
+; 16: 	move    ${{[0-9]+}}, ${{[0-9]+}}
+  %cmp1 = icmp ule i32 %1, %0
+  %cond5 = select i1 %cmp1, i32 %3, i32 %2
+  store i32 %cond5, i32* @z2, align 4
+  %4 = load i32* @c, align 4
+  %cmp6 = icmp ule i32 %4, %0
+  %cond10 = select i1 %cmp6, i32 %2, i32 %3
+  store i32 %cond10, i32* @z3, align 4
+  %cmp11 = icmp ule i32 %0, %4
+  %cond15 = select i1 %cmp11, i32 %2, i32 %3
+  store i32 %cond15, i32* @z4, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/seteq.ll b/test/CodeGen/Mips/seteq.ll
new file mode 100644
index 000000000000..da840c83a2b4
--- /dev/null
+++ b/test/CodeGen/Mips/seteq.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 10, align 4
+@k = global i32 1, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp eq i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
+; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/seteqz.ll b/test/CodeGen/Mips/seteqz.ll
new file mode 100644
index 000000000000..d445be6aedb0
--- /dev/null
+++ b/test/CodeGen/Mips/seteqz.ll
@@ -0,0 +1,24 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 0, align 4
+@j = global i32 99, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %cmp = icmp eq i32 %0, 0
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltiu	${{[0-9]+}}, 1
+; 16:	move	${{[0-9]+}}, $t8
+  %1 = load i32* @j, align 4
+  %cmp1 = icmp eq i32 %1, 99
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+; 16:	xor	$[[REGISTER:[0-9A-Ba-b_]+]], ${{[0-9]+}}
+; 16:	sltiu	$[[REGISTER:[0-9A-Ba-b_]+]], 1
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setge.ll b/test/CodeGen/Mips/setge.ll
new file mode 100644
index 000000000000..94b499bc31e9
--- /dev/null
+++ b/test/CodeGen/Mips/setge.ll
@@ -0,0 +1,27 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+@.str = private unnamed_addr constant [22 x i8] c"1 = %i\0A1 = %i\0A0 = %i\0A\00", align 1
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp sge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp sge i32 %0, %2
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setgek.ll b/test/CodeGen/Mips/setgek.ll
new file mode 100644
index 000000000000..b6bae09bcb5b
--- /dev/null
+++ b/test/CodeGen/Mips/setgek.ll
@@ -0,0 +1,18 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@k = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %cmp = icmp sgt i32 %0, -32769
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slti	${{[0-9]+}}, -32768
+; 16:	move	${{[0-9]+}}, $t8
+; 16:	xor	${{[0-9]+}}, ${{[0-9]+}}
+  ret void
+}
diff --git a/test/CodeGen/Mips/setle.ll b/test/CodeGen/Mips/setle.ll
new file mode 100644
index 000000000000..f36fb4392d76
--- /dev/null
+++ b/test/CodeGen/Mips/setle.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp sle i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp sle i32 %2, %1
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setlt.ll b/test/CodeGen/Mips/setlt.ll
new file mode 100644
index 000000000000..435be8e2334a
--- /dev/null
+++ b/test/CodeGen/Mips/setlt.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp slt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setltk.ll b/test/CodeGen/Mips/setltk.ll
new file mode 100644
index 000000000000..c0b610e37784
--- /dev/null
+++ b/test/CodeGen/Mips/setltk.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 -5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp slt i32 %0, 10
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	slti	$[[REGISTER:[0-9]+]], 10
+; 16:	move	$[[REGISTER]], $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setne.ll b/test/CodeGen/Mips/setne.ll
new file mode 100644
index 000000000000..6460c83c7b0b
--- /dev/null
+++ b/test/CodeGen/Mips/setne.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@i = global i32 1, align 4
+@j = global i32 10, align 4
+@k = global i32 1, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @i, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ne i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	xor	$[[REGISTER:[0-9]+]], ${{[0-9]+}}
+; 16:	sltu	${{[0-9]+}}, $[[REGISTER]]
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setuge.ll b/test/CodeGen/Mips/setuge.ll
new file mode 100644
index 000000000000..ac72b66e9fb0
--- /dev/null
+++ b/test/CodeGen/Mips/setuge.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp uge i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move    $[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp uge i32 %0, %2
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setugt.ll b/test/CodeGen/Mips/setugt.ll
new file mode 100644
index 000000000000..328f0e3be34a
--- /dev/null
+++ b/test/CodeGen/Mips/setugt.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @k, align 4
+  %1 = load i32* @j, align 4
+  %cmp = icmp ugt i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move    ${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setule.ll b/test/CodeGen/Mips/setule.ll
new file mode 100644
index 000000000000..792f2ae0fa29
--- /dev/null
+++ b/test/CodeGen/Mips/setule.ll
@@ -0,0 +1,26 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ule i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	$[[REGISTER:[0-9]+]], $t8
+; 16:	xor	$[[REGISTER]], ${{[0-9]+}}
+  %2 = load i32* @m, align 4
+  %cmp1 = icmp ule i32 %2, %1
+  %conv2 = zext i1 %cmp1 to i32
+  store i32 %conv2, i32* @r2, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/setult.ll b/test/CodeGen/Mips/setult.ll
new file mode 100644
index 000000000000..56d2e8daa3e0
--- /dev/null
+++ b/test/CodeGen/Mips/setult.ll
@@ -0,0 +1,21 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %1 = load i32* @k, align 4
+  %cmp = icmp ult i32 %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltu	${{[0-9]+}}, ${{[0-9]+}}
+; 16:	move	${{[0-9]+}}, $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/setultk.ll b/test/CodeGen/Mips/setultk.ll
new file mode 100644
index 000000000000..75b270ed8428
--- /dev/null
+++ b/test/CodeGen/Mips/setultk.ll
@@ -0,0 +1,20 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@j = global i32 5, align 4
+@k = global i32 10, align 4
+@l = global i32 20, align 4
+@m = global i32 10, align 4
+@r1 = common global i32 0, align 4
+@r2 = common global i32 0, align 4
+@r3 = common global i32 0, align 4
+
+define void @test() nounwind {
+entry:
+  %0 = load i32* @j, align 4
+  %cmp = icmp ult i32 %0, 10
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @r1, align 4
+; 16:	sltiu	$[[REGISTER:[0-9]+]], 10
+; 16:	move	$[[REGISTER]], $t8
+  ret void
+}
diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
new file mode 100644
index 000000000000..03503fb2ae18
--- /dev/null
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: | FileCheck %s
+
+@i = internal unnamed_addr global i32 0, align 4
+
+define i32 @geti() nounwind readonly {
+entry:
+; CHECK: lw ${{[0-9]+}}, %gp_rel(i)($gp)
+  %0 = load i32* @i, align 4
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
new file mode 100644
index 000000000000..c00c9fd9d2a1
--- /dev/null
+++ b/test/CodeGen/Mips/stchar.ll
@@ -0,0 +1,90 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_h
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16_b
+
+@.str = private unnamed_addr constant [9 x i8] c"%hd %c \0A\00", align 1
+@sp = common global i16* null, align 4
+@cp = common global i8* null, align 4
+
+define void @p1(i16 signext %s, i8 signext %c) nounwind {
+entry:
+  %conv = sext i16 %s to i32
+  %conv1 = sext i8 %c to i32
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv, i32 %conv1) nounwind
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
+
+define void @p2() nounwind {
+entry:
+  %0 = load i16** @sp, align 4
+  %1 = load i16* %0, align 2
+  %2 = load i8** @cp, align 4
+  %3 = load i8* %2, align 1
+  %conv.i = sext i16 %1 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  %4 = load i16** @sp, align 4
+  store i16 32, i16* %4, align 2
+  %5 = load i8** @cp, align 4
+  store i8 97, i8* %5, align 1
+  ret void
+}
+
+define void @test() nounwind {
+entry:
+  %s = alloca i16, align 4
+  %c = alloca i8, align 4
+  store i16 16, i16* %s, align 4
+  store i8 99, i8* %c, align 4
+  store i16* %s, i16** @sp, align 4
+  store i8* %c, i8** @cp, align 4
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %0 = load i16** @sp, align 4
+  store i16 32, i16* %0, align 2
+  %1 = load i8** @cp, align 4
+  store i8 97, i8* %1, align 1
+  %2 = load i16* %s, align 4
+  %3 = load i8* %c, align 4
+  %conv.i = sext i16 %2 to i32
+  %conv1.i = sext i8 %3 to i32
+  %call.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i, i32 %conv1.i) nounwind
+  ret void
+; 16_b: test:
+; 16_h: test:
+; 16_b:	sb	${{[0-9]+}}, [[offset1:[0-9]+]](${{[0-9]+}})
+; 16_b: lb      ${{[0-9]+}}, [[offset1]](${{[0-9]+}})
+; 16_h:	sh	${{[0-9]+}}, [[offset2:[0-9]+]](${{[0-9]+}})
+; 16_h: lh      ${{[0-9]+}}, [[offset2]](${{[0-9]+}})
+}
+
+define i32 @main() nounwind {
+entry:
+  %s.i = alloca i16, align 4
+  %c.i = alloca i8, align 4
+  %0 = bitcast i16* %s.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %c.i) nounwind
+  store i16 16, i16* %s.i, align 4
+  store i8 99, i8* %c.i, align 4
+  store i16* %s.i, i16** @sp, align 4
+  store i8* %c.i, i8** @cp, align 4
+  %call.i.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 16, i32 99) nounwind
+  %1 = load i16** @sp, align 4
+  store i16 32, i16* %1, align 2
+  %2 = load i8** @cp, align 4
+  store i8 97, i8* %2, align 1
+  %3 = load i16* %s.i, align 4
+  %4 = load i8* %c.i, align 4
+  %conv.i.i = sext i16 %3 to i32
+  %conv1.i.i = sext i8 %4 to i32
+  %call.i.i = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), i32 %conv.i.i, i32 %conv1.i.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %c.i) nounwind
+  ret i32 0
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
diff --git a/test/CodeGen/Mips/stldst.ll b/test/CodeGen/Mips/stldst.ll
new file mode 100644
index 000000000000..4182b9e76d63
--- /dev/null
+++ b/test/CodeGen/Mips/stldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+@kkkk = global i32 67, align 4
+@llll = global i32 33, align 4
+@mmmm = global i32 44, align 4
+@nnnn = global i32 55, align 4
+@oooo = global i32 32, align 4
+@pppp = global i32 41, align 4
+@qqqq = global i32 59, align 4
+@rrrr = global i32 60, align 4
+@.str = private unnamed_addr constant [32 x i8] c"%i %i %i %i %i %i %i %i %i %i \0A\00", align 1
+
+define i32 @main() nounwind {
+entry:
+  %0 = load i32* @kkkk, align 4
+  %1 = load i32* @llll, align 4
+  %add = add nsw i32 %0, 10
+  %add1 = add nsw i32 %1, 10
+  %2 = load i32* @mmmm, align 4
+  %sub = add nsw i32 %2, -3
+  %3 = load i32* @nnnn, align 4
+  %add2 = add nsw i32 %3, 10
+  %4 = load i32* @oooo, align 4
+  %add3 = add nsw i32 %4, 4
+  %5 = load i32* @pppp, align 4
+  %sub4 = add nsw i32 %5, -5
+  %6 = load i32* @qqqq, align 4
+  %sub5 = add nsw i32 %6, -10
+  %7 = load i32* @rrrr, align 4
+  %add6 = add nsw i32 %7, 6
+
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str, i32 0, i32 0), i32 %sub5, i32 %add6, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7) nounwind
+  %call7 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([32 x i8]* @.str, i32 0, i32 0), i32 %0, i32 %1, i32 %add, i32 %add1, i32 %sub, i32 %add2, i32 %add3, i32 %sub4, i32 %sub5, i32 %add6) nounwind
+  ret i32 0
+}
+; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
+; 16:	lw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Reload
+; 16:	sw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Spill
+; 16:	lw	${{[0-9]+}}, {{[0-9]+}} ( $sp );         # 4-byte Folded Reload
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Mips/tailcall.ll b/test/CodeGen/Mips/tailcall.ll
new file mode 100644
index 000000000000..bcd33fca70ed
--- /dev/null
+++ b/test/CodeGen/Mips/tailcall.ll
@@ -0,0 +1,245 @@
+; RUN: llc -march=mipsel -relocation-model=pic -enable-mips-tail-calls < %s | \
+; RUN: FileCheck %s -check-prefix=PIC32
+; RUN: llc -march=mipsel -relocation-model=static \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=STATIC32
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=+n64 -enable-mips-tail-calls \
+; RUN: < %s | FileCheck %s -check-prefix=N64
+; RUN: llc -march=mipsel -mcpu=mips16 -relocation-model=pic \
+; RUN: -enable-mips-tail-calls < %s | FileCheck %s -check-prefix=PIC16
+
+@g0 = common global i32 0, align 4
+@g1 = common global i32 0, align 4
+@g2 = common global i32 0, align 4
+@g3 = common global i32 0, align 4
+@g4 = common global i32 0, align 4
+@g5 = common global i32 0, align 4
+@g6 = common global i32 0, align 4
+@g7 = common global i32 0, align 4
+@g8 = common global i32 0, align 4
+@g9 = common global i32 0, align 4
+
+define i32 @caller1(i32 %a0) nounwind {
+entry:
+; PIC32-NOT: jalr
+; STATIC32-NOT: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee1(i32 1, i32 1, i32 1, i32 %a0) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee1(i32, i32, i32, i32)
+
+define i32 @caller2(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee2(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee2(i32, i32, i32, i32, i32)
+
+define i32 @caller3(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64-NOT: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee3(i32 1, i32 1, i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee3(i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller4(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
+entry:
+; PIC32: jalr
+; STATIC32: jal
+; N64: jalr
+; PIC16: jalrc
+
+  %call = tail call i32 @callee4(i32 1, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee4(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller5() nounwind readonly {
+entry:
+; PIC32: .ent caller5
+; PIC32-NOT: jalr
+; PIC32: .end caller5
+; STATIC32: .ent caller5
+; STATIC32-NOT: jal
+; STATIC32: .end caller5
+; N64: .ent caller5
+; N64-NOT: jalr
+; N64: .end caller5
+; PIC16: .ent caller5
+; PIC16: jalrc
+; PIC16: .end caller5
+
+  %0 = load i32* @g0, align 4
+  %1 = load i32* @g1, align 4
+  %2 = load i32* @g2, align 4
+  %3 = load i32* @g3, align 4
+  %4 = load i32* @g4, align 4
+  %5 = load i32* @g5, align 4
+  %6 = load i32* @g6, align 4
+  %7 = load i32* @g7, align 4
+  %8 = load i32* @g8, align 4
+  %9 = load i32* @g9, align 4
+  %call = tail call fastcc i32 @callee5(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9)
+  ret i32 %call
+}
+
+define internal fastcc i32 @callee5(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9) nounwind readnone noinline {
+entry:
+  %add = add nsw i32 %a1, %a0
+  %add1 = add nsw i32 %add, %a2
+  %add2 = add nsw i32 %add1, %a3
+  %add3 = add nsw i32 %add2, %a4
+  %add4 = add nsw i32 %add3, %a5
+  %add5 = add nsw i32 %add4, %a6
+  %add6 = add nsw i32 %add5, %a7
+  %add7 = add nsw i32 %add6, %a8
+  %add8 = add nsw i32 %add7, %a9
+  ret i32 %add8
+}
+
+declare i32 @callee8(i32, ...)
+
+define i32 @caller8_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller8_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller8_1() nounwind noinline {
+entry:
+; PIC32: .ent caller8_1
+; PIC32: jalr
+; PIC32: .end caller8_1
+; STATIC32: .ent caller8_1
+; STATIC32: jal
+; STATIC32: .end caller8_1
+; N64: .ent caller8_1
+; N64-NOT: jalr
+; N64: .end caller8_1
+; PIC16: .ent caller8_1
+; PIC16: jalrc
+; PIC16: .end caller8_1
+
+  %call = tail call i32 (i32, ...)* @callee8(i32 2, i32 1) nounwind
+  ret i32 %call
+}
+
+%struct.S = type { [2 x i32] }
+
+@gs1 = external global %struct.S
+
+declare i32 @callee9(%struct.S* byval)
+
+define i32 @caller9_0() nounwind {
+entry:
+  %call = tail call fastcc i32 @caller9_1()
+  ret i32 %call
+}
+
+define internal fastcc i32 @caller9_1() nounwind noinline {
+entry:
+; PIC32: .ent caller9_1
+; PIC32: jalr
+; PIC32: .end caller9_1
+; STATIC32: .ent caller9_1
+; STATIC32: jal
+; STATIC32: .end caller9_1
+; N64: .ent caller9_1
+; N64: jalr
+; N64: .end caller9_1
+; PIC16: .ent caller9_1
+; PIC16: jalrc
+; PIC16: .end caller9_1
+
+  %call = tail call i32 @callee9(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee10(i32, i32, i32, i32, i32, i32, i32, i32, i32)
+
+define i32 @caller10(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) nounwind {
+entry:
+; PIC32: .ent caller10
+; PIC32-NOT: jalr
+; STATIC32: .ent caller10
+; STATIC32-NOT: jal
+; N64: .ent caller10
+; N64-NOT: jalr
+; PIC16: .ent caller10
+; PIC16: jalrc
+
+  %call = tail call i32 @callee10(i32 %a8, i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee11(%struct.S* byval)
+
+define i32 @caller11() nounwind noinline {
+entry:
+; PIC32: .ent caller11
+; PIC32: jalr
+; STATIC32: .ent caller11
+; STATIC32: jal
+; N64: .ent caller11
+; N64: jalr
+; PIC16: .ent caller11
+; PIC16: jalrc
+
+  %call = tail call i32 @callee11(%struct.S* byval @gs1) nounwind
+  ret i32 %call
+}
+
+declare i32 @callee12()
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+
+define i32 @caller12(%struct.S* nocapture byval %a0) nounwind {
+entry:
+; PIC32: .ent caller12
+; PIC32: jalr
+; STATIC32: .ent caller12
+; STATIC32: jal
+; N64: .ent caller12
+; N64: jalr
+; PIC16: .ent caller12
+; PIC16: jalrc
+
+  %0 = bitcast %struct.S* %a0 to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast (%struct.S* @gs1 to i8*), i8* %0, i32 8, i32 4, i1 false)
+  %call = tail call i32 @callee12() nounwind
+  ret i32 %call
+}
+
+declare i32 @callee13(i32, ...)
+
+define i32 @caller13() nounwind {
+entry:
+; PIC32: .ent caller13
+; PIC32-NOT: jalr
+; STATIC32: .ent caller13
+; STATIC32-NOT: jal
+; N64: .ent caller13
+; N64-NOT: jalr
+; PIC16: .ent caller13
+; PIC16: jalrc
+
+  %call = tail call i32 (i32, ...)* @callee13(i32 1, i32 2) nounwind
+  ret i32 %call
+}
+
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index d681091f4c14..ce98cc826223 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
 @bar = hidden alias i32* @foo
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index a7ddb96e4338..72d30dc36912 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=PIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:                             | FileCheck %s -check-prefix=STATIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:   -mips-fix-global-base-reg=false | FileCheck %s -check-prefix=STATICGP
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN:     FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
+; RUN:     %s | FileCheck %s -check-prefix=STATIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
+; RUN:     -mips-fix-global-base-reg=false < %s  | \
+; RUN:     FileCheck %s -check-prefix=STATICGP
 
 @t1 = thread_local global i32 0, align 4
 
diff --git a/test/CodeGen/Mips/tls16.ll b/test/CodeGen/Mips/tls16.ll
new file mode 100644
index 000000000000..861864bcfe0f
--- /dev/null
+++ b/test/CodeGen/Mips/tls16.ll
@@ -0,0 +1,13 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@a = thread_local global i32 4, align 4
+
+define i32 @foo() nounwind readonly {
+entry:
+  %0 = load i32* @a, align 4
+; PIC16:	lw	${{[0-9]+}}, %call16(__tls_get_addr)(${{[0-9]+}})
+; PIC16:	addiu	${{[0-9]+}}, %tlsgd(a)
+  ret i32 %0
+}
+
+
diff --git a/test/CodeGen/Mips/tls16_2.ll b/test/CodeGen/Mips/tls16_2.ll
new file mode 100644
index 000000000000..b33e3c3766b6
--- /dev/null
+++ b/test/CodeGen/Mips/tls16_2.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=PIC16
+
+@f.i = internal thread_local unnamed_addr global i32 1, align 4
+
+define i8* @f(i8* nocapture %a) nounwind {
+entry:
+  %0 = load i32* @f.i, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @f.i, align 4
+  %1 = inttoptr i32 %inc to i8*
+; PIC16: addiu	${{[0-9]+}}, %tlsldm(f.i)
+  ret i8* %1
+}
+
+
diff --git a/test/CodeGen/Mips/uitofp.ll b/test/CodeGen/Mips/uitofp.ll
new file mode 100644
index 000000000000..aff70c24f07c
--- /dev/null
+++ b/test/CodeGen/Mips/uitofp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips -mattr=+single-float < %s
+
+define void @f0() nounwind {
+entry:
+  %b = alloca i32, align 4
+  %a = alloca float, align 4
+  store volatile i32 1, i32* %b, align 4
+  %0 = load volatile i32* %b, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* %a, align 4
+  ret void
+}
diff --git a/test/CodeGen/Mips/ul1.ll b/test/CodeGen/Mips/ul1.ll
new file mode 100644
index 000000000000..7e64ff4d90fd
--- /dev/null
+++ b/test/CodeGen/Mips/ul1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+%struct.ua = type <{ i16, i32 }>
+
+@foo = common global %struct.ua zeroinitializer, align 1
+
+define i32 @main() nounwind {
+entry:
+  store i32 10, i32* getelementptr inbounds (%struct.ua* @foo, i32 0, i32 1), align 1
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+  ret i32 0
+}
+
diff --git a/test/CodeGen/Mips/vector-load-store.ll b/test/CodeGen/Mips/vector-load-store.ll
new file mode 100644
index 000000000000..d88996309908
--- /dev/null
+++ b/test/CodeGen/Mips/vector-load-store.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mipsel -mattr=+dsp < %s | FileCheck %s
+
+@g1 = common global <2 x i16> zeroinitializer, align 4
+@g0 = common global <2 x i16> zeroinitializer, align 4
+@g3 = common global <4 x i8> zeroinitializer, align 4
+@g2 = common global <4 x i8> zeroinitializer, align 4
+
+define void @func_v2i16() nounwind {
+entry:
+; CHECK: lw
+; CHECK: sw
+
+  %0 = load <2 x i16>* @g1, align 4
+  store <2 x i16> %0, <2 x i16>* @g0, align 4
+  ret void
+}
+
+define void @func_v4i8() nounwind {
+entry:
+; CHECK: lw
+; CHECK: sw
+
+  %0 = load <4 x i8>* @g3, align 4
+  store <4 x i8> %0, <4 x i8>* @g2, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/NVPTX/global-ordering.ll b/test/CodeGen/NVPTX/global-ordering.ll
new file mode 100644
index 000000000000..43394a79e912
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-ordering.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; Make sure we emit these globals in def-use order
+
+
+; PTX32:      .visible .global .align 1 .u8 a = 2;
+; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;
+; PTX64:      .visible .global .align 1 .u8 a = 2;
+; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;
+@a2 = addrspace(1) global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 2
+
+
+; PTX32:      .visible .global .align 1 .u8 b = 1;
+; PTX32-NEXT: .visible .global .align 4 .u32 b2[2] = {b, b};
+; PTX64:      .visible .global .align 1 .u8 b = 1;
+; PTX64-NEXT: .visible .global .align 8 .u64 b2[2] = {b, b};
+@b2 = addrspace(1) global [2 x i8 addrspace(1)*] [i8 addrspace(1)* @b, i8 addrspace(1)* @b]
+@b = addrspace(1) global i8 1
diff --git a/test/CodeGen/NVPTX/param-align.ll b/test/CodeGen/NVPTX/param-align.ll
new file mode 100644
index 000000000000..84ccb650d40d
--- /dev/null
+++ b/test/CodeGen/NVPTX/param-align.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+;;; Need 4-byte alignment on float* passed byval
+define ptx_device void @t1(float* byval %x) {
+; CHECK: .func t1
+; CHECK: .param .align 4 .b8 t1_param_0[4]
+  ret void
+}
+
+
+;;; Need 8-byte alignment on double* passed byval
+define ptx_device void @t2(double* byval %x) {
+; CHECK: .func t2
+; CHECK: .param .align 8 .b8 t2_param_0[8]
+  ret void
+}
+
+
+;;; Need 4-byte alignment on float2* passed byval
+%struct.float2 = type { float, float }
+define ptx_device void @t3(%struct.float2* byval %x) {
+; CHECK: .func t3
+; CHECK: .param .align 4 .b8 t3_param_0[8]
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
new file mode 100644
index 000000000000..779f7798d883
--- /dev/null
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+define ptx_kernel void @t1(i1* %a) {
+; PTX32:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}};
+; PTX64:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}};
+  store i1 false, i1* %a
+  ret void
+}
+
+
+define ptx_kernel void @t2(i1* %a, i8* %b) {
+; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+
+  %t1 = load i1* %a
+  %t2 = select i1 %t1, i8 1, i8 2
+  store i8 %t2, i8* %b
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll
new file mode 100644
index 000000000000..0422b01f4ee3
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+
+
+; CHECK: .version 3.0
+
diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll
new file mode 100644
index 000000000000..d6e57301a371
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-31.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+
+
+; CHECK: .version 3.1
+
diff --git a/test/CodeGen/NVPTX/sm-version-10.ll b/test/CodeGen/NVPTX/sm-version-10.ll
new file mode 100644
index 000000000000..9324a3780986
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-10.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+
+; CHECK: .target sm_10
+
diff --git a/test/CodeGen/NVPTX/sm-version-11.ll b/test/CodeGen/NVPTX/sm-version-11.ll
new file mode 100644
index 000000000000..9033a4eba5e4
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-11.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_11 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_11 | FileCheck %s
+
+
+; CHECK: .target sm_11
+
diff --git a/test/CodeGen/NVPTX/sm-version-12.ll b/test/CodeGen/NVPTX/sm-version-12.ll
new file mode 100644
index 000000000000..d8ee85c9010e
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-12.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_12 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_12 | FileCheck %s
+
+
+; CHECK: .target sm_12
+
diff --git a/test/CodeGen/NVPTX/sm-version-13.ll b/test/CodeGen/NVPTX/sm-version-13.ll
new file mode 100644
index 000000000000..ad67d642ce30
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-13.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_13 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_13 | FileCheck %s
+
+
+; CHECK: .target sm_13
+
diff --git a/test/CodeGen/NVPTX/sm-version-20.ll b/test/CodeGen/NVPTX/sm-version-20.ll
new file mode 100644
index 000000000000..c21f49e6aeb9
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-20.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: .target sm_20
+
diff --git a/test/CodeGen/NVPTX/sm-version-21.ll b/test/CodeGen/NVPTX/sm-version-21.ll
new file mode 100644
index 000000000000..4fb6de3e6323
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-21.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_21 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_21 | FileCheck %s
+
+
+; CHECK: .target sm_21
+
diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll
new file mode 100644
index 000000000000..692b49a0d6b3
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+
+
+; CHECK: .target sm_30
+
diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll
new file mode 100644
index 000000000000..25368a01335e
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-35.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+
+; CHECK: .target sm_35
+
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index 0003a17c2284..b95ac6880758 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -9,9 +9,8 @@ target triple = "powerpc-apple-darwin11.0"
 
 define void @foo() nounwind ssp {
 entry:
-; Better: mtctr r12
-; CHECK: mr r12, [[REG:r[0-9]+]]
-; CHECK: mtctr [[REG]]
+; CHECK: mtctr r12
+; CHECK: bctrl
   %0 = load void (...)** @p, align 4              ; <void (...)*> [#uses=1]
   call void (...)* %0() nounwind
   br label %return
diff --git a/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
new file mode 100644
index 000000000000..9d2e390c1c97
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; This test check if the TOC entry symbol name won't clash with global .LC0
+; and .LC2 symbols defined in the module.
+
+@.LC0 = internal global [5 x i8] c".LC0\00"
+@.LC2 = internal global [5 x i8] c".LC2\00"
+
+define i32 @foo(double %X, double %Y) nounwind readnone {
+  ; The 1.0 and 3.0 constants generate two TOC entries
+  %cmp = fcmp oeq double %X, 1.000000e+00
+  %conv = zext i1 %cmp to i32
+  %cmp1 = fcmp oeq double %Y, 3.000000e+00
+  %conv2 = zext i1 %cmp1 to i32
+  %add = add nsw i32 %conv2, %conv
+  ret i32 %add
+}
+
+; Check the creation of 2 .tc entries for both double constants. They
+; should be .LC1 and .LC3 to avoid name clash with global constants
+; .LC0 and .LC2
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll b/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll
new file mode 100644
index 000000000000..41533a8f322b
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-10-11-dynalloc.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @test(i64 %n) nounwind {
+entry:
+  %0 = alloca i8, i64 %n, align 1
+  %1 = alloca i8, i64 %n, align 1
+  call void @use(i8* %0, i8* %1) nounwind
+  ret void
+}
+
+declare void @use(i8*, i8*)
+
+; Check we actually have two instances of dynamic stack allocation,
+; identified by the stdux used to update the back-chain link.
+; CHECK: stdux
+; CHECK: stdux
diff --git a/test/CodeGen/PowerPC/2012-10-12-bitcast.ll b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
new file mode 100644
index 000000000000..f841c5fb92e4
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-10-12-bitcast.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mattr=+altivec < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test(<16 x i8> %v) nounwind {
+entry:
+  %0 = bitcast <16 x i8> %v to i128
+  %1 = lshr i128 %0, 96
+  %2 = trunc i128 %1 to i32
+  ret i32 %2
+}
+
+; Verify that bitcast handles big-endian platforms correctly
+; by checking we load the result from the correct offset
+
+; CHECK: addi [[REGISTER:[0-9]+]], 1, -16
+; CHECK: stvx 2, 0, [[REGISTER]]
+; CHECK: lwz 3, -16(1)
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/asm-Zy.ll b/test/CodeGen/PowerPC/asm-Zy.ll
new file mode 100644
index 000000000000..691165f23788
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-Zy.ll
@@ -0,0 +1,14 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
+
+define i32 @zytest(i32 %a) nounwind {
+entry:
+; CHECK: @zytest
+  %r = call i32 asm "lwbrx $0, ${1:y}", "=r,Z"(i32 %a) nounwind, !srcloc !0
+  ret i32 %r
+; CHECK: lwbrx 3, 0,
+}
+
+!0 = metadata !{i32 101688}
+
diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index 9a456b6ecc51..638059a38ef5 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll
@@ -2,10 +2,10 @@
 
 declare void @bar(i64 %x, i64 %y)
 
-; CHECK: li {{[53]}}, 0
+; CHECK: li 3, 0
 ; CHECK: li 4, 2
+; CHECK: li 5, 0
 ; CHECK: li 6, 3
-; CHECK: mr {{[53]}}, {{[53]}}
 
 define void @foo() {
   call void @bar(i64 2, i64 3)
diff --git a/test/CodeGen/PowerPC/bl8_elf_nop.ll b/test/CodeGen/PowerPC/bl8_elf_nop.ll
deleted file mode 100644
index 386c59e32238..000000000000
--- a/test/CodeGen/PowerPC/bl8_elf_nop.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck  %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-declare i32 @clock() nounwind
-
-define i32 @func() {
-entry:
-  %call = call i32 @clock() nounwind
-  %call2 = add i32 %call, 7
-  ret i32 %call2
-}
-
-; CHECK: bl clock
-; CHECK-NEXT: nop
-
diff --git a/test/CodeGen/PowerPC/coalesce-ext.ll b/test/CodeGen/PowerPC/coalesce-ext.ll
index cc80f8330798..f19175c9beaa 100644
--- a/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -13,5 +13,6 @@ define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
   store volatile i32 %D, i32* %P
   ; Reuse low bits of extended register, don't extend live range of SUM.
   ; CHECK: stw [[EXT]]
-  ret i32 %D
+  %R = add i32 %D, %D
+  ret i32 %R
 }
diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
new file mode 100644
index 000000000000..afa1ea8e75a1
--- /dev/null
+++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [3 x i8] c"%i\00", align 1
+
+define void @test(i32 %count) nounwind {
+entry:
+; CHECK: crxor 6, 6, 6
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %cmp2 = icmp sgt i32 %count, 0
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+; CHECK: crxor 6, 6, 6
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %inc = add nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/crsave.ll b/test/CodeGen/PowerPC/crsave.ll
new file mode 100644
index 000000000000..3e98dbd254d9
--- /dev/null
+++ b/test/CodeGen/PowerPC/crsave.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC64
+
+declare void @foo()
+
+define i32 @test_cr2() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+
+define i32 @test_cr234() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09cmp 3,$2,$2\0A\09cmp 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+; PPC32-NEXT: mtcrf 16, 12
+; PPC32-NEXT: mtcrf 8, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+; PPC64-NEXT: mtcrf 16, 12
+; PPC64-NEXT: mtcrf 8, 12
+
diff --git a/test/CodeGen/PowerPC/emptystruct.ll b/test/CodeGen/PowerPC/emptystruct.ll
new file mode 100644
index 000000000000..36b4abd2bfad
--- /dev/null
+++ b/test/CodeGen/PowerPC/emptystruct.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests correct handling of empty aggregate parameters and return values.
+; An empty parameter passed by value does not consume a protocol register or
+; a parameter save area doubleword.  An empty parameter passed by reference
+; is treated as any other pointer parameter.  An empty aggregate return value 
+; is treated as any other aggregate return value, passed via address as a 
+; hidden parameter in GPR3.  In this example, GPR3 contains the return value
+; address, GPR4 contains the address of e2, and e1 and e3 are not passed or
+; received.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.empty = type {}
+
+define void @callee(%struct.empty* noalias sret %agg.result, %struct.empty* byval %a1, %struct.empty* %a2, %struct.empty* byval %a3) nounwind {
+entry:
+  %a2.addr = alloca %struct.empty*, align 8
+  store %struct.empty* %a2, %struct.empty** %a2.addr, align 8
+  %0 = load %struct.empty** %a2.addr, align 8
+  %1 = bitcast %struct.empty* %agg.result to i8*
+  %2 = bitcast %struct.empty* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 0, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: callee:
+; CHECK: std 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: blr
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @caller(%struct.empty* noalias sret %agg.result) nounwind {
+entry:
+  %e1 = alloca %struct.empty, align 1
+  %e2 = alloca %struct.empty, align 1
+  %e3 = alloca %struct.empty, align 1
+  call void @callee(%struct.empty* sret %agg.result, %struct.empty* byval %e1, %struct.empty* %e2, %struct.empty* byval %e3)
+  ret void
+}
+
+; CHECK: caller:
+; CHECK: addi 4,
+; CHECK: std 3,
+; CHECK-NOT: std 5,
+; CHECK-NOT: std 6,
+; CHECK: bl callee
diff --git a/test/CodeGen/PowerPC/floatPSA.ll b/test/CodeGen/PowerPC/floatPSA.ll
new file mode 100644
index 000000000000..b5631a160561
--- /dev/null
+++ b/test/CodeGen/PowerPC/floatPSA.ll
@@ -0,0 +1,97 @@
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; This verifies that single-precision floating point values that can't
+; be passed in registers are stored in the rightmost word of the parameter
+; save area slot.  There are 13 architected floating-point registers, so
+; the 14th is passed in storage.  The address of the 14th argument is
+; 48 (fixed size of the linkage area) + 13 * 8 (first 13 args) + 4
+; (offset to second word) = 156.
+
+define float @bar(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i, float %j, float %k, float %l, float %m, float %n) nounwind {
+entry:
+  %a.addr = alloca float, align 4
+  %b.addr = alloca float, align 4
+  %c.addr = alloca float, align 4
+  %d.addr = alloca float, align 4
+  %e.addr = alloca float, align 4
+  %f.addr = alloca float, align 4
+  %g.addr = alloca float, align 4
+  %h.addr = alloca float, align 4
+  %i.addr = alloca float, align 4
+  %j.addr = alloca float, align 4
+  %k.addr = alloca float, align 4
+  %l.addr = alloca float, align 4
+  %m.addr = alloca float, align 4
+  %n.addr = alloca float, align 4
+  store float %a, float* %a.addr, align 4
+  store float %b, float* %b.addr, align 4
+  store float %c, float* %c.addr, align 4
+  store float %d, float* %d.addr, align 4
+  store float %e, float* %e.addr, align 4
+  store float %f, float* %f.addr, align 4
+  store float %g, float* %g.addr, align 4
+  store float %h, float* %h.addr, align 4
+  store float %i, float* %i.addr, align 4
+  store float %j, float* %j.addr, align 4
+  store float %k, float* %k.addr, align 4
+  store float %l, float* %l.addr, align 4
+  store float %m, float* %m.addr, align 4
+  store float %n, float* %n.addr, align 4
+  %0 = load float* %n.addr, align 4
+  ret float %0
+}
+
+; CHECK: lfs {{[0-9]+}}, 156(1)
+
+define float @foo() nounwind {
+entry:
+  %a = alloca float, align 4
+  %b = alloca float, align 4
+  %c = alloca float, align 4
+  %d = alloca float, align 4
+  %e = alloca float, align 4
+  %f = alloca float, align 4
+  %g = alloca float, align 4
+  %h = alloca float, align 4
+  %i = alloca float, align 4
+  %j = alloca float, align 4
+  %k = alloca float, align 4
+  %l = alloca float, align 4
+  %m = alloca float, align 4
+  %n = alloca float, align 4
+  store float 1.000000e+00, float* %a, align 4
+  store float 2.000000e+00, float* %b, align 4
+  store float 3.000000e+00, float* %c, align 4
+  store float 4.000000e+00, float* %d, align 4
+  store float 5.000000e+00, float* %e, align 4
+  store float 6.000000e+00, float* %f, align 4
+  store float 7.000000e+00, float* %g, align 4
+  store float 8.000000e+00, float* %h, align 4
+  store float 9.000000e+00, float* %i, align 4
+  store float 1.000000e+01, float* %j, align 4
+  store float 1.100000e+01, float* %k, align 4
+  store float 1.200000e+01, float* %l, align 4
+  store float 1.300000e+01, float* %m, align 4
+  store float 1.400000e+01, float* %n, align 4
+  %0 = load float* %a, align 4
+  %1 = load float* %b, align 4
+  %2 = load float* %c, align 4
+  %3 = load float* %d, align 4
+  %4 = load float* %e, align 4
+  %5 = load float* %f, align 4
+  %6 = load float* %g, align 4
+  %7 = load float* %h, align 4
+  %8 = load float* %i, align 4
+  %9 = load float* %j, align 4
+  %10 = load float* %k, align 4
+  %11 = load float* %l, align 4
+  %12 = load float* %m, align 4
+  %13 = load float* %n, align 4
+  %call = call float @bar(float %0, float %1, float %2, float %3, float %4, float %5, float %6, float %7, float %8, float %9, float %10, float %11, float %12, float %13)
+  ret float %call
+}
+
+; Note that stw is used instead of stfs because the value is a simple
+; constant that can be created with a load-immediate in a GPR.
+; CHECK: stw {{[0-9]+}}, 156(1)
+
diff --git a/test/CodeGen/PowerPC/fsl-e500mc.ll b/test/CodeGen/PowerPC/fsl-e500mc.ll
new file mode 100644
index 000000000000..09b7e41b1899
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e500mc.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e500mc and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e500mc < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-fsl-linux"
+
+%struct.teststruct = type { [12 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 52, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/fsl-e5500.ll b/test/CodeGen/PowerPC/fsl-e5500.ll
new file mode 100644
index 000000000000..d47d8c8ed4f3
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e5500.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e5500 and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e5500 < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-fsl-linux"
+
+%struct.teststruct = type { [24 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 100, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
new file mode 100644
index 000000000000..5a0c072c9c52
--- /dev/null
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define float @test(i64 %x) nounwind readnone {
+entry:
+  %conv = sitofp i64 %x to float
+  ret float %conv
+}
+
+; Verify that we get the code sequence needed to avoid double-rounding.
+; Note that only parts of the sequence are checked for here, to allow
+; for minor code generation differences.
+
+; CHECK: sradi [[REGISTER:[0-9]+]], 3, 53
+; CHECK: addi [[REGISTER:[0-9]+]], [[REGISTER]], 1
+; CHECK: cmpldi 0, [[REGISTER]], 1
+; CHECK: isel [[REGISTER:[0-9]+]], {{[0-9]+}}, 3, 1
+; CHECK: std [[REGISTER]], -{{[0-9]+}}(1)
+
+
+; Also check that with -enable-unsafe-fp-math we do not get that extra
+; code sequence.  Simply verify that there is no "isel" present.
+
+; RUN: llc -mcpu=pwr7 -enable-unsafe-fp-math < %s | FileCheck %s -check-prefix=UNSAFE
+; CHECK-UNSAFE-NOT: isel
+
diff --git a/test/CodeGen/PowerPC/inlineasm-copy.ll b/test/CodeGen/PowerPC/inlineasm-copy.ll
index e1ff82d5f9b7..59c338883561 100644
--- a/test/CodeGen/PowerPC/inlineasm-copy.ll
+++ b/test/CodeGen/PowerPC/inlineasm-copy.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=ppc32 | not grep mr
+; RUN: llc < %s -march=ppc32 -verify-machineinstrs | FileCheck %s
 
+; CHECK-NOT: mr
 define i32 @test(i32 %Y, i32 %X) {
 entry:
         %tmp = tail call i32 asm "foo $0", "=r"( )              ; <i32> [#uses=1]
@@ -12,3 +13,9 @@ entry:
         ret i32 %tmp1
 }
 
+; CHECK: test3
+define i32 @test3(i32 %Y, i32 %X) {
+entry:
+        %tmp1 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "foo $0, $1", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"( i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y )                ; <i32> [#uses=1]
+       ret i32 1
+}
diff --git a/test/CodeGen/PowerPC/int-fp-conv-1.ll b/test/CodeGen/PowerPC/int-fp-conv-1.ll
index 6c8272351924..d2887b9b947e 100644
--- a/test/CodeGen/PowerPC/int-fp-conv-1.ll
+++ b/test/CodeGen/PowerPC/int-fp-conv-1.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=ppc64 | grep __floatditf
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+; CHECK-NOT: __floatditf
 
 define i64 @__fixunstfdi(ppc_fp128 %a) nounwind  {
 entry:
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
new file mode 100644
index 000000000000..62aa7cf929f8
--- /dev/null
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests receiving and re-passing parameters consisting of structures
+; of size 3, 5, 6, and 7.  They are to be found/placed right-adjusted in
+; the parameter registers.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S3 = type { [3 x i8] }
+%struct.S5 = type { [5 x i8] }
+%struct.S6 = type { [6 x i8] }
+%struct.S7 = type { [7 x i8] }
+
+define void @test(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7) nounwind {
+entry:
+  call void @check(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7)
+  ret void
+}
+
+; CHECK: std 6, 216(1)
+; CHECK: std 5, 208(1)
+; CHECK: std 4, 200(1)
+; CHECK: std 3, 192(1)
+; CHECK: lbz {{[0-9]+}}, 199(1)
+; CHECK: stb {{[0-9]+}}, 55(1)
+; CHECK: lhz {{[0-9]+}}, 197(1)
+; CHECK: sth {{[0-9]+}}, 53(1)
+; CHECK: lbz {{[0-9]+}}, 207(1)
+; CHECK: stb {{[0-9]+}}, 63(1)
+; CHECK: lwz {{[0-9]+}}, 203(1)
+; CHECK: stw {{[0-9]+}}, 59(1)
+; CHECK: lhz {{[0-9]+}}, 214(1)
+; CHECK: sth {{[0-9]+}}, 70(1)
+; CHECK: lwz {{[0-9]+}}, 210(1)
+; CHECK: stw {{[0-9]+}}, 66(1)
+; CHECK: lbz {{[0-9]+}}, 223(1)
+; CHECK: stb {{[0-9]+}}, 79(1)
+; CHECK: lhz {{[0-9]+}}, 221(1)
+; CHECK: sth {{[0-9]+}}, 77(1)
+; CHECK: lwz {{[0-9]+}}, 217(1)
+; CHECK: stw {{[0-9]+}}, 73(1)
+; CHECK: ld 6, 72(1)
+; CHECK: ld 5, 64(1)
+; CHECK: ld 4, 56(1)
+; CHECK: ld 3, 48(1)
+
+declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)
diff --git a/test/CodeGen/PowerPC/misched.ll b/test/CodeGen/PowerPC/misched.ll
new file mode 100644
index 000000000000..d6fb3b30464f
--- /dev/null
+++ b/test/CodeGen/PowerPC/misched.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -enable-misched -verify-machineinstrs
+; PR14302
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+@b = external global [16000 x double], align 32
+
+define void @pr14302() nounwind {
+entry:
+  tail call void @putchar() nounwind
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.body24.i
+
+for.body24.i:                                     ; preds = %for.body24.i, %for.body
+  store double 1.000000e+00, double* undef, align 8
+  br i1 undef, label %for.body24.i58, label %for.body24.i
+
+for.body24.i58:                                   ; preds = %for.body24.i58, %for.body24.i
+  %arrayidx26.i55.1 = getelementptr inbounds [16000 x double]* @b, i64 0, i64 undef
+  store double 1.000000e+00, double* %arrayidx26.i55.1, align 8
+  br i1 undef, label %for.body24.i64, label %for.body24.i58
+
+for.body24.i64:                                   ; preds = %for.body24.i64, %for.body24.i58
+  %exitcond.2489 = icmp eq i32 0, 16000
+  br i1 %exitcond.2489, label %for.body24.i70, label %for.body24.i64
+
+for.body24.i70:                                   ; preds = %for.body24.i70, %for.body24.i64
+  br i1 undef, label %for.body24.i76, label %for.body24.i70
+
+for.body24.i76:                                   ; preds = %for.body24.i76, %for.body24.i70
+  br i1 undef, label %set1d.exit77, label %for.body24.i76
+
+set1d.exit77:                                     ; preds = %for.body24.i76
+  br label %for.body29
+
+for.body29:                                       ; preds = %for.body29, %set1d.exit77
+  br i1 undef, label %for.end35, label %for.body29
+
+for.end35:                                        ; preds = %for.body29
+  ret void
+}
+
+declare void @putchar()
diff --git a/test/CodeGen/PowerPC/novrsave.ll b/test/CodeGen/PowerPC/novrsave.ll
new file mode 100644
index 000000000000..a70576a291e9
--- /dev/null
+++ b/test/CodeGen/PowerPC/novrsave.ll
@@ -0,0 +1,15 @@
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu   < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; This verifies that the code to update VRSAVE has been removed for SVR4.
+
+define <4 x float> @bar(<4 x float> %v) nounwind {
+entry:
+  %v.addr = alloca <4 x float>, align 16
+  store <4 x float> %v, <4 x float>* %v.addr, align 16
+  %0 = load <4 x float>* %v.addr, align 16
+  ret <4 x float> %0
+}
+
+; CHECK-NOT: mfspr
+; CHECK-NOT: mtspr
diff --git a/test/CodeGen/PowerPC/ppc64-abi-extend.ll b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
new file mode 100644
index 000000000000..8baf1c613e78
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
@@ -0,0 +1,97 @@
+; Verify that i32 argument/return values are extended to i64
+
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@si = common global i32 0, align 4
+@ui = common global i32 0, align 4
+
+declare void @arg_si(i32 signext)
+declare void @arg_ui(i32 zeroext)
+
+declare signext i32 @ret_si()
+declare zeroext i32 @ret_ui()
+
+define void @pass_arg_si() nounwind {
+entry:
+  %0 = load i32* @si, align 4
+  tail call void @arg_si(i32 signext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_si
+; CHECK: lwa 3,
+; CHECK: bl arg_si
+
+define void @pass_arg_ui() nounwind {
+entry:
+  %0 = load i32* @ui, align 4
+  tail call void @arg_ui(i32 zeroext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_ui
+; CHECK: lwz 3,
+; CHECK: bl arg_ui
+
+define i64 @use_arg_si(i32 signext %x) nounwind readnone {
+entry:
+  %conv = sext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_si
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define i64 @use_arg_ui(i32 zeroext %x) nounwind readnone {
+entry:
+  %conv = zext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_ui
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define signext i32 @pass_ret_si() nounwind readonly {
+entry:
+  %0 = load i32* @si, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_si
+; CHECK: lwa 3,
+; CHECK: blr
+
+define zeroext i32 @pass_ret_ui() nounwind readonly {
+entry:
+  %0 = load i32* @ui, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_ui
+; CHECK: lwz 3,
+; CHECK: blr
+
+define i64 @use_ret_si() nounwind {
+entry:
+  %call = tail call signext i32 @ret_si() nounwind
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_si
+; CHECK: bl ret_si
+; This is to verify the return register (3) set up by the ret_si
+; call is passed on unmodified as return value of use_ret_si.
+; CHECK-NOT: 3
+; CHECK: blr
+
+define i64 @use_ret_ui() nounwind {
+entry:
+  %call = tail call zeroext i32 @ret_ui() nounwind
+  %conv = zext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_ui
+; CHECK: bl ret_ui
+; This is to verify the return register (3) set up by the ret_ui
+; call is passed on unmodified as return value of use_ret_ui.
+; CHECK-NOT: 3
+; CHECK: blr
+
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
new file mode 100644
index 000000000000..10b70d02e5cc
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; Verify internal alignment of long double in a struct.  The double
+; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
+; the long double.  Check that these are stored to proper locations
+; in the parameter save area and loaded from there for return in FPR1/2.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S = type { double, ppc_fp128 }
+
+define ppc_fp128 @test(%struct.S* byval %x) nounwind {
+entry:
+  %b = getelementptr inbounds %struct.S* %x, i32 0, i32 1
+  %0 = load ppc_fp128* %b, align 16
+  ret ppc_fp128 %0
+}
+
+; CHECK: std 6, 72(1)
+; CHECK: std 5, 64(1)
+; CHECK: std 4, 56(1)
+; CHECK: std 3, 48(1)
+; CHECK: lfd 1, 64(1)
+; CHECK: lfd 2, 72(1)
+
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
new file mode 100644
index 000000000000..c382edbbce4e
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind readnone noinline {
+  ret void
+}
+
+define weak void @foo_weak() nounwind {
+  ret void
+}
+
+; Calls to local function does not require the TOC restore 'nop'
+define void @test_direct() nounwind readnone {
+; CHECK: test_direct:
+  tail call void @foo() nounwind
+; CHECK: bl foo
+; CHECK-NOT: nop
+  ret void
+}
+
+; Calls to weak function requires a TOC restore 'nop' because they
+; may be overridden in a different module.
+define void @test_weak() nounwind readnone {
+; CHECK: test_weak:
+  tail call void @foo_weak() nounwind
+; CHECK: bl foo
+; CHECK-NEXT: nop
+  ret void
+}
+
+; Indirect calls requires a full stub creation
+define void @test_indirect(void ()* nocapture %fp) nounwind {
+; CHECK: test_indirect:
+  tail call void %fp() nounwind
+; CHECK: ld [[FP:[0-9]+]], 0(3)
+; CHECK: ld 11, 16(3)
+; CHECK: ld 2, 8(3)
+; CHECK-NEXT: mtctr [[FP]]
+; CHECK-NEXT: bctrl
+; CHECK-NEXT: ld 2, 40(1)
+  ret void
+}
+
+; Absolute vales should be have the TOC restore 'nop'
+define void @test_abs() nounwind {
+; CHECK: test_abs:
+  tail call void inttoptr (i64 1024 to void ()*)() nounwind
+; CHECK: bla 1024
+; CHECK-NEXT: nop
+  ret void
+}
+
+declare double @sin(double) nounwind
+
+; External functions call should also have a 'nop'
+define double @test_external(double %x) nounwind {
+; CHECK: test_external:
+  %call = tail call double @sin(double %x) nounwind
+; CHECK: bl sin
+; CHECK-NEXT: nop
+  ret double %call
+}
diff --git a/test/CodeGen/PowerPC/ppc64-ind-call.ll b/test/CodeGen/PowerPC/ppc64-ind-call.ll
deleted file mode 100644
index d5c4d468c656..000000000000
--- a/test/CodeGen/PowerPC/ppc64-ind-call.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-define void @test1() {
-entry:
-  %call.i75 = call zeroext i8 undef(i8* undef, i8 zeroext 10)
-  unreachable
-}
-
-; CHECK: @test1
-; CHECK: ld 11, 0(3)
-; CHECK: ld 2, 8(3)
-; CHECK: bctrl
-; CHECK: ld 2, 40(1)
-
diff --git a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
index e5aa1f169f64..e1d50bac51a2 100644
--- a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
+++ b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
@@ -5,6 +5,7 @@
 ; CHECK-NEXT:	.align 3
 ; CHECK-NEXT:	.quad .L.test1
 ; CHECK-NEXT:	.quad .TOC.@tocbase
+; CHECK-NEXT:   .quad 0
 ; CHECK-NEXT:	.text
 ; CHECK-NEXT: .L.test1:
 
diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
new file mode 100644
index 000000000000..a29bdcb25031
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@double_array = global [32 x double] zeroinitializer, align 8
+@number64 = global i64 10, align 8
+@internal_static_var.x = internal unnamed_addr global i64 0, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+; CHECK: access_int64:
+; CHECK-NEXT: .align  3
+; CHECK-NEXT: .quad   .L.access_int64
+; CHECK-NEXT: .quad   .TOC.@tocbase
+; CHECK-NEXT: .quad   0
+; CHECK-NEXT: .text
+  %0 = load i64* @number64, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1
+}
+
+define i64 @internal_static_var(i64 %a) nounwind {
+entry:
+; CHECK: internal_static_var:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %0 = load i64* @internal_static_var.x, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1 
+}
+
+define i32 @access_double(double %a) nounwind readnone {
+entry:
+; CHECK: access_double:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %a, 2.000000e+00
+  %conv = zext i1 %cmp to i32 
+  ret i32 %conv
+}
+
+
+define i32 @access_double_array(double %a, i32 %i) nounwind readonly {
+entry:
+; CHECK: access_double_array:
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [32 x double]* @double_array, i64 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %0, %a
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Check the creation of 4 .tc entries:
+; * int64_t global 'number64'
+; * double constant 2.0
+; * double array 'double_array'
+; * static int64_t 'x' accessed within '@internal_static_var'
+; CHECK: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/ppc64-zext.ll b/test/CodeGen/PowerPC/ppc64-zext.ll
new file mode 100644
index 000000000000..eb55445cc6c9
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux"
+
+define i64 @fun(i32 %arg32) nounwind {
+entry:
+; CHECK: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  %o = zext i32 %arg32 to i64
+  ret i64 %o
+}
+
diff --git a/test/CodeGen/PowerPC/pr12757.ll b/test/CodeGen/PowerPC/pr12757.ll
new file mode 100644
index 000000000000..c344656d2983
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr12757.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @__flt_rounds() nounwind {
+entry:
+  %0 = tail call i64 asm sideeffect "mffs $0", "=f"() nounwind
+  %conv = trunc i64 %0 to i32
+  ret i32 %conv
+}
+
+; CHECK: @__flt_rounds
+; CHECK: mffs
+
diff --git a/test/CodeGen/PowerPC/pr13641.ll b/test/CodeGen/PowerPC/pr13641.ll
new file mode 100644
index 000000000000..c4d3f3a9dc60
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13641.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind {
+  ret void
+}
+
+; CHECK: blr
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .quad 0
diff --git a/test/CodeGen/PowerPC/pr13891.ll b/test/CodeGen/PowerPC/pr13891.ll
new file mode 100644
index 000000000000..3ae73850a342
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13891.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.foo = type { i8, i8 }
+
+define void @_Z5check3foos(%struct.foo* nocapture byval %f, i16 signext %i) noinline {
+; CHECK: _Z5check3foos:
+; CHECK: sth 3, {{[0-9]+}}(1)
+; CHECK: lha {{[0-9]+}}, {{[0-9]+}}(1)
+entry:
+  %0 = bitcast %struct.foo* %f to i16*
+  %1 = load i16* %0, align 2
+  %bf.val.sext = ashr i16 %1, 8
+  %cmp = icmp eq i16 %bf.val.sext, %i
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %conv = sext i16 %bf.val.sext to i32
+  tail call void @exit(i32 %conv)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+declare void @exit(i32)
diff --git a/test/CodeGen/PowerPC/remat-imm.ll b/test/CodeGen/PowerPC/remat-imm.ll
new file mode 100644
index 000000000000..520921f57a93
--- /dev/null
+++ b/test/CodeGen/PowerPC/remat-imm.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = 'test.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [6 x i8] c"%d,%d\00", align 1
+
+define i32 @main() nounwind {
+entry:
+; CHECK: li 4, 128
+; CHECK-NOT: mr 4, {{.*}}
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
new file mode 100644
index 000000000000..884d3a89d15a
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -0,0 +1,227 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads.  This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stw {{[0-9]+}}, 132(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: std {{[0-9]+}}, 144(1)
+; CHECK: std {{[0-9]+}}, 152(1)
+; CHECK: std {{[0-9]+}}, 160(1)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %z1.addr = alloca i32, align 4
+  %z2.addr = alloca i32, align 4
+  %z3.addr = alloca i32, align 4
+  %z4.addr = alloca i32, align 4
+  %z5.addr = alloca i32, align 4
+  %z6.addr = alloca i32, align 4
+  %z7.addr = alloca i32, align 4
+  %z8.addr = alloca i32, align 4
+  store i32 %z1, i32* %z1.addr, align 4
+  store i32 %z2, i32* %z2.addr, align 4
+  store i32 %z3, i32* %z3.addr, align 4
+  store i32 %z4, i32* %z4.addr, align 4
+  store i32 %z5, i32* %z5.addr, align 4
+  store i32 %z6, i32* %z6.addr, align 4
+  store i32 %z7, i32* %z7.addr, align 4
+  store i32 %z8, i32* %z8.addr, align 4
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lha {{[0-9]+}}, 132(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lwz {{[0-9]+}}, 144(1)
+; CHECK: lwz {{[0-9]+}}, 152(1)
+; CHECK: lwz {{[0-9]+}}, 160(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stb {{[0-9]+}}, 135(1)
+; CHECK: sth {{[0-9]+}}, 133(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: stb {{[0-9]+}}, 151(1)
+; CHECK: stw {{[0-9]+}}, 147(1)
+; CHECK: sth {{[0-9]+}}, 158(1)
+; CHECK: stw {{[0-9]+}}, 154(1)
+; CHECK: stb {{[0-9]+}}, 167(1)
+; CHECK: sth {{[0-9]+}}, 165(1)
+; CHECK: stw {{[0-9]+}}, 161(1)
+}
+
+define internal i32 @callee2(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %z1.addr = alloca i32, align 4
+  %z2.addr = alloca i32, align 4
+  %z3.addr = alloca i32, align 4
+  %z4.addr = alloca i32, align 4
+  %z5.addr = alloca i32, align 4
+  %z6.addr = alloca i32, align 4
+  %z7.addr = alloca i32, align 4
+  %z8.addr = alloca i32, align 4
+  store i32 %z1, i32* %z1.addr, align 4
+  store i32 %z2, i32* %z2.addr, align 4
+  store i32 %z3, i32* %z3.addr, align 4
+  store i32 %z4, i32* %z4.addr, align 4
+  store i32 %z5, i32* %z5.addr, align 4
+  store i32 %z6, i32* %z6.addr, align 4
+  store i32 %z7, i32* %z7.addr, align 4
+  store i32 %z8, i32* %z8.addr, align 4
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: lbz {{[0-9]+}}, 149(1)
+; CHECK: lbz {{[0-9]+}}, 150(1)
+; CHECK: lbz {{[0-9]+}}, 147(1)
+; CHECK: lbz {{[0-9]+}}, 148(1)
+; CHECK: lbz {{[0-9]+}}, 133(1)
+; CHECK: lbz {{[0-9]+}}, 134(1)
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lhz {{[0-9]+}}, 154(1)
+; CHECK: lhz {{[0-9]+}}, 156(1)
+; CHECK: lbz {{[0-9]+}}, 163(1)
+; CHECK: lbz {{[0-9]+}}, 164(1)
+; CHECK: lbz {{[0-9]+}}, 161(1)
+; CHECK: lbz {{[0-9]+}}, 162(1)
+}
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
new file mode 100644
index 000000000000..ef706af95d65
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -0,0 +1,213 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads.  This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(%struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: ld 9, 128(31)
+; CHECK: ld 8, 136(31)
+; CHECK: ld 7, 144(31)
+; CHECK: lwz 6, 152(31)
+; CHECK: lwz 5, 160(31)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(%struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
+; CHECK: stw 5, 68(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lha {{[0-9]+}}, 68(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lwz {{[0-9]+}}, 80(1)
+; CHECK: lwz {{[0-9]+}}, 88(1)
+; CHECK: lwz {{[0-9]+}}, 96(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(%struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 71(1)
+; CHECK: sth {{[0-9]+}}, 69(1)
+; CHECK: stb {{[0-9]+}}, 87(1)
+; CHECK: stw {{[0-9]+}}, 83(1)
+; CHECK: sth {{[0-9]+}}, 94(1)
+; CHECK: stw {{[0-9]+}}, 90(1)
+; CHECK: stb {{[0-9]+}}, 103(1)
+; CHECK: sth {{[0-9]+}}, 101(1)
+; CHECK: stw {{[0-9]+}}, 97(1)
+; CHECK: ld 9, 96(1)
+; CHECK: ld 8, 88(1)
+; CHECK: ld 7, 80(1)
+; CHECK: lwz 6, 152(31)
+; CHECK: ld 5, 64(1)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+define internal i32 @callee2(%struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
+; CHECK: std 5, 64(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lbz {{[0-9]+}}, 85(1)
+; CHECK: lbz {{[0-9]+}}, 86(1)
+; CHECK: lbz {{[0-9]+}}, 83(1)
+; CHECK: lbz {{[0-9]+}}, 84(1)
+; CHECK: lbz {{[0-9]+}}, 69(1)
+; CHECK: lbz {{[0-9]+}}, 70(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lhz {{[0-9]+}}, 90(1)
+; CHECK: lhz {{[0-9]+}}, 92(1)
+; CHECK: lbz {{[0-9]+}}, 99(1)
+; CHECK: lbz {{[0-9]+}}, 100(1)
+; CHECK: lbz {{[0-9]+}}, 97(1)
+; CHECK: lbz {{[0-9]+}}, 98(1)
+}
diff --git a/test/CodeGen/PowerPC/varargs-struct-float.ll b/test/CodeGen/PowerPC/varargs-struct-float.ll
new file mode 100644
index 000000000000..fb1835f580b2
--- /dev/null
+++ b/test/CodeGen/PowerPC/varargs-struct-float.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.Sf1 = type { float }
+
+define void @foo(float inreg %s.coerce) nounwind {
+entry:
+  %s = alloca %struct.Sf1, align 4
+  %coerce.dive = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  store float %s.coerce, float* %coerce.dive, align 1
+  %coerce.dive1 = getelementptr %struct.Sf1* %s, i32 0, i32 0
+  %0 = load float* %coerce.dive1, align 1
+  call void (i32, ...)* @testvaSf1(i32 1, float inreg %0)
+  ret void
+}
+
+; CHECK: stfs {{[0-9]+}}, 60(1)
+; CHECK: ld 4, 56(1)
+; CHECK: bl
+
+declare void @testvaSf1(i32, ...)
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
new file mode 100644
index 000000000000..3180f464d125
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -0,0 +1,527 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector comparisons using altivec. For non native types, just basic
+; comparison instruction check is done. For altivec supported type (16i8,
+; 8i16, 4i32, and 4f32) all the comparisons operators (==, !=, >, >=, <, <=)
+; are checked.
+
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define <2 x i8> @v2si8_cmp(<2 x i8> %x, <2 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i8> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %sext
+}
+; CHECK: v2si8_cmp:
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <4 x i8> @v4si8_cmp(<4 x i8> %x, <4 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <4 x i8> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %sext
+}
+; CHECK: v4si8_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <8 x i8> @v8si8_cmp(<8 x i8> %x, <8 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <8 x i8> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %sext
+}
+; CHECK: v8si8_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v16i8 since it is a altivec native type
+
+define <16 x i8> @v16si8_cmp_eq(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_eq:
+; CHECK: vcmpequb 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ne(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:     v16si8_cmp_ne:
+; CHECK:     vcmpequb [[RET:[0-9]+]], 2, 3
+; CHECK-NOR: vnor     2, [[RET]], [[RET]]
+
+define <16 x i8> @v16si8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_le:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_lt:
+; CHECK: vcmpgtsb 2, 3, 2
+
+define <16 x i8> @v16ui8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_lt:
+; CHECK: vcmpgtub 2, 3, 2
+
+define <16 x i8> @v16si8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16si8_cmp_gt:
+; CHECK: vcmpgtsb 2, 2, 3
+
+define <16 x i8> @v16ui8_cmp_gt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK: v16ui8_cmp_gt:
+; CHECK: vcmpgtub 2, 2, 3
+
+define <16 x i8> @v16si8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16si8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsb [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <16 x i8> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+; CHECK:      v16ui8_cmp_ge:
+; CHECK:      vcmpequb [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtub [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i8> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i8>
+  ret <32 x i8> %sext
+}
+; CHECK: v32si8_cmp:
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequb {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x i16> @v2si16_cmp(<2 x i16> %x, <2 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i16> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %sext
+}
+; CHECK: v2si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <4 x i16> @v4si16_cmp(<4 x i16> %x, <4 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <4 x i16> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %sext
+}
+; CHECK: v4si16_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v8i16 since it is an altivec native type
+
+define <8 x i16> @v8si16_cmp_eq(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp eq <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_eq:
+; CHECK: vcmpequh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ne(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ne:
+; CHECK:      vcmpequh [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <8 x i16> @v8si16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_le:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_lt:
+; CHECK: vcmpgtsh 2, 3, 2
+
+define <8 x i16> @v8ui16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_lt:
+; CHECK: vcmpgtuh 2, 3, 2
+
+define <8 x i16> @v8si16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8si16_cmp_gt:
+; CHECK: vcmpgtsh 2, 2, 3
+
+define <8 x i16> @v8ui16_cmp_gt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK: v8ui16_cmp_gt:
+; CHECK: vcmpgtuh 2, 2, 3
+
+define <8 x i16> @v8si16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8si16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <8 x i16> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %sext
+}
+; CHECK:      v8ui16_cmp_ge:
+; CHECK:      vcmpequh [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuh [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i16> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %sext
+}
+; CHECK: v16si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <32 x i16> @v32si16_cmp(<32 x i16> %x, <32 x i16> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i16> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i16>
+  ret <32 x i16> %sext
+}
+; CHECK: v32si16_cmp:
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequh {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x i32> @v2si32_cmp(<2 x i32> %x, <2 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <2 x i32> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %sext
+}
+; CHECK: v2si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v4si32 since it is an altivec native type
+
+define <4 x i32> @v4si32_cmp_eq(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp eq <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_eq:
+; CHECK: vcmpequw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ne(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ne <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ne:
+; CHECK:      vcmpequw [[RCMP:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RCMP]], [[RCMP]]
+
+define <4 x i32> @v4si32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sle <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ule <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_le:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp slt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_lt:
+; CHECK: vcmpgtsw 2, 3, 2
+
+define <4 x i32> @v4ui32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ult <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_lt:
+; CHECK: vcmpgtuw 2, 3, 2
+
+define <4 x i32> @v4si32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sgt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4si32_cmp_gt:
+; CHECK: vcmpgtsw 2, 2, 3
+
+define <4 x i32> @v4ui32_cmp_gt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp ugt <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK: v4ui32_cmp_gt:
+; CHECK: vcmpgtuw 2, 2, 3
+
+define <4 x i32> @v4si32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp sge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4si32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtsw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
+entry:
+  %cmp = icmp uge <4 x i32> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %sext
+}
+; CHECK:      v4ui32_cmp_ge:
+; CHECK:      vcmpequw [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtuw [[RCMPGT:[0-9]+]], 2, 3
+; CHECK-NEXT: vor      2, [[RCMPGT]], [[RCMPEQ]]
+
+
+define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <8 x i32> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %sext
+}
+; CHECK: v8si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <16 x i32> @v16si32_cmp(<16 x i32> %x, <16 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <16 x i32> %x, %y
+  %sext = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %sext
+}
+; CHECK: v16si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <32 x i32> @v32si32_cmp(<32 x i32> %x, <32 x i32> %y) nounwind readnone {
+  %cmp = icmp eq <32 x i32> %x, %y
+  %sext = sext <32 x i1> %cmp to <32 x i32>
+  ret <32 x i32> %sext
+}
+; CHECK: v32si32_cmp:
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpequw {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+define <2 x float> @v2f32_cmp(<2 x float> %x, <2 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <2 x float> %x, %y
+  %sext = sext <2 x i1> %cmp to <2 x i32>
+  %0 = bitcast <2 x i32> %sext to <2 x float>
+  ret <2 x float> %0
+}
+; CHECK: v2f32_cmp:
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+
+
+; Adicional tests for v4f32 since it is a altivec native type
+
+define <4 x float> @v4f32_cmp_eq(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_eq:
+; CHECK: vcmpeqfp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_ne(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp une <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_ne:
+; CHECK:      vcmpeqfp [[RET:[0-9]+]], 2, 3
+; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+
+define <4 x float> @v4f32_cmp_le(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ole <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK:      v4f32_cmp_le:
+; CHECK:      vcmpeqfp [[RCMPEQ:[0-9]+]], 2, 3
+; CHECK-NEXT: vcmpgtfp [[RCMPLE:[0-9]+]], 3, 2
+; CHECK-NEXT: vor      2, [[RCMPLE]], [[RCMPEQ]]
+
+define <4 x float> @v4f32_cmp_lt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp olt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_lt:
+; CHECK: vcmpgtfp 2, 3, 2
+
+define <4 x float> @v4f32_cmp_ge(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oge <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_ge:
+; CHECK: vcmpgefp 2, 2, 3
+
+define <4 x float> @v4f32_cmp_gt(<4 x float> %x, <4 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp ogt <4 x float> %x, %y
+  %sext = sext <4 x i1> %cmp to <4 x i32>
+  %0 = bitcast <4 x i32> %sext to <4 x float>
+  ret <4 x float> %0
+}
+; CHECK: v4f32_cmp_gt:
+; CHECK: vcmpgtfp 2, 2, 3
+
+
+define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
+entry:
+  %cmp = fcmp oeq <8 x float> %x, %y
+  %sext = sext <8 x i1> %cmp to <8 x i32>
+  %0 = bitcast <8 x i32> %sext to <8 x float>
+  ret <8 x float> %0
+}
+; CHECK: v8f32_cmp:
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: vcmpeqfp {{[0-9]+}}, {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vec_conv.ll b/test/CodeGen/PowerPC/vec_conv.ll
new file mode 100644
index 000000000000..a475e9499df2
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_conv.ll
@@ -0,0 +1,57 @@
+; RUN: llc -mattr=+altivec < %s | FileCheck %s
+
+; Check vector float/int conversion using altivec.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@cte_float = global <4 x float> <float 6.5e+00, float 6.5e+00, float 6.5e+00, float 6.5e+00>, align 16
+@cte_int = global <4 x i32> <i32 6, i32 6, i32 6, i32 6>, align 16
+
+
+define void @v4f32_to_v4i32(<4 x float> %x, <4 x i32>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x float>* @cte_float, align 16
+  %mul = fmul <4 x float> %0, %x
+  %1 = fptosi <4 x float> %mul to <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %y, align 16
+  ret void
+}
+;CHECK: v4f32_to_v4i32:
+;CHECK: vctsxs {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4f32_to_v4u32(<4 x float> %x, <4 x i32>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x float>* @cte_float, align 16
+  %mul = fmul <4 x float> %0, %x
+  %1 = fptoui <4 x float> %mul to <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %y, align 16
+  ret void
+}
+;CHECK: v4f32_to_v4u32:
+;CHECK: vctuxs {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4i32_to_v4f32(<4 x i32> %x, <4 x float>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x i32>* @cte_int, align 16
+  %mul = mul <4 x i32> %0, %x
+  %1 = sitofp <4 x i32> %mul to <4 x float>
+  store <4 x float> %1, <4 x float>* %y, align 16
+  ret void
+}
+;CHECK: v4i32_to_v4f32:
+;CHECK: vcfsx {{[0-9]+}}, {{[0-9]+}}, 0
+
+
+define void @v4u32_to_v4f32(<4 x i32> %x, <4 x float>* nocapture %y) nounwind {
+entry:
+  %0 = load <4 x i32>* @cte_int, align 16
+  %mul = mul <4 x i32> %0, %x
+  %1 = uitofp <4 x i32> %mul to <4 x float>
+  store <4 x float> %1, <4 x float>* %y, align 16
+  ret void
+}
+;CHECK: v4u32_to_v4f32:
+;CHECK: vcfux {{[0-9]+}}, {{[0-9]+}}, 0
diff --git a/test/CodeGen/PowerPC/vec_extload.ll b/test/CodeGen/PowerPC/vec_extload.ll
new file mode 100644
index 000000000000..201c15b9c735
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_extload.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec < %s | FileCheck %s
+
+; Check vector extend load expansion with altivec enabled.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Altivec does not provides an sext intruction, so it expands
+; a set of vector stores (stvx), bytes load/sign expand/store
+; (lbz/stb), and a final vector load (lvx) to load the result
+; extended vector.
+define <16 x i8> @v16si8_sext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = sext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK: v16si8_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lbz
+; CHECK: stb
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; The zero extend uses a more clever logic: a vector splat
+; and a logic and to set higher bits to 0.
+define <16 x i8> @v16si8_zext_in_reg(<16 x i8> %a) {
+  %b = trunc <16 x i8> %a to <16 x i4>
+  %c = zext <16 x i4> %b to <16 x i8>
+  ret <16 x i8> %c
+}
+; CHECK:      v16si8_zext_in_reg:
+; CHECK:      vspltisb [[VMASK:[0-9]+]], 15
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load/store halfwords (lhz/sth).
+define <8 x i16> @v8si16_sext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = sext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK: v8si16_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lhz
+; CHECK: sth
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg, but instead of creating the mask
+; with a splat, loads it from memory.
+define <8 x i16> @v8si16_zext_in_reg(<8 x i16> %a) {
+  %b = trunc <8 x i16> %a to <8 x i8>
+  %c = zext <8 x i8> %b to <8 x i16>
+  ret <8 x i16> %c
+}
+; CHECK:      v8si16_zext_in_reg:
+; CHECK:      ld [[RMASKTOC:[0-9]+]], .LC{{[0-9]+}}@toc(2)
+; CHECK-NEXT: lvx [[VMASK:[0-9]+]], {{[0-9]+}}, [[RMASKTOC]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
+
+; Same as v16si8_sext_in_reg, expands to load halfword (lha) and
+; store words (stw).
+define <4 x i32> @v4si32_sext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = sext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK: v4si32_sext_in_reg:
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: stvx 2, {{[0-9]+}}, {{[0-9]+}}
+; CHECK: lha
+; CHECK: stw
+; CHECK: lvx 2, {{[0-9]+}}, {{[0-9]+}}
+
+; Same as v8si16_sext_in_reg.
+define <4 x i32> @v4si32_zext_in_reg(<4 x i32> %a) {
+  %b = trunc <4 x i32> %a to <4 x i16>
+  %c = zext <4 x i16> %b to <4 x i32>
+  ret <4 x i32> %c
+}
+; CHECK:      v4si32_zext_in_reg:
+; CHECK:      vspltisw [[VMASK:[0-9]+]], -16
+; CHECK-NEXT: vsrw [[VMASK]], [[VMASK]], [[VMASK]]
+; CHECK-NEXT: vand 2, 2, [[VMASK]]
diff --git a/test/CodeGen/PowerPC/vec_sqrt.ll b/test/CodeGen/PowerPC/vec_sqrt.ll
new file mode 100644
index 000000000000..055da1a229d1
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_sqrt.ll
@@ -0,0 +1,71 @@
+; RUN: llc -mcpu=pwr6 -mattr=+altivec,+fsqrt < %s | FileCheck %s
+
+; Check for vector sqrt expansion using floating-point types, since altivec
+; does not provide an fsqrt instruction for vector.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %val)
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float> %val)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double> %val)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double> %val)
+
+define <2 x float> @v2f32_sqrt(<2 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32 (<2 x float> %x)
+  ret <2 x float> %sqrt
+}
+; sqrt (<2 x float>) is promoted to sqrt (<4 x float>)
+; CHECK: v2f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x float> @v4f32_sqrt(<4 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x float> @llvm.sqrt.v4f32 (<4 x float> %x)
+  ret <4 x float> %sqrt
+}
+; CHECK: v4f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <8 x float> @v8f32_sqrt(<8 x float> %x) nounwind readnone {
+entry:
+  %sqrt = call <8 x float> @llvm.sqrt.v8f32 (<8 x float> %x)
+  ret <8 x float> %sqrt
+}
+; CHECK: v8f32_sqrt:
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrts {{[0-9]+}}, {{[0-9]+}}
+
+define <2 x double> @v2f64_sqrt(<2 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <2 x double> @llvm.sqrt.v2f64 (<2 x double> %x)
+  ret <2 x double> %sqrt
+}
+; CHECK: v2f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+
+define <4 x double> @v4f64_sqrt(<4 x double> %x) nounwind readnone {
+entry:
+  %sqrt = call <4 x double> @llvm.sqrt.v4f64 (<4 x double> %x)
+  ret <4 x double> %sqrt
+}
+; CHECK: v4f64_sqrt:
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
+; CHECK: fsqrt {{[0-9]+}}, {{[0-9]+}}
diff --git a/test/CodeGen/PowerPC/vrspill.ll b/test/CodeGen/PowerPC/vrspill.ll
new file mode 100644
index 000000000000..7641017c434e
--- /dev/null
+++ b/test/CodeGen/PowerPC/vrspill.ll
@@ -0,0 +1,19 @@
+; RUN: llc -O0 -mtriple=powerpc-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs  < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=powerpc64-unknown-linux-gnu -mattr=+altivec -verify-machineinstrs < %s | FileCheck %s
+
+; This verifies that we generate correct spill/reload code for vector regs.
+
+define void @addrtaken(i32 %i, <4 x float> %w) nounwind {
+entry:
+  %i.addr = alloca i32, align 4
+  %w.addr = alloca <4 x float>, align 16
+  store i32 %i, i32* %i.addr, align 4
+  store <4 x float> %w, <4 x float>* %w.addr, align 16
+  call void @foo(i32* %i.addr)
+  ret void
+}
+
+; CHECK: stvx 2, 0, 0
+; CHECK: lvx 2, 0, 0
+
+declare void @foo(i32*)
diff --git a/test/CodeGen/SPARC/2011-01-11-CC.ll b/test/CodeGen/SPARC/2011-01-11-CC.ll
index 3ceda958de6e..f676fd836947 100755
--- a/test/CodeGen/SPARC/2011-01-11-CC.ll
+++ b/test/CodeGen/SPARC/2011-01-11-CC.ll
@@ -54,7 +54,7 @@ entry:
 ; V8: {{be|bne}}
 ; V9: test_select_dfp_icc
 ; V9: subcc
-; V9=NOT: {{be|bne}}
+; V9-NOT: {{be|bne}}
 ; V9: fmovd{{e|ne}} %icc
   %0 = icmp eq i32 %a, 0
   %1 = select i1 %0, double %f1, double %f2
diff --git a/test/CodeGen/Thumb2/buildvector-crash.ll b/test/CodeGen/Thumb2/buildvector-crash.ll
index 01ef472d3104..ce42f4b3773d 100644
--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.
 
 define void @RotateStarsFP_Vec() nounwind {
@@ -13,5 +13,5 @@ bb8:                                              ; preds = %bb8, %bb.nph372
   store <4 x float> %3, <4 x float>* undef, align 4
   br label %bb8
 ; CHECK: RotateStarsFP_Vec:
-; CHECK: vldmia
+; CHECK: vld1.64
 }
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index de6f6e260de3..85b4370fa599 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -20,3 +20,16 @@ entry:
 	%tmp2 = sub i64 %tmp1, %b
 	ret i64 %tmp2
 }
+
+; rdar://12559385
+define i64 @f3(i32 %vi) {
+entry:
+; CHECK: f3:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbcs r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/Thumb2/cortex-fp.ll b/test/CodeGen/Thumb2/cortex-fp.ll
index d06f8a7beeb0..b7df2fbf546c 100644
--- a/test/CodeGen/Thumb2/cortex-fp.ll
+++ b/test/CodeGen/Thumb2/cortex-fp.ll
@@ -7,8 +7,8 @@ define float @foo(float %a, float %b) {
 entry:
 ; CHECK: foo
 ; CORTEXM3: blx ___mulsf3
-; CORTEXM4: vmul.f32  s0, s1, s0
-; CORTEXA8: vmul.f32  d0, d1, d0
+; CORTEXM4: vmul.f32  s0, s2, s0
+; CORTEXA8: vmul.f32  d
   %0 = fmul float %a, %b
   ret float %0
 }
@@ -19,6 +19,6 @@ entry:
   %0 = fmul double %a, %b
 ; CORTEXM3: blx ___muldf3
 ; CORTEXM4: blx ___muldf3
-; CORTEXA8: vmul.f64  d16, d17, d16
+; CORTEXA8: vmul.f64  d
   ret double %0
 }
diff --git a/test/CodeGen/Thumb2/div.ll b/test/CodeGen/Thumb2/div.ll
index 2c00c70c0db6..f89746a30327 100644
--- a/test/CodeGen/Thumb2/div.ll
+++ b/test/CodeGen/Thumb2/div.ll
@@ -2,6 +2,8 @@
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMB
 ; RUN: llc < %s -march=thumb -mcpu=cortex-m3 -mattr=+thumb2 \
 ; RUN:    | FileCheck %s -check-prefix=CHECK-THUMBV7M
+; RUN: llc < %s -march=thumb -mcpu=swift \
+; RUN:    | FileCheck %s -check-prefix=CHECK-SWIFT-T2
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
@@ -9,6 +11,8 @@ entry:
 ; CHECK-THUMB: __divsi3
 ; CHECK-THUMBV7M: f1
 ; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f1
+; CHECK-SWIFT-T2: sdiv
         %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -19,6 +23,8 @@ entry:
 ; CHECK-THUMB: __udivsi3
 ; CHECK-THUMBV7M: f2
 ; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f2
+; CHECK-SWIFT-T2: udiv
         %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -29,6 +35,8 @@ entry:
 ; CHECK-THUMB: __modsi3
 ; CHECK-THUMBV7M: f3
 ; CHECK-THUMBV7M: sdiv
+; CHECK-SWIFT-T2: f3
+; CHECK-SWIFT-T2: sdiv
         %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
@@ -39,6 +47,8 @@ entry:
 ; CHECK-THUMB: __umodsi3
 ; CHECK-THUMBV7M: f4
 ; CHECK-THUMBV7M: udiv
+; CHECK-SWIFT-T2: f4
+; CHECK-SWIFT-T2: udiv
         %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
         ret i32 %tmp1
 }
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
new file mode 100644
index 000000000000..beefd6044cf4
--- /dev/null
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/Thumb2/thumb2-mla.ll b/test/CodeGen/Thumb2/thumb2-mla.ll
index c4cc749ea5c7..594d9742b0f9 100644
--- a/test/CodeGen/Thumb2/thumb2-mla.ll
+++ b/test/CodeGen/Thumb2/thumb2-mla.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2 -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f1(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -7,6 +8,9 @@ define i32 @f1(i32 %a, i32 %b, i32 %c) {
 }
 ; CHECK: f1:
 ; CHECK: 	mla	r0, r0, r1, r2
+; NO_MULOPS: f1:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
 
 define i32 @f2(i32 %a, i32 %b, i32 %c) {
     %tmp1 = mul i32 %a, %b
@@ -15,3 +19,6 @@ define i32 @f2(i32 %a, i32 %b, i32 %c) {
 }
 ; CHECK: f2:
 ; CHECK: 	mla	r0, r0, r1, r2
+; NO_MULOPS: f2:
+; NO_MULOPS: muls r0, r1, r0
+; NO_MULOPS-NEXT: add r0, r2
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index ead198f21624..ed4d26d746cb 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -5,7 +5,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: mvn r0, #-2147483648
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: addle.w r1, r1, r0
+; CHECK: addle r1, r0
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -30,7 +30,7 @@ define i32 @t3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t3
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: suble.w r1, r1, #10
+; CHECK: suble r1, #10
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 10
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index c128eccd662f..aaaedfa42e74 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,8 +1,12 @@
 ; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp | FileCheck %s
+; RUN: llc < %s -march=thumb -mattr=+thumb2,+t2xtpk,+t2dsp -arm-use-mulops=false | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
 ; CHECK: smlabt r0, r1, r2, r0
+; NO_MULOPS: f3
+; NO_MULOPS: smultb r1, r2, r1
+; NO_MULOPS-NEXT: add r0, r1
         %tmp = sext i16 %x to i32               ; <i32> [#uses=1]
         %tmp2 = ashr i32 %y, 16         ; <i32> [#uses=1]
         %tmp3 = mul i32 %tmp2, %tmp             ; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index 35914b16790a..2074f98cb608 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) {
 
 ; ARMv7M: test10
 ; ARMv7M: mov.w r1, #16253176
-; ARMv7M: mov.w r2, #458759
 ; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: and.w r1, r2, r0, lsr #5
+; ARMv7M: mov.w r1, #458759
+; ARMv7M: and.w r1, r1, r0, lsr #5
 ; ARMv7M: orrs r0, r1
 	%tmp1 = lshr i32 %p0, 7		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16253176		; <i32> [#uses=2]
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index 8b55bd79aaa5..3d058bc28965 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://r7512579
 
 ; PHI defs in the atomic loop should be used by the add / adc
@@ -7,17 +7,16 @@
 define void @t(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK: t:
-; CHECK: movl $1
-; CHECK: movl (%ebp), %eax
-; CHECK: movl 4(%ebp), %edx
+; CHECK: movl ([[REG:%[a-z]+]]), %eax
+; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
-; CHECK-NOT: movl $1
-; CHECK-NOT: movl $0
-; CHECK: addl
-; CHECK: adcl
+; CHECK: movl %eax, %ebx
+; CHECK: addl {{%[a-z]+}}, %ebx
+; CHECK: movl %edx, %ecx
+; CHECK: adcl {{%[a-z]+}}, %ecx
 ; CHECK: lock
-; CHECK: cmpxchg8b
-; CHECK: jne
+; CHECK-NEXT: cmpxchg8b ([[REG]])
+; CHECK-NEXT: jne
   %0 = atomicrmw add i64* %p, i64 1 seq_cst
   ret void
 }
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 8a3ccc8dfda5..3ce7db6e4138 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -2,8 +2,8 @@
 
 ;CHECK: vcast
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pshufd
-;CHECK: pshufd
+;CHECK: pmovzxdq
+;CHECK: pmovzxdq
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index fec17e9f4aca..c4b307e5a5d3 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -4,7 +4,7 @@
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: shufb
+; CHECK: pmovzxbd
   ret <4 x i8> %out
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b670c7af..04659522d360 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index 906b748fa420..4abdded38d8c 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -3,7 +3,7 @@
 ; CHECK: load_store
 define void @load_store(<4 x i16>* %in) {
 entry:
-; CHECK: movsd
+; CHECK: pmovzxwd
   %A27 = load <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
 ; CHECK: movlpd
@@ -27,6 +27,6 @@ define <2 x i32> @load_64(<2 x i32>* %ptr) {
 BB:
   %t = load <2 x i32>* %ptr
   ret <2 x i32> %t
-;CHECK: movsd
+;CHECK: pmovzxdq
 ;CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-08-16-setcc.ll b/test/CodeGen/X86/2012-08-16-setcc.ll
new file mode 100644
index 000000000000..ed511567c32b
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx | FileCheck %s
+
+; rdar://12081007
+
+; CHECK: and_1:
+; CHECK: andb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: and_2:
+; CHECK: andb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = and i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
+
+; CHECK: xor_1:
+; CHECK: xorb
+; CHECK-NEXT: cmovnel
+; CHECK: ret
+define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  %3 = select i1 %2, i32 %x, i32 0
+  ret i32 %3
+}
+
+; CHECK: xor_2:
+; CHECK: xorb
+; CHECK-NEXT: setne
+; CHECK: ret
+define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
+  %1 = xor i8 %b, %a
+  %2 = icmp ne i8 %1, 0
+  ret i1 %2
+}
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 000000000000..6ebbb2e97d13
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
new file mode 100644
index 000000000000..7b9bab97be6f
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; Make sure we are not trying to use scalar xor on the high bits of the vector.
+; CHECK-NOT: xorq
+; CHECK: xorl
+; CHECK-NEXT: ret
+
+define i32 @foo() {
+bb:
+  %tmp44.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>
+  %0 = bitcast <4 x float> %tmp44.i to i128
+  %1 = zext i128 %0 to i512
+  %2 = shl nuw nsw i512 %1, 256
+  %ins = or i512 %2, 3325764857622480139933400731976840738652108318779753826115024029985671937147149347761402413803120180680770390816681124225944317364750115981129923635970048
+  store i512 %ins, i512* undef, align 64
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/2012-09-28-CGPBug.ll b/test/CodeGen/X86/2012-09-28-CGPBug.ll
new file mode 100644
index 000000000000..32d7d012dd14
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-28-CGPBug.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=i386-apple-macosx < %s | FileCheck %s
+; rdar://12396696
+
+@JT = global [4 x i32] [i32 sub (i32 ptrtoint (i8* blockaddress(@h, %18) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %17) to i32), i32 ptrtoint (i8* blockaddress(@h, %11) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %18) to i32)), i32 sub (i32 ptrtoint (i8* blockaddress(@h, %22) to i32), i32 ptrtoint (i8* blockaddress(@h, %17) to i32))]
+@gGlobalLock = external global i8*
+@.str40 = external global [35 x i8]
+
+; CHECK: _JT:
+; CHECK-NOT: .long Ltmp{{[0-9]+}}-1
+; CHECK-NOT: .long 1-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+; CHECK: .long Ltmp{{[0-9]+}}-Ltmp{{[0-9]+}}
+
+define void @h(i8*) nounwind ssp {
+  %2 = alloca i8*
+  store i8* %0, i8** %2
+  %3 = load i8** %2
+  %4 = bitcast i8* %3 to { i32, i32 }*
+  %5 = getelementptr { i32, i32 }* %4, i32 0, i32 0
+  %6 = load i32* %5
+  %7 = srem i32 %6, 2
+  %8 = icmp slt i32 %6, 2
+  %9 = select i1 %8, i32 %6, i32 %7
+  %10 = icmp eq i32 %9, 0
+  br label %11
+
+; <label>:11                                      ; preds = %1
+  %12 = zext i1 %10 to i32
+  %13 = getelementptr [4 x i32]* @JT, i32 0, i32 %12
+  %14 = load i32* %13
+  %15 = add i32 %14, ptrtoint (i8* blockaddress(@h, %11) to i32)
+  %16 = inttoptr i32 %15 to i8*
+  indirectbr i8* %16, [label %17, label %18]
+
+; <label>:17                                      ; preds = %11
+  tail call void (i8*, ...)* @g(i8* getelementptr inbounds ([35 x i8]* @.str40, i32 0, i32 0))
+  br label %22
+
+; <label>:18                                      ; preds = %11
+  %19 = call i32 @f(i32 -1037694186) nounwind
+  %20 = inttoptr i32 %19 to i32 (i8**)*
+  %21 = tail call i32 %20(i8** @gGlobalLock)
+  br label %22
+
+; <label>:22                                      ; preds = %18, %17
+  ret void
+}
+
+declare i32 @f(i32)
+
+declare void @g(i8*, ...)
diff --git a/test/CodeGen/X86/2012-10-02-DAGCycle.ll b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
new file mode 100644
index 000000000000..8d914db3315f
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-02-DAGCycle.ll
@@ -0,0 +1,52 @@
+; RUN: llc -mtriple=i386-apple-macosx -relocation-model=pic < %s
+; RUN: llc -mtriple=x86_64-apple-macosx -relocation-model=pic < %s
+
+; rdar://12393897
+
+%TRp = type { i32, %TRH*, i32, i32 }
+%TRH = type { i8*, i8*, i8*, i8*, {}* }
+
+define i32 @t(%TRp* inreg %rp) nounwind optsize ssp {
+entry:
+  %handler = getelementptr inbounds %TRp* %rp, i32 0, i32 1
+  %0 = load %TRH** %handler, align 4
+  %sync = getelementptr inbounds %TRH* %0, i32 0, i32 4
+  %sync12 = load {}** %sync, align 4
+  %1 = bitcast {}* %sync12 to i32 (%TRp*)*
+  %call = tail call i32 %1(%TRp* inreg %rp) nounwind optsize
+  ret i32 %call
+}
+
+%btConeShape = type { %btConvexInternalShape, float, float, float, [3 x i32] }
+%btConvexInternalShape = type { %btConvexShape, %btVector, %btVector, float, float }
+%btConvexShape = type { %btCollisionShape }
+%btCollisionShape = type { i32 (...)**, i32, i8* }
+%btVector = type { [4 x float] }
+
+define { <2 x float>, <2 x float> } @t2(%btConeShape* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %0 = getelementptr inbounds %btConeShape* %this, i64 0, i32 0
+  br i1 undef, label %if.then, label %if.end17
+
+if.then:                                          ; preds = %entry
+  %vecnorm.sroa.2.8.copyload = load float* undef, align 4
+  %cmp4 = fcmp olt float undef, 0x3D10000000000000
+  %vecnorm.sroa.2.8.copyload36 = select i1 %cmp4, float -1.000000e+00, float %vecnorm.sroa.2.8.copyload
+  %call.i.i.i = tail call float @sqrtf(float 0.000000e+00) nounwind readnone
+  %div.i.i = fdiv float 1.000000e+00, %call.i.i.i
+  %mul7.i.i.i = fmul float %div.i.i, %vecnorm.sroa.2.8.copyload36
+  %1 = load float (%btConvexInternalShape*)** undef, align 8
+  %call12 = tail call float %1(%btConvexInternalShape* %0)
+  %mul7.i.i = fmul float %call12, %mul7.i.i.i
+  %retval.sroa.0.4.insert = insertelement <2 x float> zeroinitializer, float undef, i32 1
+  %add13.i = fadd float undef, %mul7.i.i
+  %retval.sroa.1.8.insert = insertelement <2 x float> undef, float %add13.i, i32 0
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.then, %entry
+  %retval.sroa.1.8.load3338 = phi <2 x float> [ %retval.sroa.1.8.insert, %if.then ], [ undef, %entry ]
+  %retval.sroa.0.0.load3137 = phi <2 x float> [ %retval.sroa.0.4.insert, %if.then ], [ undef, %entry ]
+  ret { <2 x float>, <2 x float> } undef
+}
+
+declare float @sqrtf(float) nounwind readnone
diff --git a/test/CodeGen/X86/2012-10-03-DAGCycle.ll b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
new file mode 100644
index 000000000000..72083c7115e4
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-03-DAGCycle.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=corei7 < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.pluto.0 = type { %struct.bar.1, %struct.hoge.368* }
+%struct.bar.1 = type { %i8* }
+%i8 = type { i8 }
+%struct.hoge.368 = type { i32, i32 }
+%struct.widget.375 = type { i32, i32, %i8*, %struct.hoge.368* }
+
+define fastcc void @bar(%struct.pluto.0* %arg) nounwind uwtable ssp align 2 {
+bb:
+  %tmp1 = alloca %struct.widget.375, align 8
+  %tmp2 = getelementptr inbounds %struct.pluto.0* %arg, i64 0, i32 1
+  %tmp3 = load %struct.hoge.368** %tmp2, align 8
+  store %struct.pluto.0* %arg, %struct.pluto.0** undef, align 8
+  %tmp = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 2
+  %tmp4 = getelementptr %struct.pluto.0* %arg, i64 0, i32 0, i32 0
+  %tmp5 = load %i8** %tmp4, align 8
+  store %i8* %tmp5, %i8** %tmp, align 8
+  %tmp6 = getelementptr inbounds %struct.widget.375* %tmp1, i64 0, i32 3
+  store %struct.hoge.368* %tmp3, %struct.hoge.368** %tmp6, align 8
+  br i1 undef, label %bb8, label %bb7
+
+bb7:                                              ; preds = %bb
+  unreachable
+
+bb8:                                              ; preds = %bb
+  unreachable
+}
diff --git a/test/CodeGen/X86/2012-10-18-crash-dagco.ll b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
new file mode 100644
index 000000000000..5b98624a37b8
--- /dev/null
+++ b/test/CodeGen/X86/2012-10-18-crash-dagco.ll
@@ -0,0 +1,61 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -disable-cgp-select2branch < %s
+
+; We should not crash on this test.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+target triple = "i386-apple-darwin9.0.0"
+
+@global = external constant [411 x i8], align 1
+
+define void @snork() nounwind {
+bb:
+  br i1 undef, label %bb26, label %bb27
+
+bb26:                                             ; preds = %bb48, %bb26, %bb
+  switch i32 undef, label %bb26 [
+    i32 142771596, label %bb28
+  ]
+
+bb27:                                             ; preds = %bb48, %bb
+  switch i32 undef, label %bb49 [
+    i32 142771596, label %bb28
+  ]
+
+bb28:                                             ; preds = %bb27, %bb26
+  %tmp = load i32* null
+  %tmp29 = trunc i32 %tmp to i8
+  store i8* undef, i8** undef
+  %tmp30 = load i32* null
+  %tmp31 = icmp eq i32 %tmp30, 0
+  %tmp32 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 undef
+  %tmp33 = load i8* %tmp32, align 1
+  %tmp34 = getelementptr inbounds [411 x i8]* @global, i32 0, i32 0
+  %tmp35 = load i8* %tmp34, align 1
+  %tmp36 = select i1 %tmp31, i8 %tmp35, i8 %tmp33
+  %tmp37 = select i1 undef, i8 %tmp29, i8 %tmp36
+  %tmp38 = zext i8 %tmp37 to i32
+  %tmp39 = select i1 undef, i32 0, i32 %tmp38
+  %tmp40 = getelementptr inbounds i32* null, i32 %tmp39
+  %tmp41 = load i32* %tmp40, align 4
+  %tmp42 = load i32* undef, align 4
+  %tmp43 = load i32* undef
+  %tmp44 = xor i32 %tmp42, %tmp43
+  %tmp45 = lshr i32 %tmp44, 8
+  %tmp46 = lshr i32 %tmp44, 7
+  call void @spam()
+  unreachable
+
+bb47:                                             ; No predecessors!
+  ret void
+
+bb48:                                             ; No predecessors!
+  br i1 undef, label %bb27, label %bb26
+
+bb49:                                             ; preds = %bb49, %bb27
+  br label %bb49
+
+bb50:                                             ; preds = %bb50
+  br label %bb50
+}
+
+declare void @spam() noreturn nounwind
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
new file mode 100644
index 000000000000..64825bac9719
--- /dev/null
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -0,0 +1,305 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
+
+; CHECK: merge_const_store
+; save 1,2,3 ... as one big integer.
+; CHECK: movabsq $578437695752307201
+; CHECK: ret
+define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 5, i8* %6, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the constants using a single vector store.
+; CHECK: merge_const_store_vec
+; CHECK: vmovups  %ymm0, (%rsi)
+; CHECK: ret
+define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.B* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 0, i32* %3, align 4
+  %4 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  store i32 0, i32* %4, align 4
+  %5 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  store i32 0, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.B* %.01, i64 0, i32 4
+  store i32 0, i32* %6, align 4
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 5
+  store i32 0, i32* %7, align 4
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 6
+  store i32 0, i32* %8, align 4
+  %9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 7
+  store i32 0, i32* %9, align 4
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+; Move the first 4 constants as a single vector. Move the rest as scalars.
+; CHECK: merge_nonconst_store
+; CHECK: movl $67305985
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ]
+  %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %2, align 1
+  %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %3, align 1
+  %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %4, align 1
+  %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %5, align 1
+  %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4
+  store i8 %zz, i8* %6, align 1                     ;  <----------- Not a const;
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %8, align 1
+  %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %9, align 1
+  %10 = add nsw i32 %i.02, 1
+  %11 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %10, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
+
+
+;CHECK: merge_loads_i16
+; load:
+;CHECK: movw
+; store:
+;CHECK: movw
+;CHECK: ret
+define void @merge_loads_i16(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i8* %2, align 1
+  %6 = load i8* %3, align 1
+  %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %5, i8* %7, align 1
+  %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  store i8 %6, i8* %8, align 1
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+; The loads and the stores are interleved. Can't merge them.
+;CHECK: no_merge_loads
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: movb
+;CHECK: ret
+define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1
+  br label %a4
+
+a4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
+  %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
+  %a5 = load i8* %2, align 1
+  %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0
+  store i8 %a5, i8* %a7, align 1
+  %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1
+  %a6 = load i8* %3, align 1
+  store i8 %a6, i8* %a8, align 1
+  %a9 = add nsw i32 %i.02, 1
+  %a10 = getelementptr inbounds %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %a9, %count
+  br i1 %exitcond, label %._crit_edge, label %a4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_integer
+; load:
+;CHECK: movq
+; store:
+;CHECK: movq
+;CHECK: ret
+define void @merge_loads_integer(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  br label %4
+
+; <label>:4                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %10, %4 ]
+  %5 = load i32* %2
+  %6 = load i32* %3
+  %7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  store i32 %5, i32* %7
+  %8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  store i32 %6, i32* %8
+  %9 = add nsw i32 %i.02, 1
+  %10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %9, %count
+  br i1 %exitcond, label %._crit_edge, label %4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+
+;CHECK: merge_loads_vector
+; load:
+;CHECK: movups
+; store:
+;CHECK: movups
+;CHECK: ret
+define void @merge_loads_vector(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2
+  %b2 = load i32* %a3
+  %b3 = load i32* %a4
+  %b4 = load i32* %a5
+  store i32 %b1, i32* %a7
+  store i32 %b2, i32* %a8
+  store i32 %b3, i32* %a9
+  store i32 %b4, i32* %a10
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
+;CHECK: merge_loads_no_align
+; load:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+; store:
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: movl
+;CHECK: ret
+define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
+  %a1 = icmp sgt i32 %count, 0
+  br i1 %a1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %a2 = getelementptr inbounds %struct.B* %q, i64 0, i32 0
+  %a3 = getelementptr inbounds %struct.B* %q, i64 0, i32 1
+  %a4 = getelementptr inbounds %struct.B* %q, i64 0, i32 2
+  %a5 = getelementptr inbounds %struct.B* %q, i64 0, i32 3
+  br label %block4
+
+block4:                                       ; preds = %4, %.lr.ph
+  %i.02 = phi i32 [ 0, %.lr.ph ], [ %c9, %block4 ]
+  %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %c10, %block4 ]
+  %a7 = getelementptr inbounds %struct.B* %.01, i64 0, i32 0
+  %a8 = getelementptr inbounds %struct.B* %.01, i64 0, i32 1
+  %a9 = getelementptr inbounds %struct.B* %.01, i64 0, i32 2
+  %a10 = getelementptr inbounds %struct.B* %.01, i64 0, i32 3
+  %b1 = load i32* %a2, align 1
+  %b2 = load i32* %a3, align 1
+  %b3 = load i32* %a4, align 1
+  %b4 = load i32* %a5, align 1
+  store i32 %b1, i32* %a7, align 1
+  store i32 %b2, i32* %a8, align 1
+  store i32 %b3, i32* %a9, align 1
+  store i32 %b4, i32* %a10, align 1
+  %c9 = add nsw i32 %i.02, 1
+  %c10 = getelementptr inbounds %struct.B* %.01, i64 1
+  %exitcond = icmp eq i32 %c9, %count
+  br i1 %exitcond, label %._crit_edge, label %block4
+
+._crit_edge:                                      ; preds = %4, %0
+  ret void
+}
+
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
new file mode 100644
index 000000000000..5982544f7a8c
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s
+
+; Make sure that we don't crash when dbg values are used.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define void @foo() nounwind uwtable ssp {
+entry:
+  %x.i = alloca i8, align 1
+  %y.i = alloca [256 x i8], align 16
+  %0 = getelementptr inbounds [256 x i8]* %y.i, i64 0, i64 0
+  br label %for.body
+
+for.body:
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
+  call void @llvm.dbg.declare(metadata !{i8* %x.i}, metadata !22) nounwind
+  br label %for.body
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!16 = metadata !{i32 786468, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6}
+!2 = metadata !{i32 0}
+!22 = metadata !{i32 786688, metadata !2, metadata !"x", metadata !2, i32 16, metadata !16, i32 0, i32 0}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 000000000000..f8ae74f292d2
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,410 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Check that we don't assert and crash even when there are allocas
+; outside the declared lifetime regions.
+;YESCOLOR: bad_range
+;NOCOLOR:  bad_range
+define void @bad_range() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  br label %block2
+
+block2:
+  ; I am used outside the marked lifetime.
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  ret void
+}
+
+
+; Check that we don't assert and crash even when there are usages
+; of allocas which do not read or write outside the declared lifetime regions.
+;YESCOLOR: shady_range
+;NOCOLOR:  shady_range
+
+%struct.Klass = type { i32, i32 }
+
+define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
+  %a.i = alloca [4 x %struct.Klass], align 16
+  %b.i = alloca [4 x %struct.Klass], align 16
+  %a8 = bitcast [4 x %struct.Klass]* %a.i to i8*
+  %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
+  ; I am used outside the lifetime zone below:
+  %z2 = getelementptr inbounds [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  %z3 = load i32* %z2, align 16
+  %r = call i32 @foo(i32 %z3, i8* %a8)
+  %r2 = call i32 @foo(i32 %z3, i8* %b8)
+  call void @llvm.lifetime.end(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  ret i32 9
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index a4abccba7e68..4e30f2b05a89 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -30,4 +30,17 @@ entry:
   ret i32 %z.0
 }
 
+; <rdar://problem/12579915>
+define i32 @test3(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
+entry:
+  %cmp = icmp ugt i32 %x, %y
+  %dec = sext i1 %cmp to i32
+  %dec.res = add nsw i32 %dec, %res
+  ret i32 %dec.res
+; CHECK: test3:
+; CHECK: cmpl
+; CHECK: sbbl
+; CHECK: ret
+}
+
 declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 000000000000..e7c9605d3e88
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/atom-shuf.ll b/test/CodeGen/X86/atom-shuf.ll
new file mode 100644
index 000000000000..4c3f2f67c54b
--- /dev/null
+++ b/test/CodeGen/X86/atom-shuf.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=atom | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %in) {
+  %r = shufflevector <16 x i8> %in, <16 x i8> undef, <16 x i32> < i32 7, i32 3, i32 2, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %r
+; CHECK: foo
+; CHECK: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/atomic-minmax-i6432.ll b/test/CodeGen/X86/atomic-minmax-i6432.ll
new file mode 100644
index 000000000000..e3ef605f7f1c
--- /dev/null
+++ b/test/CodeGen/X86/atomic-minmax-i6432.ll
@@ -0,0 +1,67 @@
+; RUN: llc -march=x86 -mattr=+cmov -mtriple=i386-pc-linux < %s | FileCheck %s -check-prefix=LINUX
+; RUN: llc -march=x86 -mtriple=i386-macosx -relocation-model=pic < %s | FileCheck %s -check-prefix=PIC
+
+@sc64 = external global i64
+
+define void @atomic_maxmin_i6432() {
+; LINUX: atomic_maxmin_i6432
+  %1 = atomicrmw max  i64* @sc64, i64 5 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmpl
+; LINUX: setl
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %2 = atomicrmw min  i64* @sc64, i64 6 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmpl
+; LINUX: setg
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %3 = atomicrmw umax i64* @sc64, i64 7 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmpl
+; LINUX: setb
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  %4 = atomicrmw umin i64* @sc64, i64 8 acquire
+; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmpl
+; LINUX: seta
+; LINUX: cmovne
+; LINUX: cmovne
+; LINUX: lock
+; LINUX-NEXT: cmpxchg8b
+; LINUX: jne [[LABEL]]
+  ret void
+}
+
+; rdar://12453106
+@id = internal global i64 0, align 8
+
+define void @tf_bug(i8* %ptr) nounwind {
+; PIC: tf_bug:
+; PIC: movl _id-L1$pb(
+; PIC: movl (_id-L1$pb)+4(
+  %tmp1 = atomicrmw add i64* @id, i64 1 seq_cst
+  %tmp2 = add i64 %tmp1, 1
+  %tmp3 = bitcast i8* %ptr to i64*
+  store i64 %tmp2, i64* %tmp3, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic-pointer.ll b/test/CodeGen/X86/atomic-pointer.ll
new file mode 100644
index 000000000000..a455277be4db
--- /dev/null
+++ b/test/CodeGen/X86/atomic-pointer.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-none-linux | FileCheck %s
+
+define i32* @test_atomic_ptr_load(i32** %a0) {
+; CHECK: test_atomic_ptr_load
+; CHECK: movl
+; CHECK: movl
+; CHECK: ret
+0:
+  %0 = load atomic i32** %a0 seq_cst, align 4
+  ret i32* %0
+}
+
+define void @test_atomic_ptr_store(i32* %a0, i32** %a1) {
+; CHECK: test_atomic_ptr_store
+; CHECK: movl
+; CHECK: movl
+; CHECK: xchgl
+; CHECK: ret
+0:
+  store atomic i32* %a0, i32** %a1 seq_cst, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
new file mode 100644
index 000000000000..824995d6cb98
--- /dev/null
+++ b/test/CodeGen/X86/atomic16.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mcpu=corei7 -show-mc-encoding | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc16 = external global i16
+
+define void @atomic_fetch_add16() nounwind {
+; X64:   atomic_fetch_add16
+; X32:   atomic_fetch_add16
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       incw
+; X32:       lock
+; X32:       incw
+  %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       addw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw $3
+  %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       addw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub16() nounwind {
+; X64:   atomic_fetch_sub16
+; X32:   atomic_fetch_sub16
+  %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       decw
+; X32:       lock
+; X32:       decw
+  %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       subw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw $3
+  %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       subw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and16() nounwind {
+; X64:   atomic_fetch_and16
+; X32:   atomic_fetch_and16
+  %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw $3
+  %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
+; X64:       andw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       andw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or16() nounwind {
+; X64:   atomic_fetch_or16
+; X32:   atomic_fetch_or16
+  %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw $3
+  %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
+; X64:       orw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       orw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       orw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor16() nounwind {
+; X64:   atomic_fetch_xor16
+; X32:   atomic_fetch_xor16
+  %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw $3
+  %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
+; X64:       xorw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       xorw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       xorw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand16(i16 %x) nounwind {
+; X64:   atomic_fetch_nand16
+; X32:   atomic_fetch_nand16
+  %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
+; X64:       andw
+; X64:       notw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       notw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max16(i16 %x) nounwind {
+  %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min16(i16 %x) nounwind {
+  %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax16(i16 %x) nounwind {
+  %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin16(i16 %x) nounwind {
+  %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg16() nounwind {
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store16(i16 %x) nounwind {
+  store atomic i16 %x, i16* @sc16 release, align 4
+; X64-NOT:   lock
+; X64:       movw
+; X32-NOT:   lock
+; X32:       movw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap16(i16 %x) nounwind {
+  %t1 = atomicrmw xchg i16* @sc16, i16 %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
new file mode 100644
index 000000000000..dc927d8cb6f6
--- /dev/null
+++ b/test/CodeGen/X86/atomic32.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc32 = external global i32
+
+define void @atomic_fetch_add32() nounwind {
+; X64:   atomic_fetch_add32
+; X32:   atomic_fetch_add32
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       incl
+; X32:       lock
+; X32:       incl
+  %t2 = atomicrmw add  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       addl $3
+; X32:       lock
+; X32:       addl $3
+  %t3 = atomicrmw add  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw add  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       addl
+; X32:       lock
+; X32:       addl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub32() nounwind {
+; X64:   atomic_fetch_sub32
+; X32:   atomic_fetch_sub32
+  %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       decl
+; X32:       lock
+; X32:       decl
+  %t2 = atomicrmw sub  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       subl $3
+; X32:       lock
+; X32:       subl $3
+  %t3 = atomicrmw sub  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw sub  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       subl
+; X32:       lock
+; X32:       subl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and32() nounwind {
+; X64:   atomic_fetch_and32
+; X32:   atomic_fetch_and32
+  %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       andl $3
+; X32:       lock
+; X32:       andl $3
+  %t2 = atomicrmw and  i32* @sc32, i32 5 acquire
+; X64:       andl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw and  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       andl
+; X32:       lock
+; X32:       andl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or32() nounwind {
+; X64:   atomic_fetch_or32
+; X32:   atomic_fetch_or32
+  %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       orl $3
+; X32:       lock
+; X32:       orl $3
+  %t2 = atomicrmw or   i32* @sc32, i32 5 acquire
+; X64:       orl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw or   i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       orl
+; X32:       lock
+; X32:       orl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor32() nounwind {
+; X64:   atomic_fetch_xor32
+; X32:   atomic_fetch_xor32
+  %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       xorl $3
+; X32:       lock
+; X32:       xorl $3
+  %t2 = atomicrmw xor  i32* @sc32, i32 5 acquire
+; X64:       xorl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw xor  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       xorl
+; X32:       lock
+; X32:       xorl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand32(i32 %x) nounwind {
+; X64:   atomic_fetch_nand32
+; X32:   atomic_fetch_nand32
+  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
+; X64:       andl
+; X64:       notl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max32(i32 %x) nounwind {
+  %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min32(i32 %x) nounwind {
+  %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax32(i32 %x) nounwind {
+  %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin32(i32 %x) nounwind {
+  %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg32() nounwind {
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store32(i32 %x) nounwind {
+  store atomic i32 %x, i32* @sc32 release, align 4
+; X64-NOT:   lock
+; X64:       movl
+; X32-NOT:   lock
+; X32:       movl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap32(i32 %x) nounwind {
+  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
+; X64-NOT:   lock
+; X64:       xchgl
+; X32-NOT:   lock
+; X32:       xchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
new file mode 100644
index 000000000000..45785cc8fe52
--- /dev/null
+++ b/test/CodeGen/X86/atomic64.ll
@@ -0,0 +1,216 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X64:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       incq
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       addq $3
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       addq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X64:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       decq
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       subq $3
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       subq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X64:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       andq $3
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X64:       andq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       andq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X64:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       orq $3
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X64:       orq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       orq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X64:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       xorq $3
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X64:       xorq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       xorq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X64:   atomic_fetch_nand64
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X64:       andq
+; X64:       notq
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X64-NOT:   lock
+; X64:       movq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
new file mode 100644
index 000000000000..f9b21c5bc75e
--- /dev/null
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -0,0 +1,208 @@
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X32:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X32:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X32:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X32:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X32:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
new file mode 100644
index 000000000000..412428406dcf
--- /dev/null
+++ b/test/CodeGen/X86/atomic8.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc8 = external global i8
+
+define void @atomic_fetch_add8() nounwind {
+; X64:   atomic_fetch_add8
+; X32:   atomic_fetch_add8
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       incb
+; X32:       lock
+; X32:       incb
+  %t2 = atomicrmw add  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       addb $3
+; X32:       lock
+; X32:       addb $3
+  %t3 = atomicrmw add  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw add  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       addb
+; X32:       lock
+; X32:       addb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub8() nounwind {
+; X64:   atomic_fetch_sub8
+; X32:   atomic_fetch_sub8
+  %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       decb
+; X32:       lock
+; X32:       decb
+  %t2 = atomicrmw sub  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       subb $3
+; X32:       lock
+; X32:       subb $3
+  %t3 = atomicrmw sub  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw sub  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       subb
+; X32:       lock
+; X32:       subb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and8() nounwind {
+; X64:   atomic_fetch_and8
+; X32:   atomic_fetch_and8
+  %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       andb $3
+; X32:       lock
+; X32:       andb $3
+  %t2 = atomicrmw and  i8* @sc8, i8 5 acquire
+; X64:       andb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw and  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       andb
+; X32:       lock
+; X32:       andb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or8() nounwind {
+; X64:   atomic_fetch_or8
+; X32:   atomic_fetch_or8
+  %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       orb $3
+; X32:       lock
+; X32:       orb $3
+  %t2 = atomicrmw or   i8* @sc8, i8 5 acquire
+; X64:       orb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       orb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw or   i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       orb
+; X32:       lock
+; X32:       orb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor8() nounwind {
+; X64:   atomic_fetch_xor8
+; X32:   atomic_fetch_xor8
+  %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       xorb $3
+; X32:       lock
+; X32:       xorb $3
+  %t2 = atomicrmw xor  i8* @sc8, i8 5 acquire
+; X64:       xorb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       xorb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw xor  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       xorb
+; X32:       lock
+; X32:       xorb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand8(i8 %x) nounwind {
+; X64:   atomic_fetch_nand8
+; X32:   atomic_fetch_nand8
+  %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
+; X64:       andb
+; X64:       notb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       notb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max8(i8 %x) nounwind {
+  %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min8(i8 %x) nounwind {
+  %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax8(i8 %x) nounwind {
+  %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin8(i8 %x) nounwind {
+  %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg8() nounwind {
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store8(i8 %x) nounwind {
+  store atomic i8 %x, i8* @sc8 release, align 4
+; X64-NOT:   lock
+; X64:       movb
+; X32-NOT:   lock
+; X32:       movb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap8(i8 %x) nounwind {
+  %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
+; X64-NOT:   lock
+; X64:       xchgb
+; X32-NOT:   lock
+; X32:       xchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index 1fce256a8a24..d94499889de4 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -178,7 +178,8 @@ entry:
 define void @sub2(i16* nocapture %p, i32 %v) nounwind ssp {
 entry:
 ; CHECK: sub2:
-; CHECK: negl
+; CHECK-NOT: negl
+; CHECK: subw
 	%0 = trunc i32 %v to i16		; <i16> [#uses=1]
   %1 = atomicrmw sub i16* %p, i16 %0 monotonic
   ret void
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 152bece4240f..c5fa07d07d80 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -107,13 +107,12 @@ entry:
         ; CHECK: cmpxchgl
   %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
 	store i32 %17, i32* %old
+        ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
-        ; CHECK: movl	[[R17atomic:.*]], %eax
-        ; CHECK: movl	%eax, %[[R17newval:[a-z]*]]
-        ; CHECK: andl	%[[R17mask]], %[[R17newval]]
-        ; CHECK: notl	%[[R17newval]]
+        ; CHECK: andl	%eax, %[[R17mask]]
+        ; CHECK: notl	%[[R17mask]]
         ; CHECK: lock
-        ; CHECK: cmpxchgl	%[[R17newval]], [[R17atomic]]
+        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
         ; CHECK: jne
         ; CHECK: movl	%eax,
   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa82b58f..95854c7960e7 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -109,8 +109,8 @@ allocas:
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
new file mode 100644
index 000000000000..1446b36a0fb4
--- /dev/null
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c44beb4bc2b8..88ecd5a5d34f 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1140,9 +1140,9 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-  ; CHECK: movl
-  ; CHECK: movl
-  ; CHECK: vpcmpestri
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1150,6 +1150,18 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
@@ -1216,8 +1228,19 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistri
+  ; CHECK: vpcmpistri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1225,6 +1248,16 @@ define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
   ; CHECK: seta
@@ -1271,7 +1304,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistrm
+  ; CHECK: vpcmpistrm $7
   ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
@@ -1279,6 +1312,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vaddss
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709a3b1b..ec11654b3556 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6caed36a..ff56a454996e 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa27426..a414e6880c32 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,37 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
diff --git a/test/CodeGen/X86/bitcast-i256.ll b/test/CodeGen/X86/bitcast-i256.ll
new file mode 100644
index 000000000000..85ac2fed6faa
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-i256.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i < %s | FileCheck %s --check-prefix CHECK
+
+define i256 @foo(<8 x i32> %a) {
+  %r = bitcast <8 x i32> %a to i256
+  ret i256 %r
+; CHECK: foo
+; CHECK: vextractf128
+; CHECK: vpextrq
+; CHECK: vpextrq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9bc533..09eb5d1038f7 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@ define i32 @bax(<2 x i64> %c) {
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
new file mode 100644
index 000000000000..3fb69a48b3c7
--- /dev/null
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
+  %t0 = fptoui <3 x float> %in to <3 x i8>
+  %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
+  %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
+  store <4 x i8> %t2, <4 x i8>* %out, align 4
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pinsrd
+; CHECK-NEXT: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/cmov-fp.ll b/test/CodeGen/X86/cmov-fp.ll
new file mode 100644
index 000000000000..ca91f9ea2c2b
--- /dev/null
+++ b/test/CodeGen/X86/cmov-fp.ll
@@ -0,0 +1,451 @@
+; RUN: llc -march x86 -mcpu pentium4 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march x86 -mcpu pentium3 < %s | FileCheck %s -check-prefix=NOSSE2
+; RUN: llc -march x86 -mcpu pentium2 < %s | FileCheck %s -check-prefix=NOSSE1
+; RUN: llc -march x86 -mcpu pentium < %s | FileCheck %s -check-prefix=NOCMOV
+; PR14035
+
+define double @test1(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test1:
+; SSE: movsd
+
+; NOSSE2: test1:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test1:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test1:
+; NOCMOV: fstp
+
+}
+
+define double @test2(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test2:
+; SSE: movsd
+
+; NOSSE2: test2:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test2:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test2:
+; NOCMOV: fstp
+}
+
+define double @test3(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test3:
+; SSE: movsd
+
+; NOSSE2: test3:
+; NOSSE2: fcmovb
+
+; NOSSE1: test3:
+; NOSSE1: fcmovb
+
+; NOCMOV: test3:
+; NOCMOV: fstp
+}
+
+define double @test4(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test4:
+; SSE: movsd
+
+; NOSSE2: test4:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test4:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test4:
+; NOCMOV: fstp
+}
+
+define double @test5(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test5:
+; SSE: movsd
+
+; NOSSE2: test5:
+; NOSSE2: fstp
+
+; NOSSE1: test5:
+; NOSSE1: fstp
+
+; NOCMOV: test5:
+; NOCMOV: fstp
+}
+
+define double @test6(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test6:
+; SSE: movsd
+
+; NOSSE2: test6:
+; NOSSE2: fstp
+
+; NOSSE1: test6:
+; NOSSE1: fstp
+
+; NOCMOV: test6:
+; NOCMOV: fstp
+}
+
+define double @test7(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test7:
+; SSE: movsd
+
+; NOSSE2: test7:
+; NOSSE2: fstp
+
+; NOSSE1: test7:
+; NOSSE1: fstp
+
+; NOCMOV: test7:
+; NOCMOV: fstp
+}
+
+define double @test8(i32 %a, i32 %b, double %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, double 99.0, double %x
+  ret double %sel
+
+; SSE: test8:
+; SSE: movsd
+
+; NOSSE2: test8:
+; NOSSE2: fstp
+
+; NOSSE1: test8:
+; NOSSE1: fstp
+
+; NOCMOV: test8:
+; NOCMOV: fstp
+}
+
+define float @test9(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test9:
+; SSE: movss
+
+; NOSSE2: test9:
+; NOSSE2: movss
+
+; NOSSE1: test9:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test9:
+; NOCMOV: fstp
+}
+
+define float @test10(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test10:
+; SSE: movss
+
+; NOSSE2: test10:
+; NOSSE2: movss
+
+; NOSSE1: test10:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test10:
+; NOCMOV: fstp
+}
+
+define float @test11(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test11:
+; SSE: movss
+
+; NOSSE2: test11:
+; NOSSE2: movss
+
+; NOSSE1: test11:
+; NOSSE1: fcmovb
+
+; NOCMOV: test11:
+; NOCMOV: fstp
+}
+
+define float @test12(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test12:
+; SSE: movss
+
+; NOSSE2: test12:
+; NOSSE2: movss
+
+; NOSSE1: test12:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test12:
+; NOCMOV: fstp
+}
+
+define float @test13(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test13:
+; SSE: movss
+
+; NOSSE2: test13:
+; NOSSE2: movss
+
+; NOSSE1: test13:
+; NOSSE1: fstp
+
+; NOCMOV: test13:
+; NOCMOV: fstp
+}
+
+define float @test14(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test14:
+; SSE: movss
+
+; NOSSE2: test14:
+; NOSSE2: movss
+
+; NOSSE1: test14:
+; NOSSE1: fstp
+
+; NOCMOV: test14:
+; NOCMOV: fstp
+}
+
+define float @test15(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test15:
+; SSE: movss
+
+; NOSSE2: test15:
+; NOSSE2: movss
+
+; NOSSE1: test15:
+; NOSSE1: fstp
+
+; NOCMOV: test15:
+; NOCMOV: fstp
+}
+
+define float @test16(i32 %a, i32 %b, float %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, float 99.0, float %x
+  ret float %sel
+
+; SSE: test16:
+; SSE: movss
+
+; NOSSE2: test16:
+; NOSSE2: movss
+
+; NOSSE1: test16:
+; NOSSE1: fstp
+
+; NOCMOV: test16:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test17(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ugt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test17:
+; SSE: fcmovnbe
+
+; NOSSE2: test17:
+; NOSSE2: fcmovnbe
+
+; NOSSE1: test17:
+; NOSSE1: fcmovnbe
+
+; NOCMOV: test17:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test18(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp uge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test18:
+; SSE: fcmovnb
+
+; NOSSE2: test18:
+; NOSSE2: fcmovnb
+
+; NOSSE1: test18:
+; NOSSE1: fcmovnb
+
+; NOCMOV: test18:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test19(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ult i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test19:
+; SSE: fcmovb
+
+; NOSSE2: test19:
+; NOSSE2: fcmovb
+
+; NOSSE1: test19:
+; NOSSE1: fcmovb
+
+; NOCMOV: test19:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test20(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp ule i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test20:
+; SSE: fcmovbe
+
+; NOSSE2: test20:
+; NOSSE2: fcmovbe
+
+; NOSSE1: test20:
+; NOSSE1: fcmovbe
+
+; NOCMOV: test20:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test21(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sgt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; We don't emit a branch for fp80, why?
+; SSE: test21:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test21:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test21:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test21:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test22(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sge i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test22:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test22:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test22:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test22:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test23(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp slt i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test23:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test23:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test23:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test23:
+; NOCMOV: fstp
+}
+
+define x86_fp80 @test24(i32 %a, i32 %b, x86_fp80 %x) nounwind {
+  %cmp = icmp sle i32 %a, %b
+  %sel = select i1 %cmp, x86_fp80 0xK4005C600000000000000, x86_fp80 %x
+  ret x86_fp80 %sel
+
+; SSE: test24:
+; SSE: testb
+; SSE: fcmovne
+
+; NOSSE2: test24:
+; NOSSE2: testb
+; NOSSE2: fcmovne
+
+; NOSSE1: test24:
+; NOSSE1: testb
+; NOSSE1: fcmovne
+
+; NOCMOV: test24:
+; NOCMOV: fstp
+}
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 9badfc82e99c..276d0db9a4f3 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -442,3 +442,150 @@ entry:
   ret void
 }
 declare void @_Z6PrintFz(...)
+
+@a = external global i32, align 4
+@fn1.g = private unnamed_addr constant [9 x i32*] [i32* null, i32* @a, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null, i32* null], align 16
+@e = external global i32, align 4
+
+define void @pr13943() nounwind uwtable ssp {
+entry:
+  %srcval = load i576* bitcast ([9 x i32*]* @fn1.g to i576*), align 16
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %g.0 = phi i576 [ %srcval, %entry ], [ %ins, %for.inc ]
+  %0 = load i32* @e, align 4
+  %1 = lshr i576 %g.0, 64
+  %2 = trunc i576 %1 to i64
+  %3 = inttoptr i64 %2 to i32*
+  %cmp = icmp eq i32* undef, %3
+  %conv2 = zext i1 %cmp to i32
+  %and = and i32 %conv2, %0
+  tail call void (...)* @fn3(i32 %and) nounwind
+  %tobool = icmp eq i32 undef, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.cond
+  ret void
+
+for.inc:                                          ; preds = %for.cond
+  %4 = shl i576 %1, 384
+  %mask = and i576 %g.0, -726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307841
+  %5 = and i576 %4, 726838724295606890509921801691610055141362320587174446476410459910173841445449629921945328942266354949348255351381262292727973638307840
+  %ins = or i576 %5, %mask
+  br label %for.cond
+}
+
+declare void @fn3(...)
+
+; Check coalescing of IMPLICIT_DEF instructions:
+;
+; %vreg1 = IMPLICIT_DEF
+; %vreg2 = MOV32r0
+;
+; When coalescing %vreg1 and %vreg2, the IMPLICIT_DEF instruction should be
+; erased along with its value number.
+;
+define void @rdar12474033() nounwind ssp {
+bb:
+  br i1 undef, label %bb21, label %bb1
+
+bb1:                                              ; preds = %bb
+  switch i32 undef, label %bb10 [
+    i32 4, label %bb2
+    i32 1, label %bb9
+    i32 5, label %bb3
+    i32 6, label %bb3
+    i32 2, label %bb9
+  ]
+
+bb2:                                              ; preds = %bb1
+  unreachable
+
+bb3:                                              ; preds = %bb1, %bb1
+  br i1 undef, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+bb5:                                              ; preds = %bb3
+  %tmp = load <4 x float>* undef, align 1
+  %tmp6 = bitcast <4 x float> %tmp to i128
+  %tmp7 = load <4 x float>* undef, align 1
+  %tmp8 = bitcast <4 x float> %tmp7 to i128
+  br label %bb10
+
+bb9:                                              ; preds = %bb1, %bb1
+  unreachable
+
+bb10:                                             ; preds = %bb5, %bb1
+  %tmp11 = phi i128 [ undef, %bb1 ], [ %tmp6, %bb5 ]
+  %tmp12 = phi i128 [ 0, %bb1 ], [ %tmp8, %bb5 ]
+  switch i32 undef, label %bb21 [
+    i32 2, label %bb18
+    i32 3, label %bb13
+    i32 5, label %bb16
+    i32 6, label %bb17
+    i32 1, label %bb18
+  ]
+
+bb13:                                             ; preds = %bb10
+  br i1 undef, label %bb15, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br label %bb21
+
+bb15:                                             ; preds = %bb13
+  unreachable
+
+bb16:                                             ; preds = %bb10
+  unreachable
+
+bb17:                                             ; preds = %bb10
+  unreachable
+
+bb18:                                             ; preds = %bb10, %bb10
+  %tmp19 = bitcast i128 %tmp11 to <4 x float>
+  %tmp20 = bitcast i128 %tmp12 to <4 x float>
+  br label %bb21
+
+bb21:                                             ; preds = %bb18, %bb14, %bb10, %bb
+  %tmp22 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp20, %bb18 ]
+  %tmp23 = phi <4 x float> [ undef, %bb ], [ undef, %bb10 ], [ undef, %bb14 ], [ %tmp19, %bb18 ]
+  store <4 x float> %tmp23, <4 x float>* undef, align 16
+  store <4 x float> %tmp22, <4 x float>* undef, align 16
+  switch i32 undef, label %bb29 [
+    i32 5, label %bb27
+    i32 1, label %bb24
+    i32 2, label %bb25
+    i32 14, label %bb28
+    i32 4, label %bb26
+  ]
+
+bb24:                                             ; preds = %bb21
+  unreachable
+
+bb25:                                             ; preds = %bb21
+  br label %bb29
+
+bb26:                                             ; preds = %bb21
+  br label %bb29
+
+bb27:                                             ; preds = %bb21
+  unreachable
+
+bb28:                                             ; preds = %bb21
+  br label %bb29
+
+bb29:                                             ; preds = %bb28, %bb26, %bb25, %bb21
+  unreachable
+}
+
+define void @pr14194() nounwind uwtable {
+  %tmp = load i64* undef, align 16
+  %tmp1 = trunc i64 %tmp to i32
+  %tmp2 = lshr i64 %tmp, 32
+  %tmp3 = trunc i64 %tmp2 to i32
+  %tmp4 = call { i32, i32 } asm sideeffect "", "=&r,=&r,r,r,0,1,~{dirflag},~{fpsr},~{flags}"(i32 %tmp3, i32 undef, i32 %tmp3, i32 %tmp1) nounwind
+ ret void
+}
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
new file mode 100644
index 000000000000..466b09606786
--- /dev/null
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+
+define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
+  %t1 = uitofp i32 %x to float
+  %t2 = insertelement <2 x float> undef, float %t1, i32 0
+  %t3 = uitofp i32 %y to float
+  %t4 = insertelement <2 x float> %t2, float %t3, i32 1
+  %t5 = fmul <2 x float> %v, %t4
+  ret <2 x float> %t5
+; CHECK: foo
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
+
+define <2 x float> @bar(<2 x i32> %in) {
+  %r = uitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %r
+; CHECK: bar
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/early-ifcvt-crash.ll b/test/CodeGen/X86/early-ifcvt-crash.ll
new file mode 100644
index 000000000000..c8280269689d
--- /dev/null
+++ b/test/CodeGen/X86/early-ifcvt-crash.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -x86-early-ifcvt -verify-machineinstrs
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt -verify-machineinstrs
+;
+; Run these tests with and without -stress-early-ifcvt to exercise heuristics.
+;
+target triple = "x86_64-apple-macosx10.8.0"
+
+; MachineTraceMetrics::Ensemble::addLiveIns crashes because the first operand
+; on an inline asm instruction is not a vreg def.
+; <rdar://problem/12472811>
+define void @f1() nounwind {
+entry:
+  br i1 undef, label %if.then6.i, label %if.end.i
+
+if.then6.i:
+  br label %if.end.i
+
+if.end.i:
+  br i1 undef, label %if.end25.i, label %if.else17.i
+
+if.else17.i:
+  %shl24.i = shl i32 undef, undef
+  br label %if.end25.i
+
+if.end25.i:
+  %storemerge31.i = phi i32 [ %shl24.i, %if.else17.i ], [ 0, %if.end.i ]
+  store i32 %storemerge31.i, i32* undef, align 4
+  %0 = tail call i32 asm sideeffect "", "=r,r,i,i"(i32 undef, i32 15, i32 1) nounwind
+  %conv = trunc i32 %0 to i8
+  store i8 %conv, i8* undef, align 1
+  unreachable
+}
diff --git a/test/CodeGen/X86/early-ifcvt.ll b/test/CodeGen/X86/early-ifcvt.ll
index 7883ffabd565..2e1852d3e3ae 100644
--- a/test/CodeGen/X86/early-ifcvt.ll
+++ b/test/CodeGen/X86/early-ifcvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-early-ifcvt -stress-early-ifcvt | FileCheck %s
+; RUN: llc < %s -x86-early-ifcvt -stress-early-ifcvt | FileCheck %s
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; CHECK: mm2
@@ -67,3 +67,78 @@ if.end41:
 }
 
 declare void @fprintf(...) nounwind
+
+; CHECK: BZ2_decompress
+; This test case contains irreducible control flow, so MachineLoopInfo doesn't
+; recognize the cycle in the CFG. This would confuse MachineTraceMetrics.
+define void @BZ2_decompress(i8* %s) nounwind ssp {
+entry:
+  switch i32 undef, label %sw.default [
+    i32 39, label %if.end.sw.bb2050_crit_edge
+    i32 36, label %sw.bb1788
+    i32 37, label %if.end.sw.bb1855_crit_edge
+    i32 40, label %sw.bb2409
+    i32 38, label %sw.bb1983
+    i32 44, label %if.end.sw.bb3058_crit_edge
+  ]
+
+if.end.sw.bb3058_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb1855_crit_edge:                       ; preds = %entry
+  br label %save_state_and_return
+
+if.end.sw.bb2050_crit_edge:                       ; preds = %entry
+  br label %sw.bb2050
+
+sw.bb1788:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.bb1983:                                        ; preds = %entry
+  br i1 undef, label %save_state_and_return, label %if.then1990
+
+if.then1990:                                      ; preds = %sw.bb1983
+  br label %while.body2038
+
+while.body2038:                                   ; preds = %sw.bb2050, %if.then1990
+  %groupPos.8 = phi i32 [ 0, %if.then1990 ], [ %groupPos.9, %sw.bb2050 ]
+  br i1 undef, label %save_state_and_return, label %if.end2042
+
+if.end2042:                                       ; preds = %while.body2038
+  br i1 undef, label %if.end2048, label %while.end2104
+
+if.end2048:                                       ; preds = %if.end2042
+  %bsLive2054.pre = getelementptr inbounds i8* %s, i32 8
+  br label %sw.bb2050
+
+sw.bb2050:                                        ; preds = %if.end2048, %if.end.sw.bb2050_crit_edge
+  %groupPos.9 = phi i32 [ 0, %if.end.sw.bb2050_crit_edge ], [ %groupPos.8, %if.end2048 ]
+  %and2064 = and i32 undef, 1
+  br label %while.body2038
+
+while.end2104:                                    ; preds = %if.end2042
+  br i1 undef, label %save_state_and_return, label %if.end2117
+
+if.end2117:                                       ; preds = %while.end2104
+  br i1 undef, label %while.body2161.lr.ph, label %while.body2145.lr.ph
+
+while.body2145.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+while.body2161.lr.ph:                             ; preds = %if.end2117
+  br label %save_state_and_return
+
+sw.bb2409:                                        ; preds = %entry
+  br label %save_state_and_return
+
+sw.default:                                       ; preds = %entry
+  call void @BZ2_bz__AssertH__fail() nounwind
+  br label %save_state_and_return
+
+save_state_and_return:
+  %groupPos.14 = phi i32 [ 0, %sw.default ], [ %groupPos.8, %while.body2038 ], [ %groupPos.8, %while.end2104 ], [ 0, %if.end.sw.bb3058_crit_edge ], [ 0, %if.end.sw.bb1855_crit_edge ], [ %groupPos.8, %while.body2161.lr.ph ], [ %groupPos.8, %while.body2145.lr.ph ], [ 0, %sw.bb2409 ], [ 0, %sw.bb1788 ], [ 0, %sw.bb1983 ]
+  store i32 %groupPos.14, i32* undef, align 4
+  ret void
+}
+
+declare void @BZ2_bz__AssertH__fail()
diff --git a/test/CodeGen/X86/extract-concat.ll b/test/CodeGen/X86/extract-concat.ll
new file mode 100644
index 000000000000..704309eb6507
--- /dev/null
+++ b/test/CodeGen/X86/extract-concat.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @foo(<4 x float> %in, <4 x i8>* %out) {
+  %t0 = fptosi <4 x float> %in to <4 x i32>
+  %t1 = trunc <4 x i32> %t0 to <4 x i16>
+  %t2 = shufflevector <4 x i16> %t1, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %t3 = trunc <8 x i16> %t2 to <8 x i8>
+  %t4 = shufflevector <8 x i8> %t3, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %t5 = insertelement <4 x i8> %t4, i8 -1, i32 3
+  store <4 x i8> %t5, <4 x i8>* %out
+  ret void
+; CHECK: foo
+; CHECK: cvttps2dq
+; CHECK-NOT: pextrd
+; CHECK: pshufb
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/fast-cc-callee-pops.ll b/test/CodeGen/X86/fast-cc-callee-pops.ll
index ea10897c7357..2c5b80ac4af0 100644
--- a/test/CodeGen/X86/fast-cc-callee-pops.ll
+++ b/test/CodeGen/X86/fast-cc-callee-pops.ll
@@ -2,12 +2,12 @@
 
 ; Check that a fastcc function pops its stack variables before returning.
 
-define x86_fastcallcc void @func(i64 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_fastcallcc void @func(i64 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
 
-define x86_thiscallcc void @func2(i32 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_thiscallcc void @func2(i32 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index 14cb136f89de..d591f9408b14 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -3,7 +3,7 @@
 
 target triple = "i686-pc-linux-gnu"
 
-declare x86_fastcallcc void @func(i32*, i64)
+declare x86_fastcallcc void @func(i32*, i64 inreg)
 
 define x86_fastcallcc void @caller(i32, i64) {
         %X = alloca i32         ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
index a96e5043fed4..b60b68bd388d 100644
--- a/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
 ; check that fastcc is passing stuff in regs.
 
-declare x86_fastcallcc i64 @callee(i64)
+declare x86_fastcallcc i64 @callee(i64 inreg)
 
 define i64 @caller() {
         %X = call x86_fastcallcc  i64 @callee( i64 4294967299 )          ; <i64> [#uses=1]
@@ -9,7 +9,7 @@ define i64 @caller() {
         ret i64 %X
 }
 
-define x86_fastcallcc i64 @caller2(i64 %X) {
+define x86_fastcallcc i64 @caller2(i64 inreg %X) {
         ret i64 %X
 ; CHECK: mov{{.*}}EAX, ECX
 }
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663c94e6..cdfaf7f4c134 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
@@ -285,3 +291,16 @@ entry:
 }
 
 declare void @foo22(i32)
+
+; PR13563
+define void @test23(i8* noalias sret %result) {
+  %a = alloca i8
+  %b = call i8* @foo23()
+  ret void
+; CHECK: test23:
+; CHECK: call
+; CHECK: movq  %rdi, %rax
+; CHECK: ret
+}
+
+declare i8* @foo23()
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a0dd1c..bd3514cc3f73 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e09d75b..e3910a6935c4 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b346e2b..2fe1ecd40e0c 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87b3bbf..6d98d59b3822 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index c961f7576f93..d8366654c01c 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -57,13 +57,13 @@ entry:
   %0 = load i32* %P, align 4
   %1 = load i32* %Q, align 4
   %2 = xor i32 %0, %1
-  %3 = and i32 %2, 65535
+  %3 = and i32 %2, 89947
   %4 = icmp eq i32 %3, 0
   br i1 %4, label %exit, label %land.end
 
 exit:
   %shr.i.i19 = xor i32 %1, %0
-  %5 = and i32 %shr.i.i19, 2147418112
+  %5 = and i32 %shr.i.i19, 3456789123
   %6 = icmp eq i32 %5, 0
   br label %land.end
 
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 000000000000..d70aa7d79f00
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test4
+define float @test4(float %a) {
+; CHECK-NOT: fma
+; CHECK-NOT mul
+; CHECK-NOT: add
+; CHECK: ret
+  %t1 = fmul float %a, 0.0
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %a) {
+; CHECK-NOT: add
+; CHECK: vxorps
+; CHECK: ret
+  %t1 = fsub float -0.0, %a
+  %t2 = fadd float %a, %t1
+  ret float %t2
+}
diff --git a/test/CodeGen/X86/fp-load-trunc.ll b/test/CodeGen/X86/fp-load-trunc.ll
new file mode 100644
index 000000000000..2ae65c97d97a
--- /dev/null
+++ b/test/CodeGen/X86/fp-load-trunc.ll
@@ -0,0 +1,61 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
+
+define <1 x float> @test1(<1 x double>* %p) nounwind {
+; CHECK: test1
+; CHECK: cvtsd2ss
+; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
+  %x = load <1 x double>* %p
+  %y = fptrunc <1 x double> %x to <1 x float>
+  ret <1 x float> %y
+}
+
+define <2 x float> @test2(<2 x double>* %p) nounwind {
+; CHECK: test2
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: ret
+; AVX:   test2
+; AVX:   vcvtpd2psx {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <2 x double>* %p
+  %y = fptrunc <2 x double> %x to <2 x float>
+  ret <2 x float> %y
+}
+
+define <4 x float> @test3(<4 x double>* %p) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   ret
+  %x = load <4 x double>* %p
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double>* %p) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: cvtpd2ps {{[0-9]*}}(%{{.*}})
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vcvtpd2psy {{[0-9]*}}(%{{.*}})
+; AVX:   vinsertf128
+; AVX:   ret
+  %x = load <8 x double>* %p
+  %y = fptrunc <8 x double> %x to <8 x float>
+  ret <8 x float> %y
+}
+
+
diff --git a/test/CodeGen/X86/fp-trunc.ll b/test/CodeGen/X86/fp-trunc.ll
index 170637a40ee2..25442fcadd23 100644
--- a/test/CodeGen/X86/fp-trunc.ll
+++ b/test/CodeGen/X86/fp-trunc.ll
@@ -1,33 +1,56 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx-i | FileCheck %s --check-prefix=AVX
 
 define <1 x float> @test1(<1 x double> %x) nounwind {
+; CHECK: test1
 ; CHECK: cvtsd2ss
 ; CHECK: ret
+; AVX:   test1
+; AVX:   vcvtsd2ss
+; AVX:   ret
   %y = fptrunc <1 x double> %x to <1 x float>
   ret <1 x float> %y
 }
 
-
 define <2 x float> @test2(<2 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+; CHECK: test2
+; CHECK: cvtpd2ps
 ; CHECK: ret
+; AVX:   test2
+; AVX-NOT:  vcvtpd2psy
+; AVX:   vcvtpd2ps
+; AVX:   ret
   %y = fptrunc <2 x double> %x to <2 x float>
   ret <2 x float> %y
 }
 
-define <8 x float> @test3(<8 x double> %x) nounwind {
-; FIXME: It would be nice if this compiled down to a series of cvtpd2ps
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
-; CHECK: cvtsd2ss
+define <4 x float> @test3(<4 x double> %x) nounwind {
+; CHECK: test3
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: ret
+; AVX:   test3
+; AVX:   vcvtpd2psy
+; AVX:   ret
+  %y = fptrunc <4 x double> %x to <4 x float>
+  ret <4 x float> %y
+}
+
+define <8 x float> @test4(<8 x double> %x) nounwind {
+; CHECK: test4
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
+; CHECK: cvtpd2ps
+; CHECK: cvtpd2ps
+; CHECK: movlhps
 ; CHECK: ret
+; AVX:   test4
+; AVX:   vcvtpd2psy
+; AVX:   vcvtpd2psy
+; AVX:   vinsertf128
+; AVX:   ret
   %y = fptrunc <8 x double> %x to <8 x float>
   ret <8 x float> %y
 }
diff --git a/test/CodeGen/X86/handle-move.ll b/test/CodeGen/X86/handle-move.ll
new file mode 100644
index 000000000000..e9f7a962e20d
--- /dev/null
+++ b/test/CodeGen/X86/handle-move.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-bottomup -verify-machineinstrs < %s
+; RUN: llc -march=x86-64 -mcpu=core2 -fast-isel -enable-misched -misched=shuffle -misched-topdown -verify-machineinstrs < %s
+; REQUIRES: asserts
+;
+; Test the LiveIntervals::handleMove() function.
+;
+; Moving the DIV32r instruction exercises the regunit update code because
+; %EDX has a live range into the function and is used by the DIV32r.
+;
+; Here sinking a kill + dead def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def>, %EDX<imp-def,dead>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,144d:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,180d:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = add i32 %c, 1
+  %x = udiv i32 %b, %a
+  %add = add nsw i32 %y, %x
+  ret i32 %add
+}
+
+; Same as above, but moving a kill + live def:
+; 144B -> 180B: DIV32r %vreg4, %EAX<imp-def,dead>, %EDX<imp-def>, %EFLAGS<imp-def,dead>, %EAX<imp-use,kill>, %EDX<imp-use>
+;       %vreg4: [48r,144r:0)  0@48r
+;         -->   [48r,180r:0)  0@48r
+;       DH:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;       DL:     [0B,16r:0)[128r,144r:2)[144r,184r:1)  0@0B-phi 1@144r 2@128r
+;         -->   [0B,16r:0)[128r,180r:2)[180r,184r:1)  0@0B-phi 1@180r 2@128r
+;
+define i32 @f2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %c, %d
+  %x = urem i32 %b, %a
+  %add = add nsw i32 %x, %y
+  ret i32 %add
+}
+
+; Moving a use below the existing kill (%vreg5):
+; Moving a tied virtual register def (%vreg11):
+;
+; 96B -> 120B: %vreg11<def,tied1> = SUB32rr %vreg11<tied0>, %vreg5
+;       %vreg11:        [80r,96r:1)[96r,144r:0)  0@96r 1@80r
+;            -->        [80r,120r:1)[120r,144r:0)  0@120r 1@80r
+;       %vreg5:         [16r,112r:0)  0@16r
+;            -->        [16r,120r:0)  0@16r
+;
+define i32 @f3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %y = sub i32 %a, %b
+  %x = add i32 %a, %b
+  %r = mul i32 %x, %y
+  ret i32 %r
+}
+
+; Move EFLAGS dead def across another def:
+; handleMove 208B -> 36B: %EDX<def> = MOV32r0 %EFLAGS<imp-def,dead>
+;    EFLAGS:    [20r,20d:4)[160r,160d:3)[208r,208d:0)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@208r 1@224r 2@272r 3@160r 4@20r 5@304r
+;         -->   [20r,20d:4)[36r,36d:0)[160r,160d:3)[224r,224d:1)[272r,272d:2)[304r,304d:5)  0@36r 1@224r 2@272r 3@160r 4@20r 5@304r
+;
+define i32 @f4(i32 %a, i32 %b, i32 %c, i32 %d) nounwind uwtable readnone ssp {
+entry:
+  %x = sub i32 %a, %b
+  %y = sub i32 %b, %c
+  %z = sub i32 %c, %d
+  %r1 = udiv i32 %x, %y
+  %r2 = mul i32 %z, %r1
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb09ec2..597236e36281 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index e6eb9efd8c78..d201ebdc85d1 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -52,3 +52,10 @@ entry:
   %0 = call { i32, i32, i32, i32, i32 } asm sideeffect "", "=&r,=&r,=&r,=&r,=&q,r,~{ecx},~{memory},~{dirflag},~{fpsr},~{flags}"(i8* %h) nounwind
   ret void
 }
+
+; Mix normal and EC defs of the same register.
+define i32 @pr14376() nounwind noinline {
+entry:
+  %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
+  ret i32 %asm
+}
diff --git a/test/CodeGen/X86/inlineasm-sched-bug.ll b/test/CodeGen/X86/inlineasm-sched-bug.ll
new file mode 100644
index 000000000000..08de0c02d293
--- /dev/null
+++ b/test/CodeGen/X86/inlineasm-sched-bug.ll
@@ -0,0 +1,13 @@
+; PR13504
+; RUN: llc -march=x86 -mcpu=atom <%s | FileCheck %s
+; CHECK: bsfl
+; CHECK-NOT: movl
+
+define i32 @foo(i32 %treemap) nounwind uwtable {
+entry:
+  %sub = sub i32 0, %treemap
+  %and = and i32 %treemap, %sub
+  %0 = tail call i32 asm "bsfl $1,$0\0A\09", "=r,rm,~{dirflag},~{fpsr},~{flags}"(i32 %and) nounwind
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 48e21061d209..0e34222b945f 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro -verify-machineinstrs | FileCheck %s
 
 define i32 @f(i32 %X) {
 entry:
@@ -219,7 +219,6 @@ entry:
 ; by sbb, we should not optimize cmp away.
 define i32 @q(i32 %j.4, i32 %w, i32 %el) {
 ; CHECK: q:
-; CHECK: sub
 ; CHECK: cmp
 ; CHECK-NEXT: sbb
   %tmp532 = add i32 %j.4, %w
@@ -253,3 +252,56 @@ return:
   %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
   ret i8* %retval.0
 }
+
+; Test optimizations of dec/inc.
+define i32 @dec(i32 %a) nounwind {
+entry:
+; CHECK: dec:
+; CHECK: decl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %sub = sub nsw i32 %a, 1
+  %cmp = icmp sgt i32 %sub, 0
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+
+define i32 @inc(i32 %a) nounwind {
+entry:
+; CHECK: inc:
+; CHECK: incl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %add = add nsw i32 %a, 1
+  %cmp = icmp sgt i32 %add, 0
+  %cond = select i1 %cmp, i32 %add, i32 0
+  ret i32 %cond
+}
+
+; PR13966
+@b = common global i32 0, align 4
+@a = common global i32 0, align 4
+define i32 @test1(i32 %p1) nounwind uwtable {
+entry:
+; CHECK: test1:
+; CHECK: testb
+; CHECK: j
+; CHECK: ret
+  %0 = load i32* @b, align 4
+  %cmp = icmp ult i32 %0, %p1
+  %conv = zext i1 %cmp to i32
+  %1 = load i32* @a, align 4
+  %and = and i32 %conv, %1
+  %conv1 = trunc i32 %and to i8
+  %2 = urem i8 %conv1, 3
+  %tobool = icmp eq i8 %2, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+  %dec = add nsw i32 %1, -1
+  store i32 %dec, i32* @a, align 4
+  br label %if.end
+
+if.end:
+  ret i32 undef
+}
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 000000000000..2184d9e96036
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/X86/misched-ilp.ll b/test/CodeGen/X86/misched-ilp.ll
new file mode 100644
index 000000000000..c6cedb7be871
--- /dev/null
+++ b/test/CodeGen/X86/misched-ilp.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmax | FileCheck -check-prefix=MAX %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=core2 -enable-misched -misched=ilpmin | FileCheck -check-prefix=MIN %s
+;
+; Basic verification of the ScheduleDAGILP metric.
+;
+; MAX: addss
+; MAX: addss
+; MAX: addss
+; MAX: subss
+; MAX: addss
+;
+; MIN: addss
+; MIN: addss
+; MIN: subss
+; MIN: addss
+; MIN: addss
+define float @ilpsched(float %a, float %b, float %c, float %d, float %e, float %f) nounwind uwtable readnone ssp {
+entry:
+  %add = fadd float %a, %b
+  %add1 = fadd float %c, %d
+  %add2 = fadd float %e, %f
+  %add3 = fsub float %add1, %add2
+  %add4 = fadd float %add, %add3
+  ret float %add4
+}
diff --git a/test/CodeGen/X86/misched-new.ll b/test/CodeGen/X86/misched-new.ll
index 8f2f6f7697df..cec04b534fba 100644
--- a/test/CodeGen/X86/misched-new.ll
+++ b/test/CodeGen/X86/misched-new.ll
@@ -1,4 +1,6 @@
-; RUN: llc -march=x86-64 -mcpu=core2 -enable-misched -misched=shuffle -misched-bottomup < %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -x86-early-ifcvt -enable-misched \
+; RUN:          -misched=shuffle -misched-bottomup -verify-machineinstrs \
+; RUN:     | FileCheck %s
 ; REQUIRES: asserts
 ;
 ; Interesting MachineScheduler cases.
@@ -25,3 +27,27 @@ for.cond.preheader:                               ; preds = %entry
 if.end:                                           ; preds = %entry
   ret void
 }
+
+; The machine verifier checks that EFLAGS kill flags are updated when
+; the scheduler reorders cmovel instructions.
+;
+; CHECK: test
+; CHECK: cmovel
+; CHECK: cmovel
+; CHECK: call
+define void @foo(i32 %b) nounwind uwtable ssp {
+entry:
+  %tobool = icmp ne i32 %b, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %v1 = phi i32 [1, %entry], [2, %if.then]
+  %v2 = phi i32 [3, %entry], [4, %if.then]
+  call void @bar(i32 %v1, i32 %v2)
+  ret void
+}
+
+declare void @bar(i32,i32)
diff --git a/test/CodeGen/X86/mmx-builtins.ll b/test/CodeGen/X86/mmx-builtins.ll
index 8b7200d2f78f..a8d33f43da01 100644
--- a/test/CodeGen/X86/mmx-builtins.ll
+++ b/test/CodeGen/X86/mmx-builtins.ll
@@ -1043,6 +1043,20 @@ entry:
   ret i64 %5
 }
 
+define i32 @test21_2(<1 x i64> %a) nounwind readnone optsize ssp {
+; CHECK: test21_2
+; CHECK: pshufw
+; CHECK: movd
+entry:
+  %0 = bitcast <1 x i64> %a to <4 x i16>
+  %1 = bitcast <4 x i16> %0 to x86_mmx
+  %2 = tail call x86_mmx @llvm.x86.sse.pshuf.w(x86_mmx %1, i8 3) nounwind readnone
+  %3 = bitcast x86_mmx %2 to <4 x i16>
+  %4 = bitcast <4 x i16> %3 to <2 x i32>
+  %5 = extractelement <2 x i32> %4, i32 0
+  ret i32 %5
+}
+
 declare x86_mmx @llvm.x86.mmx.pmulu.dq(x86_mmx, x86_mmx) nounwind readnone
 
 define i64 @test20(<1 x i64> %a, <1 x i64> %b) nounwind readnone optsize ssp {
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 000000000000..24d28adda894
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t2() nounwind {
+entry:
+  call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  ret void
+; CHECK: t2
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, 1
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+define void @t3(i32 %V) nounwind {
+entry:
+  %V.addr = alloca i32, align 4
+  store i32 %V, i32* %V.addr, align 4
+  call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
+  ret void
+; CHECK: t3
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, DWORD PTR {{[[esp]}}
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
+
+%struct.t18_type = type { i32, i32 }
+
+define i32 @t18() nounwind {
+entry:
+  %foo = alloca %struct.t18_type, align 4
+  %a = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  store i32 2, i32* %b, align 4
+  call void asm sideeffect inteldialect "lea ebx, foo\0A\09mov eax, [ebx].0\0A\09mov [ebx].4, ecx", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %b1 = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  %0 = load i32* %b1, align 4
+  ret i32 %0
+; CHECK: t18
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: lea ebx, foo
+; CHECK: mov eax, [ebx].0
+; CHECK: mov [ebx].4, ecx
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
new file mode 100644
index 000000000000..b75ac009e76d
--- /dev/null
+++ b/test/CodeGen/X86/mulx32.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86 < %s | FileCheck %s
+
+define i64 @f1(i32 %a, i32 %b) {
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f1
+; CHECK: mulxl
+; CHECK: ret
+  ret i64 %r
+}
+
+define i64 @f2(i32 %a, i32* %p) {
+  %b = load i32* %p
+  %x = zext i32 %a to i64
+  %y = zext i32 %b to i64
+  %r = mul i64 %x, %y
+; CHECK: f2
+; CHECK: mulxl ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i64 %r
+}
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
new file mode 100644
index 000000000000..d5730282a137
--- /dev/null
+++ b/test/CodeGen/X86/mulx64.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mcpu=core-avx2 -march=x86-64 < %s | FileCheck %s
+
+define i128 @f1(i64 %a, i64 %b) {
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f1
+; CHECK: mulxq
+; CHECK: ret
+  ret i128 %r
+}
+
+define i128 @f2(i64 %a, i64* %p) {
+  %b = load i64* %p
+  %x = zext i64 %a to i128
+  %y = zext i64 %b to i128
+  %r = mul i128 %x, %y
+; CHECK: f2
+; CHECK: mulxq ({{.+}}), %{{.+}}, %{{.+}}
+; CHECK: ret
+  ret i128 %r
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e57e0c6..2a20e7ad6f15 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pic_jumptable.ll b/test/CodeGen/X86/pic_jumptable.ll
index 8c16dc68b291..bdd885935842 100644
--- a/test/CodeGen/X86/pic_jumptable.ll
+++ b/test/CodeGen/X86/pic_jumptable.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -relocation-model=pic -mtriple=i386-linux-gnu -asm-verbose=false \
 ; RUN:   | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -relocation-model=pic -mark-data-regions -mtriple=i686-apple-darwin -asm-verbose=false \
+; RUN:   | FileCheck %s --check-prefix=CHECK-DATA
 ; RUN: llc < %s -relocation-model=pic -mtriple=i686-apple-darwin -asm-verbose=false \
 ; RUN:   | FileCheck %s
 ; RUN: llc < %s                       -mtriple=x86_64-apple-darwin | not grep 'lJTI'
@@ -16,6 +18,16 @@ entry:
 ; CHECK:       Ltmp0 = LJTI0_0-L0$pb
 ; CHECK-NEXT:  addl Ltmp0(%eax,%ecx,4)
 ; CHECK-NEXT:  jmpl *%eax
+
+;; When data-in-code markers are enabled, we should see them around the jump
+;; table.
+; CHECK-DATA: .data_region jt32
+; CHECK-DATA: LJTI0_0
+; CHECK-DATA: .end_data_region
+
+;; When they're not enabled, make sure we don't see them at all.
+; CHECK-NOT: .data_region
+; CHECK-LINUX-NOT: .data_region
 	%Y_addr = alloca i32		; <i32*> [#uses=2]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
 	store i32 %Y, i32* %Y_addr
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
new file mode 100644
index 000000000000..16e9c28fcdef
--- /dev/null
+++ b/test/CodeGen/X86/pmovext.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; rdar://11897677
+
+;CHECK: intrin_pmov
+;CHECK: pmovzxbw  (%{{.*}}), %xmm0
+;CHECK-NEXT: movdqu
+;CHECK-NEXT: ret
+define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable ssp {
+  %1 = bitcast i8* %src to <2 x i64>*
+  %2 = load <2 x i64>* %1, align 16
+  %3 = bitcast <2 x i64> %2 to <16 x i8>
+  %4 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %3) nounwind
+  %5 = bitcast i16* %dest to i8*
+  %6 = bitcast <8 x i16> %4 to <16 x i8>
+  tail call void @llvm.x86.sse2.storeu.dq(i8* %5, <16 x i8> %6) nounwind
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 800fbedb4f99..58423d195964 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
 entry:
   %G = load <4 x i8>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: pand
   %K = inttoptr <4 x i8> %G to <4 x i32*>
 ;CHECK: ret
@@ -105,7 +104,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movsd
+;CHECK: pmovzxdq
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index 5b7b5eab87ec..e7e29e0d609c 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -54,3 +54,11 @@ entry:
   %f1 = fpext <8 x float> %v1 to <8 x double>
   ret <8 x double> %f1
 }
+
+define void @test_vector_creation() nounwind {
+  %1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2
+  %2 = load double addrspace(1)* null
+  %3 = insertelement <4 x double> %1, double %2, i32 3
+  store <4 x double> %3, <4 x double>* undef
+  ret void
+}
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
new file mode 100644
index 000000000000..fa378502f724
--- /dev/null
+++ b/test/CodeGen/X86/pr11985.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s
+
+define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* blockaddress(@foo, %out), i64 22, i32 1, i1 false)
+  br label %out
+
+out:                                              ; preds = %entry
+  %add = fadd float %a, %b
+  ret float %add
+; CHECK: foo
+; CHECK: movw .L{{.*}}+20(%rip), %{{.*}}
+; CHECK: movl .L{{.*}}+16(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}+8(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}(%rip), %{{.*}}
+; CHECK: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 000000000000..087b8d7539ec
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,155 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest128(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 000000000000..024b163fa718
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/pr13458.ll b/test/CodeGen/X86/pr13458.ll
new file mode 100644
index 000000000000..55548b3c3b45
--- /dev/null
+++ b/test/CodeGen/X86/pr13458.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin11.4.2"
+
+%v8_uniform_Stats.0.2.4.10 = type { i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, [7 x i32], [7 x i64] }
+
+@globalStats = external global %v8_uniform_Stats.0.2.4.10
+
+define void @MergeStats() nounwind {
+allocas:
+  %r.i.i720 = atomicrmw max i64* getelementptr inbounds (%v8_uniform_Stats.0.2.4.10* @globalStats, i64 0, i32 30), i64 0 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13859.ll b/test/CodeGen/X86/pr13859.ll
new file mode 100644
index 000000000000..719721dfd87b
--- /dev/null
+++ b/test/CodeGen/X86/pr13859.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.7.0"
+
+define void @_Z17FilterYUVRows_MMXi(i32 %af) nounwind ssp {
+entry:
+  %aMyAlloca = alloca i32, align 32
+  %dest = alloca <1 x i64>, align 32
+
+  %a32 = load i32* %aMyAlloca, align 4
+  %aconv = trunc i32 %a32 to i16
+  %a36 = insertelement <4 x i16> undef, i16 %aconv, i32 0
+  %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
+  %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
+  %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
+  %a40 = bitcast <4 x i16> %a39 to x86_mmx
+  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+
+  %a47 = trunc i32 %a32 to i1
+  br i1 %a47, label %a48, label %a49
+
+a48:
+  unreachable
+
+a49:
+  store <1 x i64> %a41, <1 x i64>* %dest, align 8 ; !!!
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13899.ll b/test/CodeGen/X86/pr13899.ll
new file mode 100644
index 000000000000..bc81e34d67e3
--- /dev/null
+++ b/test/CodeGen/X86/pr13899.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=X64
+
+; ModuleID = 'a.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S32"
+target triple = "i386-pc-win32"
+
+%v4_varying_big_struct = type { [4 x <4 x i32>] }
+
+declare <4 x i32> @"foo"(%v4_varying_big_struct, <4 x i32>) nounwind
+
+define <4 x i32> @"bar"(%v4_varying_big_struct %s, <4 x i32> %__mask) nounwind {
+allocas:
+  %calltmp = call <4 x i32> @"foo"(%v4_varying_big_struct %s, <4 x i32> %__mask)
+  ret <4 x i32> %calltmp
+; CHECK: bar
+; CHECK: andl
+; CHECK: call
+; CHECK: ret
+}
+
+declare <8 x float> @bar64(<8 x float> %i0, <8 x float> %i1,
+                         <8 x float> %i2, <8 x float> %i3,
+                         <8 x float> %i4, <8 x float> %i5,
+                         <8 x float> %i6, <8 x float> %i7,
+                         <8 x float> %i8, <8 x float> %i9)
+
+define <8 x float> @foo64(<8 x float>* %p) {
+  %1 = load <8 x float>* %p
+  %idx1 = getelementptr inbounds <8 x float>* %p, i64 1
+  %2 = load <8 x float>* %idx1
+  %idx2 = getelementptr inbounds <8 x float>* %p, i64 2
+  %3 = load <8 x float>* %idx2
+  %idx3 = getelementptr inbounds <8 x float>* %p, i64 3
+  %4 = load <8 x float>* %idx3
+  %idx4 = getelementptr inbounds <8 x float>* %p, i64 4
+  %5 = load <8 x float>* %idx4
+  %idx5 = getelementptr inbounds <8 x float>* %p, i64 5
+  %6 = load <8 x float>* %idx5
+  %idx6 = getelementptr inbounds <8 x float>* %p, i64 6
+  %7 = load <8 x float>* %idx6
+  %idx7 = getelementptr inbounds <8 x float>* %p, i64 7
+  %8 = load <8 x float>* %idx7
+  %idx8 = getelementptr inbounds <8 x float>* %p, i64 8
+  %9 = load <8 x float>* %idx8
+  %idx9 = getelementptr inbounds <8 x float>* %p, i64 9
+  %10 = load <8 x float>* %idx9
+  %r = tail call <8 x float> @bar64(<8 x float> %1, <8 x float> %2,
+                                    <8 x float> %3, <8 x float> %4,
+                                    <8 x float> %5, <8 x float> %6,
+                                    <8 x float> %7, <8 x float> %8,
+                                    <8 x float> %9, <8 x float> %10)
+  ret <8 x float> %r
+; X64: foo
+; X64: and
+; X64: call
+; X64: ret
+}
diff --git a/test/CodeGen/X86/pr14088.ll b/test/CodeGen/X86/pr14088.ll
new file mode 100644
index 000000000000..505e3b5cf262
--- /dev/null
+++ b/test/CodeGen/X86/pr14088.ll
@@ -0,0 +1,25 @@
+; RUN: llc -mtriple x86_64-linux -mcpu core2 -verify-machineinstrs %s -o - | FileCheck %s
+define i32 @f(i1 %foo, i16* %tm_year2, i8* %bar, i16 %zed, i32 %zed2) {
+entry:
+  br i1 %foo, label %return, label %if.end
+
+if.end:
+  %rem = srem i32 %zed2, 100
+  %conv3 = trunc i32 %rem to i16
+  store i16 %conv3, i16* %tm_year2
+  %sext = shl i32 %rem, 16
+  %conv5 = ashr exact i32 %sext, 16
+  %div = sdiv i32 %conv5, 10
+  %conv6 = trunc i32 %div to i8
+  store i8 %conv6, i8* %bar
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 0, %if.end ], [ -1, %entry ]
+  ret i32 %retval.0
+}
+
+; We were miscompiling this and using %ax instead of %cx in the movw.
+; CHECK: movswl	%cx, %ecx
+; CHECK: movw	%cx, (%rsi)
+; CHECK: movslq	%ecx, %rcx
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
new file mode 100644
index 000000000000..d76b912fd8e2
--- /dev/null
+++ b/test/CodeGen/X86/pr14090.ll
@@ -0,0 +1,76 @@
+; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
+
+define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
+entry:
+  %_Tmp.i39 = alloca i64, align 8
+  %retval.i33 = alloca i64, align 8
+  %_Tmp.i = alloca i64, align 8
+  %retval.i.i = alloca i64, align 8
+  %_First.i = alloca i64, align 8
+
+  %0 = load i64* %retval.i, align 8
+
+  %1 = load i64* %retval.i, align 8
+
+  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
+  store i64 %1, i64* %_Tmp.i39, align 8
+  %cmp.i.i.i40 = icmp slt i32 %call, 0
+  %2 = lshr i64 %1, 32
+  %3 = trunc i64 %2 to i32
+  %sub.i.i.i44 = sub i32 0, %call
+  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
+  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
+  %add.i.i.i47 = add i32 %3, %call
+  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
+  %trunc.i50 = trunc i64 %1 to i32
+  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
+  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
+  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
+  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
+  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
+  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
+  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
+  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
+  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
+  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
+  %srcval.i56 = load i64* %_Tmp.i39, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
+
+; CHECK: Before Merge disjoint stack slots
+; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
+; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
+
+; CHECK: After Merge disjoint stack slots
+; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
+; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
+
+  %fifteen = bitcast i64* %retval.i.i to i32**
+  %sixteen = bitcast i64* %retval.i.i to i8*
+  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
+  store i32* %.ph.i80, i32** %fifteen, align 8, !tbaa !0
+  %sunkaddr = ptrtoint i64* %retval.i.i to i32
+  %sunkaddr86 = add i32 %sunkaddr, 4
+  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
+  store i32 %fourteen, i32* %sunkaddr87, align 4, !tbaa !3
+  %seventeen = load i64* %retval.i.i, align 8
+  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
+  %eighteen = lshr i64 %seventeen, 32
+  %nineteen = trunc i64 %eighteen to i32
+  %shl.i.i.i = shl i32 1, %nineteen
+
+  store i32 %shl.i.i.i, i32* %out.lo, align 8
+  store i32 %nineteen, i32* %out.hi, align 8
+
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
+!4 = metadata !{metadata !"vtable pointer", metadata !2}
diff --git a/test/CodeGen/X86/pr14098.ll b/test/CodeGen/X86/pr14098.ll
new file mode 100644
index 000000000000..6ce2449ab6a6
--- /dev/null
+++ b/test/CodeGen/X86/pr14098.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple i386-unknown-linux-gnu -relocation-model=pic -verify-machineinstrs < %s
+; We used to crash on this.
+
+declare void @foo()
+declare void @foo3(i1 %x)
+define void @bar(i1 %a1, i16 %a2) nounwind align 2 {
+bb0:
+  %a3 = trunc i16 %a2 to i8
+  %a4 = lshr i16 %a2, 8
+  %a5 = trunc i16 %a4 to i8
+  br i1 %a1, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %a6 = phi i8 [ 3, %bb0 ], [ %a5, %bb1 ]
+  %a7 = phi i8 [ 9, %bb0 ], [ %a3, %bb1 ]
+  %a8 = icmp eq i8 %a6, 1
+  call void @foo()
+  %a9 = icmp eq i8 %a7, 0
+  call void @foo3(i1 %a9)
+  call void @foo3(i1 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
new file mode 100644
index 000000000000..ff4532eac3ac
--- /dev/null
+++ b/test/CodeGen/X86/pr14161.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=corei7 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
+
+define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %8
+; CHECK: good
+; CHECK: pminud
+; CHECK-NEXT: pmovzxwq
+; CHECK: ret
+}
+
+define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %9
+; CHECK: bad
+; CHECK: pminud
+; CHECK: pextrd
+; CHECK: pmovzxwq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14204.ll b/test/CodeGen/X86/pr14204.ll
new file mode 100644
index 000000000000..42e362bf3b9b
--- /dev/null
+++ b/test/CodeGen/X86/pr14204.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=core-avx2 | FileCheck %s
+
+; FIXME: vpmovsxwd should be generated instead of vpmovzxwd followed by
+; SLL/SRA.
+
+define <8 x i32> @foo(<8 x i1> %bar) nounwind readnone {
+entry:
+  %s = sext <8 x i1> %bar to <8 x i32>
+  ret <8 x i32> %s
+; CHECK: foo
+; CHECK: vpmovzxwd
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
new file mode 100644
index 000000000000..5388a4b01b65
--- /dev/null
+++ b/test/CodeGen/X86/pr14314.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 | FileCheck %s
+
+define i64 @atomicSub(i64* %a, i64 %b) nounwind {
+entry:
+  %0 = atomicrmw sub i64* %a, i64 %b seq_cst
+  ret i64 %0
+; CHECK: atomicSub
+; movl %eax, %ebx
+; subl {{%[a-z]+}}, %ebx
+; movl %edx, %ecx
+; sbbl {{%[a-z]+}}, %ecx
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/pr14333.ll b/test/CodeGen/X86/pr14333.ll
new file mode 100644
index 000000000000..86c12ef6b547
--- /dev/null
+++ b/test/CodeGen/X86/pr14333.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s
+%foo = type { i64, i64 }
+define void @bar(%foo* %zed) {
+  %tmp = getelementptr inbounds %foo* %zed, i64 0, i32 0
+  store i64 0, i64* %tmp, align 8
+  %tmp2 = getelementptr inbounds %foo* %zed, i64 0, i32 1
+  store i64 0, i64* %tmp2, align 8
+  %tmp3 = bitcast %foo* %zed to i8*
+  call void @llvm.memset.p0i8.i64(i8* %tmp3, i8 0, i64 16, i32 8, i1 false)
+  ret void
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
new file mode 100644
index 000000000000..d048db8a850d
--- /dev/null
+++ b/test/CodeGen/X86/pr5145.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+@sc8 = external global i8
+
+define void @atomic_maxmin_i8() {
+; CHECK: atomic_maxmin_i8
+  %1 = atomicrmw max  i8* @sc8, i8 5 acquire
+; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovl
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL1]]
+  %2 = atomicrmw min  i8* @sc8, i8 6 acquire
+; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovg
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL3]]
+  %3 = atomicrmw umax i8* @sc8, i8 7 acquire
+; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovb
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL5]]
+  %4 = atomicrmw umin i8* @sc8, i8 8 acquire
+; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmova
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL7]]
+  ret void
+}
diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll
index 8b30dc718b08..283f48cd37b4 100644
--- a/test/CodeGen/X86/promote.ll
+++ b/test/CodeGen/X86/promote.ll
@@ -20,7 +20,7 @@ entry:
 ; CHECK: shuff_f
 define i32 @shuff_f(<4 x i8>* %A) {
 entry:
-; CHECK: pshufb
+; CHECK: pmovzxbd
 ; CHECK: paddd
 ; CHECK: pshufb
   %0 = load <4 x i8>* %A, align 8
diff --git a/test/CodeGen/X86/ptr-rotate.ll b/test/CodeGen/X86/ptr-rotate.ll
index 6debd16ba5dd..fbd13b503644 100644
--- a/test/CodeGen/X86/ptr-rotate.ll
+++ b/test/CodeGen/X86/ptr-rotate.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin -o - < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin -mcpu=corei7 -o - < %s | FileCheck %s
 
 define i32 @func(i8* %A) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/red-zone2.ll b/test/CodeGen/X86/red-zone2.ll
index f09216319e8d..3e9c7909a366 100644
--- a/test/CodeGen/X86/red-zone2.ll
+++ b/test/CodeGen/X86/red-zone2.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 > %t
-; RUN: grep subq %t | count 1
-; RUN: grep addq %t | count 1
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+; CHECK: f0:
+; CHECK: subq
+; CHECK: addq
 
 define x86_fp80 @f0(float %f) nounwind readnone noredzone {
 entry:
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 99602fd64ff5..e95a734e048d 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
@@ -48,12 +49,25 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xfoo:
 ; CHECK: roll $7
+; BMI2: xfoo:
+; BMI2: rorxl $25
 	%0 = lshr i32 %x, 25
 	%1 = shl i32 %x, 7
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xfoop(i32* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxl $25, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 25
+	%b = shl i32 %x, 7
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbar(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbar:
@@ -68,12 +82,25 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xun:
 ; CHECK: roll $25
+; BMI2: xun:
+; BMI2: rorxl $7
 	%0 = lshr i32 %x, 7
 	%1 = shl i32 %x, 25
 	%2 = or i32 %0, %1
 	ret i32 %2
 }
 
+define i32 @xunp(i32* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxl $7, ({{.+}}), %{{.+}}
+	%x = load i32* %p
+	%a = lshr i32 %x, 7
+	%b = shl i32 %x, 25
+	%c = or i32 %a, %b
+	ret i32 %c
+}
+
 define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK: xbu:
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index 4e082bb860b4..7fa982d83b61 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,8 +1,9 @@
-; RUN: llc < %s -march=x86-64 > %t
-; RUN: grep rol %t | count 3
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 > %t
+; RUN: grep rol %t | count 5
 ; RUN: grep ror %t | count 1
 ; RUN: grep shld %t | count 2
 ; RUN: grep shrd %t | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
@@ -42,12 +43,25 @@ entry:
 
 define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xfoo:
+; BMI2: rorxq $57
 	%0 = lshr i64 %x, 57
 	%1 = shl i64 %x, 7
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xfoop(i64* %p) nounwind readnone {
+entry:
+; BMI2: xfoop:
+; BMI2: rorxq $57, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 57
+	%b = shl i64 %x, 7
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = shl i64 %y, 7
@@ -58,12 +72,25 @@ entry:
 
 define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; BMI2: xun:
+; BMI2: rorxq $7
 	%0 = lshr i64 %x, 7
 	%1 = shl i64 %x, 57
 	%2 = or i64 %0, %1
 	ret i64 %2
 }
 
+define i64 @xunp(i64* %p) nounwind readnone {
+entry:
+; BMI2: xunp:
+; BMI2: rorxq $7, ({{.+}}), %{{.+}}
+	%x = load i64* %p
+	%a = lshr i64 %x, 7
+	%b = shl i64 %x, 57
+	%c = or i64 %a, %b
+	ret i64 %c
+}
+
 define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
 	%0 = lshr i64 %y, 7
diff --git a/test/CodeGen/X86/rotate2.ll b/test/CodeGen/X86/rotate2.ll
index 2eea3999e7b8..2316c708507a 100644
--- a/test/CodeGen/X86/rotate2.ll
+++ b/test/CodeGen/X86/rotate2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 | grep rol | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | grep rol | count 2
 
 define i64 @test1(i64 %x) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
new file mode 100644
index 000000000000..76eb9514f02c
--- /dev/null
+++ b/test/CodeGen/X86/rtm.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare i32 @llvm.x86.xbegin() nounwind
+declare void @llvm.x86.xend() nounwind
+declare void @llvm.x86.xabort(i8) noreturn nounwind
+
+define i32 @test_xbegin() nounwind uwtable {
+entry:
+  %0 = tail call i32 @llvm.x86.xbegin() nounwind
+  ret i32 %0
+; CHECK: test_xbegin
+; CHECK: xbegin [[LABEL:.*BB.*]]
+; CHECK: [[LABEL]]:
+}
+
+define void @test_xend() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xend() nounwind
+  ret void
+; CHECK: test_xend
+; CHECK: xend
+}
+
+define void @test_xabort() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xabort(i8 2)
+  unreachable
+; CHECK: test_xabort
+; CHECK: xabort $2
+}
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index 2e39473057b1..3bec3acdbf76 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -344,3 +344,16 @@ entry:
 ; ATOM: negw
 ; ATOM: sbbw
 }
+
+define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
+  %cmp = icmp slt i32 %x, 15
+  %sel = select i1 %cmp, i8 %a, i8 %b
+  ret i8 %sel
+; CHECK: test18:
+; CHECK: cmpl $15, %edi
+; CHECK: cmovgel %edx
+
+; ATOM: test18:
+; ATOM: cmpl $15, %edi
+; ATOM: cmovgel %edx
+}
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
new file mode 100644
index 000000000000..5b2409d2396f
--- /dev/null
+++ b/test/CodeGen/X86/select_const.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mcpu=corei7 | FileCheck %s
+
+define i64 @test1(i64 %x) nounwind {
+entry:
+  %cmp = icmp eq i64 %x, 2
+  %add = add i64 %x, 1
+  %retval.0 = select i1 %cmp, i64 2, i64 %add
+  ret i64 %retval.0
+
+; CHECK: test1:
+; CHECK: leaq 1(%rdi), %rax
+; CHECK: cmpq $2, %rdi
+; CHECK: cmoveq %rdi, %rax
+; CHECK: ret
+
+}
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
new file mode 100644
index 000000000000..d1f321f17738
--- /dev/null
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -0,0 +1,178 @@
+; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s
+
+define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32
+; BMI2: shlxl
+; BMI2: ret
+; BMI264: shl32
+; BMI264: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32i(i32 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i32 %x, 5
+; BMI2: shl32i
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32i
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, %shamt
+; BMI2: shl32p
+; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: shl32p
+; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @shl32pi(i32* %p) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = shl i32 %x, 5
+; BMI2: shl32pi
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32pi
+; BMI264-NOT: shlxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64
+; BMI264: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64i(i64 %x) nounwind uwtable readnone {
+entry:
+  %shl = shl i64 %x, 7
+; BMI264: shl64i
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, %shamt
+; BMI264: shl64p
+; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = shl i64 %x, 7
+; BMI264: shl64p
+; BMI264-NOT: shlxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32
+; BMI2: shrxl
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = lshr i32 %x, %shamt
+; BMI2: lshr32p
+; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64
+; BMI264: shrxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = lshr i64 %x, %shamt
+; BMI264: lshr64p
+; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32
+; BMI2: sarxl
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i32* %p
+  %shl = ashr i32 %x, %shamt
+; BMI2: ashr32p
+; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i32 %shl
+}
+
+define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64
+; BMI264: sarxq
+; BMI264: ret
+  ret i64 %shl
+}
+
+define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+  %x = load i64* %p
+  %shl = ashr i64 %x, %shamt
+; BMI264: ashr64p
+; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+  ret i64 %shl
+}
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 1479be1f56ba..734f48ae329f 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,7 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=SAFE
 
 declare float  @sinf(float) readonly
 
@@ -17,6 +18,9 @@ define float @test1(float %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test1
+; SAFE-NOT: fsin
+
 ; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
@@ -26,6 +30,9 @@ define double @test2(double %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test2
+; SAFE-NOT: fsin
+
 ; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
@@ -50,12 +57,18 @@ define float @test4(float %X) {
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test4
+; SAFE-NOT: fcos
+
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test5
+; SAFE-NOT: fcos
+
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
diff --git a/test/CodeGen/X86/sjlj.ll b/test/CodeGen/X86/sjlj.ll
new file mode 100644
index 000000000000..681db0094384
--- /dev/null
+++ b/test/CodeGen/X86/sjlj.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X86 %s
+; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC86 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=static | FileCheck --check-prefix=X64 %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=corei7 -relocation-model=pic | FileCheck --check-prefix=PIC64 %s
+
+@buf = internal global [5 x i8*] zeroinitializer
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+declare i8* @llvm.stacksave() nounwind
+
+declare i32 @llvm.eh.sjlj.setjmp(i8*) nounwind
+
+declare void @llvm.eh.sjlj.longjmp(i8*) nounwind
+
+define i32 @sj0() nounwind {
+  %fp = tail call i8* @llvm.frameaddress(i32 0)
+  store i8* %fp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 0), align 16
+  %sp = tail call i8* @llvm.stacksave()
+  store i8* %sp, i8** getelementptr inbounds ([5 x i8*]* @buf, i64 0, i64 2), align 16
+  %r = tail call i32 @llvm.eh.sjlj.setjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  ret i32 %r
+; X86: sj0
+; x86: movl %ebp, buf
+; X86: movl %esp, buf+8
+; x86: movl ${{.*LBB.*}}, buf+4
+; X86: ret
+; PIC86: sj0
+; PIC86: movl %ebp, buf@GOTOFF(%[[GOT:.*]])
+; PIC86: movl %esp, buf@GOTOFF+8(%[[GOT]])
+; PIC86: leal {{.*LBB.*}}@GOTOFF(%[[GOT]]), %[[LREG:.*]]
+; PIC86: movl %[[LREG]], buf@GOTOFF+4
+; PIC86: ret
+; X64: sj0
+; x64: movq %rbp, buf(%rip)
+; x64: movq ${{.*LBB.*}}, buf+8(%rip)
+; X64: movq %rsp, buf+16(%rip)
+; X64: ret
+; PIC64: sj0
+; PIC64: movq %rbp, buf(%rip)
+; PIC64: movq %rsp, buf+16(%rip)
+; PIC64: leaq {{.*LBB.*}}(%rip), %[[LREG:.*]]
+; PIC64: movq %[[LREG]], buf+8(%rip)
+; PIC64: ret
+}
+
+define void @lj0() nounwind {
+  tail call void @llvm.eh.sjlj.longjmp(i8* bitcast ([5 x i8*]* @buf to i8*))
+  unreachable
+; X86: lj0
+; X86: movl buf, %ebp
+; X86: movl buf+4, %[[REG32:.*]]
+; X86: movl buf+8, %esp
+; X86: jmpl *%[[REG32]]
+; X64: lj0
+; X64: movq buf(%rip), %rbp
+; X64: movq buf+8(%rip), %[[REG64:.*]]
+; X64: movq buf+16(%rip), %rsp
+; X64: jmpq *%[[REG64]]
+}
diff --git a/test/CodeGen/X86/smul-with-overflow.ll b/test/CodeGen/X86/smul-with-overflow.ll
index 7ac3840482a2..2d0b2f7aa91d 100644
--- a/test/CodeGen/X86/smul-with-overflow.ll
+++ b/test/CodeGen/X86/smul-with-overflow.ll
@@ -67,3 +67,17 @@ entry:
 ; CHECK: mull
 ; CHECK-NEXT: ret
 }
+
+declare { i63, i1 } @llvm.smul.with.overflow.i63(i63, i63) nounwind readnone
+
+define i1 @test5() nounwind {
+entry:
+  %res = call { i63, i1 } @llvm.smul.with.overflow.i63(i63 4, i63 4611686018427387903)
+  %sum = extractvalue { i63, i1 } %res, 0
+  %overflow = extractvalue { i63, i1 } %res, 1
+  ret i1 %overflow
+; Was returning false, should return true (not constant folded yet though).
+; PR13991
+; CHECK: test5:
+; CHECK-NOT: xorb
+}
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
new file mode 100644
index 000000000000..188505072f05
--- /dev/null
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 3839e875615f..0ba02155a657 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -47,8 +47,7 @@ define double @olt(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse:
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse:
 ; FINITE-NEXT: minsd  %xmm0, %xmm1
@@ -65,8 +64,7 @@ define double @ogt_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse:
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse:
 ; FINITE-NEXT: maxsd  %xmm0, %xmm1
@@ -107,8 +105,7 @@ define double @ole(double %x, double %y) nounwind {
 ; CHECK:      oge_inverse:
 ; CHECK-NEXT: ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -123,8 +120,7 @@ define double @oge_inverse(double %x, double %y) nounwind {
 ; CHECK:      ole_inverse:
 ; CHECK-NEXT: ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -142,7 +138,8 @@ define double @ole_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -160,7 +157,8 @@ define double @ogt_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -218,7 +216,8 @@ define double @olt_inverse_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      oge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -234,7 +233,8 @@ define double @oge_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ole_x:
 ; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_x:
 ; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -313,8 +313,7 @@ define double @ult(double %x, double %y) nounwind {
 ; CHECK:      ugt_inverse:
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_inverse:
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse:
 ; FINITE-NEXT: minsd   %xmm0, %xmm1
@@ -329,8 +328,7 @@ define double @ugt_inverse(double %x, double %y) nounwind {
 ; CHECK:      ult_inverse:
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_inverse:
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse:
 ; FINITE-NEXT: maxsd   %xmm0, %xmm1
@@ -378,8 +376,7 @@ define double @ule(double %x, double %y) nounwind {
 ; CHECK-NEXT: minsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse:
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse:
 ; FINITE-NEXT: minsd %xmm0, %xmm1
@@ -395,8 +392,7 @@ define double @uge_inverse(double %x, double %y) nounwind {
 ; CHECK-NEXT: maxsd %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse:
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse:
 ; FINITE-NEXT: maxsd %xmm0, %xmm1
@@ -412,7 +408,8 @@ define double @ule_inverse(double %x, double %y) nounwind {
 ; CHECK:      ucomisd %xmm0, %xmm1
 ; UNSAFE:      ugt_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -428,7 +425,8 @@ define double @ugt_x(double %x) nounwind {
 ; CHECK:      ucomisd %xmm1, %xmm0
 ; UNSAFE:      ult_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
-; UNSAFE-NEXT: minsd   %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_x:
 ; FINITE-NEXT: xorp{{[sd]}}   %xmm1, %xmm1
@@ -483,7 +481,8 @@ define double @ult_inverse_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -502,7 +501,8 @@ define double @uge_x(double %x) nounwind {
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_x:
 ; UNSAFE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
-; UNSAFE-NEXT: minsd  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  %xmm0, %xmm1
+; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_x:
 ; FINITE-NEXT: xorp{{[sd]}}  %xmm1, %xmm1
@@ -590,9 +590,7 @@ define double @olt_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ogt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ogt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -611,9 +609,7 @@ define double @ogt_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      olt_inverse_y:
-; UNSAFE-NEXT: movsd  {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd  %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd  {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      olt_inverse_y:
 ; FINITE-NEXT: movsd  {{[^,]*}}, %xmm1
@@ -657,9 +653,7 @@ define double @ole_y(double %x) nounwind {
 ; CHECK:      oge_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      oge_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      oge_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -675,9 +669,7 @@ define double @oge_inverse_y(double %x) nounwind {
 ; CHECK:      ole_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ole_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ole_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -721,9 +713,7 @@ define double @ult_y(double %x) nounwind {
 ; CHECK:      ugt_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ugt_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: minsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ugt_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -739,9 +729,7 @@ define double @ugt_inverse_y(double %x) nounwind {
 ; CHECK:      ult_inverse_y:
 ; CHECK:      ucomisd %xmm
 ; UNSAFE:      ult_inverse_y:
-; UNSAFE-NEXT: movsd   {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd   %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}}  %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd   {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ult_inverse_y:
 ; FINITE-NEXT: movsd   {{[^,]*}}, %xmm1
@@ -792,9 +780,7 @@ define double @ule_y(double %x) nounwind {
 ; CHECK-NEXT: minsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      uge_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: minsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: minsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      uge_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
@@ -811,9 +797,7 @@ define double @uge_inverse_y(double %x) nounwind {
 ; CHECK-NEXT: maxsd {{[^,]*}}, %xmm0
 ; CHECK-NEXT: ret
 ; UNSAFE:      ule_inverse_y:
-; UNSAFE-NEXT: movsd {{[^,]*}}, %xmm1
-; UNSAFE-NEXT: maxsd %xmm0, %xmm1
-; UNSAFE-NEXT: movap{{[sd]}} %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd {{[^,]*}}, %xmm0
 ; UNSAFE-NEXT: ret
 ; FINITE:      ule_inverse_y:
 ; FINITE-NEXT: movsd {{[^,]*}}, %xmm1
diff --git a/test/CodeGen/X86/sse_partial_update.ll b/test/CodeGen/X86/sse_partial_update.ll
new file mode 100644
index 000000000000..655f75800cff
--- /dev/null
+++ b/test/CodeGen/X86/sse_partial_update.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
+
+; rdar: 12558838
+; PR14221
+; There is a mismatch between the intrinsic and the actual instruction.
+; The actual instruction has a partial update of dest, while the intrinsic
+; passes through the upper FP values. Here, we make sure the source and
+; destination of rsqrtss are the same.
+define void @t1(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t1:
+; CHECK: rsqrtss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare void @callee(double, double)
+declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
+
+define void @t2(<4 x float> %a) nounwind uwtable ssp {
+entry:
+; CHECK: t2:
+; CHECK: rcpss %xmm0, %xmm0
+  %0 = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a) nounwind
+  %a.addr.0.extract = extractelement <4 x float> %0, i32 0
+  %conv = fpext float %a.addr.0.extract to double
+  %a.addr.4.extract = extractelement <4 x float> %0, i32 1
+  %conv3 = fpext float %a.addr.4.extract to double
+  tail call void @callee(double %conv, double %conv3) nounwind
+  ret void
+}
+declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index 70307534156e..ecc253ba587e 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -1,6 +1,4 @@
-; RUN: llc < %s | FileCheck %s
-target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-target triple = "x86_64-apple-darwin11.4.0"
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=core2 < %s | FileCheck %s
 
 declare i64 @testi()
 
@@ -93,4 +91,67 @@ define { i64, i64 } @crash(i8* %this) {
   ret { i64, i64 } %mrv7
 }
 
+; Check that we can fold an indexed load into a tail call instruction.
+; CHECK: fold_indexed_load
+; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
+; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  # TAILCALL
+%struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
+@func_table = external global [0 x %struct.funcs]
+define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
+entry:
+  %dsplen = getelementptr inbounds [0 x %struct.funcs]* @func_table, i64 0, i64 %idxprom, i32 2
+  %x1 = load i32 (i8*)** %dsplen, align 8
+  %call = tail call i32 %x1(i8* %mbstr) nounwind
+  ret void
+}
+
+; <rdar://problem/12282281> Fold an indexed load into the tail call instruction.
+; Calling a varargs function with 6 arguments requires 7 registers (%al is the
+; vector count for varargs functions). This leaves %r11 as the only available
+; scratch register.
+;
+; It is not possible to fold an indexed load into TCRETURNmi64 in that case.
+;
+; typedef int (*funcptr)(void*, ...);
+; extern const funcptr funcs[];
+; int f(int n) {
+;   return funcs[n](0, 0, 0, 0, 0, 0);
+; }
+;
+; CHECK: rdar12282281
+; CHECK: jmpq *%r11 # TAILCALL
+@funcs = external constant [0 x i32 (i8*, ...)*]
+
+define i32 @rdar12282281(i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
+  %0 = load i32 (i8*, ...)** %arrayidx, align 8
+  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  ret i32 %call
+}
+
+define x86_fp80 @fp80_call(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: fp80_call:
+; CHECK: jmp _fp80_callee
+  %call = tail call x86_fp80 @fp80_callee(x86_fp80 %x) nounwind
+  ret x86_fp80 %call
+}
+
+declare x86_fp80 @fp80_callee(x86_fp80)
+
+; rdar://12229511
+define x86_fp80 @trunc_fp80(x86_fp80 %x) nounwind  {
+entry:
+; CHECK: trunc_fp80
+; CHECK: callq _trunc
+; CHECK-NOT: jmp _trunc
+; CHECK: ret
+  %conv = fptrunc x86_fp80 %x to double
+  %call = tail call double @trunc(double %conv) nounwind readnone
+  %conv1 = fpext double %call to x86_fp80
+  ret x86_fp80 %conv1
+}
 
+declare double @trunc(double) nounwind readnone
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
index ba5f8f83619f..a773e9daeff8 100644
--- a/test/CodeGen/X86/targetLoweringGeneric.ll
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=corei7 -fast-isel=false -O0 < %s | FileCheck %s
 
 ; Gather non-machine specific tests for the transformations in
 ; CodeGen/SelectionDAG/TargetLowering.  Currently, these
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d2363f8b..b823f0af2cdf 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 9877d7be169b..1d22a185def3 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -2,8 +2,7 @@
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
-;CHECK: movzwl
-;CHECK: pshufb
+;CHECK: pmovzxbq
 ;CHECK: paddq
 ;CHECK: pshufb
 ; A single 16-bit store
@@ -19,8 +18,7 @@ define void @load_2_i8(<2 x i8>* %A)  {
 
 ;CHECK: load_2_i16
 ; Read 32-bits
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxwq
 ;CHECK: paddq
 ;CHECK: pshufb
 ;CHECK: movd
@@ -33,7 +31,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 } 
 
 ;CHECK: load_2_i32
-;CHECK: pshufd
+;CHECK: pmovzxdq
 ;CHECK: paddq
 ;CHECK: pshufd
 ;CHECK: ret
@@ -45,8 +43,7 @@ define void @load_2_i32(<2 x i32>* %A)  {
 } 
 
 ;CHECK: load_4_i8
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -58,7 +55,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 } 
 
 ;CHECK: load_4_i16
-;CHECK: punpcklwd
+;CHECK: pmovzxwd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -70,7 +67,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 } 
 
 ;CHECK: load_8_i8
-;CHECK: punpcklbw
+;CHECK: pmovzxbw
 ;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 46d6a23554f4..4da79538dbf6 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: punpcklwd
-; CHECK: pshufd
+; CHECK: pmovzxwq
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
   %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 000000000000..82517cb9a5a0
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 000000000000..5e0160bd2856
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e2e0c4..dc0464ff9e0f 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6bb114b..4c56f848dedb 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: movd %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/vec_shuffle-30.ll b/test/CodeGen/X86/vec_shuffle-30.ll
index 1651c4cdace2..f5f88426058c 100644
--- a/test/CodeGen/X86/vec_shuffle-30.ll
+++ b/test/CodeGen/X86/vec_shuffle-30.ll
@@ -1,21 +1,25 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep pshufhw %t | grep -- -95 | count 1
-; RUN: grep shufps %t | count 1
-; RUN: not grep pslldq %t
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck %s
 
+; CHECK: test
 ; Test case when creating pshufhw, we incorrectly set the higher order bit
 ; for an undef,
 define void @test(<8 x i16>* %dest, <8 x i16> %in) nounwind {
 entry:
+; CHECK-NOT: vmovaps
+; CHECK: vmovlpd
+; CHECK: vpshufhw        $-95
   %0 = load <8 x i16>* %dest
   %1 = shufflevector <8 x i16> %0, <8 x i16> %in, <8 x i32> < i32 0, i32 1, i32 2, i32 3, i32 13, i32 undef, i32 14, i32 14>
   store <8 x i16> %1, <8 x i16>* %dest
   ret void
-}                              
+}
 
+; CHECK: test2
 ; A test case where we shouldn't generate a punpckldq but a pshufd and a pslldq
 define void @test2(<4 x i32>* %dest, <4 x i32> %in) nounwind {
 entry:
+; CHECK-NOT: pslldq
+; CHECK: shufps
   %0 = shufflevector <4 x i32> %in, <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> < i32 undef, i32 5, i32 undef, i32 2>
   store <4 x i32> %0, <4 x i32>* %dest
   ret void
diff --git a/test/CodeGen/X86/widen_cast-1.ll b/test/CodeGen/X86/widen_cast-1.ll
index ebdfea9a37f7..56c63644e02e 100644
--- a/test/CodeGen/X86/widen_cast-1.ll
+++ b/test/CodeGen/X86/widen_cast-1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=x86 -mcpu=generic -mattr=+sse42 < %s | FileCheck %s
-; RUN: llc -march=x86 -mcpu=atom -mattr=+sse42 < %s | FileCheck -check-prefix=ATOM %s
+; RUN: llc -march=x86 -mcpu=atom < %s | FileCheck -check-prefix=ATOM %s
 
 ; CHECK: paddd
 ; CHECK: movl
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d149ddcc..dfaa3d6dc91a 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]
 
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 79aa00050254..224898c1a3e5 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -170,7 +170,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
 define %i8vec3pack  @rot() nounwind {
-; CHECK: movd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
 entry:
   %X = alloca %i8vec3pack, align 4
   %rot = alloca %i8vec3pack, align 4
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
new file mode 100644
index 000000000000..486dafeb5a24
--- /dev/null
+++ b/test/CodeGen/X86/xmulo.ll
@@ -0,0 +1,50 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+declare i32 @printf(i8*, ...)
+
+@.str = private unnamed_addr constant [10 x i8] c"%llx, %d\0A\00", align 1
+
+define i32 @t1() nounwind {
+; CHECK: t1:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $72, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t2() nounwind {
+; CHECK: t2:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $0, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t3() nounwind {
+; CHECK: t3:
+; CHECK:  movl $1, 12(%esp)
+; CHECK:  movl $-1, 8(%esp)
+; CHECK:  movl $-9, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
diff --git a/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
new file mode 100755
index 000000000000..9a1d5383caac
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-inl-test.elf-x86-64
diff --git a/test/DebugInfo/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index db7bb0ad6030..559f032cb3a6 100644
--- a/test/DebugInfo/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll
@@ -1,6 +1,6 @@
-; RUN: llc -O0 -asm-verbose < %s > %t
-; RUN: grep "External Name" %t | grep -v X
-; RUN: grep "External Name" %t | grep Y | count 1
+; RUN: llc -O0 -asm-verbose -mtriple=x86_64-macosx < %s | FileCheck %s
+; CHECK-NOT: .asciz "X" ## External Name
+; CHECK: .asciz "Y" ## External Name
 ; Test to check type with no definition is listed in pubtypes section.
 %struct.X = type opaque
 %struct.Y = type { i32 }
diff --git a/test/DebugInfo/X86/DW_AT_byte_size.ll b/test/DebugInfo/X86/DW_AT_byte_size.ll
index 59280e027f35..25b5f00c6af6 100644
--- a/test/DebugInfo/X86/DW_AT_byte_size.ll
+++ b/test/DebugInfo/X86/DW_AT_byte_size.ll
@@ -4,7 +4,8 @@
 ; Checks that we don't emit a size for a pointer type.
 ; CHECK: DW_TAG_pointer_type
 ; CHECK-NEXT: DW_AT_type
-; CHECK-NOT-NEXT: DW_AT_byte_size
+; CHECK-NOT: DW_AT_byte_size
+; CHECK: .debug_info contents
 
 %struct.A = type { i32 }
 
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
new file mode 100644
index 000000000000..163a1e7cec73
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00bf => {0x000000bf})
+; CHECK: 0x000000bf:     DW_TAG_formal_parameter [12]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000085] = "this")
+
+%class.A = type { i32 }
+
+define i32 @_Z3foov() nounwind uwtable ssp {
+entry:
+  %a = alloca %class.A, align 4
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21), !dbg !23
+  call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
+  %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
+  %0 = load i32* %m_a, align 4, !dbg !25
+  ret i32 %0, !dbg !25
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26), !dbg !28
+  %this1 = load %class.A** %this.addr
+  call void @_ZN1AC2Ev(%class.A* %this1), !dbg !29
+  ret void, !dbg !29
+}
+
+define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30), !dbg !31
+  %this1 = load %class.A** %this.addr
+  %m_a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !32
+  store i32 0, i32* %m_a, align 4, !dbg !32
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !10, metadata !20}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!6 = metadata !{i32 786473, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !14, metadata !"m_a", metadata !6, i32 4, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !14, metadata !"A", metadata !"A", metadata !"", metadata !6, i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!21 = metadata !{i32 786688, metadata !22, metadata !"a", metadata !6, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!22 = metadata !{i32 786443, metadata !5, i32 7, i32 11, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!23 = metadata !{i32 8, i32 5, metadata !22, null}
+!24 = metadata !{i32 8, i32 6, metadata !22, null}
+!25 = metadata !{i32 9, i32 3, metadata !22, null}
+!26 = metadata !{i32 786689, metadata !10, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!27 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!28 = metadata !{i32 3, i32 3, metadata !10, null}
+!29 = metadata !{i32 3, i32 18, metadata !10, null}
+!30 = metadata !{i32 786689, metadata !20, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!31 = metadata !{i32 3, i32 3, metadata !20, null}
+!32 = metadata !{i32 3, i32 9, metadata !33, null}
+!33 = metadata !{i32 786443, metadata !20, i32 3, i32 7, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!34 = metadata !{i32 3, i32 18, metadata !33, null}
diff --git a/test/DebugInfo/X86/concrete_out_of_line.ll b/test/DebugInfo/X86/concrete_out_of_line.ll
index a22707189b08..58fb05573670 100644
--- a/test/DebugInfo/X86/concrete_out_of_line.ll
+++ b/test/DebugInfo/X86/concrete_out_of_line.ll
@@ -7,16 +7,15 @@
 ; first check that we have a TAG_subprogram at a given offset and it has
 ; AT_inline.
 
-; CHECK: 0x00000134:   DW_TAG_subprogram [18]
-; CHECK-NEXT:     DW_AT_MIPS_linkage_name
+; CHECK: 0x0000011e:   DW_TAG_subprogram [18]
 ; CHECK-NEXT:     DW_AT_specification
 ; CHECK-NEXT:     DW_AT_inline
 
 
 ; and then that a TAG_subprogram refers to it with AT_abstract_origin.
 
-; CHECK: 0x00000184:   DW_TAG_subprogram [20]
-; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x0134 => {0x00000134})
+; CHECK: 0x0000015f:   DW_TAG_subprogram [20]
+; CHECK-NEXT: DW_AT_abstract_origin [DW_FORM_ref4]    (cu + 0x011e => {0x0000011e})
 
 define i32 @_ZN17nsAutoRefCnt7ReleaseEv() {
 entry:
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
new file mode 100644
index 000000000000..b908bcefe478
--- /dev/null
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -0,0 +1,109 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: 0x0000000b: DW_TAG_compile_unit
+; CHECK: 0x00000012:   DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
+; CHECK: 0x0000003c:   DW_TAG_class_type
+; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
+; CHECK: 0x00000044:     DW_TAG_member
+; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
+
+%class.D = type { i32, i32, i32, i32 }
+
+@_ZN1DC1Ev = alias void (%class.D*)* @_ZN1DC2Ev
+@_ZN1DC1ERKS_ = alias void (%class.D*, %class.D*)* @_ZN1DC2ERKS_
+
+define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29), !dbg !36
+  %c1 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !37
+  store i32 1, i32* %c1, align 4, !dbg !37, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !42
+  store i32 2, i32* %c2, align 4, !dbg !42, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !43
+  store i32 3, i32* %c3, align 4, !dbg !43, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !44
+  store i32 4, i32* %c4, align 4, !dbg !44, !tbaa !39
+  ret void, !dbg !45
+}
+
+define void @_ZN1DC2ERKS_(%class.D* nocapture %this, %class.D* nocapture %d) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35), !dbg !46
+  %c1 = getelementptr inbounds %class.D* %d, i64 0, i32 0, !dbg !47
+  %0 = load i32* %c1, align 4, !dbg !47, !tbaa !39
+  %c12 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !47
+  store i32 %0, i32* %c12, align 4, !dbg !47, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %d, i64 0, i32 1, !dbg !49
+  %1 = load i32* %c2, align 4, !dbg !49, !tbaa !39
+  %c23 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !49
+  store i32 %1, i32* %c23, align 4, !dbg !49, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %d, i64 0, i32 2, !dbg !50
+  %2 = load i32* %c3, align 4, !dbg !50, !tbaa !39
+  %c34 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !50
+  store i32 %2, i32* %c34, align 4, !dbg !50, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %d, i64 0, i32 3, !dbg !51
+  %3 = load i32* %c4, align 4, !dbg !51, !tbaa !39
+  %c45 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !51
+  store i32 %3, i32* %c45, align 4, !dbg !51, !tbaa !39
+  ret void, !dbg !52
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !31}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", metadata !6, i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = metadata !{i32 786434, null, metadata !"D", metadata !6, i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [from ]
+!11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
+!12 = metadata !{i32 786445, metadata !10, metadata !"c1", metadata !6, i32 6, i64 32, i64 32, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
+!13 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{i32 786445, metadata !10, metadata !"c2", metadata !6, i32 7, i64 32, i64 32, i64 32, i32 1, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
+!15 = metadata !{i32 786445, metadata !10, metadata !"c3", metadata !6, i32 8, i64 32, i64 32, i64 64, i32 1, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
+!16 = metadata !{i32 786445, metadata !10, metadata !"c4", metadata !6, i32 9, i64 32, i64 32, i64 96, i32 1, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [D]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 4, metadata !21, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !25, i32 4} ; [ DW_TAG_subprogram ] [line 4] [D]
+!21 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null, metadata !9, metadata !23}
+!23 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
+!25 = metadata !{metadata !26}
+!26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!27 = metadata !{metadata !28}
+!28 = metadata !{metadata !29}
+!29 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777228, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 12]
+!30 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!31 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2ERKS_", metadata !6, i32 19, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
+!32 = metadata !{metadata !33}
+!33 = metadata !{metadata !34, metadata !35}
+!34 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777235, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 19]
+!35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
+!36 = metadata !{i32 12, i32 0, metadata !5, null}
+!37 = metadata !{i32 13, i32 0, metadata !38, null}
+!38 = metadata !{i32 786443, metadata !5, i32 12, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!39 = metadata !{metadata !"int", metadata !40}
+!40 = metadata !{metadata !"omnipotent char", metadata !41}
+!41 = metadata !{metadata !"Simple C/C++ TBAA"}
+!42 = metadata !{i32 14, i32 0, metadata !38, null}
+!43 = metadata !{i32 15, i32 0, metadata !38, null}
+!44 = metadata !{i32 16, i32 0, metadata !38, null}
+!45 = metadata !{i32 17, i32 0, metadata !38, null}
+!46 = metadata !{i32 19, i32 0, metadata !31, null}
+!47 = metadata !{i32 20, i32 0, metadata !48, null}
+!48 = metadata !{i32 786443, metadata !31, i32 19, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!49 = metadata !{i32 21, i32 0, metadata !48, null}
+!50 = metadata !{i32 22, i32 0, metadata !48, null}
+!51 = metadata !{i32 23, i32 0, metadata !48, null}
+!52 = metadata !{i32 24, i32 0, metadata !48, null}
diff --git a/test/DebugInfo/X86/enum-fwd-decl.ll b/test/DebugInfo/X86/enum-fwd-decl.ll
index c2dacea4839a..0902430008c1 100644
--- a/test/DebugInfo/X86/enum-fwd-decl.ll
+++ b/test/DebugInfo/X86/enum-fwd-decl.ll
@@ -5,16 +5,14 @@
 
 !llvm.dbg.cu = !{!0}
 
-!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", metadata !"clang version 3.2 (trunk 157772) (llvm/trunk 157761)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !6, metadata !6, metadata !7} ; [ DW_TAG_compile_unit ]
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/tmp", metadata !"clang version 3.2 (trunk 165274) (llvm/trunk 165272)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !1, metadata !3} ; [ DW_TAG_compile_unit ] [/tmp/foo.cpp] [DW_LANG_C_plus_plus]
 !1 = metadata !{metadata !2}
-!2 = metadata !{metadata !3}
-!3 = metadata !{i32 786436, null, metadata !"E", metadata !4, i32 1, i64 16, i64 16, i32 0, i32 4, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_enumeration_type ]
-!4 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo/tmp", null} ; [ DW_TAG_file_type ]
-!5 = metadata !{i32 0}
-!6 = metadata !{metadata !5}
-!7 = metadata !{metadata !8}
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !4, i32 2, metadata !3, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ]
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786484, i32 0, null, metadata !"e", metadata !"e", metadata !"", metadata !6, i32 2, metadata !7, i32 0, i32 1, i16* @e} ; [ DW_TAG_variable ] [e] [line 2] [def]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786436, null, metadata !"E", metadata !6, i32 1, i64 16, i64 16, i32 0, i32 4, null, null, i32 0} ; [ DW_TAG_enumeration_type ] [E] [line 1, size 16, align 16, offset 0] [fwd] [from ]
 
 ; CHECK: DW_TAG_enumeration_type
 ; CHECK-NEXT: DW_AT_name
diff --git a/test/DebugInfo/X86/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
new file mode 100644
index 000000000000..b98492383ac3
--- /dev/null
+++ b/test/DebugInfo/X86/linkage-name.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=x86_64-macosx -darwin-gdb-compat=Disable %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_TAG_subprogram [9] *
+; CHECK-NOT: DW_AT_MIPS_linkage_name
+; CHECK: DW_AT_specification
+
+%class.A = type { i8 }
+
+@a = global %class.A zeroinitializer, align 1
+
+define i32 @_ZN1A1aEi(%class.A* %this, i32 %b) nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  %b.addr = alloca i32, align 4
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !21), !dbg !23
+  store i32 %b, i32* %b.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %b.addr}, metadata !24), !dbg !25
+  %this1 = load %class.A** %this.addr
+  %0 = load i32* %b.addr, align 4, !dbg !26
+  ret i32 %0, !dbg !26
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/Users/echristo", metadata !"clang version 3.1 (trunk 152691) (llvm/trunk 152692)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !18} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 5, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%class.A*, i32)* @_ZN1A1aEi, null, metadata !13, metadata !16} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/Users/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{metadata !9, metadata !10, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!10 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 64, metadata !11} ; [ DW_TAG_pointer_type ]
+!11 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !12, i32 0, null, null} ; [ DW_TAG_class_type ]
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 786478, i32 0, metadata !11, metadata !"a", metadata !"a", metadata !"_ZN1A1aEi", metadata !6, i32 2, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 257, i1 false, null, null, i32 0, metadata !14} ; [ DW_TAG_subprogram ]
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ]
+!18 = metadata !{metadata !19}
+!19 = metadata !{metadata !20}
+!20 = metadata !{i32 786484, i32 0, null, metadata !"a", metadata !"a", metadata !"", metadata !6, i32 9, metadata !11, i32 0, i32 1, %class.A* @a} ; [ DW_TAG_variable ]
+!21 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777221, metadata !22, i32 64, i32 0} ; [ DW_TAG_arg_variable ]
+!22 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !11} ; [ DW_TAG_pointer_type ]
+!23 = metadata !{i32 5, i32 8, metadata !5, null}
+!24 = metadata !{i32 786689, metadata !5, metadata !"b", metadata !6, i32 33554437, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ]
+!25 = metadata !{i32 5, i32 14, metadata !5, null}
+!26 = metadata !{i32 6, i32 4, metadata !27, null}
+!27 = metadata !{i32 786443, metadata !5, i32 5, i32 17, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
diff --git a/test/DebugInfo/X86/pr13303.ll b/test/DebugInfo/X86/pr13303.ll
deleted file mode 100644
index e820cb564cf0..000000000000
--- a/test/DebugInfo/X86/pr13303.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc %s -o %t -filetype=obj -mtriple=x86_64-unknown-linux-gnu
-; RUN: llvm-dwarfdump %t | FileCheck %s
-; PR13303
-
-; Check that the prologue ends with is_stmt here.
-; CHECK: 0x0000000000000000 {{.*}} is_stmt
-
-define i32 @main() nounwind uwtable {
-entry:
-  %retval = alloca i32, align 4
-  store i32 0, i32* %retval
-  ret i32 0, !dbg !10
-}
-
-!llvm.dbg.cu = !{!0}
-
-!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"PR13303.c", metadata !"/home/probinson", metadata !"clang version 3.2 (trunk 160143)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/probinson/PR13303.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{metadata !4}
-!4 = metadata !{metadata !5}
-!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"main", metadata !"main", metadata !"", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @main, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [main]
-!6 = metadata !{i32 786473, metadata !"PR13303.c", metadata !"/home/probinson", null} ; [ DW_TAG_file_type ]
-!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!8 = metadata !{metadata !9}
-!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
-!10 = metadata !{i32 1, i32 14, metadata !11, null}
-!11 = metadata !{i32 786443, metadata !5, i32 1, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/probinson/PR13303.c]
diff --git a/test/DebugInfo/X86/prologue-stack.ll b/test/DebugInfo/X86/prologue-stack.ll
new file mode 100644
index 000000000000..929db5190267
--- /dev/null
+++ b/test/DebugInfo/X86/prologue-stack.ll
@@ -0,0 +1,35 @@
+; RUN: llc -disable-fp-elim -O0 %s -mtriple x86_64-unknown-linux-gnu -o - | FileCheck %s
+
+; int callme(int);
+; int isel_line_test2() {
+;   callme(400);
+;   return 0;
+; }
+
+define i32 @isel_line_test2() nounwind uwtable {
+  ; The stack adjustment should be part of the prologue.
+  ; CHECK: isel_line_test2:
+  ; CHECK: {{subq|leaq}} {{.*}}, %rsp
+  ; CHECK: .loc 1 5 3 prologue_end
+entry:
+  %call = call i32 @callme(i32 400), !dbg !10
+  ret i32 0, !dbg !12
+}
+
+declare i32 @callme(i32)
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 12, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", metadata !"clang version 3.2 (trunk 164980) (llvm/trunk 164979)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/tmp/bar.c] [DW_LANG_C99]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"isel_line_test2", metadata !"isel_line_test2", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 0, i1 false, i32 ()* @isel_line_test2, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 3] [def] [scope 4] [isel_line_test2]
+!6 = metadata !{i32 786473, metadata !"bar.c", metadata !"/usr/local/google/home/echristo/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 5, i32 3, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 4, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/tmp/bar.c]
+!12 = metadata !{i32 6, i32 3, metadata !11, null}
diff --git a/test/DebugInfo/X86/stringpool.ll b/test/DebugInfo/X86/stringpool.ll
index 2cd100156aad..caf12c2756e0 100644
--- a/test/DebugInfo/X86/stringpool.ll
+++ b/test/DebugInfo/X86/stringpool.ll
@@ -16,8 +16,8 @@
 
 ; Verify that we refer to 'yyyy' with a relocation.
 ; LINUX:      .long   .Lstring3               # DW_AT_name
-; LINUX-NEXT: .long   39                      # DW_AT_type
-; LINUX-NEXT: .byte   1                       # DW_AT_external
+; LINUX-NEXT: .long   38                      # DW_AT_type
+; LINUX-NEXT:                                 # DW_AT_external
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_file
 ; LINUX-NEXT: .byte   1                       # DW_AT_decl_line
 ; LINUX-NEXT: .byte   9                       # DW_AT_location
diff --git a/test/DebugInfo/bug_null_debuginfo.ll b/test/DebugInfo/bug_null_debuginfo.ll
index a7fdf70d71c7..b17affed893c 100644
--- a/test/DebugInfo/bug_null_debuginfo.ll
+++ b/test/DebugInfo/bug_null_debuginfo.ll
@@ -1,5 +1,4 @@
-; RUN: llc
-
+; RUN: llc < %s
 
 !llvm.dbg.cu = !{!0}
 
diff --git a/test/DebugInfo/dwarfdump-inlining.test b/test/DebugInfo/dwarfdump-inlining.test
new file mode 100644
index 000000000000..d3a7e12a8703
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-inlining.test
@@ -0,0 +1,28 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x613 \
+RUN:   --inlining --functions | FileCheck %s -check-prefix DEEP_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x6de \
+RUN:   --inlining | FileCheck %s -check-prefix SHORTER_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x685 \
+RUN:   --inlining | FileCheck %s -check-prefix SHORT_STACK
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-inl-test.elf-x86-64 --address=0x640 \
+RUN:   --functions | FileCheck %s -check-prefix INL_FUNC_NAME
+
+DEEP_STACK:      inlined_h
+DEEP_STACK-NEXT: header.h:2:21
+DEEP_STACK-NEXT: inlined_g
+DEEP_STACK-NEXT: header.h:7
+DEEP_STACK-NEXT: inlined_f
+DEEP_STACK-NEXT: main.cc:3
+DEEP_STACK-NEXT: main
+DEEP_STACK-NEXT: main.cc:8
+
+SHORTER_STACK:      header.h:7:20
+SHORTER_STACK-NEXT: main.cc:3
+SHORTER_STACK-NEXT: main.cc:8
+
+SHORT_STACK:      main.cc:3:20
+SHORT_STACK-NEXT: main.cc:8
+
+INL_FUNC_NAME:      inlined_g
+INL_FUNC_NAME-NEXT: header.h:7:20
+
diff --git a/test/DebugInfo/dwarfdump-test.test b/test/DebugInfo/dwarfdump-test.test
index de23dcd9c278..973c3447e340 100644
--- a/test/DebugInfo/dwarfdump-test.test
+++ b/test/DebugInfo/dwarfdump-test.test
@@ -17,6 +17,8 @@ RUN:   --address=0x56d --functions | FileCheck %s -check-prefix INCLUDE_TEST_2
 RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
 RUN:   --address=0x55c --functions \
 RUN:   | FileCheck %s -check-prefix MANY_SEQ_IN_LINE_TABLE
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-test4.elf-x86-64 \
+RUN:   | FileCheck %s -check-prefix DEBUG_RANGES
 
 MAIN: main
 MAIN-NEXT: /tmp/dbginfo{{[/\\]}}dwarfdump-test.cc:16:10
@@ -44,3 +46,11 @@ INCLUDE_TEST_2-NEXT: /tmp/include{{[/\\]}}decl.h:5:0
 
 MANY_SEQ_IN_LINE_TABLE: _Z1cv
 MANY_SEQ_IN_LINE_TABLE-NEXT: /tmp/dbginfo/sequences{{[/\\]}}c.cc:2:0
+
+DEBUG_RANGES:      .debug_ranges contents:
+DEBUG_RANGES-NEXT: 00000000 000000000000055c 0000000000000567
+DEBUG_RANGES-NEXT: 00000000 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000000 <End of list>
+DEBUG_RANGES-NEXT: 00000030 0000000000000570 000000000000057b
+DEBUG_RANGES-NEXT: 00000030 0000000000000567 000000000000056d
+DEBUG_RANGES-NEXT: 00000030 <End of list>
diff --git a/test/ExecutionEngine/2002-12-16-ArgTest.ll b/test/ExecutionEngine/2002-12-16-ArgTest.ll
index eb2fe8c04832..4c03519a85af 100644
--- a/test/ExecutionEngine/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/2002-12-16-ArgTest.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli %s > /dev/null
+; XFAIL: arm
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
index 46273d340095..28cc54a86806 100644
--- a/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
+++ b/test/ExecutionEngine/MCJIT/2002-12-16-ArgTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @.LC0 = internal global [10 x i8] c"argc: %d\0A\00"		; <[10 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
index 88bfbb3c09bb..9f895983fdb1 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-ArgumentBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @foo(i32 %X, i32 %Y, double %A) {
 	%cond212 = fcmp une double %A, 1.000000e+00		; <i1> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
index d5f860d17048..997b2a9037ee 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-LoopTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	call i32 @mylog( i32 4 )		; <i32>:1 [#uses=0]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
index 721f2e8859dc..ba35b5bcc436 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-04-PhiTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
index d17df997c817..f3c88adf8435 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-09-SARTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; We were accidentally inverting the signedness of right shifts.  Whoops.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
index e55cb06aa1e6..f925e79f2484 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-10-FUCOM.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%X = fadd double 0.000000e+00, 1.000000e+00		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
index 663dc4001079..5b426f6c330b 100644
--- a/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-01-15-AlignmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @bar(i8* %X) {
         ; pointer should be 4 byte aligned!
diff --git a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
index e95294be74a4..c0a7393f8244 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-06-LivenessClobber.ll
@@ -1,6 +1,6 @@
 ; This testcase should return with an exit code of 1.
 ;
-; RUN: not %lli -use-mcjit %s
+; RUN: not %lli -mtriple=%mcjit_triple -use-mcjit %s
 
 @test = global i64 0		; <i64*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
index a237194ea48f..d3e6204a85be 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-07-ArgumentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s test
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s test
 
 declare i32 @puts(i8*)
 
diff --git a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
index 70464a3ffcb7..55a169754104 100644
--- a/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-05-11-PHIRegAllocBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 target datalayout = "e-p:32:32"
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
index 58d423f92441..79c6e7fe4cae 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-04-bzip2-bug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
index a22fe07b0859..ffd6df6e5e25 100644
--- a/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
+++ b/test/ExecutionEngine/MCJIT/2003-06-05-PHIBug.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; Testcase distilled from 256.bzip2.
 
diff --git a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
index b3c6d8abbc06..90839e96986f 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-15-AllocaAssertion.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This testcase failed to work because two variable sized allocas confused the
 ; local register allocator.
diff --git a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
index bd32f3037ddc..29ef2c556cd0 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-21-EnvironmentTest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ;
 ; Regression Test: EnvironmentTest.ll
diff --git a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
index 1959534b877a..2adb608acbb1 100644
--- a/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
+++ b/test/ExecutionEngine/MCJIT/2003-08-23-RegisterAllocatePhysReg.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This testcase exposes a bug in the local register allocator where it runs out
 ; of registers (due to too many overlapping live ranges), but then attempts to
diff --git a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
index 1f8343fc43f5..91bde4690361 100644
--- a/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
+++ b/test/ExecutionEngine/MCJIT/2003-10-18-PHINode-ConstantExpr-CondCode-Failure.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @A = global i32 0		; <i32*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
index 79a7d583ce61..a7462d9e698a 100644
--- a/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
+++ b/test/ExecutionEngine/MCJIT/2005-12-02-TailCallBug.ll
@@ -1,5 +1,5 @@
 ; PR672
-; RUN: %lli -use-mcjit %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s
 ; XFAIL: mcjit-ia32
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
index 52cef4d35ca6..240659660252 100644
--- a/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
+++ b/test/ExecutionEngine/MCJIT/2007-12-10-APIntLoadStore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter %s
 ; PR1836
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
index a6e917f457b3..d429d519e04f 100644
--- a/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
+++ b/test/ExecutionEngine/MCJIT/2008-06-05-APInt-OverAShr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 1
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 1
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i686-pc-linux-gnu"
diff --git a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
index 524a724c474b..a6d18e7919cc 100644
--- a/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
+++ b/test/ExecutionEngine/MCJIT/2010-01-15-UndefValue.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s > /dev/null
 
 define i32 @main() {
        %a = add i32 0, undef
diff --git a/test/ExecutionEngine/MCJIT/fpbitcast.ll b/test/ExecutionEngine/MCJIT/fpbitcast.ll
index 9da908f8cff1..bb4957e9e66e 100644
--- a/test/ExecutionEngine/MCJIT/fpbitcast.ll
+++ b/test/ExecutionEngine/MCJIT/fpbitcast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -force-interpreter=true %s | grep 40091eb8
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -force-interpreter=true %s | grep 40091eb8
 ;
 define i32 @test(double %x) {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/hello.ll b/test/ExecutionEngine/MCJIT/hello.ll
index a52b6d48af29..ceb9c12ab4bd 100644
--- a/test/ExecutionEngine/MCJIT/hello.ll
+++ b/test/ExecutionEngine/MCJIT/hello.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @.LC0 = internal global [12 x i8] c"Hello World\00"		; <[12 x i8]*> [#uses=1]
 
diff --git a/test/ExecutionEngine/MCJIT/hello2.ll b/test/ExecutionEngine/MCJIT/hello2.ll
index 670a6dd671ce..756fcadb1caf 100644
--- a/test/ExecutionEngine/MCJIT/hello2.ll
+++ b/test/ExecutionEngine/MCJIT/hello2.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @X = global i32 7		; <i32*> [#uses=0]
 @msg = internal global [13 x i8] c"Hello World\0A\00"		; <[13 x i8]*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index 2980ce70811c..fc29f651aa1f 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -8,13 +8,17 @@ def getRoot(config):
 root = getRoot(config)
 
 targets = set(root.targets_to_build.split())
-if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets):
+if ('X86' in targets) | ('ARM' in targets) | ('Mips' in targets) | \
+   ('PowerPC' in targets):
     config.unsupported = False
 else:
     config.unsupported = True
 
-if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips']:
+if root.host_arch not in ['x86', 'x86_64', 'ARM', 'Mips', 'PowerPC']:
     config.unsupported = True
 
-if root.host_os in ['Win32', 'Cygwin', 'MingW', 'Windows', 'Darwin']:
+if root.host_os in ['Darwin']:
+    config.unsupported = True
+
+if 'powerpc' in root.target_triple and not 'powerpc64' in root.target_triple:
     config.unsupported = True
diff --git a/test/ExecutionEngine/MCJIT/pr13727.ll b/test/ExecutionEngine/MCJIT/pr13727.ll
new file mode 100644
index 000000000000..c33bf3281087
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/pr13727.ll
@@ -0,0 +1,88 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
+
+; The intention of this test is to verify that symbols mapped to COMMON in ELF
+; work as expected.
+;
+; Compiled from this C code:
+;
+; int zero_int;
+; double zero_double;
+; int zero_arr[10];
+; 
+; int main()
+; {
+;     zero_arr[zero_int + 5] = 40;
+; 
+;     if (zero_double < 1.1)
+;         zero_arr[zero_int + 2] = 70;
+; 
+;     for (int i = 1; i < 10; ++i) {
+;         zero_arr[i] = zero_arr[i - 1] + zero_arr[i];
+;     }
+;     return zero_arr[9] == 110 ? 0 : -1;
+; }
+
+@zero_int = common global i32 0, align 4
+@zero_arr = common global [10 x i32] zeroinitializer, align 16
+@zero_double = common global double 0.000000e+00, align 8
+
+define i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @zero_int, align 4
+  %add = add nsw i32 %0, 5
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom
+  store i32 40, i32* %arrayidx, align 4
+  %1 = load double* @zero_double, align 8
+  %cmp = fcmp olt double %1, 1.100000e+00
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %2 = load i32* @zero_int, align 4
+  %add1 = add nsw i32 %2, 2
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom2
+  store i32 70, i32* %arrayidx3, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  store i32 1, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  %3 = load i32* %i, align 4
+  %cmp4 = icmp slt i32 %3, 10
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %4 = load i32* %i, align 4
+  %sub = sub nsw i32 %4, 1
+  %idxprom5 = sext i32 %sub to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom5
+  %5 = load i32* %arrayidx6, align 4
+  %6 = load i32* %i, align 4
+  %idxprom7 = sext i32 %6 to i64
+  %arrayidx8 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom7
+  %7 = load i32* %arrayidx8, align 4
+  %add9 = add nsw i32 %5, %7
+  %8 = load i32* %i, align 4
+  %idxprom10 = sext i32 %8 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32]* @zero_arr, i32 0, i64 %idxprom10
+  store i32 %add9, i32* %arrayidx11, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %10 = load i32* getelementptr inbounds ([10 x i32]* @zero_arr, i32 0, i64 9), align 4
+  %cmp12 = icmp eq i32 %10, 110
+  %cond = select i1 %cmp12, i32 0, i32 -1
+  ret i32 %cond
+}
diff --git a/test/ExecutionEngine/MCJIT/simplesttest.ll b/test/ExecutionEngine/MCJIT/simplesttest.ll
index a6688c237c0e..02ad0061fd13 100644
--- a/test/ExecutionEngine/MCJIT/simplesttest.ll
+++ b/test/ExecutionEngine/MCJIT/simplesttest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/simpletest.ll b/test/ExecutionEngine/MCJIT/simpletest.ll
index 4562aa6012ef..958b783067e4 100644
--- a/test/ExecutionEngine/MCJIT/simpletest.ll
+++ b/test/ExecutionEngine/MCJIT/simpletest.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @bar() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/stubs.ll b/test/ExecutionEngine/MCJIT/stubs.ll
index b285b0eadb3f..9e5d5b2e4186 100644
--- a/test/ExecutionEngine/MCJIT/stubs.ll
+++ b/test/ExecutionEngine/MCJIT/stubs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -disable-lazy-compilation=false %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -disable-lazy-compilation=false %s
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-arith.ll b/test/ExecutionEngine/MCJIT/test-arith.ll
index 31777604d577..b73227fe635e 100644
--- a/test/ExecutionEngine/MCJIT/test-arith.ll
+++ b/test/ExecutionEngine/MCJIT/test-arith.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = add i8 0, 12		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-branch.ll b/test/ExecutionEngine/MCJIT/test-branch.ll
index 702c11022094..8f3c7279051e 100644
--- a/test/ExecutionEngine/MCJIT/test-branch.ll
+++ b/test/ExecutionEngine/MCJIT/test-branch.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test unconditional branch
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
index 6f284055fd92..20150b2de626 100644
--- a/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-call-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @_Z14func_exit_codev() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-call.ll b/test/ExecutionEngine/MCJIT/test-call.ll
index 7a244ee50581..51d19fe99178 100644
--- a/test/ExecutionEngine/MCJIT/test-call.ll
+++ b/test/ExecutionEngine/MCJIT/test-call.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 declare void @exit(i32)
 
diff --git a/test/ExecutionEngine/MCJIT/test-cast.ll b/test/ExecutionEngine/MCJIT/test-cast.ll
index 75e7d1b423f3..dcc97f466568 100644
--- a/test/ExecutionEngine/MCJIT/test-cast.ll
+++ b/test/ExecutionEngine/MCJIT/test-cast.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @foo() {
 	ret i32 0
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
new file mode 100644
index 000000000000..d666a2aa4aa3
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols-alignment.ll
@@ -0,0 +1,32 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+; This test checks that common symbols have been allocated addresses honouring
+; the alignment requirement.
+
+@CS1 = common global i32 0, align 16
+@CS2 = common global i8 0, align 1
+@CS3 = common global i32 0, align 16
+
+define i32 @main() nounwind {
+entry:
+    %retval = alloca i32, align 4
+    %ptr = alloca i32, align 4
+    store i32 0, i32* %retval
+    store i32 ptrtoint (i32* @CS3 to i32), i32* %ptr, align 4
+    %0 = load i32* %ptr, align 4
+    %and = and i32 %0, 15
+    %tobool = icmp ne i32 %and, 0
+    br i1 %tobool, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+    store i32 1, i32* %retval
+    br label %return
+
+if.else:                                          ; preds = %entry
+    store i32 0, i32* %retval
+    br label %return
+
+return:                                           ; preds = %if.else, %if.then
+    %1 = load i32* %retval
+    ret i32 %1
+}
diff --git a/test/ExecutionEngine/MCJIT/test-common-symbols.ll b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
index ac1d9acd954e..8c8190291f18 100644
--- a/test/ExecutionEngine/MCJIT/test-common-symbols.ll
+++ b/test/ExecutionEngine/MCJIT/test-common-symbols.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit -O0 -disable-lazy-compilation=false %s
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 -disable-lazy-compilation=false %s
 
 ; The intention of this test is to verify that symbols mapped to COMMON in ELF
 ; work as expected.
diff --git a/test/ExecutionEngine/MCJIT/test-constantexpr.ll b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
index 6b46639c51f9..56c1290448ad 100644
--- a/test/ExecutionEngine/MCJIT/test-constantexpr.ll
+++ b/test/ExecutionEngine/MCJIT/test-constantexpr.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; This tests to make sure that we can evaluate weird constant expressions
 
diff --git a/test/ExecutionEngine/MCJIT/test-data-align.ll b/test/ExecutionEngine/MCJIT/test-data-align.ll
new file mode 100644
index 000000000000..0493cba87fdb
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-data-align.ll
@@ -0,0 +1,15 @@
+; RUN:  %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+; Check that a variable is always aligned as specified.
+
+@var = global i32 0, align 32
+define i32 @main() {
+  %addr = ptrtoint i32* @var to i64
+  %mask = and i64 %addr, 31
+  %tst = icmp eq i64 %mask, 0
+  br i1 %tst, label %good, label %bad
+good:
+  ret i32 0
+bad:
+  ret i32 1
+}
diff --git a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
index 35491df79177..7af1d8b53910 100644
--- a/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp-no-external-funcs.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-fp.ll b/test/ExecutionEngine/MCJIT/test-fp.ll
index 6fc5a501f6e6..f7e6fb9ba18e 100644
--- a/test/ExecutionEngine/MCJIT/test-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
index 4a790c6ff174..ec6cbad2f14e 100644
--- a/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
+++ b/test/ExecutionEngine/MCJIT/test-global-init-nonzero.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @count = global i32 1, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-global.ll b/test/ExecutionEngine/MCJIT/test-global.ll
index 94e0250769ec..e7972f978e95 100644
--- a/test/ExecutionEngine/MCJIT/test-global.ll
+++ b/test/ExecutionEngine/MCJIT/test-global.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 @count = global i32 0, align 4
 
diff --git a/test/ExecutionEngine/MCJIT/test-loadstore.ll b/test/ExecutionEngine/MCJIT/test-loadstore.ll
index e9171490e352..f450d0ab528b 100644
--- a/test/ExecutionEngine/MCJIT/test-loadstore.ll
+++ b/test/ExecutionEngine/MCJIT/test-loadstore.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define void @test(i8* %P, i16* %P.upgrd.1, i32* %P.upgrd.2, i64* %P.upgrd.3) {
 	%V = load i8* %P		; <i8> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-local.ll b/test/ExecutionEngine/MCJIT/test-local.ll
index 4f5ae47dd048..d4e9f444e426 100644
--- a/test/ExecutionEngine/MCJIT/test-local.ll
+++ b/test/ExecutionEngine/MCJIT/test-local.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-logical.ll b/test/ExecutionEngine/MCJIT/test-logical.ll
index 0540c22fc629..32f45ef119e6 100644
--- a/test/ExecutionEngine/MCJIT/test-logical.ll
+++ b/test/ExecutionEngine/MCJIT/test-logical.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%A = and i8 4, 8		; <i8> [#uses=2]
diff --git a/test/ExecutionEngine/MCJIT/test-loop.ll b/test/ExecutionEngine/MCJIT/test-loop.ll
index b1dbf408996b..ebc689664d65 100644
--- a/test/ExecutionEngine/MCJIT/test-loop.ll
+++ b/test/ExecutionEngine/MCJIT/test-loop.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 ; <label>:0
diff --git a/test/ExecutionEngine/MCJIT/test-phi.ll b/test/ExecutionEngine/MCJIT/test-phi.ll
index fbc080862c83..1408533d7ae9 100644
--- a/test/ExecutionEngine/MCJIT/test-phi.ll
+++ b/test/ExecutionEngine/MCJIT/test-phi.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test phi node
 @Y = global i32 6		; <i32*> [#uses=1]
diff --git a/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
new file mode 100644
index 000000000000..93b6a6deffd1
--- /dev/null
+++ b/test/ExecutionEngine/MCJIT/test-ptr-reloc.ll
@@ -0,0 +1,16 @@
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit -O0 %s
+
+@.str = private unnamed_addr constant [6 x i8] c"data1\00", align 1
+@ptr = global i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), align 4
+@.str1 = private unnamed_addr constant [6 x i8] c"data2\00", align 1
+@ptr2 = global i8* getelementptr inbounds ([6 x i8]* @.str1, i32 0, i32 0), align 4
+
+define i32 @main(i32 %argc, i8** nocapture %argv) nounwind readonly {
+entry:
+  %0 = load i8** @ptr, align 4
+  %1 = load i8** @ptr2, align 4
+  %cmp = icmp eq i8* %0, %1
+  %. = zext i1 %cmp to i32
+  ret i32 %.
+}
+
diff --git a/test/ExecutionEngine/MCJIT/test-ret.ll b/test/ExecutionEngine/MCJIT/test-ret.ll
index 1b90ee075069..af282926907f 100644
--- a/test/ExecutionEngine/MCJIT/test-ret.ll
+++ b/test/ExecutionEngine/MCJIT/test-ret.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 ; test return instructions
 define void @test1() {
diff --git a/test/ExecutionEngine/MCJIT/test-return.ll b/test/ExecutionEngine/MCJIT/test-return.ll
index 9c399cab38d9..67f7107c3d7d 100644
--- a/test/ExecutionEngine/MCJIT/test-return.ll
+++ b/test/ExecutionEngine/MCJIT/test-return.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() nounwind uwtable {
 entry:
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
index 030ff317560b..a8f4bd8529f8 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-fp.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 
 define i32 @main() {
diff --git a/test/ExecutionEngine/MCJIT/test-setcond-int.ll b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
index 1113efee510f..ed52b5065c84 100644
--- a/test/ExecutionEngine/MCJIT/test-setcond-int.ll
+++ b/test/ExecutionEngine/MCJIT/test-setcond-int.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%int1 = add i32 0, 0		; <i32> [#uses=6]
diff --git a/test/ExecutionEngine/MCJIT/test-shift.ll b/test/ExecutionEngine/MCJIT/test-shift.ll
index 2da824fecce9..5a5c10d56050 100644
--- a/test/ExecutionEngine/MCJIT/test-shift.ll
+++ b/test/ExecutionEngine/MCJIT/test-shift.ll
@@ -1,4 +1,4 @@
-; RUN: %lli -use-mcjit %s > /dev/null
+; RUN: %lli -mtriple=%mcjit_triple -use-mcjit %s > /dev/null
 
 define i32 @main() {
 	%shamt = add i8 0, 1		; <i8> [#uses=8]
diff --git a/test/ExecutionEngine/lit.local.cfg b/test/ExecutionEngine/lit.local.cfg
index 19eebc0ac7ac..f0343263dba6 100644
--- a/test/ExecutionEngine/lit.local.cfg
+++ b/test/ExecutionEngine/lit.local.cfg
@@ -1 +1,12 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+if root.host_arch in ['PowerPC']:
+    config.unsupported = True
+
diff --git a/test/ExecutionEngine/test-fp-no-external-funcs.ll b/test/ExecutionEngine/test-fp-no-external-funcs.ll
index 61b12c2abeb7..139b2efea57f 100644
--- a/test/ExecutionEngine/test-fp-no-external-funcs.ll
+++ b/test/ExecutionEngine/test-fp-no-external-funcs.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli  %s > /dev/null
+; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/ExecutionEngine/test-fp.ll b/test/ExecutionEngine/test-fp.ll
index 2bf0210d8b00..c9064500d475 100644
--- a/test/ExecutionEngine/test-fp.ll
+++ b/test/ExecutionEngine/test-fp.ll
@@ -1,4 +1,5 @@
 ; RUN: %lli %s > /dev/null
+; XFAIL: arm
 
 define double @test(double* %DP, double %Arg) {
 	%D = load double* %DP		; <double> [#uses=1]
diff --git a/test/Feature/linker_private_linkages.ll b/test/Feature/linker_private_linkages.ll
index f9f290875645..19bcbb40aa01 100644
--- a/test/Feature/linker_private_linkages.ll
+++ b/test/Feature/linker_private_linkages.ll
@@ -4,4 +4,3 @@
 
 @foo = linker_private hidden global i32 0
 @bar = linker_private_weak hidden global i32 0
-@qux = linker_private_weak_def_auto global i32 0
diff --git a/test/Feature/minsize_attr.ll b/test/Feature/minsize_attr.ll
new file mode 100644
index 000000000000..51b133c4bdb7
--- /dev/null
+++ b/test/Feature/minsize_attr.ll
@@ -0,0 +1,7 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+define void @test1() minsize {
+; CHECK: define void @test1() minsize
+        ret void
+}
+
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index d19000187060..655f69c16fdf 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -69,3 +69,23 @@ entry:
   store i32 42, i32* %a
   ret void
 }
+
+; Check that asan leaves just one alloca.
+
+declare void @alloca_test_use([10 x i8]*)
+define void @alloca_test() address_safety {
+entry:
+  %x = alloca [10 x i8], align 1
+  %y = alloca [10 x i8], align 1
+  %z = alloca [10 x i8], align 1
+  call void @alloca_test_use([10 x i8]* %x)
+  call void @alloca_test_use([10 x i8]* %y)
+  call void @alloca_test_use([10 x i8]* %z)
+  ret void
+}
+
+; CHECK: define void @alloca_test()
+; CHECK: = alloca
+; CHECK-NOT: = alloca
+; CHECK: ret void
+
diff --git a/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
new file mode 100644
index 000000000000..28d4ac0c0f58
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-instrument-internal-globals.ll
@@ -0,0 +1,19 @@
+; This test checks that we are not instrumenting globals
+; that we created ourselves.
+; RUN: opt < %s -asan -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z3barv() uwtable address_safety {
+entry:
+  %a = alloca i32, align 4
+  call void @_Z3fooPi(i32* %a)
+  ret void
+}
+
+declare void @_Z3fooPi(i32*)
+; We create one global string constant for the stack frame above.
+; Make sure we don't create any other global constants.
+; CHECK: = private constant
+; CHECK-NOT: = private constant
diff --git a/test/Instrumentation/AddressSanitizer/instrument_global.ll b/test/Instrumentation/AddressSanitizer/instrument_global.ll
index ba8d65a4fa4b..3d92946087ec 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_global.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_global.ll
@@ -6,8 +6,8 @@ target triple = "x86_64-unknown-linux-gnu"
 ; If a global is present, __asan_[un]register_globals should be called from
 ; module ctor/dtor
 
-; CHECK: llvm.global_dtors
 ; CHECK: llvm.global_ctors
+; CHECK: llvm.global_dtors
 
 ; CHECK: define internal void @asan.module_ctor
 ; CHECK-NOT: ret
diff --git a/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
new file mode 100644
index 000000000000..472551654e53
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/instrument_initializer_metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -asan -asan-initialization-order -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+@xxx = global i32 0, align 4
+; Clang will emit the following metadata identifying @xxx as dynamically
+; initialized.
+!0 = metadata !{i32* @xxx}
+!llvm.asan.dynamically_initialized_globals = !{!0}
+
+define i32 @initializer() uwtable {
+entry:
+  ret i32 42
+}
+
+define internal void @__cxx_global_var_init() section ".text.startup" {
+entry:
+  %call = call i32 @initializer()
+  store i32 %call, i32* @xxx, align 4
+  ret void
+}
+
+define internal void @_GLOBAL__I_a() address_safety section ".text.startup" {
+entry:
+  call void @__cxx_global_var_init()
+  ret void
+}
+
+; Clang indicated that @xxx was dynamically initailized.
+; __asan_{before,after}_dynamic_init should be called from _GLOBAL__I_a
+
+; CHECK: define internal void @_GLOBAL__I_a
+; CHECK-NOT: ret
+; CHECK: call void @__asan_before_dynamic_init
+; CHECK: call void @__cxx_global_var_init
+; CHECK: call void @__asan_after_dynamic_init
+; CHECK: ret
diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll
index 02bf215c6bff..107dbdc0f227 100644
--- a/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -8,7 +8,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_unordered
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -16,7 +16,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_monotonic
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 1)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_acquire(i8* %a) nounwind uwtable {
 entry:
@@ -24,7 +24,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_acquire
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 4)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 2)
 
 define i8 @atomic8_load_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -32,7 +32,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_seq_cst
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 32)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 5)
 
 define void @atomic8_store_unordered(i8* %a) nounwind uwtable {
 entry:
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_unordered
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -48,7 +48,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_monotonic
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 1)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_release(i8* %a) nounwind uwtable {
 entry:
@@ -56,7 +56,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_release
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 8)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 3)
 
 define void @atomic8_store_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -64,7 +64,287 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_seq_cst
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 32)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xchg_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xchg_monotonic
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 0)
+
+define void @atomic8_add_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_add_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 0)
+
+define void @atomic8_sub_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_sub_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 0)
+
+define void @atomic8_and_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_and_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 0)
+
+define void @atomic8_or_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_or_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xor_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xor_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xchg_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xchg_acquire
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 2)
+
+define void @atomic8_add_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_add_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 2)
+
+define void @atomic8_sub_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_sub_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 2)
+
+define void @atomic8_and_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_and_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 2)
+
+define void @atomic8_or_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_or_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xor_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xor_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xchg_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xchg_release
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 3)
+
+define void @atomic8_add_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_add_release
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 3)
+
+define void @atomic8_sub_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_sub_release
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 3)
+
+define void @atomic8_and_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_and_release
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 3)
+
+define void @atomic8_or_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_or_release
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xor_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xor_release
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xchg_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xchg_acq_rel
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 4)
+
+define void @atomic8_add_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_add_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 4)
+
+define void @atomic8_sub_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_sub_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 4)
+
+define void @atomic8_and_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_and_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 4)
+
+define void @atomic8_or_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_or_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xor_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xor_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xchg_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xchg_seq_cst
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 5)
+
+define void @atomic8_add_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_add_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 5)
+
+define void @atomic8_sub_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_sub_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 5)
+
+define void @atomic8_and_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_and_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 5)
+
+define void @atomic8_or_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_or_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xor_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xor_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 5)
+
+define void @atomic8_cas_monotonic(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 monotonic
+  ret void
+}
+; CHECK: atomic8_cas_monotonic
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 0)
+
+define void @atomic8_cas_acquire(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acquire
+  ret void
+}
+; CHECK: atomic8_cas_acquire
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 2)
+
+define void @atomic8_cas_release(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 release
+  ret void
+}
+; CHECK: atomic8_cas_release
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 3)
+
+define void @atomic8_cas_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acq_rel
+  ret void
+}
+; CHECK: atomic8_cas_acq_rel
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 4)
+
+define void @atomic8_cas_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 seq_cst
+  ret void
+}
+; CHECK: atomic8_cas_seq_cst
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 5)
 
 define i16 @atomic16_load_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -72,7 +352,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_unordered
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -80,7 +360,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_monotonic
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 1)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_acquire(i16* %a) nounwind uwtable {
 entry:
@@ -88,7 +368,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_acquire
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 4)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 2)
 
 define i16 @atomic16_load_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -96,7 +376,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_seq_cst
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 32)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 5)
 
 define void @atomic16_store_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -104,7 +384,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_unordered
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -112,7 +392,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_monotonic
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 1)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_release(i16* %a) nounwind uwtable {
 entry:
@@ -120,7 +400,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_release
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 8)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 3)
 
 define void @atomic16_store_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -128,7 +408,287 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_seq_cst
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 32)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xchg_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xchg_monotonic
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 0)
+
+define void @atomic16_add_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_add_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 0)
+
+define void @atomic16_sub_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_sub_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 0)
+
+define void @atomic16_and_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_and_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 0)
+
+define void @atomic16_or_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_or_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xor_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xor_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xchg_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xchg_acquire
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 2)
+
+define void @atomic16_add_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_add_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 2)
+
+define void @atomic16_sub_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_sub_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 2)
+
+define void @atomic16_and_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_and_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 2)
+
+define void @atomic16_or_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_or_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xor_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xor_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xchg_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xchg_release
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 3)
+
+define void @atomic16_add_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_add_release
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 3)
+
+define void @atomic16_sub_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_sub_release
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 3)
+
+define void @atomic16_and_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_and_release
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 3)
+
+define void @atomic16_or_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_or_release
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xor_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xor_release
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xchg_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xchg_acq_rel
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 4)
+
+define void @atomic16_add_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_add_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 4)
+
+define void @atomic16_sub_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_sub_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 4)
+
+define void @atomic16_and_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_and_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 4)
+
+define void @atomic16_or_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_or_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xor_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xor_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xchg_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xchg_seq_cst
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 5)
+
+define void @atomic16_add_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_add_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 5)
+
+define void @atomic16_sub_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_sub_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 5)
+
+define void @atomic16_and_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_and_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 5)
+
+define void @atomic16_or_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_or_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xor_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xor_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 5)
+
+define void @atomic16_cas_monotonic(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 monotonic
+  ret void
+}
+; CHECK: atomic16_cas_monotonic
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 0)
+
+define void @atomic16_cas_acquire(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acquire
+  ret void
+}
+; CHECK: atomic16_cas_acquire
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 2)
+
+define void @atomic16_cas_release(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 release
+  ret void
+}
+; CHECK: atomic16_cas_release
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 3)
+
+define void @atomic16_cas_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acq_rel
+  ret void
+}
+; CHECK: atomic16_cas_acq_rel
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 4)
+
+define void @atomic16_cas_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 seq_cst
+  ret void
+}
+; CHECK: atomic16_cas_seq_cst
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 5)
 
 define i32 @atomic32_load_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -136,7 +696,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_unordered
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -144,7 +704,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_monotonic
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 1)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_acquire(i32* %a) nounwind uwtable {
 entry:
@@ -152,7 +712,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_acquire
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 4)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 2)
 
 define i32 @atomic32_load_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -160,7 +720,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_seq_cst
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 32)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 5)
 
 define void @atomic32_store_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -168,7 +728,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_unordered
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -176,7 +736,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_monotonic
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 1)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_release(i32* %a) nounwind uwtable {
 entry:
@@ -184,7 +744,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_release
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 8)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 3)
 
 define void @atomic32_store_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -192,7 +752,287 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_seq_cst
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 32)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xchg_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xchg_monotonic
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 0)
+
+define void @atomic32_add_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_add_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 0)
+
+define void @atomic32_sub_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_sub_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 0)
+
+define void @atomic32_and_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_and_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 0)
+
+define void @atomic32_or_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_or_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xor_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xor_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xchg_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xchg_acquire
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 2)
+
+define void @atomic32_add_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_add_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 2)
+
+define void @atomic32_sub_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_sub_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 2)
+
+define void @atomic32_and_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_and_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 2)
+
+define void @atomic32_or_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_or_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xor_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xor_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xchg_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xchg_release
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 3)
+
+define void @atomic32_add_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_add_release
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 3)
+
+define void @atomic32_sub_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_sub_release
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 3)
+
+define void @atomic32_and_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_and_release
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 3)
+
+define void @atomic32_or_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_or_release
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xor_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xor_release
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xchg_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xchg_acq_rel
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 4)
+
+define void @atomic32_add_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_add_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 4)
+
+define void @atomic32_sub_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_sub_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 4)
+
+define void @atomic32_and_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_and_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 4)
+
+define void @atomic32_or_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_or_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xor_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xor_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xchg_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xchg_seq_cst
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 5)
+
+define void @atomic32_add_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_add_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 5)
+
+define void @atomic32_sub_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_sub_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 5)
+
+define void @atomic32_and_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_and_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 5)
+
+define void @atomic32_or_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_or_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xor_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xor_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 5)
+
+define void @atomic32_cas_monotonic(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 monotonic
+  ret void
+}
+; CHECK: atomic32_cas_monotonic
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 0)
+
+define void @atomic32_cas_acquire(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acquire
+  ret void
+}
+; CHECK: atomic32_cas_acquire
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 2)
+
+define void @atomic32_cas_release(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 release
+  ret void
+}
+; CHECK: atomic32_cas_release
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 3)
+
+define void @atomic32_cas_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acq_rel
+  ret void
+}
+; CHECK: atomic32_cas_acq_rel
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 4)
+
+define void @atomic32_cas_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 seq_cst
+  ret void
+}
+; CHECK: atomic32_cas_seq_cst
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 5)
 
 define i64 @atomic64_load_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -200,7 +1040,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_unordered
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -208,7 +1048,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_monotonic
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 1)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_acquire(i64* %a) nounwind uwtable {
 entry:
@@ -216,7 +1056,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_acquire
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 4)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 2)
 
 define i64 @atomic64_load_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -224,7 +1064,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_seq_cst
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 32)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 5)
 
 define void @atomic64_store_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -232,7 +1072,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_unordered
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -240,7 +1080,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_monotonic
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 1)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_release(i64* %a) nounwind uwtable {
 entry:
@@ -248,7 +1088,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_release
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 8)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 3)
 
 define void @atomic64_store_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -256,7 +1096,287 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_seq_cst
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 32)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xchg_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xchg_monotonic
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 0)
+
+define void @atomic64_add_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_add_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 0)
+
+define void @atomic64_sub_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_sub_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 0)
+
+define void @atomic64_and_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_and_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 0)
+
+define void @atomic64_or_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_or_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xor_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xor_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xchg_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xchg_acquire
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 2)
+
+define void @atomic64_add_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_add_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 2)
+
+define void @atomic64_sub_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_sub_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 2)
+
+define void @atomic64_and_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_and_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 2)
+
+define void @atomic64_or_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_or_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xor_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xor_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xchg_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xchg_release
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 3)
+
+define void @atomic64_add_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_add_release
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 3)
+
+define void @atomic64_sub_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_sub_release
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 3)
+
+define void @atomic64_and_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_and_release
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 3)
+
+define void @atomic64_or_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_or_release
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xor_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xor_release
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xchg_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xchg_acq_rel
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 4)
+
+define void @atomic64_add_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_add_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 4)
+
+define void @atomic64_sub_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_sub_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 4)
+
+define void @atomic64_and_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_and_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 4)
+
+define void @atomic64_or_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_or_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xor_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xor_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xchg_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xchg_seq_cst
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 5)
+
+define void @atomic64_add_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_add_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 5)
+
+define void @atomic64_sub_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_sub_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 5)
+
+define void @atomic64_and_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_and_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 5)
+
+define void @atomic64_or_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_or_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xor_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xor_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 5)
+
+define void @atomic64_cas_monotonic(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 monotonic
+  ret void
+}
+; CHECK: atomic64_cas_monotonic
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 0)
+
+define void @atomic64_cas_acquire(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acquire
+  ret void
+}
+; CHECK: atomic64_cas_acquire
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 2)
+
+define void @atomic64_cas_release(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 release
+  ret void
+}
+; CHECK: atomic64_cas_release
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 3)
+
+define void @atomic64_cas_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acq_rel
+  ret void
+}
+; CHECK: atomic64_cas_acq_rel
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 4)
+
+define void @atomic64_cas_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 seq_cst
+  ret void
+}
+; CHECK: atomic64_cas_seq_cst
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 5)
 
 define i128 @atomic128_load_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -264,7 +1384,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_unordered
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -272,7 +1392,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_monotonic
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 1)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_acquire(i128* %a) nounwind uwtable {
 entry:
@@ -280,7 +1400,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_acquire
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 4)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 2)
 
 define i128 @atomic128_load_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -288,7 +1408,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_seq_cst
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 32)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 5)
 
 define void @atomic128_store_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -296,7 +1416,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_unordered
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -304,7 +1424,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_monotonic
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 1)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_release(i128* %a) nounwind uwtable {
 entry:
@@ -312,7 +1432,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_release
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 8)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 3)
 
 define void @atomic128_store_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -320,4 +1440,348 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_seq_cst
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 32)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xchg_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xchg_monotonic
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 0)
+
+define void @atomic128_add_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_add_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 0)
+
+define void @atomic128_sub_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_sub_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 0)
+
+define void @atomic128_and_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_and_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 0)
+
+define void @atomic128_or_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_or_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xor_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xor_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xchg_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xchg_acquire
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 2)
+
+define void @atomic128_add_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_add_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 2)
+
+define void @atomic128_sub_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_sub_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 2)
+
+define void @atomic128_and_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_and_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 2)
+
+define void @atomic128_or_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_or_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xor_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xor_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xchg_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xchg_release
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 3)
+
+define void @atomic128_add_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_add_release
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 3)
+
+define void @atomic128_sub_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_sub_release
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 3)
+
+define void @atomic128_and_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_and_release
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 3)
+
+define void @atomic128_or_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_or_release
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xor_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xor_release
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xchg_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xchg_acq_rel
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 4)
+
+define void @atomic128_add_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_add_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 4)
+
+define void @atomic128_sub_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_sub_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 4)
+
+define void @atomic128_and_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_and_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 4)
+
+define void @atomic128_or_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_or_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xor_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xor_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xchg_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xchg_seq_cst
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 5)
+
+define void @atomic128_add_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_add_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 5)
+
+define void @atomic128_sub_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_sub_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 5)
+
+define void @atomic128_and_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_and_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 5)
+
+define void @atomic128_or_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_or_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xor_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xor_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 5)
+
+define void @atomic128_cas_monotonic(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 monotonic
+  ret void
+}
+; CHECK: atomic128_cas_monotonic
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 0)
+
+define void @atomic128_cas_acquire(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acquire
+  ret void
+}
+; CHECK: atomic128_cas_acquire
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 2)
+
+define void @atomic128_cas_release(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 release
+  ret void
+}
+; CHECK: atomic128_cas_release
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 3)
+
+define void @atomic128_cas_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acq_rel
+  ret void
+}
+; CHECK: atomic128_cas_acq_rel
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 4)
+
+define void @atomic128_cas_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 seq_cst
+  ret void
+}
+; CHECK: atomic128_cas_seq_cst
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 5)
+
+define void @atomic_signal_fence_acquire() nounwind uwtable {
+entry:
+  fence singlethread acquire
+  ret void
+}
+; CHECK: atomic_signal_fence_acquire
+; CHECK: call void @__tsan_atomic_signal_fence(i32 2)
+
+define void @atomic_thread_fence_acquire() nounwind uwtable {
+entry:
+  fence  acquire
+  ret void
+}
+; CHECK: atomic_thread_fence_acquire
+; CHECK: call void @__tsan_atomic_thread_fence(i32 2)
+
+define void @atomic_signal_fence_release() nounwind uwtable {
+entry:
+  fence singlethread release
+  ret void
+}
+; CHECK: atomic_signal_fence_release
+; CHECK: call void @__tsan_atomic_signal_fence(i32 3)
+
+define void @atomic_thread_fence_release() nounwind uwtable {
+entry:
+  fence  release
+  ret void
+}
+; CHECK: atomic_thread_fence_release
+; CHECK: call void @__tsan_atomic_thread_fence(i32 3)
+
+define void @atomic_signal_fence_acq_rel() nounwind uwtable {
+entry:
+  fence singlethread acq_rel
+  ret void
+}
+; CHECK: atomic_signal_fence_acq_rel
+; CHECK: call void @__tsan_atomic_signal_fence(i32 4)
+
+define void @atomic_thread_fence_acq_rel() nounwind uwtable {
+entry:
+  fence  acq_rel
+  ret void
+}
+; CHECK: atomic_thread_fence_acq_rel
+; CHECK: call void @__tsan_atomic_thread_fence(i32 4)
+
+define void @atomic_signal_fence_seq_cst() nounwind uwtable {
+entry:
+  fence singlethread seq_cst
+  ret void
+}
+; CHECK: atomic_signal_fence_seq_cst
+; CHECK: call void @__tsan_atomic_signal_fence(i32 5)
+
+define void @atomic_thread_fence_seq_cst() nounwind uwtable {
+entry:
+  fence  seq_cst
+  ret void
+}
+; CHECK: atomic_thread_fence_seq_cst
+; CHECK: call void @__tsan_atomic_thread_fence(i32 5)
diff --git a/test/MC/ARM/arm-arithmetic-aliases.s b/test/MC/ARM/arm-arithmetic-aliases.s
index 9895cfc02b25..3ed444858146 100644
--- a/test/MC/ARM/arm-arithmetic-aliases.s
+++ b/test/MC/ARM/arm-arithmetic-aliases.s
@@ -124,3 +124,7 @@ bicseq r2, r3
 @ CHECK: bicseq r2, r2, #6              @ encoding: [0x06,0x20,0xd2,0x03]
 @ CHECK: bicseq r2, r2, r3              @ encoding: [0x03,0x20,0xd2,0x01]
 @ CHECK: bicseq r2, r2, r3              @ encoding: [0x03,0x20,0xd2,0x01]
+
+add r0, pc, #123
+
+@ CHECK: adr	r0, #123                @ encoding: [0x7b,0x00,0x8f,0xe2]
diff --git a/test/MC/ARM/arm-shift-encoding.s b/test/MC/ARM/arm-shift-encoding.s
new file mode 100644
index 000000000000..3c57b67f6e3b
--- /dev/null
+++ b/test/MC/ARM/arm-shift-encoding.s
@@ -0,0 +1,119 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7 -show-encoding < %s | FileCheck %s
+
+	ldr r0, [r0, r0]
+	ldr r0, [r0, r0, lsr #32]
+	ldr r0, [r0, r0, lsr #16]
+	ldr r0, [r0, r0, lsl #0]
+	ldr r0, [r0, r0, lsl #16]
+	ldr r0, [r0, r0, asr #32]
+	ldr r0, [r0, r0, asr #16]
+	ldr r0, [r0, r0, rrx]
+	ldr r0, [r0, r0, ror #16]
+
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x90,0xe7]
+
+	pld [r0, r0]
+	pld [r0, r0, lsr #32]
+	pld [r0, r0, lsr #16]
+	pld [r0, r0, lsl #0]
+	pld [r0, r0, lsl #16]
+	pld [r0, r0, asr #32]
+	pld [r0, r0, asr #16]
+	pld [r0, r0, rrx]
+	pld [r0, r0, ror #16]
+
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #32] @ encoding: [0x20,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #16] @ encoding: [0x20,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsl #16] @ encoding: [0x00,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #32] @ encoding: [0x40,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #16] @ encoding: [0x40,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, rrx]     @ encoding: [0x60,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, ror #16] @ encoding: [0x60,0xf8,0xd0,0xf7]
+
+	str r0, [r0, r0]
+	str r0, [r0, r0, lsr #32]
+	str r0, [r0, r0, lsr #16]
+	str r0, [r0, r0, lsl #0]
+	str r0, [r0, r0, lsl #16]
+	str r0, [r0, r0, asr #32]
+	str r0, [r0, r0, asr #16]
+	str r0, [r0, r0, rrx]
+	str r0, [r0, r0, ror #16]
+
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x80,0xe7]
+
+@ Uses printAddrMode2OffsetOperand(), used by LDRBT_POST_IMM LDRBT_POST_REG
+@ LDRB_POST_IMM LDRB_POST_REG LDRT_POST_IMM LDRT_POST_REG LDR_POST_IMM
+@ LDR_POST_REG STRBT_POST_IMM STRBT_POST_REG STRB_POST_IMM STRB_POST_REG
+@ STRT_POST_IMM STRT_POST_REG STR_POST_IMM STR_POST_REG
+
+	ldr r0, [r1], r2, rrx
+	ldr r3, [r4], r5, ror #0
+	str r6, [r7], r8, lsl #0
+	str r9, [r10], r11
+
+@ CHECK: ldr r0, [r1], r2, rrx    @ encoding: [0x62,0x00,0x91,0xe6]
+@ CHECK: ldr r3, [r4], r5         @ encoding: [0x05,0x30,0x94,0xe6]
+@ CHECK: str r6, [r7], r8         @ encoding: [0x08,0x60,0x87,0xe6]
+@ CHECK: str r9, [r10], r11       @ encoding: [0x0b,0x90,0x8a,0xe6]
+
+@ Uses printSORegImmOperand(), used by ADCrsi ADDrsi ANDrsi BICrsi EORrsi
+@ ORRrsi RSBrsi RSCrsi SBCrsi SUBrsi CMNzrsi CMPrsi MOVsi MVNsi TEQrsi TSTrsi
+
+	adc sp, lr, pc
+	adc r1, r8, r9, lsr #32
+	adc r2, r7, pc, lsr #16
+	adc r3, r6, r10, lsl #0
+	adc r4, r5, lr, lsl #16
+	adc r5, r4, r11, asr #32
+	adc r6, r3, sp, asr #16
+	adc r7, r2, r12, rrx
+	adc r8, r1, r0, ror #16
+
+@ CHECK: adc sp, lr, pc           @ encoding: [0x0f,0xd0,0xae,0xe0]
+@ CHECK: adc r1, r8, r9, lsr #32  @ encoding: [0x29,0x10,0xa8,0xe0]
+@ CHECK: adc r2, r7, pc, lsr #16  @ encoding: [0x2f,0x28,0xa7,0xe0]
+@ CHECK: adc r3, r6, r10          @ encoding: [0x0a,0x30,0xa6,0xe0]
+@ CHECK: adc r4, r5, lr, lsl #16  @ encoding: [0x0e,0x48,0xa5,0xe0]
+@ CHECK: adc r5, r4, r11, asr #32 @ encoding: [0x4b,0x50,0xa4,0xe0]
+@ CHECK: adc r6, r3, sp, asr #16  @ encoding: [0x4d,0x68,0xa3,0xe0]
+@ CHECK: adc r7, r2, r12, rrx     @ encoding: [0x6c,0x70,0xa2,0xe0]
+@ CHECK: adc r8, r1, r0, ror #16  @ encoding: [0x60,0x88,0xa1,0xe0]
+
+	cmp sp, lr
+	cmp r1, r8, lsr #32
+	cmp r2, r7, lsr #16
+	cmp r3, r6, lsl #0
+	cmp r4, r5, lsl #16
+	cmp r5, r4, asr #32
+	cmp r6, r3, asr #16
+	cmp r7, r2, rrx
+	cmp r8, r1, ror #16
+
+@ CHECK: cmp sp, lr           @ encoding: [0x0e,0x00,0x5d,0xe1]
+@ CHECK: cmp r1, r8, lsr #32  @ encoding: [0x28,0x00,0x51,0xe1]
+@ CHECK: cmp r2, r7, lsr #16  @ encoding: [0x27,0x08,0x52,0xe1]
+@ CHECK: cmp r3, r6           @ encoding: [0x06,0x00,0x53,0xe1]
+@ CHECK: cmp r4, r5, lsl #16  @ encoding: [0x05,0x08,0x54,0xe1]
+@ CHECK: cmp r5, r4, asr #32  @ encoding: [0x44,0x00,0x55,0xe1]
+@ CHECK: cmp r6, r3, asr #16  @ encoding: [0x43,0x08,0x56,0xe1]
+@ CHECK: cmp r7, r2, rrx      @ encoding: [0x62,0x00,0x57,0xe1]
+@ CHECK: cmp r8, r1, ror #16  @ encoding: [0x61,0x08,0x58,0xe1]
diff --git a/test/MC/ARM/basic-thumb-instructions.s b/test/MC/ARM/basic-thumb-instructions.s
index 4ee34ce6b4c8..22e21da88e40 100644
--- a/test/MC/ARM/basic-thumb-instructions.s
+++ b/test/MC/ARM/basic-thumb-instructions.s
@@ -259,8 +259,8 @@ _func:
 
 @ CHECK: ldr	r1, _foo                @ encoding: [A,0x49]
              @   fixup A - offset: 0, value: _foo, kind: fixup_arm_thumb_cp
-@ CHECK: ldr     r3, #604                @ encoding: [0x97,0x4b]
-@ CHECK: ldr     r3, #368                @ encoding: [0x5c,0x4b]
+@ CHECK: ldr     r3, [pc, #604]         @ encoding: [0x97,0x4b]
+@ CHECK: ldr     r3, [pc, #368]         @ encoding: [0x5c,0x4b]
 
 @------------------------------------------------------------------------------
 @ LDR (register)
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 499e0550135e..d65cfd7a67a5 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -47,7 +47,47 @@
 @ CHECK-ERRORS: error: immediate shift value out of range
 @ CHECK-ERRORS:         adc r4, r5, r6, ror #32
 
+        @ Out of range shift immediate values for load/store.
+        str r1, [r2, r3, lsl #invalid]
+        ldr r4, [r5], r6, lsl #-1
+        pld r4, [r5, r6, lsl #32]
+        str r4, [r5], r6, lsr #-1
+        ldr r4, [r5, r6, lsr #33]
+        pld r4, [r5, r6, asr #-1]
+        str r4, [r5, r6, asr #33]
+        ldr r4, [r5, r6, ror #-1]
+        pld r4, [r5, r6, ror #32]
+        pld r4, [r5, r6, rrx #0]
 
+@ CHECK-ERRORS: error: shift amount must be an immediate
+@ CHECK-ERRORS:         str r1, [r2, r3, lsl #invalid]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5], r6, lsl #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, lsl #32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5], r6, lsr #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, lsr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, asr #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5, r6, asr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, ror #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, ror #32]
+@ CHECK-ERRORS: error: ']' expected
+@ CHECK-ERRORS:         pld r4, [r5, r6, rrx #0]
+        
         @ Out of range 16-bit immediate on BKPT
         bkpt #65536
 
@@ -321,3 +361,13 @@
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS:         cps f,#1
 @ CHECK-ERRORS:               ^
+
+        @ Bad operands for msr
+        msr #0, #0
+        msr foo, #0
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         msr #0, #0
+@ CHECK-ERRORS:             ^
+@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:         msr foo, #0
+@ CHECK-ERRORS:             ^
diff --git a/test/MC/ARM/elf-jump24-fixup.s b/test/MC/ARM/elf-jump24-fixup.s
new file mode 100644
index 000000000000..75a4b869dc60
--- /dev/null
+++ b/test/MC/ARM/elf-jump24-fixup.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc %s -triple=thumbv7-linux-gnueabi -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+	.syntax unified
+	.text
+	.code	16
+	.thumb_func
+foo:
+	b.w	bar
+
+@ CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 bar
diff --git a/test/MC/ARM/thumb-shift-encoding.s b/test/MC/ARM/thumb-shift-encoding.s
new file mode 100644
index 000000000000..54284132b653
--- /dev/null
+++ b/test/MC/ARM/thumb-shift-encoding.s
@@ -0,0 +1,45 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7 -show-encoding < %s | FileCheck %s
+
+@ Uses printT2SOOperand(), used by t2ADCrs t2ADDrs t2ANDrs t2BICrs t2EORrs
+@ t2ORNrs t2ORRrs t2RSBrs t2SBCrs t2SUBrs t2CMNzrs t2CMPrs t2MOVSsi t2MOVsi
+@ t2MVNs t2TEQrs t2TSTrs
+
+	sbc.w r12, lr, r0
+	sbc.w r1, r8, r9, lsr #32
+	sbc.w r2, r7, pc, lsr #16
+	sbc.w r3, r6, r10, lsl #0
+	sbc.w r4, r5, lr, lsl #16
+	sbc.w r5, r4, r11, asr #32
+	sbc.w r6, r3, sp, asr #16
+	sbc.w r7, r2, r12, rrx
+	sbc.w r8, r1, r0, ror #16
+
+@ CHECK: sbc.w r12, lr, r0          @ encoding: [0x6e,0xeb,0x00,0x0c]
+@ CHECK: sbc.w r1, r8, r9, lsr #32  @ encoding: [0x68,0xeb,0x19,0x01]
+@ CHECK: sbc.w r2, r7, pc, lsr #16  @ encoding: [0x67,0xeb,0x1f,0x42]
+@ CHECK: sbc.w r3, r6, r10          @ encoding: [0x66,0xeb,0x0a,0x03]
+@ CHECK: sbc.w r4, r5, lr, lsl #16  @ encoding: [0x65,0xeb,0x0e,0x44]
+@ CHECK: sbc.w r5, r4, r11, asr #32 @ encoding: [0x64,0xeb,0x2b,0x05]
+@ CHECK: sbc.w r6, r3, sp, asr #16  @ encoding: [0x63,0xeb,0x2d,0x46]
+@ CHECK: sbc.w r7, r2, r12, rrx     @ encoding: [0x62,0xeb,0x3c,0x07]
+@ CHECK: sbc.w r8, r1, r0, ror #16  @ encoding: [0x61,0xeb,0x30,0x48]
+
+	and.w r12, lr, r0
+	and.w r1, r8, r9, lsr #32
+	and.w r2, r7, pc, lsr #16
+	and.w r3, r6, r10, lsl #0
+	and.w r4, r5, lr, lsl #16
+	and.w r5, r4, r11, asr #32
+	and.w r6, r3, sp, asr #16
+	and.w r7, r2, r12, rrx
+	and.w r8, r1, r0, ror #16
+
+@ CHECK: and.w r12, lr, r0          @ encoding: [0x0e,0xea,0x00,0x0c]
+@ CHECK: and.w r1, r8, r9, lsr #32  @ encoding: [0x08,0xea,0x19,0x01]
+@ CHECK: and.w r2, r7, pc, lsr #16  @ encoding: [0x07,0xea,0x1f,0x42]
+@ CHECK: and.w r3, r6, r10          @ encoding: [0x06,0xea,0x0a,0x03]
+@ CHECK: and.w r4, r5, lr, lsl #16  @ encoding: [0x05,0xea,0x0e,0x44]
+@ CHECK: and.w r5, r4, r11, asr #32 @ encoding: [0x04,0xea,0x2b,0x05]
+@ CHECK: and.w r6, r3, sp, asr #16  @ encoding: [0x03,0xea,0x2d,0x46]
+@ CHECK: and.w r7, r2, r12, rrx     @ encoding: [0x02,0xea,0x3c,0x07]
+@ CHECK: and.w r8, r1, r0, ror #16  @ encoding: [0x01,0xea,0x30,0x48]
diff --git a/test/MC/ARM/thumb2-b.w-encodingT4.s b/test/MC/ARM/thumb2-b.w-encodingT4.s
new file mode 100644
index 000000000000..be77b06267a2
--- /dev/null
+++ b/test/MC/ARM/thumb2-b.w-encodingT4.s
@@ -0,0 +1,12 @@
+@ RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -show-encoding < %s | FileCheck %s
+  .syntax unified
+  .globl _func
+.thumb_func _foo
+.space 0x37c6
+_foo:
+@------------------------------------------------------------------------------
+@ B (thumb2 b.w encoding T4) rdar://12585795
+@------------------------------------------------------------------------------
+        b.w   0x3680c
+
+@ CHECK: b.w	#223244                    @ encoding: [0x6d,0xf0,0x0c,0xb0]
diff --git a/test/MC/AsmParser/bad-macro.s b/test/MC/AsmParser/bad-macro.s
new file mode 100644
index 000000000000..313607b7782c
--- /dev/null
+++ b/test/MC/AsmParser/bad-macro.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2>&1 | FileCheck %s
+
+.macro 23
+
+// CHECK: expected identifier in '.macro' directive
+
+.macro abc 33
+
+// CHECK: expected identifier in '.macro' directive
diff --git a/test/MC/AsmParser/directive_lcomm.s b/test/MC/AsmParser/directive_lcomm.s
index 0a0add513fe9..37a350c82e81 100644
--- a/test/MC/AsmParser/directive_lcomm.s
+++ b/test/MC/AsmParser/directive_lcomm.s
@@ -1,9 +1,14 @@
 # RUN: llvm-mc -triple i386-apple-darwin10 %s | FileCheck %s
+# RUN: llvm-mc -triple i386-pc-mingw32 %s | FileCheck %s
+# RUN: not llvm-mc -triple i386-linux-gnu %s 2>&1 | FileCheck %s -check-prefix=ERROR
 
 # CHECK: TEST0:
-# CHECK: .zerofill __DATA,__bss,a,7,4
-# CHECK: .zerofill __DATA,__bss,b,8
-# CHECK: .zerofill __DATA,__bss,c,0
+# CHECK: .lcomm a,7,4
+# CHECK: .lcomm b,8
+# CHECK: .lcomm c,0
+
+# ELF doesn't like alignment on .lcomm.
+# ERROR: alignment not supported on this target
 TEST0:  
         .lcomm a, 8-1, 4
         .lcomm b,8
diff --git a/test/MC/AsmParser/labels.s b/test/MC/AsmParser/labels.s
index 56091755d966..6a9870b655f2 100644
--- a/test/MC/AsmParser/labels.s
+++ b/test/MC/AsmParser/labels.s
@@ -41,7 +41,7 @@ foo:
 // CHECK: .comm "a 6",1
         .comm "a 6", 1
 
-// CHECK: .zerofill __DATA,__bss,"a 7",1,0
+// CHECK: .lcomm "a 7",1
         .lcomm "a 7", 1
 
 // FIXME: We don't bother to support .lsym.
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
index 6d084213e40b..3269369be020 100644
--- a/test/MC/AsmParser/macro-args.s
+++ b/test/MC/AsmParser/macro-args.s
@@ -4,10 +4,18 @@
     movl   \var@GOTOFF(%ebx),\re2g
 .endm
 
+.macro GET_DEFAULT var, re2g=%ebx, re3g=%ecx
+movl 2(\re2g, \re3g, 2), \var
+.endm
+
+GET         is_sse, %eax
+// CHECK: movl  is_sse@GOTOFF(%ebx), %eax
 
-GET    is_sse, %eax
+GET_DEFAULT %ebx, , %edx
+// CHECK: movl  2(%ebx,%edx,2), %ebx
 
-// CHECK: movl	is_sse@GOTOFF(%ebx), %eax
+GET_DEFAULT %ebx, %edx
+// CHECK: movl  2(%edx,%ecx,2), %ebx
 
 .macro bar
     .long $n
diff --git a/test/MC/AsmParser/macro-rept-err1.s b/test/MC/AsmParser/macro-rept-err1.s
index db92856a1d6d..cfa66878d979 100644
--- a/test/MC/AsmParser/macro-rept-err1.s
+++ b/test/MC/AsmParser/macro-rept-err1.s
@@ -3,4 +3,4 @@
 
 .endr
 
-// CHECK: unexpected '.endr' directive, no current .rept
+// CHECK: unmatched '.endr' directive
diff --git a/test/MC/AsmParser/macros-darwin.s b/test/MC/AsmParser/macros-darwin.s
new file mode 100644
index 000000000000..31b9edb37818
--- /dev/null
+++ b/test/MC/AsmParser/macros-darwin.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple i386-apple-darwin10 %s 2> %t.err | FileCheck %s
+
+.macro test1
+.globl "$0 $1 $2 $$3 $n"
+.endmacro
+
+// CHECK: .globl "1 23  $3 2"
+test1 1, 2 3
+
diff --git a/test/MC/AsmParser/macros.s b/test/MC/AsmParser/macros.s
index 295759299238..b1cb851fcd6b 100644
--- a/test/MC/AsmParser/macros.s
+++ b/test/MC/AsmParser/macros.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
+// RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t.err | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERRORS %s < %t.err
 
 .macro .test0
@@ -28,12 +28,66 @@ test2 10
 .globl "$0 $1 $2 $$3 $n"
 .endmacro
 
-// CHECK: .globl	"1 23  $3 2"
-test3 1,2 3
+// CHECK: .globl	"1 (23)  $3 2"
+test3 1, (2 3)
+
+// CHECK: .globl "1 2  $3 2"
+test3 1 2
 
 .macro test4
 .globl "$0 -- $1"
 .endmacro
 
-// CHECK: .globl	"ab)(,) -- (cd)"
-test4 a b)(,),(cd)
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+.macro test5 _a
+.globl "\_a"
+.endm
+
+// CHECK: .globl zed1
+test5 zed1
+
+.macro test6 $a
+.globl "\$a"
+.endm
+
+// CHECK: .globl zed2
+test6 zed2
+
+.macro test7 .a
+.globl "\.a"
+.endm
+
+// CHECK: .globl zed3
+test7 zed3
+
+.macro test8 _a, _b, _c
+.globl "\_a,\_b,\_c"
+.endmacro
+
+.macro test9 _a _b _c
+.globl "\_a \_b \_c"
+.endmacro
+
+// CHECK: .globl  "a,b,c"
+test8 a, b, c
+// CHECK: .globl  "%1,%2,%3"
+test8 %1 %2 %3 #a comment
+// CHECK: .globl "x-y,z,1"
+test8 x - y z 1
+// CHECK: .globl  "1 2 3"
+test9 1, 2,3
+
+test8 1,2 3
+// CHECK-ERRORS: error: macro argument '_c' is missing
+// CHECK-ERRORS-NEXT: test8 1,2 3
+// CHECK-ERRORS-NEXT:           ^
+
+test8 1 2, 3
+// CHECK-ERRORS: error: expected ' ' for macro argument separator
+// CHECK-ERRORS-NEXT:test8 1 2, 3
+// CHECK-ERRORS-NEXT:         ^
diff --git a/test/MC/COFF/comm.ll b/test/MC/COFF/comm.ll
new file mode 100644
index 000000000000..74da557fb5cc
--- /dev/null
+++ b/test/MC/COFF/comm.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
+
+@a = internal global i8 0, align 1
+@b = internal global double 0.000000e+00, align 8
+@c = common global i8 0, align 1
+@d = common global double 0.000000e+00, align 8
+
+; .lcomm uses byte alignment
+; CHECK: .lcomm	_a,1
+; CHECK: .lcomm	_b,8,8
+; .comm uses log2 alignment
+; CHECK: .comm	_c,1,0
+; CHECK: .comm	_d,8,3
diff --git a/test/MC/COFF/global_ctors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 4d6b1c7d9913..2a25219a778c 100644
--- a/test/MC/COFF/global_ctors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -1,14 +1,16 @@
 ; Test that global ctors are emitted into the proper COFF section for the
 ; target. Mingw uses .ctors, whereas MSVC uses .CRT$XC*.
-; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
-; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
+; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32
+; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32
 
 @.str = private unnamed_addr constant [13 x i8] c"constructing\00", align 1
-@.str2 = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str2 = private unnamed_addr constant [12 x i8] c"destructing\00", align 1
+@.str3 = private unnamed_addr constant [5 x i8] c"main\00", align 1
 
 @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_ctor }]
+@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_dtor }]
 
 declare i32 @puts(i8*)
 
@@ -17,12 +19,21 @@ define void @a_global_ctor() nounwind {
   ret void
 }
 
+define void @a_global_dtor() nounwind {
+  %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
+  ret void
+}
+
 define i32 @main() nounwind {
-  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str2, i32 0, i32 0))
+  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str3, i32 0, i32 0))
   ret i32 0
 }
 
 ; WIN32: .section .CRT$XCU,"r"
 ; WIN32: a_global_ctor
+; WIN32: .section .CRT$XTX,"r"
+; WIN32: a_global_dtor
 ; MINGW32: .section .ctors,"w"
 ; MINGW32: a_global_ctor
+; MINGW32: .section .dtors,"w"
+; MINGW32: a_global_dtor
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
index 5ba7d618bfd7..00b85264686d 100644
--- a/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
+++ b/test/MC/Disassembler/ARM/invalid-VLD1DUPq8_UPD-arm.txt
@@ -1,5 +1,4 @@
-# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | grep "invalid instruction encoding"
-# XFAIL: *
+# RUN: llvm-mc --disassemble %s -triple=armv7-unknown-unknwon -mcpu=cortex-a8 2>&1 | FileCheck %s
 
 # Opcode=737 Name=VLD1DUPq8_UPD Format=ARM_FORMAT_NLdSt(30)
 #  31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 
@@ -9,3 +8,4 @@
 # 
 # 'a' == 1 and data_size == 8 is invalid
 0x3d 0x3c 0xa0 0xf4
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
new file mode 100644
index 000000000000..9bb0995ecef8
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD1LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
new file mode 100644
index 000000000000..84c98bfbcaf4
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4DUPd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0xc0 0x0f
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
new file mode 100644
index 000000000000..9024b09531cf
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VLD4LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0xa0 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
new file mode 100644
index 000000000000..9462812f26d1
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST1LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x10 0x08
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
new file mode 100644
index 000000000000..f6e71bcfd65b
--- /dev/null
+++ b/test/MC/Disassembler/ARM/invalid-VST4LNd32_UPD-thumb.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s 2>&1 | FileCheck %s
+
+0x80 0xf9 0x30 0x0b
+# CHECK: invalid instruction encoding
diff --git a/test/MC/Disassembler/ARM/marked-up-thumb.txt b/test/MC/Disassembler/ARM/marked-up-thumb.txt
new file mode 100644
index 000000000000..65be28618bac
--- /dev/null
+++ b/test/MC/Disassembler/ARM/marked-up-thumb.txt
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -triple=thumbv7-apple-darwin -mcpu=cortex-a8 -mdis < %s | FileCheck %s
+# CHECK: ldr  <reg:r4>, <mem:[pc, <imm:#32>]>
+0x08 0x4c
+# CHECK: push	{<reg:r1>, <reg:r2>, <reg:r7>}
+0x86 0xb4
+# CHECK: sub	<reg:sp>, <imm:#132>
+0xa1 0xb0
diff --git a/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
new file mode 100644
index 000000000000..e53739e73975
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VLD-reencoding.txt
@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0xa0 0xf9 0x00 0x00
+0xa0 0xf9 0x20 0x00
+0xa0 0xf9 0x40 0x00
+0xa0 0xf9 0x60 0x00
+0xa0 0xf9 0x80 0x00
+0xa0 0xf9 0xa0 0x00
+0xa0 0xf9 0xc0 0x00
+0xa0 0xf9 0xe0 0x00
+
+# CHECK: vld1.8  {d0[0]}, [r0], r0 @ encoding: [0xa0,0xf9,0x00,0x00]
+# CHECK: vld1.8  {d0[1]}, [r0], r0 @ encoding: [0xa0,0xf9,0x20,0x00]
+# CHECK: vld1.8  {d0[2]}, [r0], r0 @ encoding: [0xa0,0xf9,0x40,0x00]
+# CHECK: vld1.8  {d0[3]}, [r0], r0 @ encoding: [0xa0,0xf9,0x60,0x00]
+# CHECK: vld1.8  {d0[4]}, [r0], r0 @ encoding: [0xa0,0xf9,0x80,0x00]
+# CHECK: vld1.8  {d0[5]}, [r0], r0 @ encoding: [0xa0,0xf9,0xa0,0x00]
+# CHECK: vld1.8  {d0[6]}, [r0], r0 @ encoding: [0xa0,0xf9,0xc0,0x00]
+# CHECK: vld1.8  {d0[7]}, [r0], r0 @ encoding: [0xa0,0xf9,0xe0,0x00]
+
+0xa0 0xf9 0x00 0x04
+0xa0 0xf9 0x10 0x04
+0xa0 0xf9 0x40 0x04
+0xa0 0xf9 0x50 0x04
+0xa0 0xf9 0x80 0x04
+0xa0 0xf9 0x90 0x04
+0xa0 0xf9 0xc0 0x04
+0xa0 0xf9 0xd0 0x04
+
+# CHECK: vld1.16 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x04]
+# CHECK: vld1.16 {d0[0]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x40,0x04]
+# CHECK: vld1.16 {d0[1]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x50,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x04]
+# CHECK: vld1.16 {d0[2]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0x90,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0], r0      @ encoding: [0xa0,0xf9,0xc0,0x04]
+# CHECK: vld1.16 {d0[3]}, [r0, :16], r0 @ encoding: [0xa0,0xf9,0xd0,0x04]
+
+0xa0 0xf9 0x00 0x08
+0xa0 0xf9 0x30 0x08
+0xa0 0xf9 0x80 0x08
+0xa0 0xf9 0xb0 0x08
+
+# CHECK: vld1.32 {d0[0]}, [r0], r0      @ encoding: [0xa0,0xf9,0x00,0x08]
+# CHECK: vld1.32 {d0[0]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0x30,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0], r0      @ encoding: [0xa0,0xf9,0x80,0x08]
+# CHECK: vld1.32 {d0[1]}, [r0, :32], r0 @ encoding: [0xa0,0xf9,0xb0,0x08]
+
+0xa0 0xf9 0x1f 0x04
+0xa0 0xf9 0x8f 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16] @ encoding: [0xa0,0xf9,0x1f,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]      @ encoding: [0xa0,0xf9,0x8f,0x00]
+
+0xa0 0xf9 0x1d 0x04
+0xa0 0xf9 0x8d 0x00
+
+# CHECK: vld1.16 {d0[0]}, [r0, :16]! @ encoding: [0xa0,0xf9,0x1d,0x04]
+# CHECK: vld1.8  {d0[4]}, [r0]!      @ encoding: [0xa0,0xf9,0x8d,0x00]
+
+0xa5 0xf9 0x10 0x04
+0xa5 0xf9 0x1a 0x04
+0xae 0xf9 0x1a 0x04
+0xa5 0xf9 0x1a 0x94
+
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0xa5,0xf9,0x10,0x04]
+# CHECK: vld1.16 {d0[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d0[0]}, [lr, :16], r10 @ encoding: [0xae,0xf9,0x1a,0x04]
+# CHECK: vld1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0xa5,0xf9,0x1a,0x94]
+
+0xa0 0xf9 0x20 0x0b
+0xa0 0xf9 0x20 0x07
+0xa0 0xf9 0x20 0x03
+
+# CHECK: vld4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0, :128], r0 @ encoding: [0xa0,0xf9,0x20,0x0b]
+# CHECK: vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x07]
+# CHECK: vld4.8  {d0[1], d1[1], d2[1], d3[1]}, [r0], r0       @ encoding: [0xa0,0xf9,0x20,0x03]
diff --git a/test/MC/Disassembler/ARM/neont-VST-reencoding.txt b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
new file mode 100644
index 000000000000..eb3722c08531
--- /dev/null
+++ b/test/MC/Disassembler/ARM/neont-VST-reencoding.txt
@@ -0,0 +1,77 @@
+# RUN: llvm-mc -triple thumbv7 -show-encoding -disassemble < %s | FileCheck %s
+
+0x80 0xf9 0x00 0x00
+0x81 0xf9 0x21 0x10
+0x81 0xf9 0x42 0x10
+0x81 0xf9 0x61 0x20
+0x82 0xf9 0x82 0x20
+0x82 0xf9 0xa1 0x10
+0x82 0xf9 0xc2 0x20
+0x83 0xf9 0xe3 0x30
+
+# CHECK: vst1.8  {d0[0]}, [r0], r0 @ encoding: [0x80,0xf9,0x00,0x00]
+# CHECK: vst1.8  {d1[1]}, [r1], r1 @ encoding: [0x81,0xf9,0x21,0x10]
+# CHECK: vst1.8  {d1[2]}, [r1], r2 @ encoding: [0x81,0xf9,0x42,0x10]
+# CHECK: vst1.8  {d2[3]}, [r1], r1 @ encoding: [0x81,0xf9,0x61,0x20]
+# CHECK: vst1.8  {d2[4]}, [r2], r2 @ encoding: [0x82,0xf9,0x82,0x20]
+# CHECK: vst1.8  {d1[5]}, [r2], r1 @ encoding: [0x82,0xf9,0xa1,0x10]
+# CHECK: vst1.8  {d2[6]}, [r2], r2 @ encoding: [0x82,0xf9,0xc2,0x20]
+# CHECK: vst1.8  {d3[7]}, [r3], r3 @ encoding: [0x83,0xf9,0xe3,0x30]
+
+0x80 0xf9 0x00 0x04
+0xc3 0xf9 0x13 0x04
+0xc4 0xf9 0x43 0x04
+0xc5 0xf9 0x55 0x04
+0xc6 0xf9 0x85 0x04
+0xc7 0xf9 0x95 0x74
+0xc8 0xf9 0xc7 0x84
+0xc9 0xf9 0xd9 0x94
+
+# CHECK: vst1.16 {d0[0]},  [r0], r0      @ encoding: [0x80,0xf9,0x00,0x04]
+# CHECK: vst1.16 {d16[0]}, [r3, :16], r3 @ encoding: [0xc3,0xf9,0x13,0x04]
+# CHECK: vst1.16 {d16[1]}, [r4], r3      @ encoding: [0xc4,0xf9,0x43,0x04]
+# CHECK: vst1.16 {d16[1]}, [r5, :16], r5 @ encoding: [0xc5,0xf9,0x55,0x04]
+# CHECK: vst1.16 {d16[2]}, [r6], r5      @ encoding: [0xc6,0xf9,0x85,0x04]
+# CHECK: vst1.16 {d23[2]}, [r7, :16], r5 @ encoding: [0xc7,0xf9,0x95,0x74]
+# CHECK: vst1.16 {d24[3]}, [r8], r7      @ encoding: [0xc8,0xf9,0xc7,0x84]
+# CHECK: vst1.16 {d25[3]}, [r9, :16], r9 @ encoding: [0xc9,0xf9,0xd9,0x94]
+
+0x8a 0xf9 0x01 0xa8
+0xcb 0xf9 0x32 0x18
+0x8c 0xf9 0x83 0xb8
+0xcd 0xf9 0xb4 0x28
+
+# CHECK: vst1.32 {d10[0]}, [r10], r1      @ encoding: [0x8a,0xf9,0x01,0xa8]
+# CHECK: vst1.32 {d17[0]}, [r11, :32], r2 @ encoding: [0xcb,0xf9,0x32,0x18]
+# CHECK: vst1.32 {d11[1]}, [r12], r3      @ encoding: [0x8c,0xf9,0x83,0xb8]
+# CHECK: vst1.32 {d18[1]}, [sp, :32], r4  @ encoding: [0xcd,0xf9,0xb4,0x28]
+
+0x81 0xf9 0x1f 0x44
+0x82 0xf9 0x8f 0x30
+
+# CHECK: vst1.16 {d4[0]}, [r1, :16] @ encoding: [0x81,0xf9,0x1f,0x44]
+# CHECK: vst1.8  {d3[4]}, [r2]      @ encoding: [0x82,0xf9,0x8f,0x30]
+
+0x83 0xf9 0x1d 0x24
+0x84 0xf9 0x8d 0x10
+
+# CHECK: vst1.16 {d2[0]}, [r3, :16]! @ encoding: [0x83,0xf9,0x1d,0x24]
+# CHECK: vst1.8  {d1[4]}, [r4]!      @ encoding: [0x84,0xf9,0x8d,0x10]
+
+0x85 0xf9 0x10 0x04
+0x85 0xf9 0x1a 0x74
+0x8e 0xf9 0x1a 0x84
+0x85 0xf9 0x1a 0x94
+
+# CHECK: vst1.16 {d0[0]}, [r5, :16], r0  @ encoding: [0x85,0xf9,0x10,0x04]
+# CHECK: vst1.16 {d7[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x74]
+# CHECK: vst1.16 {d8[0]}, [lr, :16], r10 @ encoding: [0x8e,0xf9,0x1a,0x84]
+# CHECK: vst1.16 {d9[0]}, [r5, :16], r10 @ encoding: [0x85,0xf9,0x1a,0x94]
+
+0x81 0xf9 0x24 0x0b
+0x82 0xf9 0x25 0x07
+0x83 0xf9 0x26 0x03
+
+# CHECK: vst4.32 {d0[0], d1[0], d2[0], d3[0]}, [r1, :128], r4 @ encoding: [0x81,0xf9,0x24,0x0b]
+# CHECK: vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r2], r5       @ encoding: [0x82,0xf9,0x25,0x07]
+# CHECK: vst4.8  {d0[1], d1[1], d2[1], d3[1]}, [r3], r6       @ encoding: [0x83,0xf9,0x26,0x03]
diff --git a/test/MC/Disassembler/ARM/thumb-printf.txt b/test/MC/Disassembler/ARM/thumb-printf.txt
index 8158a73edcb5..ca820444adc7 100644
--- a/test/MC/Disassembler/ARM/thumb-printf.txt
+++ b/test/MC/Disassembler/ARM/thumb-printf.txt
@@ -7,17 +7,17 @@
 # CHECK-NEXT:	add	r3, sp, #20
 # CHECK-NEXT:	ldr	r5, [r3], #4
 # CHECK-NEXT:	str	r3, [sp]
-# CHECK-NEXT:	ldr	r3, #52
+# CHECK-NEXT:	ldr	r3, [pc, #52]
 # CHECK-NEXT:	add	r3, pc
 # CHECK-NEXT:	ldr	r0, [r3]
 # CHECK-NEXT:	ldr	r4, [r0]
-# CHECK-NEXT:	ldr	r0, #48
+# CHECK-NEXT:	ldr	r0, [pc, #48]
 # CHECK-NEXT:	add	r0, pc
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	ldr	r0, [r0]
 # CHECK-NEXT:	blx	#191548
 # CHECK-NEXT:	cbnz	r0, #6
-# CHECK-NEXT:	ldr	r1, #40
+# CHECK-NEXT:	ldr	r1, [pc, #40]
 # CHECK-NEXT:	add	r1, pc
 # CHECK-NEXT:	ldr	r1, [r1]
 # CHECK-NEXT:	b	#0
diff --git a/test/MC/Disassembler/ARM/thumb-tests.txt b/test/MC/Disassembler/ARM/thumb-tests.txt
index c08585a37197..757ce6e3977b 100644
--- a/test/MC/Disassembler/ARM/thumb-tests.txt
+++ b/test/MC/Disassembler/ARM/thumb-tests.txt
@@ -30,7 +30,7 @@
 # CHECK:	ldm	r0!, {r1}
 0x02 0xc8
 
-# CHECK:	ldr	r5, #432
+# CHECK:	ldr	r5, [pc, #432]
 0x6c 0x4d
 
 # CHECK:	str	r0, [r3]
diff --git a/test/MC/Disassembler/ARM/thumb1.txt b/test/MC/Disassembler/ARM/thumb1.txt
index 5b7026231096..de9596aab732 100644
--- a/test/MC/Disassembler/ARM/thumb1.txt
+++ b/test/MC/Disassembler/ARM/thumb1.txt
@@ -160,6 +160,7 @@
 # CHECK: ldr r1, [sp]
 # CHECK: ldr r2, [sp, #24]
 # CHECK: ldr r3, [sp, #1020]
+# CHECK: ldr r1, [pc, #12]
 
 
 0x29 0x68
@@ -168,6 +169,7 @@
 0x00 0x99
 0x06 0x9a
 0xff 0x9b
+0x03 0x49
 
 #------------------------------------------------------------------------------
 # LDR (register)
diff --git a/test/MC/Disassembler/ARM/thumb2.txt b/test/MC/Disassembler/ARM/thumb2.txt
index 42ebe58207b3..45dace3b09c5 100644
--- a/test/MC/Disassembler/ARM/thumb2.txt
+++ b/test/MC/Disassembler/ARM/thumb2.txt
@@ -169,6 +169,9 @@
 
 0x13 0xf5 0xce 0xa9
 
+# CHECK: b.w   #208962
+
+0x33 0xf0 0x21 0xb8 # rdar://12585795
 
 #------------------------------------------------------------------------------
 # BFC
diff --git a/test/MC/Disassembler/Mips/mips64.txt b/test/MC/Disassembler/Mips/mips64.txt
index 095ed181ba81..0a88c40839fa 100644
--- a/test/MC/Disassembler/Mips/mips64.txt
+++ b/test/MC/Disassembler/Mips/mips64.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x03 0xc1 0x08 0x17
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x3c 0x01 0x00 0x01
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xac 0x3a 0xc4 0xc9
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64_le.txt b/test/MC/Disassembler/Mips/mips64_le.txt
index c4e5591da4f9..fe8faffa8335 100644
--- a/test/MC/Disassembler/Mips/mips64_le.txt
+++ b/test/MC/Disassembler/Mips/mips64_le.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x2d 0xd0 0x2b 0x00
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x17 0x08 0xc1 0x03
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x01 0x00 0x01 0x3c
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xc9 0xc4 0x3a 0xac
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64r2.txt b/test/MC/Disassembler/Mips/mips64r2.txt
index 41808c724e24..2dfde0d231c6 100644
--- a/test/MC/Disassembler/Mips/mips64r2.txt
+++ b/test/MC/Disassembler/Mips/mips64r2.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0x67 0x4b 0x7c 0xcd
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x00 0x2b 0xd0 0x2d
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x03 0x1b 0xe0 0x14
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0x00 0x01 0x0f 0xbb
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x03 0xc1 0x08 0x17
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x03 0x78 0xe0 0x2f
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0x8c 0x3b 0xc4 0xcd
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x3c 0x01 0x00 0x01
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x3c 0x1f 0x00 0x01
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xac 0x3a 0xc4 0xc9
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/Mips/mips64r2_le.txt b/test/MC/Disassembler/Mips/mips64r2_le.txt
index 4987f80af9d8..620d9ebe8da3 100644
--- a/test/MC/Disassembler/Mips/mips64r2_le.txt
+++ b/test/MC/Disassembler/Mips/mips64r2_le.txt
@@ -3,7 +3,7 @@
 # CHECK: daddiu $11, $26, 31949
 0xcd 0x7c 0x4b 0x67
 
-# CHECK: daddu $26, $at, $11
+# CHECK: daddu $26, $1, $11
 0x2d 0xd0 0x2b 0x00
 
 # CHECK: ddiv $zero, $26, $22
@@ -30,10 +30,10 @@
 # CHECK: dsllv $gp, $27, $24
 0x14 0xe0 0x1b 0x03
 
-# CHECK: dsra $at, $at, 30
+# CHECK: dsra $1, $1, 30
 0xbb 0x0f 0x01 0x00
 
-# CHECK: dsrav $at, $at, $fp
+# CHECK: dsrav $1, $1, $fp
 0x17 0x08 0xc1 0x03
 
 # CHECK: dsrl $10, $gp, 24
@@ -45,10 +45,10 @@
 # CHECK: dsubu $gp, $27, $24
 0x2f 0xe0 0x78 0x03
 
-# CHECK: lw $27, -15155($at)
+# CHECK: lw $27, -15155($1)
 0xcd 0xc4 0x3b 0x8c
 
-# CHECK: lui $at, 1
+# CHECK: lui $1, 1
 0x01 0x00 0x01 0x3c
 
 # CHECK: lwu $3, -1746($3)
@@ -57,7 +57,7 @@
 # CHECK: lui $ra, 1
 0x01 0x00 0x1f 0x3c
 
-# CHECK: sw $26, -15159($at)
+# CHECK: sw $26, -15159($1)
 0xc9 0xc4 0x3a 0xac
 
 # CHECK: ld $26, 3958($zero)
diff --git a/test/MC/Disassembler/X86/marked-up.txt b/test/MC/Disassembler/X86/marked-up.txt
new file mode 100644
index 000000000000..f0e51252f8d8
--- /dev/null
+++ b/test/MC/Disassembler/X86/marked-up.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc --mdis %s -triple=x86_64-apple-darwin9 2>&1 | FileCheck %s
+
+# CHECK: movq	<mem:<reg:%gs>:8>, <reg:%rcx>
+0x65 0x48 0x8b 0x0c 0x25 0x08 0x00 0x00 0x00
+# CHECK: xorps	<reg:%xmm1>, <reg:%xmm2>
+0x0f 0x57 0xd1
diff --git a/test/MC/ELF/cfi-reg.s b/test/MC/ELF/cfi-reg.s
new file mode 100644
index 000000000000..fd68d6d5ad07
--- /dev/null
+++ b/test/MC/ELF/cfi-reg.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple x86_64-pc-linux-gnu %s -o - | FileCheck %s
+// PR13754
+
+f:
+	.cfi_startproc
+        nop
+	.cfi_offset 6, -16
+        nop
+	.cfi_offset %rsi, -16
+        nop
+	.cfi_offset rbx, -16
+        nop
+	.cfi_endproc
+
+// CHECK: f:
+// CHECK: .cfi_offset %rbp, -16
+// CHECK: .cfi_offset %rsi, -16
+// CHECK: .cfi_offset %rbx, -16
diff --git a/test/MC/ELF/lcomm.s b/test/MC/ELF/lcomm.s
new file mode 100644
index 000000000000..ae8d0baa3323
--- /dev/null
+++ b/test/MC/ELF/lcomm.s
@@ -0,0 +1,21 @@
+// RUN: llvm-mc -triple i386-pc-linux-gnu %s -filetype=obj -o - | elf-dump | FileCheck %s
+
+.lcomm A, 5
+.lcomm B, 32 << 20
+
+// CHECK: (('st_name', 0x00000001) # 'A'
+// CHECK:  ('st_value', 0x00000000)
+// CHECK:  ('st_size', 0x00000005)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
+// CHECK: (('st_name', 0x00000003) # 'B'
+// CHECK:  ('st_value', 0x00000005)
+// CHECK:  ('st_size', 0x02000000)
+// CHECK:  ('st_bind', 0x0)
+// CHECK:  ('st_type', 0x1)
+// CHECK:  ('st_other', 0x00)
+// CHECK:  ('st_shndx', 0x0003)
+// CHECK: ),
diff --git a/test/MC/MachO/ARM/long-call-branch-island-relocation.s b/test/MC/MachO/ARM/long-call-branch-island-relocation.s
new file mode 100644
index 000000000000..8ee7da54b541
--- /dev/null
+++ b/test/MC/MachO/ARM/long-call-branch-island-relocation.s
@@ -0,0 +1,43 @@
+@ RUN: llvm-mc -n -triple armv7-apple-darwin10 %s -filetype=obj -o %t.o
+@ RUN: macho-dump --dump-section-data < %t.o | FileCheck %s
+
+@ rdar://12359919
+
+	.syntax unified
+	.text
+
+	.globl	_bar
+	.align	2
+	.code	16
+	.thumb_func	_bar
+_bar:
+	push	{r7, lr}
+	mov	r7, sp
+	bl	_foo
+	pop	{r7, pc}
+
+
+_junk:
+@ Make the _foo symbol sufficiently far away to force the 'bl' relocation
+@ above to be out of range. On Darwin, the assembler deals with this by
+@ generating an external relocation so the linker can create a branch
+@ island.
+
+  .space 20000000
+
+  .section	__TEXT,initcode,regular,pure_instructions
+
+	.globl	_foo
+	.align	2
+	.code	16
+_foo:
+	push	{r7, lr}
+	mov	r7, sp
+	pop	{r7, pc}
+
+
+@ CHECK:  ('_relocations', [
+@ CHECK:    # Relocation 0
+@ CHECK:    (('word-0', 0x4),
+@ CHECK:     ('word-1', 0x6d000002)),
+@ CHECK:  ])
diff --git a/test/MC/MachO/absolute.s b/test/MC/MachO/absolute.s
new file mode 100644
index 000000000000..784e32a7e41d
--- /dev/null
+++ b/test/MC/MachO/absolute.s
@@ -0,0 +1,158 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+_bar:
+  nop
+_foo:
+  nop
+
+  .set foo_set1, (_foo + 0xffff0000)
+  .set foo_set2, (_foo - _bar + 0xffff0000)
+
+foo_equals = (_foo + 0xffff0000)
+foo_equals2 = (_foo - _bar + 0xffff0000)
+
+  .globl foo_set1_global;
+  .set foo_set1_global, (_foo + 0xffff0000)
+
+  .globl foo_set2_global;
+  .set foo_set2_global, (_foo - _bar + 0xffff0000)
+
+// CHECK: ('cputype', 16777223)
+// CHECK: ('cpusubtype', 3)
+// CHECK: ('filetype', 1)
+// CHECK: ('num_load_commands', 3)
+// CHECK: ('load_commands_size', 256)
+// CHECK: ('flag', 0)
+// CHECK: ('reserved', 0)
+// CHECK: ('load_commands', [
+// CHECK:   # Load Command 0
+// CHECK:  (('command', 25)
+// CHECK:   ('size', 152)
+// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:   ('vm_addr', 0)
+// CHECK:   ('vm_size', 2)
+// CHECK:   ('file_offset', 288)
+// CHECK:   ('file_size', 2)
+// CHECK:   ('maxprot', 7)
+// CHECK:   ('initprot', 7)
+// CHECK:   ('num_sections', 1)
+// CHECK:   ('flags', 0)
+// CHECK:   ('sections', [
+// CHECK:     # Section 0
+// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 0)
+// CHECK:     ('size', 2)
+// CHECK:     ('offset', 288)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 0)
+// CHECK:     ('num_reloc', 0)
+// CHECK:     ('flags', 0x80000400)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:     ('reserved3', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:   ])
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 1
+// CHECK:  (('command', 2)
+// CHECK:   ('size', 24)
+// CHECK:   ('symoff', 292)
+// CHECK:   ('nsyms', 8)
+// CHECK:   ('stroff', 420)
+// CHECK:   ('strsize', 84)
+// CHECK:   ('_string_data', '\x00foo_set1_global\x00foo_set2_global\x00_bar\x00_foo\x00foo_set1\x00foo_set2\x00foo_equals\x00foo_equals2\x00')
+// CHECK:   ('_symbols', [
+// CHECK:     # Symbol 0
+// CHECK:    (('n_strx', 33)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 0)
+// CHECK:     ('_string', '_bar')
+// CHECK:    ),
+// CHECK:     # Symbol 1
+// CHECK:    (('n_strx', 38)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 1)
+// CHECK:     ('_string', '_foo')
+// CHECK:    ),
+// CHECK:     # Symbol 2
+// CHECK:    (('n_strx', 43)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1')
+// CHECK:    ),
+// CHECK:     # Symbol 3
+// CHECK:    (('n_strx', 52)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2')
+// CHECK:    ),
+// CHECK:     # Symbol 4
+// CHECK:    (('n_strx', 61)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals')
+// CHECK:    ),
+// CHECK:     # Symbol 5
+// CHECK:    (('n_strx', 72)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals2')
+// CHECK:    ),
+// CHECK:     # Symbol 6
+// CHECK:    (('n_strx', 1)
+// CHECK:     ('n_type', 0xf)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1_global')
+// CHECK:    ),
+// CHECK:     # Symbol 7
+// CHECK:    (('n_strx', 17)
+// CHECK:     ('n_type', 0x3)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2_global')
+// CHECK:    ),
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 2
+// CHECK:  (('command', 11)
+// CHECK:   ('size', 80)
+// CHECK:   ('ilocalsym', 0)
+// CHECK:   ('nlocalsym', 6)
+// CHECK:   ('iextdefsym', 6)
+// CHECK:   ('nextdefsym', 2)
+// CHECK:   ('iundefsym', 8)
+// CHECK:   ('nundefsym', 0)
+// CHECK:   ('tocoff', 0)
+// CHECK:   ('ntoc', 0)
+// CHECK:   ('modtaboff', 0)
+// CHECK:   ('nmodtab', 0)
+// CHECK:   ('extrefsymoff', 0)
+// CHECK:   ('nextrefsyms', 0)
+// CHECK:   ('indirectsymoff', 0)
+// CHECK:   ('nindirectsyms', 0)
+// CHECK:   ('extreloff', 0)
+// CHECK:   ('nextrel', 0)
+// CHECK:   ('locreloff', 0)
+// CHECK:   ('nlocrel', 0)
+// CHECK:   ('_indirect_symbols', [
+// CHECK:   ])
+// CHECK:  ),
+// CHECK: ])
diff --git a/test/MC/MachO/gen-dwarf-cpp.s b/test/MC/MachO/gen-dwarf-cpp.s
new file mode 100644
index 000000000000..cb749f48eef6
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-cpp.s
@@ -0,0 +1,22 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 100 "t.s" 1
+.globl _bar
+_bar:
+	movl	$0, %eax
+L1:	leave
+	ret
+
+// rdar://9275556
+
+// We check that the source name "t.s" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 t.s
+
+// We check that the source line number 100 is picked up before the "movl"
+// CHECK: Address            Line   Column File   ISA Flags
+// CHECK: ------------------ ------ ------ ------ --- -------------
+// CHECK: 0x0000000000000000    102      0      2   0  is_stmt
diff --git a/test/MC/MachO/gen-dwarf-macro-cpp.s b/test/MC/MachO/gen-dwarf-macro-cpp.s
new file mode 100644
index 000000000000..05a449b4027c
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-macro-cpp.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 1 "foo.S" 2
+.macro switcher
+        ljmp *0x38(%ecx)
+.endmacro
+        switcher NaClSwitchNoSSE, 0
+
+// PR14264 was a crash in the code caused by the .macro not handled correctly
+// rdar://12637628
+
+// We check that the source name "foo.S" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-macro-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 foo.S
diff --git a/test/MC/MachO/i386-large-relocations.s b/test/MC/MachO/i386-large-relocations.s
new file mode 100644
index 000000000000..e5a1cfb2c5ef
--- /dev/null
+++ b/test/MC/MachO/i386-large-relocations.s
@@ -0,0 +1,36 @@
+// RUN: llvm-mc -triple i386-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+.space 0x1ed280
+       .section        __DATA,__const
+       .align  4
+.space 0x5181020
+_foo:
+       .long   _bar
+       .long   0
+       .long   _bar+8
+       .long   _bar+24
+       .long   0
+       .long   _bar+16
+
+.zerofill __DATA,__bss,__dummy,0x5d780
+.zerofill __DATA,__bss,_bar,48,4
+
+// Normally scattered relocations are used for sym+offset expressions. When
+// the value exceeds 24-bits, however, it's outside what MachO can encode,
+// so the assembler falls back to non-scattered relocations.
+// rdar://12358909
+
+// CHECK: ('_relocations', [
+// CHECK:   # Relocation 0
+// CHECK:   (('word-0', 0x5181034),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 1
+// CHECK:   (('word-0', 0x518102c),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 2
+// CHECK:   (('word-0', 0x5181028),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK:   # Relocation 3
+// CHECK:   (('word-0', 0x5181020),
+// CHECK:    ('word-1', 0x4000003)),
+// CHECK: ])
diff --git a/test/MC/MachO/lit.local.cfg b/test/MC/MachO/lit.local.cfg
index 6c49f08b7496..41a8434f9993 100644
--- a/test/MC/MachO/lit.local.cfg
+++ b/test/MC/MachO/lit.local.cfg
@@ -1,4 +1,4 @@
-config.suffixes = ['.s']
+config.suffixes = ['.s', '.ll']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/MC/MachO/x86-data-in-code.ll b/test/MC/MachO/x86-data-in-code.ll
new file mode 100644
index 000000000000..2410974c5ca3
--- /dev/null
+++ b/test/MC/MachO/x86-data-in-code.ll
@@ -0,0 +1,108 @@
+; RUN: llc -O0 -mtriple=x86_64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
+
+; There should not be a data-in-code load command (type 0x29) for x86_64
+; jump tables, even though they are in the text section.
+; CHECK: 'num_load_commands'
+; CHECK-NOT: (('command', 41)
+
+define void @foo(i32* %ptr) nounwind ssp {
+  %tmp = load i32* %ptr, align 4
+  switch i32 %tmp, label %default [
+    i32 11, label %bb0
+    i32 10, label %bb1
+    i32 8, label %bb2
+    i32 4, label %bb3
+    i32 2, label %bb4
+    i32 6, label %bb5
+    i32 9, label %bb6
+    i32 15, label %bb7
+    i32 1, label %bb8
+    i32 3, label %bb9
+    i32 5, label %bb10
+    i32 30, label %bb11
+    i32 31, label %bb12
+    i32 13, label %bb13
+    i32 14, label %bb14
+    i32 20, label %bb15
+    i32 19, label %bb16
+    i32 17, label %bb17
+    i32 18, label %bb18
+    i32 21, label %bb19
+    i32 22, label %bb20
+    i32 16, label %bb21
+    i32 24, label %bb22
+    i32 25, label %bb23
+    i32 26, label %bb24
+    i32 27, label %bb25
+    i32 28, label %bb26
+    i32 23, label %bb27
+    i32 12, label %bb28
+  ]
+
+default:
+  br label %exit
+bb0:
+  br label %exit
+bb1:
+  br label %exit
+bb2:
+  br label %exit
+bb3:
+  br label %exit
+bb4:
+  br label %exit
+bb5:
+  br label %exit
+bb6:
+  br label %exit
+bb7:
+  br label %exit
+bb8:
+  br label %exit
+bb9:
+  br label %exit
+bb10:
+  br label %exit
+bb11:
+  br label %exit
+bb12:
+  br label %exit
+bb13:
+  br label %exit
+bb14:
+  br label %exit
+bb15:
+  br label %exit
+bb16:
+  br label %exit
+bb17:
+  br label %exit
+bb18:
+  br label %exit
+bb19:
+  br label %exit
+bb20:
+  br label %exit
+bb21:
+  br label %exit
+bb22:
+  br label %exit
+bb23:
+  br label %exit
+bb24:
+  br label %exit
+bb25:
+  br label %exit
+bb26:
+  br label %exit
+bb27:
+  br label %exit
+bb28:
+  br label %exit
+
+
+exit:
+
+  ret void
+}
+
diff --git a/test/MC/Markup/basic-markup.mc b/test/MC/Markup/basic-markup.mc
new file mode 100644
index 000000000000..2fa5ebb28fa4
--- /dev/null
+++ b/test/MC/Markup/basic-markup.mc
@@ -0,0 +1,16 @@
+// RUN: llvm-mcmarkup %s | FileCheck %s
+
+	push	{<reg:r1>, <reg:r2>, <reg:r7>}
+	sub	<reg:sp>, <imm:#132>
+	ldr	<reg:r0>, <mem:[<reg:r0>, <imm:#4>]>
+
+
+// CHECK: reg
+// CHECK: reg
+// CHECK: reg
+// CHECK: reg
+// CHECK: imm
+// CHECK: reg
+// CHECK: mem
+// CHECK: reg
+// CHECK: imm
diff --git a/test/MC/Markup/lit.local.cfg b/test/MC/Markup/lit.local.cfg
new file mode 100644
index 000000000000..ab28eedae212
--- /dev/null
+++ b/test/MC/Markup/lit.local.cfg
@@ -0,0 +1,2 @@
+config.suffixes = ['.mc']
+
diff --git a/test/MC/Mips/do_switch.ll b/test/MC/Mips/do_switch.ll
new file mode 100644
index 000000000000..7eda1b41d18c
--- /dev/null
+++ b/test/MC/Mips/do_switch.ll
@@ -0,0 +1,39 @@
+; This test case will cause an internal EK_GPRel64BlockAddress to be 
+; produced. This was not handled for direct object and an assertion
+; to occur. This is a variation on test case test/CodeGen/Mips/do_switch.ll
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=static
+
+; RUN: llc < %s -filetype=obj -march=mips -relocation-model=pic
+
+; RUN: llc < %s -filetype=obj -march=mips64 -relocation-model=pic -mcpu=mips64 -mattr=n64 
+
+define i32 @main() nounwind readnone {
+entry:
+  %x = alloca i32, align 4                        ; <i32*> [#uses=2]
+  store volatile i32 2, i32* %x, align 4
+  %0 = load volatile i32* %x, align 4             ; <i32> [#uses=1]
+
+  switch i32 %0, label %bb4 [
+    i32 0, label %bb5
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+  ]
+
+bb1:                                              ; preds = %entry
+  ret i32 2
+
+bb2:                                              ; preds = %entry
+  ret i32 0
+
+bb3:                                              ; preds = %entry
+  ret i32 3
+
+bb4:                                              ; preds = %entry
+  ret i32 4
+
+bb5:                                              ; preds = %entry
+  ret i32 1
+}
+
diff --git a/test/MC/Mips/elf-N64.ll b/test/MC/Mips/elf-N64.ll
index 23ec53a2e26d..ae6de78d6552 100644
--- a/test/MC/Mips/elf-N64.ll
+++ b/test/MC/Mips/elf-N64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 %s -o - | elf-dump --dump-section-data  | FileCheck %s
+; RUN: llc -filetype=obj -march=mips64el -mcpu=mips64 -disable-mips-delay-filler %s -o - | elf-dump --dump-section-data  | FileCheck %s
 
 ; Check for N64 relocation production.
 ;
diff --git a/test/MC/Mips/higher_highest.ll b/test/MC/Mips/higher_highest.ll
index 81a89e3040e3..0c665220335b 100644
--- a/test/MC/Mips/higher_highest.ll
+++ b/test/MC/Mips/higher_highest.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
-
+; DISABLE: llc -march=mips64el -mcpu=mips64 -mattr=n64  -force-mips-long-branch -filetype=obj < %s -o - | elf-dump --dump-section-data | FileCheck %s
+; RUN: false
+; XFAIL: *
+; Disabled because currently we don't have a way to generate these relocations.
+;
 ; Check that the R_MIPS_HIGHER and R_MIPS_HIGHEST relocations were created.
 
 ; CHECK:     ('r_type', 0x1d)
diff --git a/test/MC/Mips/mips-alu-instructions.s b/test/MC/Mips/mips-alu-instructions.s
new file mode 100644
index 000000000000..2997782cd01b
--- /dev/null
+++ b/test/MC/Mips/mips-alu-instructions.s
@@ -0,0 +1,100 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for arithmetic and logical instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Logical instructions
+#------------------------------------------------------------------------------
+# CHECK:  and    $9, $6, $7      # encoding: [0x24,0x48,0xc7,0x00]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  andi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x30]
+# CHECK:  clo    $6, $7          # encoding: [0x21,0x30,0xe6,0x70]
+# CHECK:  clz    $6, $7          # encoding: [0x20,0x30,0xe6,0x70]
+# CHECK:  ins    $19, $9, 6, 7   # encoding: [0x84,0x61,0x33,0x7d]
+# CHECK:  nor    $9, $6, $7      # encoding: [0x27,0x48,0xc7,0x00]
+# CHECK:  or     $3, $3, $5      # encoding: [0x25,0x18,0x65,0x00]
+# CHECK:  ori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x34]
+# CHECK:  rotr   $9, $6, 7       # encoding: [0xc2,0x49,0x26,0x00]
+# CHECK:  rotrv  $9, $6, $7      # encoding: [0x46,0x48,0xe6,0x00]
+# CHECK:  sll    $4, $3, 7       # encoding: [0xc0,0x21,0x03,0x00]
+# CHECK:  sllv   $2, $3, $5      # encoding: [0x04,0x10,0xa3,0x00]
+# CHECK:  slt    $3, $3, $5      # encoding: [0x2a,0x18,0x65,0x00]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  slti   $3, $3, 103     # encoding: [0x67,0x00,0x63,0x28]
+# CHECK:  sltiu  $3, $3, 103     # encoding: [0x67,0x00,0x63,0x2c]
+# CHECK:  sltu   $3, $3, $5      # encoding: [0x2b,0x18,0x65,0x00]
+# CHECK:  sra    $4, $3, 7       # encoding: [0xc3,0x21,0x03,0x00]
+# CHECK:  srav   $2, $3, $5      # encoding: [0x07,0x10,0xa3,0x00]
+# CHECK:  srl    $4, $3, 7       # encoding: [0xc2,0x21,0x03,0x00]
+# CHECK:  srlv   $2, $3, $5      # encoding: [0x06,0x10,0xa3,0x00]
+# CHECK:  xor    $3, $3, $5      # encoding: [0x26,0x18,0x65,0x00]
+# CHECK:  xori    $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  xori   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x38]
+# CHECK:  wsbh   $6, $7          # encoding: [0xa0,0x30,0x07,0x7c]
+# CHECK:  nor    $7, $8, $zero   # encoding: [0x27,0x38,0x00,0x01]
+     and    $9,  $6, $7
+     and    $9,  $6, 17767
+     andi   $9,  $6, 17767
+     clo    $6,  $7
+     clz    $6,  $7
+     ins    $19, $9, 6,7
+     nor    $9,  $6, $7
+     or     $3,  $3, $5
+     ori    $9,  $6, 17767
+     rotr   $9,  $6, 7
+     rotrv  $9,  $6, $7
+     sll    $4,  $3, 7
+     sllv   $2,  $3, $5
+     slt    $3,  $3, $5
+     slt    $3,  $3, 103
+     slti   $3,  $3, 103
+     sltiu  $3,  $3, 103
+     sltu   $3,  $3, $5
+     sra    $4,  $3, 7
+     srav   $2,  $3, $5
+     srl    $4,  $3, 7
+     srlv   $2,  $3, $5
+     xor    $3,  $3, $5
+     xor    $9,  $6, 17767
+     xori   $9,  $6, 17767
+     wsbh   $6,  $7
+     not    $7  ,$8
+
+#------------------------------------------------------------------------------
+# Arithmetic instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  add    $9, $6, $7      # encoding: [0x20,0x48,0xc7,0x00]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addi   $9, $6, 17767   # encoding: [0x67,0x45,0xc9,0x20]
+# CHECK:  addiu  $9, $6, -15001  # encoding: [0x67,0xc5,0xc9,0x24]
+# CHECK:  addu   $9, $6, $7      # encoding: [0x21,0x48,0xc7,0x00]
+# CHECK:  madd   $6, $7          # encoding: [0x00,0x00,0xc7,0x70]
+# CHECK:  maddu  $6, $7          # encoding: [0x01,0x00,0xc7,0x70]
+# CHECK:  msub   $6, $7          # encoding: [0x04,0x00,0xc7,0x70]
+# CHECK:  msubu  $6, $7          # encoding: [0x05,0x00,0xc7,0x70]
+# CHECK:  mult   $3, $5          # encoding: [0x18,0x00,0x65,0x00]
+# CHECK:  multu  $3, $5          # encoding: [0x19,0x00,0x65,0x00]
+# CHECK:  sub    $9, $6, $7      # encoding: [0x22,0x48,0xc7,0x00]
+# CHECK:  subu   $4, $3, $5      # encoding: [0x23,0x20,0x65,0x00]
+# CHECK:  sub     $6, $zero, $7  # encoding: [0x22,0x30,0x07,0x00]
+# CHECK:  subu    $6, $zero, $7  # encoding: [0x23,0x30,0x07,0x00]
+# CHECK:  add     $7, $8, $zero  # encoding: [0x20,0x38,0x00,0x01]
+    add    $9,$6,$7
+    add    $9,$6,17767
+    addu   $9,$6,-15001
+    addi   $9,$6,17767
+    addiu  $9,$6,-15001
+    addu   $9,$6,$7
+    madd   $6,$7
+    maddu  $6,$7
+    msub   $6,$7
+    msubu  $6,$7
+    mult   $3,$5
+    multu  $3,$5
+    sub    $9,$6,$7
+    subu   $4,$3,$5
+    neg    $6,$7
+    negu   $6,$7
+    move   $7,$8
diff --git a/test/MC/Mips/mips-coprocessor-encodings.s b/test/MC/Mips/mips-coprocessor-encodings.s
new file mode 100644
index 000000000000..bad9163ba9fa
--- /dev/null
+++ b/test/MC/Mips/mips-coprocessor-encodings.s
@@ -0,0 +1,37 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck --check-prefix=MIPS64 %s
+
+# MIPS64:	dmtc0	$12, $16, 2             # encoding: [0x40,0xac,0x80,0x02]
+# MIPS64:	dmtc0	$12, $16, 0             # encoding: [0x40,0xac,0x80,0x00]
+# MIPS64:	mtc0	$12, $16, 2             # encoding: [0x40,0x8c,0x80,0x02]
+# MIPS64:	mtc0	$12, $16, 0             # encoding: [0x40,0x8c,0x80,0x00]
+# MIPS64:	dmfc0	$12, $16, 2             # encoding: [0x40,0x2c,0x80,0x02]
+# MIPS64:	dmfc0	$12, $16, 0             # encoding: [0x40,0x2c,0x80,0x00]
+# MIPS64:	mfc0	$12, $16, 2             # encoding: [0x40,0x0c,0x80,0x02]
+# MIPS64:	mfc0	$12, $16, 0             # encoding: [0x40,0x0c,0x80,0x00]
+
+	dmtc0	$12, $16, 2
+	dmtc0	$12, $16
+	mtc0	$12, $16, 2
+	mtc0	$12, $16
+	dmfc0	$12, $16, 2
+	dmfc0	$12, $16
+	mfc0	$12, $16, 2
+	mfc0	$12, $16
+
+# MIPS64:	dmtc2	$12, $16, 2             # encoding: [0x48,0xac,0x80,0x02]
+# MIPS64:	dmtc2	$12, $16, 0             # encoding: [0x48,0xac,0x80,0x00]
+# MIPS64:	mtc2	$12, $16, 2             # encoding: [0x48,0x8c,0x80,0x02]
+# MIPS64:	mtc2	$12, $16, 0             # encoding: [0x48,0x8c,0x80,0x00]
+# MIPS64:	dmfc2	$12, $16, 2             # encoding: [0x48,0x2c,0x80,0x02]
+# MIPS64:	dmfc2	$12, $16, 0             # encoding: [0x48,0x2c,0x80,0x00]
+# MIPS64:	mfc2	$12, $16, 2             # encoding: [0x48,0x0c,0x80,0x02]
+# MIPS64:	mfc2	$12, $16, 0             # encoding: [0x48,0x0c,0x80,0x00]
+
+	dmtc2	$12, $16, 2
+	dmtc2	$12, $16
+	mtc2	$12, $16, 2
+	mtc2	$12, $16
+	dmfc2	$12, $16, 2
+	dmfc2	$12, $16
+	mfc2	$12, $16, 2
+	mfc2	$12, $16
diff --git a/test/MC/Mips/mips-expansions.s b/test/MC/Mips/mips-expansions.s
new file mode 100644
index 000000000000..cfc15e883a95
--- /dev/null
+++ b/test/MC/Mips/mips-expansions.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for macro instructions
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Load immediate instructions
+#------------------------------------------------------------------------------
+# CHECK: ori     $5, $zero, 123      # encoding: [0x7b,0x00,0x05,0x34]
+# CHECK: addiu   $6, $zero, -2345    # encoding: [0xd7,0xf6,0x06,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $4, $zero, 20       # encoding: [0x14,0x00,0x04,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addiu   $4, $5, 20          # encoding: [0x14,0x00,0xa4,0x24]
+# CHECK: lui     $7, 1               # encoding: [0x01,0x00,0x07,0x3c]
+# CHECK: ori     $7, $7, 2           # encoding: [0x02,0x00,0xe7,0x34]
+# CHECK: addu    $7, $7, $8          # encoding: [0x21,0x38,0xe8,0x00]
+
+    li $5,123
+    li $6,-2345
+    li $7,65538
+
+    la $a0, 20
+    la $7,65538
+    la $a0, 20($a1)
+    la $7,65538($8)
diff --git a/test/MC/Mips/mips-fpu-instructions.s b/test/MC/Mips/mips-fpu-instructions.s
new file mode 100644
index 000000000000..a126c6f7188c
--- /dev/null
+++ b/test/MC/Mips/mips-fpu-instructions.s
@@ -0,0 +1,178 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for FPU instructions.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# FP aritmetic  instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  abs.d      $f12, $f14         # encoding: [0x05,0x73,0x20,0x46]
+# CHECK:  abs.s      $f6, $f7           # encoding: [0x85,0x39,0x00,0x46]
+# CHECK:  add.d      $f8, $f12, $f14    # encoding: [0x00,0x62,0x2e,0x46]
+# CHECK:  add.s      $f9, $f6, $f7      # encoding: [0x40,0x32,0x07,0x46]
+# CHECK:  floor.w.d  $f12, $f14         # encoding: [0x0f,0x73,0x20,0x46]
+# CHECK:  floor.w.s  $f6, $f7           # encoding: [0x8f,0x39,0x00,0x46]
+# CHECK:  ceil.w.d   $f12, $f14         # encoding: [0x0e,0x73,0x20,0x46]
+# CHECK:  ceil.w.s   $f6, $f7           # encoding: [0x8e,0x39,0x00,0x46]
+# CHECK:  mul.d      $f8, $f12, $f14    # encoding: [0x02,0x62,0x2e,0x46]
+# CHECK:  mul.s      $f9, $f6, $f7      # encoding: [0x42,0x32,0x07,0x46]
+# CHECK:  neg.d      $f12, $f14         # encoding: [0x07,0x73,0x20,0x46]
+# CHECK:  neg.s      $f6, $f7           # encoding: [0x87,0x39,0x00,0x46]
+# CHECK:  round.w.d  $f12, $f14         # encoding: [0x0c,0x73,0x20,0x46]
+# CHECK:  round.w.s  $f6, $f7           # encoding: [0x8c,0x39,0x00,0x46]
+# CHECK:  sqrt.d     $f12, $f14         # encoding: [0x04,0x73,0x20,0x46]
+# CHECK:  sqrt.s     $f6, $f7           # encoding: [0x84,0x39,0x00,0x46]
+# CHECK:  sub.d      $f8, $f12, $f14    # encoding: [0x01,0x62,0x2e,0x46]
+# CHECK:  sub.s      $f9, $f6, $f7      # encoding: [0x41,0x32,0x07,0x46]
+# CHECK:  trunc.w.d  $f12, $f14         # encoding: [0x0d,0x73,0x20,0x46]
+# CHECK:  trunc.w.s  $f6, $f7           # encoding: [0x8d,0x39,0x00,0x46]
+
+    abs.d      $f12,$f14
+    abs.s      $f6,$f7
+    add.d      $f8,$f12,$f14
+    add.s      $f9,$f6,$f7
+    floor.w.d  $f12,$f14
+    floor.w.s  $f6,$f7
+    ceil.w.d   $f12,$f14
+    ceil.w.s   $f6,$f7
+    mul.d      $f8,$f12,$f14
+    mul.s      $f9,$f6, $f7
+    neg.d      $f12,$f14
+    neg.s      $f6,$f7
+    round.w.d  $f12,$f14
+    round.w.s  $f6,$f7
+    sqrt.d     $f12,$f14
+    sqrt.s     $f6,$f7
+    sub.d      $f8,$f12,$f14
+    sub.s      $f9,$f6,$f7
+    trunc.w.d  $f12,$f14
+    trunc.w.s  $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP compare instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  c.eq.d    $f12, $f14        # encoding: [0x32,0x60,0x2e,0x46]
+# CHECK:  c.eq.s    $f6, $f7          # encoding: [0x32,0x30,0x07,0x46]
+# CHECK:  c.f.d     $f12, $f14        # encoding: [0x30,0x60,0x2e,0x46]
+# CHECK:  c.f.s     $f6, $f7          # encoding: [0x30,0x30,0x07,0x46]
+# CHECK:  c.le.d    $f12, $f14        # encoding: [0x3e,0x60,0x2e,0x46]
+# CHECK:  c.le.s    $f6, $f7          # encoding: [0x3e,0x30,0x07,0x46]
+# CHECK:  c.lt.d    $f12, $f14        # encoding: [0x3c,0x60,0x2e,0x46]
+# CHECK:  c.lt.s    $f6, $f7          # encoding: [0x3c,0x30,0x07,0x46]
+# CHECK:  c.nge.d   $f12, $f14        # encoding: [0x3d,0x60,0x2e,0x46]
+# CHECK:  c.nge.s   $f6, $f7          # encoding: [0x3d,0x30,0x07,0x46]
+# CHECK:  c.ngl.d   $f12, $f14        # encoding: [0x3b,0x60,0x2e,0x46]
+# CHECK:  c.ngl.s   $f6, $f7          # encoding: [0x3b,0x30,0x07,0x46]
+# CHECK:  c.ngle.d  $f12, $f14        # encoding: [0x39,0x60,0x2e,0x46]
+# CHECK:  c.ngle.s  $f6, $f7          # encoding: [0x39,0x30,0x07,0x46]
+# CHECK:  c.ngt.d   $f12, $f14        # encoding: [0x3f,0x60,0x2e,0x46]
+# CHECK:  c.ngt.s   $f6, $f7          # encoding: [0x3f,0x30,0x07,0x46]
+# CHECK:  c.ole.d   $f12, $f14        # encoding: [0x36,0x60,0x2e,0x46]
+# CHECK:  c.ole.s   $f6, $f7          # encoding: [0x36,0x30,0x07,0x46]
+# CHECK:  c.olt.d   $f12, $f14        # encoding: [0x34,0x60,0x2e,0x46]
+# CHECK:  c.olt.s   $f6, $f7          # encoding: [0x34,0x30,0x07,0x46]
+# CHECK:  c.seq.d   $f12, $f14        # encoding: [0x3a,0x60,0x2e,0x46]
+# CHECK:  c.seq.s   $f6, $f7          # encoding: [0x3a,0x30,0x07,0x46]
+# CHECK:  c.sf.d    $f12, $f14        # encoding: [0x38,0x60,0x2e,0x46]
+# CHECK:  c.sf.s    $f6, $f7          # encoding: [0x38,0x30,0x07,0x46]
+# CHECK:  c.ueq.d   $f12, $f14        # encoding: [0x33,0x60,0x2e,0x46]
+# CHECK:  c.ueq.s   $f28, $f18        # encoding: [0x33,0xe0,0x12,0x46]
+# CHECK:  c.ule.d   $f12, $f14        # encoding: [0x37,0x60,0x2e,0x46]
+# CHECK:  c.ule.s   $f6, $f7          # encoding: [0x37,0x30,0x07,0x46]
+# CHECK:  c.ult.d   $f12, $f14        # encoding: [0x35,0x60,0x2e,0x46]
+# CHECK:  c.ult.s   $f6, $f7          # encoding: [0x35,0x30,0x07,0x46]
+# CHECK:  c.un.d    $f12, $f14        # encoding: [0x31,0x60,0x2e,0x46]
+# CHECK:  c.un.s    $f6, $f7          # encoding: [0x31,0x30,0x07,0x46]
+
+     c.eq.d    $f12,$f14
+     c.eq.s    $f6,$f7
+     c.f.d     $f12,$f14
+     c.f.s     $f6,$f7
+     c.le.d    $f12,$f14
+     c.le.s    $f6,$f7
+     c.lt.d    $f12,$f14
+     c.lt.s    $f6,$f7
+     c.nge.d   $f12,$f14
+     c.nge.s   $f6,$f7
+     c.ngl.d   $f12,$f14
+     c.ngl.s   $f6,$f7
+     c.ngle.d  $f12,$f14
+     c.ngle.s  $f6,$f7
+     c.ngt.d   $f12,$f14
+     c.ngt.s   $f6,$f7
+     c.ole.d   $f12,$f14
+     c.ole.s   $f6,$f7
+     c.olt.d   $f12,$f14
+     c.olt.s   $f6,$f7
+     c.seq.d   $f12,$f14
+     c.seq.s   $f6,$f7
+     c.sf.d    $f12,$f14
+     c.sf.s    $f6,$f7
+     c.ueq.d   $f12,$f14
+     c.ueq.s   $f28,$f18
+     c.ule.d   $f12,$f14
+     c.ule.s   $f6,$f7
+     c.ult.d   $f12,$f14
+     c.ult.s   $f6,$f7
+     c.un.d    $f12,$f14
+     c.un.s    $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP convert instructions
+#------------------------------------------------------------------------------
+# CHECK:  cvt.d.s   $f6, $f7          # encoding: [0xa1,0x39,0x00,0x46]
+# CHECK:  cvt.d.w   $f12, $f14        # encoding: [0x21,0x73,0x80,0x46]
+# CHECK:  cvt.s.d   $f12, $f14        # encoding: [0x20,0x73,0x20,0x46]
+# CHECK:  cvt.s.w   $f6, $f7          # encoding: [0xa0,0x39,0x80,0x46]
+# CHECK:  cvt.w.d   $f12, $f14        # encoding: [0x24,0x73,0x20,0x46]
+# CHECK:  cvt.w.s   $f6, $f7          # encoding: [0xa4,0x39,0x00,0x46]
+
+  cvt.d.s   $f6,$f7
+  cvt.d.w   $f12,$f14
+  cvt.s.d   $f12,$f14
+  cvt.s.w   $f6,$f7
+  cvt.w.d   $f12,$f14
+  cvt.w.s   $f6,$f7
+
+#------------------------------------------------------------------------------
+# FP move instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  cfc1    $6, $fcc0            # encoding: [0x00,0x00,0x46,0x44]
+# CHECK:  mfc1    $6, $f7              # encoding: [0x00,0x38,0x06,0x44]
+# CHECK:  mfhi    $5                   # encoding: [0x10,0x28,0x00,0x00]
+# CHECK:  mflo    $5                   # encoding: [0x12,0x28,0x00,0x00]
+# CHECK:  mov.d   $f6, $f8             # encoding: [0x86,0x41,0x20,0x46]
+# CHECK:  mov.s   $f6, $f7             # encoding: [0x86,0x39,0x00,0x46]
+# CHECK:  mtc1    $6, $f7              # encoding: [0x00,0x38,0x86,0x44]
+# CHECK:  mthi    $7                   # encoding: [0x11,0x00,0xe0,0x00]
+# CHECK:  mtlo    $7                   # encoding: [0x13,0x00,0xe0,0x00]
+# CHECK:  swc1    $f9, 9158($7)        # encoding: [0xc6,0x23,0xe9,0xe4]
+# CHECK:  mfc0    $6, $7, 0               # encoding: [0x00,0x38,0x06,0x40]
+# CHECK:  mtc0    $9, $8, 0               # encoding: [0x00,0x40,0x89,0x40]
+# CHECK:  mfc2    $5, $7, 0               # encoding: [0x00,0x38,0x05,0x48]
+# CHECK:  mtc2    $9, $4, 0               # encoding: [0x00,0x20,0x89,0x48]
+# CHECK:  mfc0    $6, $7, 2               # encoding: [0x02,0x38,0x06,0x40]
+# CHECK:  mtc0    $9, $8, 3               # encoding: [0x03,0x40,0x89,0x40]
+# CHECK:  mfc2    $5, $7, 4               # encoding: [0x04,0x38,0x05,0x48]
+# CHECK:  mtc2    $9, $4, 5               # encoding: [0x05,0x20,0x89,0x48]
+
+   cfc1    $a2,$0
+   mfc1    $a2,$f7
+   mfhi    $a1
+   mflo    $a1
+   mov.d   $f6,$f8
+   mov.s   $f6,$f7
+   mtc1    $a2,$f7
+   mthi    $a3
+   mtlo    $a3
+   swc1    $f9,9158($a3)
+   mfc0    $6, $7
+   mtc0    $9, $8
+   mfc2    $5, $7
+   mtc2    $9, $4
+   mfc0    $6, $7, 2
+   mtc0    $9, $8, 3
+   mfc2    $5, $7, 4
+   mtc2    $9, $4, 5
diff --git a/test/MC/Mips/mips-jump-instructions.s b/test/MC/Mips/mips-jump-instructions.s
new file mode 100644
index 000000000000..998be418d204
--- /dev/null
+++ b/test/MC/Mips/mips-jump-instructions.s
@@ -0,0 +1,72 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for jumps and branches.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Branch instructions
+#------------------------------------------------------------------------------
+# CHECK:   b 1332                 # encoding: [0x34,0x05,0x00,0x10]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1f 1332              # encoding: [0x34,0x05,0x00,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bc1t 1332              # encoding: [0x34,0x05,0x01,0x45]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   beq $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x11]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgez $6, 1332          # encoding: [0x34,0x05,0xc1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgezal $6, 1332        # encoding: [0x34,0x05,0xd1,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bgtz $6, 1332          # encoding: [0x34,0x05,0xc0,0x1c]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   blez $6, 1332          # encoding: [0x34,0x05,0xc0,0x18]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bne $9, $6, 1332       # encoding: [0x34,0x05,0x26,0x15]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   bal     1332           # encoding: [0x34,0x05,0x00,0x04]
+# CHECK:   nop                    # encoding: [0x00,0x00,0x00,0x00]
+         b 1332
+         nop
+         bc1f 1332
+         nop
+         bc1t 1332
+         nop
+         beq $9,$6,1332
+         nop
+         bgez $6,1332
+         nop
+         bgezal $6,1332
+         nop
+         bgtz $6,1332
+         nop
+         blez $6,1332
+         nop
+         bne $9,$6,1332
+         nop
+         bal 1332
+         nop
+
+end_of_code:
+#------------------------------------------------------------------------------
+# Jump instructions
+#------------------------------------------------------------------------------
+# CHECK:   j 1328               # encoding: [0x30,0x05,0x00,0x08]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jal 1328             # encoding: [0x30,0x05,0x00,0x0c]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jalr $6              # encoding: [0x09,0xf8,0xc0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+# CHECK:   nop                  # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:   jr $7                # encoding: [0x08,0x00,0xe0,0x00]
+
+
+   j 1328
+   nop
+   jal 1328
+   nop
+   jalr $6
+   nop
+   jr $7
+   nop
+   j $7
diff --git a/test/MC/Mips/mips-memory-instructions.s b/test/MC/Mips/mips-memory-instructions.s
new file mode 100644
index 000000000000..b5f1267ef386
--- /dev/null
+++ b/test/MC/Mips/mips-memory-instructions.s
@@ -0,0 +1,45 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for loads and stores.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+#------------------------------------------------------------------------------
+# Memory store instructions
+#------------------------------------------------------------------------------
+# CHECK:  sb      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa0]
+# CHECK:  sc      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xe0]
+# CHECK:  sh      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa4]
+# CHECK:  sw      $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xac]
+# CHECK:  sw      $7,  0($5)      # encoding: [0x00,0x00,0xa7,0xac]
+# CHECK:  swc1    $f2, 16($5)     # encoding: [0x10,0x00,0xa2,0xe4]
+# CHECK:  swl     $4, 16($5)      # encoding: [0x10,0x00,0xa4,0xa8]
+     sb   $4, 16($5)
+     sc   $4, 16($5)
+     sh   $4, 16($5)
+     sw   $4, 16($5)
+     sw   $7,   ($5)
+     swc1 $f2, 16($5)
+     swl  $4, 16($5)
+
+#------------------------------------------------------------------------------
+# Memory load instructions
+#------------------------------------------------------------------------------
+
+# CHECK:  lb  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x80]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lbu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x90]
+# CHECK:  lh  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x84]
+# CHECK:  lhu $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x94]
+# CHECK:  ll  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0xc0]
+# CHECK:  lw  $4, 4($5)       # encoding: [0x04,0x00,0xa4,0x8c]
+# CHECK:  lw  $7, 0($7)       # encoding: [0x00,0x00,0xe7,0x8c]
+# CHECK:  lw  $2, 16($sp)     # encoding: [0x10,0x00,0xa2,0x8f]
+
+      lb      $4, 4($5)
+      lw      $4, 4($5)
+      lbu     $4, 4($5)
+      lh      $4, 4($5)
+      lhu     $4, 4($5)
+      ll      $4, 4($5)
+      lw      $4, 4($5)
+      lw      $7,    ($7)
+      lw      $2, 16($sp)
diff --git a/test/MC/Mips/mips-register-names.s b/test/MC/Mips/mips-register-names.s
new file mode 100644
index 000000000000..26187ce58875
--- /dev/null
+++ b/test/MC/Mips/mips-register-names.s
@@ -0,0 +1,71 @@
+# RUN: llvm-mc %s -triple=mips-unknown-freebsd -show-encoding | FileCheck %s
+
+# Check that the register names are mapped to their correct numbers for o32
+# Second byte of addiu with $zero at rt contains the number of the source
+# register.
+
+# CHECK: encoding: [0x24,0x00,0x00,0x00]
+# CHECK: encoding: [0x24,0x01,0x00,0x00]
+# CHECK: encoding: [0x24,0x02,0x00,0x00]
+# CHECK: encoding: [0x24,0x03,0x00,0x00]
+# CHECK: encoding: [0x24,0x04,0x00,0x00]
+# CHECK: encoding: [0x24,0x05,0x00,0x00]
+# CHECK: encoding: [0x24,0x06,0x00,0x00]
+# CHECK: encoding: [0x24,0x07,0x00,0x00]
+# CHECK: encoding: [0x24,0x08,0x00,0x00]
+# CHECK: encoding: [0x24,0x09,0x00,0x00]
+# CHECK: encoding: [0x24,0x0a,0x00,0x00]
+# CHECK: encoding: [0x24,0x0b,0x00,0x00]
+# CHECK: encoding: [0x24,0x0c,0x00,0x00]
+# CHECK: encoding: [0x24,0x0d,0x00,0x00]
+# CHECK: encoding: [0x24,0x0e,0x00,0x00]
+# CHECK: encoding: [0x24,0x0f,0x00,0x00]
+# CHECK: encoding: [0x24,0x10,0x00,0x00]
+# CHECK: encoding: [0x24,0x11,0x00,0x00]
+# CHECK: encoding: [0x24,0x12,0x00,0x00]
+# CHECK: encoding: [0x24,0x13,0x00,0x00]
+# CHECK: encoding: [0x24,0x14,0x00,0x00]
+# CHECK: encoding: [0x24,0x15,0x00,0x00]
+# CHECK: encoding: [0x24,0x16,0x00,0x00]
+# CHECK: encoding: [0x24,0x17,0x00,0x00]
+# CHECK: encoding: [0x24,0x18,0x00,0x00]
+# CHECK: encoding: [0x24,0x19,0x00,0x00]
+# CHECK: encoding: [0x24,0x1a,0x00,0x00]
+# CHECK: encoding: [0x24,0x1b,0x00,0x00]
+# CHECK: encoding: [0x24,0x1c,0x00,0x00]
+# CHECK: encoding: [0x24,0x1d,0x00,0x00]
+# CHECK: encoding: [0x24,0x1e,0x00,0x00]
+# CHECK: encoding: [0x24,0x1f,0x00,0x00]
+addiu	$zero, $zero, 0
+addiu	$at, $zero, 0
+addiu	$v0, $zero, 0
+addiu	$v1, $zero, 0
+addiu	$a0, $zero, 0
+addiu	$a1, $zero, 0
+addiu	$a2, $zero, 0
+addiu	$a3, $zero, 0
+addiu	$t0, $zero, 0
+addiu	$t1, $zero, 0
+addiu	$t2, $zero, 0
+addiu	$t3, $zero, 0
+addiu	$t4, $zero, 0
+addiu	$t5, $zero, 0
+addiu	$t6, $zero, 0
+addiu	$t7, $zero, 0
+addiu	$s0, $zero, 0
+addiu	$s1, $zero, 0
+addiu	$s2, $zero, 0
+addiu	$s3, $zero, 0
+addiu	$s4, $zero, 0
+addiu	$s5, $zero, 0
+addiu	$s6, $zero, 0
+addiu	$s7, $zero, 0
+addiu	$t8, $zero, 0
+addiu	$t9, $zero, 0
+addiu	$k0, $zero, 0
+addiu	$k1, $zero, 0
+addiu	$gp, $zero, 0
+addiu	$sp, $zero, 0
+addiu	$fp, $zero, 0
+addiu	$sp, $zero, 0
+addiu	$ra, $zero, 0
diff --git a/test/MC/Mips/mips-relocations.s b/test/MC/Mips/mips-relocations.s
new file mode 100644
index 000000000000..ff71c7559cd0
--- /dev/null
+++ b/test/MC/Mips/mips-relocations.s
@@ -0,0 +1,41 @@
+# RUN: llvm-mc %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 | FileCheck %s
+# Check that the assembler can handle the documented syntax
+# for relocations.
+# CHECK: .section __TEXT,__text,regular,pure_instructions
+# CHECK:  lui   $2, %hi(_gp_disp)     # encoding: [A,A,0x02,0x3c]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_HI, kind: fixup_Mips_HI16
+# CHECK:  addiu $2, $2, %lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                              #   fixup A - offset: 0, value: _gp_disp@ABS_LO, kind: fixup_Mips_LO16
+# CHECK:  lw    $25, %call16(strchr)($gp)   # encoding: [A,A,0x99,0x8f]
+# CHECK:                                    #   fixup A - offset: 0, value: strchr@GOT_CALL, kind: fixup_Mips_CALL16
+# CHECK:  lw      $3, %got(loop_1)($2)    # encoding: [A,A,0x43,0x8c]
+# CHECK:                                  #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lui     $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x02,0x3c]
+# CHECK:                                        #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  addiu   $2, $2, %dtprel_hi(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                  #   fixup A - offset: 0, value: _gp_disp@DTPREL_HI, kind: fixup_Mips_DTPREL_HI
+# CHECK:  lw      $3, %got(loop_1)($2)      # encoding: [A,A,0x43,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_1@GOT, kind: fixup_Mips_GOT_Local
+# CHECK:  lw      $4, %got_disp(loop_2)($3) # encoding: [A,A,0x64,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_2@GOT_DISP, kind: fixup_Mips_GOT_DISP
+# CHECK:  lw      $5, %got_page(loop_3)($4) # encoding: [A,A,0x85,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_3@GOT_PAGE, kind: fixup_Mips_GOT_PAGE
+# CHECK:  lw      $6, %got_ofst(loop_4)($5) # encoding: [A,A,0xa6,0x8c]
+# CHECK:                                    #   fixup A - offset: 0, value: loop_4@GOT_OFST, kind: fixup_Mips_GOT_OFST
+# CHECK:  lui     $2, %tprel_hi(_gp_disp)   # encoding: [A,A,0x02,0x3c]
+# CHECK:                                    #   fixup A - offset: 0, value: _gp_disp@TPREL_HI, kind: fixup_Mips_TPREL_HI
+# CHECK:  addiu   $2, $2, %tprel_lo(_gp_disp) # encoding: [A,A,0x42,0x24]
+# CHECK:                                      #   fixup A - offset: 0, value: _gp_disp@TPREL_LO, kind: fixup_Mips_TPREL_LO
+
+    lui	$2, %hi(_gp_disp)
+	  addiu	$2, $2, %lo(_gp_disp)
+    lw	$25, %call16(strchr)($gp)
+    lw      $3, %got(loop_1)($2)
+    lui	$2, %dtprel_hi(_gp_disp)
+	  addiu	$2, $2, %dtprel_hi(_gp_disp)
+    lw	$3, %got(loop_1)($2)
+    lw	$4, %got_disp(loop_2)($3)
+    lw	$5, %got_page(loop_3)($4)
+    lw	$6, %got_ofst(loop_4)($5)
+    lui	$2, %tprel_hi(_gp_disp)
+	  addiu	$2, $2, %tprel_lo(_gp_disp)
diff --git a/test/MC/Mips/mips64-register-names.s b/test/MC/Mips/mips64-register-names.s
new file mode 100644
index 000000000000..16783ee1a68c
--- /dev/null
+++ b/test/MC/Mips/mips64-register-names.s
@@ -0,0 +1,70 @@
+# RUN: llvm-mc %s -triple=mips64-unknown-freebsd -show-encoding | FileCheck %s
+
+# Check that the register names are mapped to their correct numbers for n64
+# Second byte of addiu with $zero at rt contains the number of the source
+# register.
+
+# CHECK: encoding: [0x64,0x00,0x00,0x00]
+# CHECK: encoding: [0x64,0x01,0x00,0x00]
+# CHECK: encoding: [0x64,0x02,0x00,0x00]
+# CHECK: encoding: [0x64,0x03,0x00,0x00]
+# CHECK: encoding: [0x64,0x04,0x00,0x00]
+# CHECK: encoding: [0x64,0x05,0x00,0x00]
+# CHECK: encoding: [0x64,0x06,0x00,0x00]
+# CHECK: encoding: [0x64,0x07,0x00,0x00]
+# CHECK: encoding: [0x64,0x08,0x00,0x00]
+# CHECK: encoding: [0x64,0x09,0x00,0x00]
+# CHECK: encoding: [0x64,0x0a,0x00,0x00]
+# CHECK: encoding: [0x64,0x0b,0x00,0x00]
+# CHECK: encoding: [0x64,0x0c,0x00,0x00]
+# CHECK: encoding: [0x64,0x0d,0x00,0x00]
+# CHECK: encoding: [0x64,0x0e,0x00,0x00]
+# CHECK: encoding: [0x64,0x0f,0x00,0x00]
+# CHECK: encoding: [0x64,0x10,0x00,0x00]
+# CHECK: encoding: [0x64,0x11,0x00,0x00]
+# CHECK: encoding: [0x64,0x12,0x00,0x00]
+# CHECK: encoding: [0x64,0x13,0x00,0x00]
+# CHECK: encoding: [0x64,0x14,0x00,0x00]
+# CHECK: encoding: [0x64,0x15,0x00,0x00]
+# CHECK: encoding: [0x64,0x16,0x00,0x00]
+# CHECK: encoding: [0x64,0x17,0x00,0x00]
+# CHECK: encoding: [0x64,0x18,0x00,0x00]
+# CHECK: encoding: [0x64,0x19,0x00,0x00]
+# CHECK: encoding: [0x64,0x1a,0x00,0x00]
+# CHECK: encoding: [0x64,0x1b,0x00,0x00]
+# CHECK: encoding: [0x64,0x1c,0x00,0x00]
+# CHECK: encoding: [0x64,0x1d,0x00,0x00]
+# CHECK: encoding: [0x64,0x1e,0x00,0x00]
+# CHECK: encoding: [0x64,0x1f,0x00,0x00]
+daddiu	$zero, $zero, 0
+daddiu	$at, $zero, 0
+daddiu	$v0, $zero, 0
+daddiu	$v1, $zero, 0
+daddiu	$a0, $zero, 0
+daddiu	$a1, $zero, 0
+daddiu	$a2, $zero, 0
+daddiu	$a3, $zero, 0
+daddiu	$a4, $zero, 0
+daddiu	$a5, $zero, 0
+daddiu	$a6, $zero, 0
+daddiu	$a7, $zero, 0
+daddiu	$t4, $zero, 0
+daddiu	$t5, $zero, 0
+daddiu	$t6, $zero, 0
+daddiu	$t7, $zero, 0
+daddiu	$s0, $zero, 0
+daddiu	$s1, $zero, 0
+daddiu	$s2, $zero, 0
+daddiu	$s3, $zero, 0
+daddiu	$s4, $zero, 0
+daddiu	$s5, $zero, 0
+daddiu	$s6, $zero, 0
+daddiu	$s7, $zero, 0
+daddiu	$t8, $zero, 0
+daddiu	$t9, $zero, 0
+daddiu	$kt0, $zero, 0
+daddiu	$kt1, $zero, 0
+daddiu	$gp, $zero, 0
+daddiu	$sp, $zero, 0
+daddiu	$s8, $zero, 0
+daddiu	$ra, $zero, 0
diff --git a/test/MC/Mips/mips64extins.ll b/test/MC/Mips/mips64extins.ll
new file mode 100644
index 000000000000..ebe8f86513fd
--- /dev/null
+++ b/test/MC/Mips/mips64extins.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -mattr=n64 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 - \
+; RUN: | FileCheck %s
+
+define i64 @dext(i64 %i) nounwind readnone {
+entry:
+; CHECK: dext ${{[0-9]+}}, ${{[0-9]+}}, 5, 10
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 1023
+  ret i64 %and
+}
+
+define i64 @dextu(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextu ${{[0-9]+}}, ${{[0-9]+}}, 2, 6
+  %shr = lshr i64 %i, 34
+  %and = and i64 %shr, 63
+  ret i64 %and
+}
+
+define i64 @dextm(i64 %i) nounwind readnone {
+entry:
+; CHECK: dextm ${{[0-9]+}}, ${{[0-9]+}}, 5, 2
+  %shr = lshr i64 %i, 5
+  %and = and i64 %shr, 17179869183
+  ret i64 %and
+}
+
+define i64 @dins(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 8, 10
+  %shl2 = shl i64 %j, 8
+  %and = and i64 %shl2, 261888
+  %and3 = and i64 %i, -261889
+  %or = or i64 %and3, %and
+  ret i64 %or
+}
+
+define i64 @dinsm(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsm ${{[0-9]+}}, ${{[0-9]+}}, 10, 1
+  %shl4 = shl i64 %j, 10
+  %and = and i64 %shl4, 8796093021184
+  %and5 = and i64 %i, -8796093021185
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
+
+define i64 @dinsu(i64 %i, i64 %j) nounwind readnone {
+entry:
+; CHECK: dinsu ${{[0-9]+}}, ${{[0-9]+}}, 8, 13
+  %shl4 = shl i64 %j, 40
+  %and = and i64 %shl4, 9006099743113216
+  %and5 = and i64 %i, -9006099743113217
+  %or = or i64 %and5, %and
+  ret i64 %or
+}
diff --git a/test/MC/Mips/mips64shift.ll b/test/MC/Mips/mips64shift.ll
index 7817b96fa594..99cac7b591fa 100644
--- a/test/MC/Mips/mips64shift.ll
+++ b/test/MC/Mips/mips64shift.ll
@@ -1,5 +1,8 @@
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - | llvm-objdump -disassemble -triple mips64el - | FileCheck %s
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 %s -o - \
+; RUN: | llvm-objdump -disassemble -triple mips64el - | FileCheck %s 
 
 define i64 @f3(i64 %a0) nounwind readnone {
 entry:
diff --git a/test/MC/Mips/mips_directives.s b/test/MC/Mips/mips_directives.s
new file mode 100644
index 000000000000..e2f75a827d0a
--- /dev/null
+++ b/test/MC/Mips/mips_directives.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -triple mips-unknown-unknown %s
+#this test produces no output so there isS no FileCheck call
+$BB0_2:
+  .ent directives_test
+	.frame	$sp,0,$ra
+	.mask 	0x00000000,0
+	.fmask	0x00000000,0
+	.set	noreorder
+	.set	nomacro
+	.set	noat
+$JTI0_0:
+	.gpword	($BB0_2)
+	.set  at=$12
+	.set macro
+	.set reorder
+	.end directives_test
diff --git a/test/MC/Mips/multi-64bit-func.ll b/test/MC/Mips/multi-64bit-func.ll
index 6e0d784e07f6..83577aa1628b 100644
--- a/test/MC/Mips/multi-64bit-func.ll
+++ b/test/MC/Mips/multi-64bit-func.ll
@@ -1,8 +1,8 @@
 ; There is no real check here. If the test doesn't 
 ; assert it passes.
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -disable-mips-delay-filler < %s 
 ; Run it again without extra nop in delay slot
-; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 -enable-mips-delay-filler < %s 
+; RUN: llc -march=mips64el -filetype=obj -mcpu=mips64r2 < %s 
 
 define i32 @bosco1(i32 %x) nounwind readnone {
 entry:
diff --git a/test/MC/Mips/sext_64_32.ll b/test/MC/Mips/sext_64_32.ll
index e5c57b8c41d8..9e0cfa01fdfc 100644
--- a/test/MC/Mips/sext_64_32.ll
+++ b/test/MC/Mips/sext_64_32.ll
@@ -2,7 +2,7 @@
 
 ; Sign extend from 32 to 64 was creating nonsense opcodes
 
-; CHECK: sll ${{[0-9]+}}, ${{[0-9]+}}, 0
+; CHECK: sll ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
 
 define i64 @foo(i32 %ival) nounwind readnone {
 entry:
@@ -10,7 +10,7 @@ entry:
   ret i64 %conv
 }
 
-; CHECK: dsll32 ${{[0-9]+}}, ${{[0-9]+}}, 0
+; CHECK: dsll32 ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
 
 define i64 @foo_2(i32 %ival_2) nounwind readnone {
 entry:
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
new file mode 100644
index 000000000000..88488cdd048e
--- /dev/null
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll', '.c', '.cpp', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
diff --git a/test/MC/PowerPC/ppc64-initial-cfa.ll b/test/MC/PowerPC/ppc64-initial-cfa.ll
new file mode 100644
index 000000000000..3936cf2e81e5
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-initial-cfa.ll
@@ -0,0 +1,41 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file should be in .s form, change when asm parser is available.
+
+define void @f() {
+entry:
+  ret void
+}
+
+;; CHECK:      ('sh_name', 0x{{.*}}) # '.eh_frame'
+;; CHECK-NEXT: ('sh_type', 0x00000001)
+;; CHECK-NEXT: ('sh_flags', 0x0000000000000002)
+;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
+;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
+;; CHECK-NEXT: ('sh_size', 0x0000000000000030)
+;; CHECK-NEXT: ('sh_link', 0x00000000)
+;; CHECK-NEXT: ('sh_info', 0x00000000)
+;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
+;; CHECK-NEXT: ('sh_entsize', 0x0000000000000000)
+;; CHECK-NEXT: ('_section_data', '00000010 00000000 017a5200 01784101 000c0100 00000018 00000018 00000000 00000000 00000000 00000010 00000000')
+
+;; CHECK:      ('sh_name', 0x{{.*}}) # '.rela.eh_frame'
+;; CHECK-NEXT: ('sh_type', 0x00000004)
+;; CHECK-NEXT: ('sh_flags', 0x0000000000000000)
+;; CHECK-NEXT: ('sh_addr', 0x{{.*}})
+;; CHECK-NEXT: ('sh_offset', 0x{{.*}})
+;; CHECK-NEXT: ('sh_size', 0x0000000000000018)
+;; CHECK-NEXT: ('sh_link', 0x{{.*}})
+;; CHECK-NEXT: ('sh_info', 0x{{.*}})
+;; CHECK-NEXT: ('sh_addralign', 0x0000000000000008)
+;; CHECK-NEXT: ('sh_entsize', 0x0000000000000018)
+;; CHECK-NEXT: ('_relocations', [
+;; CHECK-NEXT:  # Relocation 0
+;; CHECK-NEXT:  (('r_offset', 0x000000000000001c)
+;; CHECK-NEXT:   ('r_sym', 0x{{.*}})
+;; CHECK-NEXT:   ('r_type', 0x00000026)
+;; CHECK-NEXT:   ('r_addend', 0x0000000000000000)
+;; CHECK-NEXT:  ),
+;; CHECK-NEXT: ])
+
diff --git a/test/MC/PowerPC/ppc64-relocs-01.ll b/test/MC/PowerPC/ppc64-relocs-01.ll
new file mode 100644
index 000000000000..5996af84f448
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-relocs-01.ll
@@ -0,0 +1,66 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -O3  \
+;; RUN:  -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file need to be in .s form, change when asm parse is done.
+
+@number64 = global i64 10, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+  %0 = load i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+declare double @sin(double) nounwind
+
+define double @test_branch24 (double %x) nounwind readonly {
+entry:
+  %add = call double @sin(double %x) nounwind
+  ret double %add
+}
+
+;; The relocations in .rela.text are the 'number64' load using a
+;; R_PPC64_TOC16_DS against the .toc and the 'sin' external function
+;; address using a R_PPC64_REL24
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000006
+;; CHECK-NEXT:  'r_type', 0x0000003f
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x0000000a
+;; CHECK-NEXT:  'r_type', 0x0000000a
+
+;; The .opd entry for the 'access_int64' function creates 2 relocations:
+;; 1. A R_PPC64_ADDR64 against the .text segment plus addend (the function
+;    address itself);
+;; 2. And a R_PPC64_TOC against no symbol (the linker will replace for the
+;;    module's TOC base).
+;; CHECK:       '.rela.opd'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000002
+;; CHECK-NEXT:  'r_type', 0x00000026
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000000
+;; CHECK-NEXT:  'r_type', 0x00000033
+
+;; Finally the TOC creates the relocation for the 'number64'.
+;; CHECK:       '.rela.toc'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000026
+
+;; Check if the relocation references are for correct symbols.
+;; CHECK:       Symbol 7
+;; CHECK-NEXT:  'access_int64'
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  'number64'
+;; CHECK:       Symbol 10
+;; CHECK-NEXT:  'sin'
diff --git a/test/MC/PowerPC/ppc64-tls-relocs-01.ll b/test/MC/PowerPC/ppc64-tls-relocs-01.ll
new file mode 100644
index 000000000000..5e3731107522
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-tls-relocs-01.ll
@@ -0,0 +1,28 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file should be in .s form, change when asm parser is available.
+
+@t = thread_local global i32 0, align 4
+
+define i32* @f() nounwind {
+entry:
+  ret i32* @t
+}
+
+;; Check for a pair of R_PPC64_TPREL16_HA / R_PPC64_TPREL16_LO relocs
+;; against the thread-local symbol 't'.
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000048
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000046
+
+;; Check that we got the correct symbol.
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  't'
+
diff --git a/test/MC/X86/intel-syntax-2.s b/test/MC/X86/intel-syntax-2.s
index ca4afc317398..d6dbe152cd58 100644
--- a/test/MC/X86/intel-syntax-2.s
+++ b/test/MC/X86/intel-syntax-2.s
@@ -1,7 +1,9 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown  %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=att %s | FileCheck %s
 
 	.intel_syntax
 _test:
 // CHECK:	movl	$257, -4(%rsp)
 	mov	DWORD PTR [RSP - 4], 257
-
+    .att_syntax
+// CHECK:	movl	$257, -4(%rsp)
+    movl $257, -4(%rsp)
diff --git a/test/MC/X86/x86-32-ms-inline-asm.s b/test/MC/X86/x86-32-ms-inline-asm.s
new file mode 100644
index 000000000000..73d5878b41bc
--- /dev/null
+++ b/test/MC/X86/x86-32-ms-inline-asm.s
@@ -0,0 +1,60 @@
+// RUN: llvm-mc -x86-asm-syntax=intel -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+mov eax, [ebx].0
+mov [ebx].4, ecx
+
+// CHECK: movl (%ebx), %eax
+// CHECK: encoding: [0x8b,0x03]
+// CHECK: movl %ecx, 4(%ebx)
+// CHECK: encoding: [0x89,0x4b,0x04]
+        
+_t21:                                   ## @t21
+// CHECK: t21
+	mov eax, [4*eax + 4]
+// CHECK: movl 4(,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x85,0x04,0x00,0x00,0x00]
+    mov eax, [4*eax][4]
+// CHECK: movl 4(,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x85,0x04,0x00,0x00,0x00]
+        
+	mov eax, [esi + eax]
+// CHECK: movl (%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x04,0x06]
+	mov eax, [esi][eax]
+// CHECK: movl (%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x04,0x06]
+        
+	mov eax, [esi + 4*eax]
+// CHECK: movl (%esi,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x86]
+	mov eax, [esi][4*eax]
+// CHECK: movl (%esi,%eax,4), %eax
+// CHECK: # encoding: [0x8b,0x04,0x86]
+
+    mov eax, [esi + eax + 4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi][eax + 4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi + eax][4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+	mov eax, [esi][eax][4]
+// CHECK: movl 4(%esi,%eax), %eax
+// CHECK: # encoding: [0x8b,0x44,0x06,0x04]
+
+	mov eax, [esi + 2*eax + 4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi][2*eax + 4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi + 2*eax][4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+	mov eax, [esi][2*eax][4]
+// CHECK: movl 4(%esi,%eax,2), %eax
+// CHECK: # encoding: [0x8b,0x44,0x46,0x04]
+
+	ret
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 6a2d5bba6b70..03cb62e7cba3 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -1164,6 +1164,10 @@ xsetbv // CHECK: xsetbv # encoding: [0x0f,0x01,0xd1]
 // CHECK: encoding: [0x66,0x48,0x0f,0x6e,0xc7]
 	movd %rdi,%xmm0
 
+// CHECK: movd  %xmm0, %rax
+// CHECK: encoding: [0x66,0x48,0x0f,0x7e,0xc0]
+        movd  %xmm0, %rax
+
 // CHECK: movntil %eax, (%rdi)
 // CHECK: encoding: [0x0f,0xc3,0x07]
 // CHECK: movntil
diff --git a/test/MC/X86/x86_64-rtm-encoding.s b/test/MC/X86/x86_64-rtm-encoding.s
new file mode 100644
index 000000000000..44d6bacb7f32
--- /dev/null
+++ b/test/MC/X86/x86_64-rtm-encoding.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: xbegin .L0
+// CHECK: encoding: [0xc7,0xf8,A,A,A,A]
+	xbegin .L0
+
+// CHECK: xend
+// CHECK: encoding: [0x0f,0x01,0xd5]
+	xend
+
+// CHECK: xabort
+// CHECK: encoding: [0xc6,0xf8,0x0d]
+	xabort $13
diff --git a/test/MC/X86/x86_nop.s b/test/MC/X86/x86_nop.s
new file mode 100644
index 000000000000..396e3022ebec
--- /dev/null
+++ b/test/MC/X86/x86_nop.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=generic %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i386 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i486 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i586 %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=pentium-mmx %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=geode %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=i686 %s | llvm-objdump -d - | not FileCheck %s
+
+# CHECK-NOT: nop{{[lw]}}
+inc %eax
+.align 8
+inc %eax
diff --git a/test/Makefile b/test/Makefile
index 9ddfabfb9a8e..810fdded465a 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -29,11 +29,6 @@ else
 LIT_ARGS := -s -v
 endif
 
-# -jN causes crash on Cygwin's python.
-ifneq (,$(filter $(HOST_OS),Cygwin))
-  LIT_ARGS += -j1
-endif
-
 ifdef TESTSUITE
 LIT_TESTSUITE := $(TESTSUITE)
 CLEANED_TESTSUITE := $(patsubst %/,%,$(TESTSUITE))
@@ -122,6 +117,16 @@ else
 ENABLE_ASSERTIONS=1
 endif
 
+# Derive whether or not LTO is enabled by checking the extra options.
+LTO_IS_ENABLED := 0
+ifneq ($(findstring -flto,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+else
+ifneq ($(findstring -O4,$(CompileCommonOpts)),)
+LTO_IS_ENABLED := 1
+endif
+endif
+
 lit.site.cfg: FORCE
 	@echo "Making LLVM 'lit.site.cfg' file..."
 	@$(ECHOPATH) s=@TARGET_TRIPLE@=$(TARGET_TRIPLE)=g > lit.tmp
@@ -131,9 +136,10 @@ lit.site.cfg: FORCE
 	@$(ECHOPATH) s=@SHLIBDIR@=$(SharedLibDir)=g >> lit.tmp
 	@$(ECHOPATH) s=@SHLIBEXT@=$(SHLIBEXT)=g >> lit.tmp
 	@$(ECHOPATH) s=@PYTHON_EXECUTABLE@=python=g >> lit.tmp
-	@$(ECHOPATH) s,@OCAMLOPT@,$(OCAMLOPT) -cc \\\\\"$(CXX_FOR_OCAMLOPT)\\\\\" -I $(LibDir)/ocaml,g >> lit.tmp
+	@$(ECHOPATH) s=@OCAMLOPT@=$(OCAMLOPT) -cc $(subst *,'\\\"',*$(subst =,"\\=",$(CXX_FOR_OCAMLOPT))*) -I $(LibDir)/ocaml=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_SHARED@=$(ENABLE_SHARED)=g >> lit.tmp
 	@$(ECHOPATH) s=@ENABLE_ASSERTIONS@=$(ENABLE_ASSERTIONS)=g >> lit.tmp
+	@$(ECHOPATH) s=@LTO_IS_ENABLED@=$(LTO_IS_ENABLED)=g >> lit.tmp
 	@$(ECHOPATH) s=@TARGETS_TO_BUILD@=$(TARGETS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@LLVM_BINDINGS@=$(BINDINGS_TO_BUILD)=g >> lit.tmp
 	@$(ECHOPATH) s=@HOST_OS@=$(HOST_OS)=g >> lit.tmp
diff --git a/test/Object/Inputs/dext-test.elf-mips64r2 b/test/Object/Inputs/dext-test.elf-mips64r2
new file mode 100644
index 000000000000..59dbaef69a2d
--- /dev/null
+++ b/test/Object/Inputs/dext-test.elf-mips64r2
diff --git a/test/Object/Inputs/relocations.elf-x86-64 b/test/Object/Inputs/relocations.elf-x86-64
new file mode 100644
index 000000000000..6e340c752543
--- /dev/null
+++ b/test/Object/Inputs/relocations.elf-x86-64
diff --git a/test/Object/Mips/feature.test b/test/Object/Mips/feature.test
new file mode 100644
index 000000000000..e8da60974603
--- /dev/null
+++ b/test/Object/Mips/feature.test
@@ -0,0 +1,11 @@
+RUN: llvm-objdump -disassemble -triple mips64el -mattr +mips64r2 %p/../Inputs/dext-test.elf-mips64r2 \
+RUN: | FileCheck %s
+
+CHECK: Disassembly of section .text:
+CHECK: .text:
+CHECK:        0:	08 00 e0 03                                  	jr	$ra
+CHECK:        4:	43 49 82 7c                                  	dext $2, $4, 5, 10
+CHECK:        8:	08 00 e0 03                                  	jr	$ra
+CHECK:        c:	83 28 82 7c                                  	dext $2, $4, 2, 6
+CHECK:       10:	08 00 e0 03                                  	jr	$ra
+CHECK:       14:	43 09 82 7c                                  	dext $2, $4, 5, 2
diff --git a/test/Object/Mips/lit.local.cfg b/test/Object/Mips/lit.local.cfg
new file mode 100644
index 000000000000..149931749822
--- /dev/null
+++ b/test/Object/Mips/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.test']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Mips' in targets:
+    config.unsupported = True
diff --git a/test/Object/nm-shared-object.test b/test/Object/nm-shared-object.test
index b361df535553..a57b9401ad42 100644
--- a/test/Object/nm-shared-object.test
+++ b/test/Object/nm-shared-object.test
@@ -1,15 +1,23 @@
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-i386 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-32
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-64
 
 ; Note: tls_sym should be 'D' (not '?'), but TLS is not
 ; yet recognized by ObjectFile.
 
-ELF: {{[0-9a-f]+}} A __bss_start
-ELF: {{[0-9a-f]+}} A _edata
-ELF: {{[0-9a-f]+}} A _end
-ELF: {{[0-9a-f]+}} B common_sym
-ELF: {{[0-9a-f]+}} D defined_sym
-ELF: {{[0-9a-f]+}} T global_func
-ELF:               ? tls_sym
+ELF-32: 0012c8 A __bss_start
+ELF-32: 0012c8 A _edata
+ELF-32: 0012cc A _end
+ELF-32: 0012c8 B common_sym
+ELF-32: 0012c4 D defined_sym
+ELF-32: 0001f0 T global_func
+ELF-32:        ? tls_sym
+
+ELF-64: 200454 A __bss_start
+ELF-64: 200454 A _edata
+ELF-64: 200458 A _end
+ELF-64: 200454 B common_sym
+ELF-64: 200450 D defined_sym
+ELF-64: 0002f0 T global_func
+ELF-64:        ? tls_sym
diff --git a/test/Object/objdump-relocations.test b/test/Object/objdump-relocations.test
index a394a23a7e05..6d35a2651d7a 100644
--- a/test/Object/objdump-relocations.test
+++ b/test/Object/objdump-relocations.test
@@ -9,6 +9,9 @@ RUN:              | FileCheck %s -check-prefix ELF-x86-64
 RUN: llvm-objdump -r %p/Inputs/trivial-object-test.elf-hexagon \
 RUN:              | FileCheck %s -check-prefix ELF-hexagon
 
+RUN: llvm-objdump -r %p/Inputs/relocations.elf-x86-64 \
+RUN:              | FileCheck %s -check-prefix ELF-complex-x86-64
+
 COFF-i386: .text
 COFF-i386: IMAGE_REL_I386_DIR32 L_.str
 COFF-i386: IMAGE_REL_I386_REL32 _puts
@@ -36,3 +39,13 @@ ELF-hexagon: R_HEX_HI16 puts
 ELF-hexagon: R_HEX_LO16 puts
 ELF-hexagon: R_HEX_B15_PCREL testf
 ELF-hexagon: R_HEX_B22_PCREL puts
+
+ELF-complex-x86-64: .text
+ELF-complex-x86-64-NEXT: R_X86_64_8 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_16 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_32S .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_64 .data-4
+ELF-complex-x86-64-NEXT: R_X86_64_PC32 .data-4-P
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+0
+ELF-complex-x86-64-NEXT: R_X86_64_32 .data+4
diff --git a/test/Object/objdump-symbol-table.test b/test/Object/objdump-symbol-table.test
index 989ec04a8ddc..c94b07773550 100644
--- a/test/Object/objdump-symbol-table.test
+++ b/test/Object/objdump-symbol-table.test
@@ -4,6 +4,8 @@ RUN: llvm-objdump -t %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -t %p/Inputs/trivial-object-test.macho-i386 \
 RUN:              | FileCheck %s -check-prefix macho-i386
+RUN: llvm-objdump -t %p/Inputs/shared-object-test.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-shared
 
 COFF-i386: file format
 COFF-i386: SYMBOL TABLE:
@@ -31,3 +33,9 @@ macho-i386: SYMBOL TABLE:
 macho-i386: 00000000 g     F __TEXT,__text  00000024 _main
 macho-i386: 00000000         *UND*  00000000 _SomeOtherFunction
 macho-i386: 00000000         *UND*  00000000 _puts
+
+ELF-shared: shared-object-test.elf-i386:     file format
+ELF-shared: SYMBOL TABLE:
+ELF-shared: 00000200 l     F .text 00000003 local_func
+ELF-shared: 000012c4 g       .data 00000004 defined_sym
+ELF-shared: 000001f0 g     F .text 00000003 global_func
diff --git a/test/Other/FileCheck-space.txt b/test/Other/FileCheck-space.txt
new file mode 100644
index 000000000000..6bbe5bc05ba7
--- /dev/null
+++ b/test/Other/FileCheck-space.txt
@@ -0,0 +1,9 @@
+RUN: printf "a\nb" | FileCheck %s -check-prefix=TEST1
+RUN: echo oo | FileCheck %s -check-prefix=TEST2
+
+Check that CHECK-NEXT without a space after the colon works.
+TEST1:a
+TEST1-NEXT:b
+
+Check that CHECK-NOT without a space after the colon works.
+TEST2-NOT:foo
diff --git a/test/Other/Inputs/llvm-cov.gcda b/test/Other/Inputs/llvm-cov.gcda
new file mode 100644
index 000000000000..9ae2286ea2f4
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcda
diff --git a/test/Other/Inputs/llvm-cov.gcno b/test/Other/Inputs/llvm-cov.gcno
new file mode 100644
index 000000000000..25e202386a89
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcno
diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll
new file mode 100644
index 000000000000..b8b3d0a90233
--- /dev/null
+++ b/test/Other/ResponseFile.ll
@@ -0,0 +1,9 @@
+; RUN: echo %s > %t.list
+; RUN: llvm-as @%t.list -o %t.bc
+; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s
+
+; CHECK: T foobar
+
+define void @foobar() {
+  ret void
+}
diff --git a/test/Other/extract-alias.ll b/test/Other/extract-alias.ll
new file mode 100644
index 000000000000..d5bab4b3f36b
--- /dev/null
+++ b/test/Other/extract-alias.ll
@@ -0,0 +1,49 @@
+; RUN: llvm-extract -func foo -S < %s | FileCheck %s
+; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
+; RUN: llvm-extract -alias zeda0 -S < %s | FileCheck --check-prefix=ALIAS %s
+; RUN: llvm-extract -ralias .*bar -S < %s | FileCheck --check-prefix=ALIASRE %s
+
+; Both aliases should be converted to declarations
+; CHECK:      @zeda0 = external global i32
+; CHECK:      define i32* @foo() {
+; CHECK-NEXT:  call void @a0bar()
+; CHECK-NEXT:  ret i32* @zeda0
+; CHECK-NEXT: }
+; CHECK:      declare void @a0bar()
+
+; DELETE:      @zed = global i32 0
+; DELETE:      @zeda0 = alias i32* @zed
+; DELETE-NEXT: @a0foo = alias i32* ()* @foo
+; DELETE-NEXT: @a0a0bar = alias void ()* @a0bar
+; DELETE-NEXT: @a0bar = alias void ()* @bar
+; DELETE:      declare i32* @foo()
+; DELETE:      define void @bar() {
+; DELETE-NEXT:  %c = call i32* @foo()
+; DELETE-NEXT:  ret void
+; DELETE-NEXT: }
+
+; ALIAS: @zed = external global i32
+; ALIAS: @zeda0 = alias i32* @zed
+
+; ALIASRE: @a0a0bar = alias void ()* @a0bar
+; ALIASRE: @a0bar = alias void ()* @bar
+; ALIASRE: declare void @bar()
+
+@zed = global i32 0
+@zeda0 = alias i32* @zed
+
+@a0foo = alias i32* ()* @foo
+
+define i32* @foo() {
+  call void @a0bar()
+  ret i32* @zeda0
+}
+
+@a0a0bar = alias void ()* @a0bar
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  %c = call i32* @foo()
+  ret void
+}
diff --git a/test/Other/extract-weak-odr.ll b/test/Other/extract-weak-odr.ll
new file mode 100644
index 000000000000..6618f5843645
--- /dev/null
+++ b/test/Other/extract-weak-odr.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-extract -func foo -S < %s | FileCheck %s
+; RUN: llvm-extract -delete -func foo -S < %s | FileCheck --check-prefix=DELETE %s
+
+; Test that we don't convert weak_odr to external definitions.
+
+; CHECK:      @bar = external global i32
+; CHECK:      define weak_odr i32* @foo() {
+; CHECK-NEXT:  ret i32* @bar
+; CHECK-NEXT: }
+
+; DELETE: @bar = weak_odr global i32 42
+; DELETE: declare i32* @foo()
+
+@bar = weak_odr global i32 42
+
+define weak_odr i32*  @foo() {
+  ret i32* @bar
+}
+
+define void @g() {
+  %c = call i32* @foo()
+  ret void
+}
diff --git a/test/Other/extract.ll b/test/Other/extract.ll
index 57573ed76f9a..8b0c835d5746 100644
--- a/test/Other/extract.ll
+++ b/test/Other/extract.ll
@@ -7,18 +7,19 @@
 ; llvm-extract uses lazy bitcode loading, so make sure it correctly reads
 ; from bitcode files in addition to assembly files.
 
-; CHECK: define void @foo() {
+; CHECK: define hidden void @foo() {
 ; CHECK:   ret void
 ; CHECK: }
 
-; The linkonce_odr linkage for foo() should be changed to external linkage.
-; DELETE: declare void @foo()
+; The private linkage for foo() should be changed to external linkage and
+; hidden visibility added.
+; DELETE: declare hidden void @foo()
 ; DELETE: define void @bar() {
 ; DELETE:   call void @foo()
 ; DELETE:   ret void
 ; DELETE: }
 
-define linkonce_odr void @foo() {
+define private void @foo() {
   ret void
 }
 define void @bar() {
diff --git a/test/Other/link-opts.ll b/test/Other/link-opts.ll
new file mode 100644
index 000000000000..8e58ac8a5683
--- /dev/null
+++ b/test/Other/link-opts.ll
@@ -0,0 +1,13 @@
+;RUN: opt -S -std-link-opts < %s | FileCheck %s
+; Simple test to check that -std-link-opts keeps only the main function.
+
+; CHECK-NOT: define
+; CHECK: define void @main
+; CHECK-NOT: define
+define void @main() {
+  ret void
+}
+
+define void @foo() {
+  ret void
+}
diff --git a/test/Other/lint.ll b/test/Other/lint.ll
index c84f56f8f694..78bbbe9e6fa6 100644
--- a/test/Other/lint.ll
+++ b/test/Other/lint.ll
@@ -9,8 +9,11 @@ declare void @has_noaliases(i32* noalias %p, i32* %q)
 declare void @one_arg(i32)
 
 @CG = constant i32 7
+@E = external global i8
 
 define i32 @foo() noreturn {
+  %buf = alloca i8
+  %buf2 = alloca {i8, i8}, align 2
 ; CHECK: Caller and callee calling convention differ
   call void @bar()
 ; CHECK: Null pointer dereference
@@ -26,8 +29,10 @@ define i32 @foo() noreturn {
 ; CHECK: Address one pointer dereference
   store i32 0, i32* inttoptr (i64 1 to i32*)
 ; CHECK: Memory reference address is misaligned
-  %x = inttoptr i32 1 to i32*
-  load i32* %x, align 4
+  store i8 0, i8* %buf, align 2
+; CHECK: Memory reference address is misaligned
+  %gep = getelementptr {i8, i8}* %buf2, i32 0, i32 1
+  store i8 0, i8* %gep, align 2
 ; CHECK: Division by zero
   %sd = sdiv i32 2, 0
 ; CHECK: Division by zero
@@ -75,6 +80,18 @@ define i32 @foo() noreturn {
 ; CHECK: Write to read-only memory
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i32 1, i1 0)
 
+; CHECK: Undefined behavior: Buffer overflow
+  %wider = bitcast i8* %buf to i16*
+  store i16 0, i16* %wider
+; CHECK: Undefined behavior: Buffer overflow
+  %inner = getelementptr {i8, i8}* %buf2, i32 0, i32 1
+  %wider2 = bitcast i8* %inner to i16*
+  store i16 0, i16* %wider2
+; CHECK: Undefined behavior: Buffer overflow
+  %before = getelementptr i8* %buf, i32 -1
+  %wider3 = bitcast i8* %before to i16*
+  store i16 0, i16* %wider3
+
   br label %next
 
 next:
@@ -84,6 +101,10 @@ next:
   ret i32 0
 
 foo:
+; CHECK-NOT: Undefined behavior: Buffer overflow
+; CHECK-NOT: Memory reference address is misaligned
+  %e = bitcast i8* @E to i64*
+  store i64 0, i64* %e
   %z = add i32 0, 0
 ; CHECK: unreachable immediately preceded by instruction without side effects
   unreachable
diff --git a/test/Other/lit.local.cfg b/test/Other/lit.local.cfg
index 19eebc0ac7ac..269307724232 100644
--- a/test/Other/lit.local.cfg
+++ b/test/Other/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll', '.c', '.cpp', '.txt']
diff --git a/test/Other/llvm-cov.test b/test/Other/llvm-cov.test
new file mode 100644
index 000000000000..c0aa203e2c17
--- /dev/null
+++ b/test/Other/llvm-cov.test
@@ -0,0 +1,3 @@
+PR11760
+RUN: llvm-cov -gcda=%S/Inputs/llvm-cov.gcda -gcno=%S/Inputs/llvm-cov.gcno
+
diff --git a/test/Other/llvm-nm-without-aliases.ll b/test/Other/llvm-nm-without-aliases.ll
new file mode 100644
index 000000000000..9d9408c13b6d
--- /dev/null
+++ b/test/Other/llvm-nm-without-aliases.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s > %t
+; RUN: llvm-nm -without-aliases < %t | FileCheck %s
+; RUN: llvm-nm < %t | FileCheck --check-prefix=WITH %s
+
+; CHECK-NOT: T a0bar
+; CHECK-NOT: T a0foo
+; CHECK: T bar
+; CHECK: T foo
+
+; WITH: T a0bar
+; WITH: T a0foo
+; WITH: T bar
+; WITH: T foo
+
+@a0foo = alias void ()* @foo
+
+define void @foo() {
+  ret void
+}
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  ret void
+}
diff --git a/test/Other/spir_cc.ll b/test/Other/spir_cc.ll
new file mode 100644
index 000000000000..ffc02945de4d
--- /dev/null
+++ b/test/Other/spir_cc.ll
@@ -0,0 +1,13 @@
+; RUN: llvm-as < %s | llvm-dis > %t1.ll
+; RUN: llvm-as %t1.ll -o - | llvm-dis > %t2.ll
+; RUN: diff %t1.ll %t2.ll
+
+define spir_func void @foo() {
+        ret void
+}
+
+define spir_kernel void @bar() {
+        call spir_func void @foo( )
+        call spir_kernel void @bar( )
+        ret void
+}
diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 18de368af9f1..1d8d62329ae3 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td
@@ -3,15 +3,59 @@
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
-// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, !if({ C:x{2} }, 0, 1), !if({ C:x{2} }, 1, 1), !if({ C:x{2} }, 0, 0), !if({ C:x{1} }, C:y{3}, 0), !if({ C:x{1} }, C:y{2}, 1), !if({ C:x{0} }, C:y{3}, C:z), !if({ C:x{0} }, C:y{2}, C:y{2}), !if({ C:x{0} }, C:y{1}, C:y{1}), !if({ C:x{0} }, C:y{0}, C:y{0}) };
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, !if({ C:y{3} }, 1, !if({ C:y{2} }, { C:x{0} }, !if({ C:y{1} }, { C:x{1} }, !if({ C:y{0} }, { C:x{2} }, ?)))){0}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){1}, !if({ C:x{2} }, { C:y{3}, C:y{2} }, !if({ C:x{1} }, { C:y{2}, C:y{1} }, !if({ C:x{0} }, { C:y{1}, C:y{0} }, ?))){0}, !if({ C:x{2} }, 2, 6){2}, !if({ C:x{2} }, 2, 6){1}, !if({ C:x{2} }, 2, 6){0}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){1}, !if({ C:x{1} }, { C:y{3}, C:y{2} }, { 0, 1 }){0}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){3}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){2}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){1}, !if({ C:x{0} }, { C:y{3}, C:y{2}, C:y{1}, C:y{0} }, { C:z, C:y{2}, C:y{1}, C:y{0} }){0} };
 class C<bits<3> x, bits<4> y, bit z> {
   bits<16> n;
 
+  let n{11}  = !if(y{3}, 1,
+               !if(y{2}, x{0},
+               !if(y{1}, x{1},
+               !if(y{0}, x{2}, ?))));
+  let n{10-9}= !if(x{2}, y{3-2},
+               !if(x{1}, y{2-1},
+               !if(x{0}, y{1-0}, ?)));
   let n{8-6} = !if(x{2}, 0b010, 0b110);
   let n{5-4} = !if(x{1}, y{3-2}, {0, 1});
   let n{3-0} = !if(x{0}, y{3-0}, {z, y{2}, y{1}, y{0}});
 }
 
+def C1 : C<{1, 0, 1}, {0, 1, 0, 1}, 0>;
+def C2 : C<{0, 1, 0}, {1, 0, 1, 0}, 1>;
+def C3 : C<{0, 0, 0}, {1, 0, 1, 0}, 0>;
+def C4 : C<{0, 0, 0}, {0, 0, 0, 0}, 0>;
+
+// CHECK: def C1
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1 };
+// CHECK: def C2
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0 };
+// CHECK: def C3
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, 1, ?, ?, 1, 1, 0, 0, 1, 0, 0, 1, 0 };
+// CHECK: def C4
+// CHECK-NEXT: bits<16> n = { ?, ?, ?, ?, ?, ?, ?, 1, 1, 0, 0, 1, 0, 0, 0, 0 };
+
+class S<int s> {
+  bits<2> val = !if(!eq(s, 8),  {0, 0},
+                !if(!eq(s, 16), 0b01,
+                !if(!eq(s, 32), 2,
+                !if(!eq(s, 64), {1, 1}, ?))));
+}
+
+def D8  : S<8>;
+def D16 : S<16>;
+def D32 : S<32>;
+def D64 : S<64>;
+def D128: S<128>;
+// CHECK: def D128
+// CHECK-NEXT: bits<2> val = { ?, ? };
+// CHECK: def D16
+// CHECK-NEXT: bits<2> val = { 0, 1 };
+// CHECK: def D32
+// CHECK-NEXT: bits<2> val = { 1, 0 };
+// CHECK: def D64
+// CHECK-NEXT: bits<2> val = { 1, 1 };
+// CHECK: def D8
+// CHECK-NEXT: bits<2> val = { 0, 0 };
+
 // CHECK:      def One
 // CHECK-NEXT: list<int> first = [1, 2, 3];
 // CHECK-NEXT: list<int> rest = [1, 2, 3];
diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
new file mode 100644
index 000000000000..5f3e3dabf4d4
--- /dev/null
+++ b/test/TableGen/list-element-bitref.td
@@ -0,0 +1,15 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class C<list<bits<8>> L> {
+  bits<2> V0 = L[0]{1-0};
+  bits<2> V1 = L[1]{3-2};
+  string V2 = !if(L[0]{0}, "Odd", "Even");
+}
+
+def c0 : C<[0b0101, 0b1010]>;
+
+// CHECK: def c0
+// CHECk-NEXT: bits<2> V0 = { 0, 1 };
+// CHECk-NEXT: bits<2> V1 = { 1, 0 };
+// CHECk-NEXT: string V2 = "Odd";
diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
new file mode 100644
index 000000000000..7779b635e33c
--- /dev/null
+++ b/test/TableGen/pr8330.td
@@ -0,0 +1,29 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// XFAIL: vg_leak
+
+class Or4<bits<8> Val> {
+  bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
+}
+
+class Whatev<bits<8> x>;
+
+class Whatever<bits<8> x> {
+  bits<8> W = {x{0}, x{1}, x{2}, x{3}, x{4}, x{5}, x{6}, x{7} };
+}
+
+multiclass X<bits<8> BaseOpc> {
+ def bar : Whatev<Or4<BaseOpc>.V >;
+}
+
+multiclass Y<bits<8> BaseOpc> {
+ def foo : Whatever<Or4<BaseOpc>.V >;
+}
+
+defm a : X<4>;
+
+// CHECK: def abar
+
+defm b : Y<8>;
+
+// CHECK: def bfoo
+// CHECK-NEXT: bits<8> W = { 0, 0, 1, 1, 0, 0, 0, 0 };
diff --git a/test/Transforms/BBVectorize/X86/cmp-types.ll b/test/Transforms/BBVectorize/X86/cmp-types.ll
new file mode 100644
index 000000000000..a4fcbb6048f5
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/cmp-types.ll
@@ -0,0 +1,16 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%"struct.btSoftBody" = type { float, float, float*, i8 }
+
+define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 {
+entry:
+  %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null
+  %cond16 = zext i1 %tobool15 to i32
+  %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null
+  %cond22 = zext i1 %tobool21 to i32
+  ret void
+; CHECK: @test1
+}
+
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll
new file mode 100644
index 000000000000..493f23b09853
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/loop1.ll
@@ -0,0 +1,53 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; The second check covers the use of alias analysis (with loop unrolling).
+
+define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
+entry:
+  br label %for.body
+; CHECK: @test1
+; CHECK-UNRL: @test1
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8
+  %mul = fmul double %0, %0
+  %mul3 = fmul double %0, %1
+  %add = fadd double %mul, %mul3
+  %add4 = fadd double %1, %1
+  %add5 = fadd double %add4, %0
+  %mul6 = fmul double %0, %add5
+  %add7 = fadd double %add, %mul6
+  %mul8 = fmul double %1, %1
+  %add9 = fadd double %0, %0
+  %add10 = fadd double %add9, %0
+  %mul11 = fmul double %mul8, %add10
+  %add12 = fadd double %add7, %mul11
+  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+  store double %add12, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK-NOT: <2 x double>
+; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
+; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
+; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
+; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
+; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
+; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
+; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
+; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
+; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
+; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
+; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
+; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/BBVectorize/X86/sh-rec.ll b/test/Transforms/BBVectorize/X86/sh-rec.ll
new file mode 100644
index 000000000000..1e0492c2a8c2
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec.ll
@@ -0,0 +1,54 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @ptoa() nounwind uwtable {
+entry:
+  %call = call i8* @malloc() nounwind
+  br i1 undef, label %return, label %if.end10
+
+if.end10:                                         ; preds = %entry
+  %incdec.ptr = getelementptr inbounds i8* %call, i64 undef
+  %call17 = call i32 @ptou() nounwind
+  %incdec.ptr26.1 = getelementptr inbounds i8* %incdec.ptr, i64 -2
+  store i8 undef, i8* %incdec.ptr26.1, align 1
+  %div27.1 = udiv i32 %call17, 100
+  %rem.2 = urem i32 %div27.1, 10
+  %add2230.2 = or i32 %rem.2, 48
+  %conv25.2 = trunc i32 %add2230.2 to i8
+  %incdec.ptr26.2 = getelementptr inbounds i8* %incdec.ptr, i64 -3
+  store i8 %conv25.2, i8* %incdec.ptr26.2, align 1
+  %incdec.ptr26.3 = getelementptr inbounds i8* %incdec.ptr, i64 -4
+  store i8 undef, i8* %incdec.ptr26.3, align 1
+  %div27.3 = udiv i32 %call17, 10000
+  %rem.4 = urem i32 %div27.3, 10
+  %add2230.4 = or i32 %rem.4, 48
+  %conv25.4 = trunc i32 %add2230.4 to i8
+  %incdec.ptr26.4 = getelementptr inbounds i8* %incdec.ptr, i64 -5
+  store i8 %conv25.4, i8* %incdec.ptr26.4, align 1
+  %div27.4 = udiv i32 %call17, 100000
+  %rem.5 = urem i32 %div27.4, 10
+  %add2230.5 = or i32 %rem.5, 48
+  %conv25.5 = trunc i32 %add2230.5 to i8
+  %incdec.ptr26.5 = getelementptr inbounds i8* %incdec.ptr, i64 -6
+  store i8 %conv25.5, i8* %incdec.ptr26.5, align 1
+  %incdec.ptr26.6 = getelementptr inbounds i8* %incdec.ptr, i64 -7
+  store i8 0, i8* %incdec.ptr26.6, align 1
+  %incdec.ptr26.7 = getelementptr inbounds i8* %incdec.ptr, i64 -8
+  store i8 undef, i8* %incdec.ptr26.7, align 1
+  %div27.7 = udiv i32 %call17, 100000000
+  %rem.8 = urem i32 %div27.7, 10
+  %add2230.8 = or i32 %rem.8, 48
+  %conv25.8 = trunc i32 %add2230.8 to i8
+  %incdec.ptr26.8 = getelementptr inbounds i8* %incdec.ptr, i64 -9
+  store i8 %conv25.8, i8* %incdec.ptr26.8, align 1
+  unreachable
+
+return:                                           ; preds = %entry
+  ret void
+; CHECK: @ptoa
+}
+
+declare noalias i8* @malloc() nounwind
+
+declare i32 @ptou()
diff --git a/test/Transforms/BBVectorize/X86/sh-rec2.ll b/test/Transforms/BBVectorize/X86/sh-rec2.ll
new file mode 100644
index 000000000000..ef2239932fa1
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec2.ll
@@ -0,0 +1,85 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+  %xmc = alloca [52 x i16], align 16
+  %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+  call void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i16* undef, i16* null, i16* undef, i16* undef, i16* undef, i16* %arraydecay5) nounwind
+  %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+  %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+  store i8 0, i8* %incdec.ptr136, align 1
+  %arrayidx162 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 11
+  %0 = load i16* %arrayidx162, align 2
+  %conv1631 = trunc i16 %0 to i8
+  %and164 = shl i8 %conv1631, 3
+  %shl165 = and i8 %and164, 56
+  %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+  store i8 %shl165, i8* %incdec.ptr157, align 1
+  %1 = load i16* inttoptr (i64 2 to i16*), align 2
+  %conv1742 = trunc i16 %1 to i8
+  %and175 = shl i8 %conv1742, 1
+  %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+  store i8 %and175, i8* %incdec.ptr172, align 1
+  %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+  store i8 0, i8* %incdec.ptr183, align 1
+  %arrayidx214 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 15
+  %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+  store i8 0, i8* %incdec.ptr199, align 1
+  %2 = load i16* %arrayidx214, align 2
+  %conv2223 = trunc i16 %2 to i8
+  %and223 = shl i8 %conv2223, 6
+  %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+  store i8 %and223, i8* %incdec.ptr220, align 1
+  %arrayidx240 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 19
+  %3 = load i16* %arrayidx240, align 2
+  %conv2414 = trunc i16 %3 to i8
+  %and242 = shl i8 %conv2414, 2
+  %shl243 = and i8 %and242, 28
+  %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+  store i8 %shl243, i8* %incdec.ptr235, align 1
+  %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+  store i8 0, i8* %incdec.ptr251, align 1
+  %arrayidx282 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 25
+  %4 = load i16* %arrayidx282, align 2
+  %conv2835 = trunc i16 %4 to i8
+  %and284 = and i8 %conv2835, 7
+  %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+  store i8 %and284, i8* %incdec.ptr272, align 1
+  %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+  store i8 0, i8* %incdec.ptr287, align 1
+  %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+  store i8 0, i8* %incdec.ptr298, align 1
+  %arrayidx319 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 26
+  %5 = load i16* %arrayidx319, align 4
+  %conv3206 = trunc i16 %5 to i8
+  %and321 = shl i8 %conv3206, 4
+  %shl322 = and i8 %and321, 112
+  %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+  store i8 %shl322, i8* %incdec.ptr314, align 1
+  %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+  %6 = load i16* %arrayidx340, align 2
+  %conv3417 = trunc i16 %6 to i8
+  %and342 = shl i8 %conv3417, 3
+  %shl343 = and i8 %and342, 56
+  %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+  store i8 %shl343, i8* %incdec.ptr335, align 1
+  %incdec.ptr366 = getelementptr inbounds i8* %c, i64 24
+  store i8 0, i8* %incdec.ptr350, align 1
+  %arrayidx381 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 36
+  %incdec.ptr387 = getelementptr inbounds i8* %c, i64 25
+  store i8 0, i8* %incdec.ptr366, align 1
+  %7 = load i16* %arrayidx381, align 8
+  %conv3898 = trunc i16 %7 to i8
+  %and390 = shl i8 %conv3898, 6
+  store i8 %and390, i8* %incdec.ptr387, align 1
+  unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-rec3.ll b/test/Transforms/BBVectorize/X86/sh-rec3.ll
new file mode 100644
index 000000000000..fd2cc8bdd91c
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec3.ll
@@ -0,0 +1,170 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+  %LARc28 = alloca [2 x i64], align 16
+  %LARc28.sub = getelementptr inbounds [2 x i64]* %LARc28, i64 0, i64 0
+  %tmpcast = bitcast [2 x i64]* %LARc28 to [8 x i16]*
+  %Nc = alloca [4 x i16], align 2
+  %Mc = alloca [4 x i16], align 2
+  %bc = alloca [4 x i16], align 2
+  %xmc = alloca [52 x i16], align 16
+  %arraydecay = bitcast [2 x i64]* %LARc28 to i16*
+  %arraydecay1 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 0
+  %arraydecay2 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 0
+  %arraydecay3 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 0
+  %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+  call void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i16* %arraydecay, i16* %arraydecay1, i16* %arraydecay2, i16* %arraydecay3, i16* undef, i16* %arraydecay5) nounwind
+  %0 = load i64* %LARc28.sub, align 16
+  %1 = trunc i64 %0 to i32
+  %conv1 = lshr i32 %1, 2
+  %and = and i32 %conv1, 15
+  %or = or i32 %and, 208
+  %conv6 = trunc i32 %or to i8
+  %incdec.ptr = getelementptr inbounds i8* %c, i64 1
+  store i8 %conv6, i8* %c, align 1
+  %conv84 = trunc i64 %0 to i8
+  %and9 = shl i8 %conv84, 6
+  %incdec.ptr15 = getelementptr inbounds i8* %c, i64 2
+  store i8 %and9, i8* %incdec.ptr, align 1
+  %2 = lshr i64 %0, 50
+  %shr226.tr = trunc i64 %2 to i8
+  %conv25 = and i8 %shr226.tr, 7
+  %incdec.ptr26 = getelementptr inbounds i8* %c, i64 3
+  store i8 %conv25, i8* %incdec.ptr15, align 1
+  %incdec.ptr42 = getelementptr inbounds i8* %c, i64 4
+  store i8 0, i8* %incdec.ptr26, align 1
+  %arrayidx52 = getelementptr inbounds [8 x i16]* %tmpcast, i64 0, i64 7
+  %3 = load i16* %arrayidx52, align 2
+  %conv537 = trunc i16 %3 to i8
+  %and54 = and i8 %conv537, 7
+  %incdec.ptr57 = getelementptr inbounds i8* %c, i64 5
+  store i8 %and54, i8* %incdec.ptr42, align 1
+  %incdec.ptr68 = getelementptr inbounds i8* %c, i64 6
+  store i8 0, i8* %incdec.ptr57, align 1
+  %4 = load i16* %arraydecay3, align 2
+  %conv748 = trunc i16 %4 to i8
+  %and75 = shl i8 %conv748, 5
+  %shl76 = and i8 %and75, 96
+  %incdec.ptr84 = getelementptr inbounds i8* %c, i64 7
+  store i8 %shl76, i8* %incdec.ptr68, align 1
+  %arrayidx94 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 1
+  %5 = load i16* %arrayidx94, align 2
+  %conv959 = trunc i16 %5 to i8
+  %and96 = shl i8 %conv959, 1
+  %shl97 = and i8 %and96, 14
+  %or103 = or i8 %shl97, 1
+  %incdec.ptr105 = getelementptr inbounds i8* %c, i64 8
+  store i8 %or103, i8* %incdec.ptr84, align 1
+  %arrayidx115 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 4
+  %6 = bitcast i16* %arrayidx115 to i32*
+  %7 = load i32* %6, align 8
+  %conv11610 = trunc i32 %7 to i8
+  %and117 = and i8 %conv11610, 7
+  %incdec.ptr120 = getelementptr inbounds i8* %c, i64 9
+  store i8 %and117, i8* %incdec.ptr105, align 1
+  %8 = lshr i32 %7, 16
+  %and12330 = shl nuw nsw i32 %8, 5
+  %and123 = trunc i32 %and12330 to i8
+  %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+  store i8 %and123, i8* %incdec.ptr120, align 1
+  %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+  store i8 0, i8* %incdec.ptr136, align 1
+  %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+  store i8 0, i8* %incdec.ptr157, align 1
+  %arrayidx173 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 1
+  %9 = load i16* %arrayidx173, align 2
+  %conv17412 = zext i16 %9 to i32
+  %and175 = shl nuw nsw i32 %conv17412, 1
+  %arrayidx177 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 1
+  %10 = load i16* %arrayidx177, align 2
+  %conv17826 = zext i16 %10 to i32
+  %shr17913 = lshr i32 %conv17826, 1
+  %and180 = and i32 %shr17913, 1
+  %or181 = or i32 %and175, %and180
+  %conv182 = trunc i32 %or181 to i8
+  %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+  store i8 %conv182, i8* %incdec.ptr172, align 1
+  %arrayidx188 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 1
+  %11 = load i16* %arrayidx188, align 2
+  %conv18914 = trunc i16 %11 to i8
+  %and190 = shl i8 %conv18914, 5
+  %shl191 = and i8 %and190, 96
+  %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+  store i8 %shl191, i8* %incdec.ptr183, align 1
+  %arrayidx209 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 14
+  %12 = load i16* %arrayidx209, align 4
+  %conv21015 = trunc i16 %12 to i8
+  %and211 = shl i8 %conv21015, 1
+  %shl212 = and i8 %and211, 14
+  %or218 = or i8 %shl212, 1
+  %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+  store i8 %or218, i8* %incdec.ptr199, align 1
+  %arrayidx225 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 16
+  %13 = bitcast i16* %arrayidx225 to i64*
+  %14 = load i64* %13, align 16
+  %conv22616 = trunc i64 %14 to i8
+  %and227 = shl i8 %conv22616, 3
+  %shl228 = and i8 %and227, 56
+  %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+  store i8 %shl228, i8* %incdec.ptr220, align 1
+  %15 = lshr i64 %14, 32
+  %and23832 = shl nuw nsw i64 %15, 5
+  %and238 = trunc i64 %and23832 to i8
+  %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+  store i8 %and238, i8* %incdec.ptr235, align 1
+  %arrayidx266 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 23
+  %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+  store i8 0, i8* %incdec.ptr251, align 1
+  %16 = load i16* %arrayidx266, align 2
+  %conv27418 = trunc i16 %16 to i8
+  %and275 = shl i8 %conv27418, 6
+  %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+  store i8 %and275, i8* %incdec.ptr272, align 1
+  %arrayidx288 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 2
+  %17 = load i16* %arrayidx288, align 2
+  %conv28919 = zext i16 %17 to i32
+  %and290 = shl nuw nsw i32 %conv28919, 1
+  %arrayidx292 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 2
+  %18 = load i16* %arrayidx292, align 2
+  %conv29327 = zext i16 %18 to i32
+  %shr29420 = lshr i32 %conv29327, 1
+  %and295 = and i32 %shr29420, 1
+  %or296 = or i32 %and290, %and295
+  %conv297 = trunc i32 %or296 to i8
+  %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+  store i8 %conv297, i8* %incdec.ptr287, align 1
+  %conv30021 = trunc i16 %18 to i8
+  %and301 = shl i8 %conv30021, 7
+  %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+  store i8 %and301, i8* %incdec.ptr298, align 1
+  %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+  store i8 0, i8* %incdec.ptr314, align 1
+  %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+  %19 = load i16* %arrayidx340, align 2
+  %conv34122 = trunc i16 %19 to i8
+  %and342 = shl i8 %conv34122, 3
+  %shl343 = and i8 %and342, 56
+  %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+  store i8 %shl343, i8* %incdec.ptr335, align 1
+  %arrayidx355 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 32
+  %20 = bitcast i16* %arrayidx355 to i32*
+  %21 = load i32* %20, align 16
+  %conv35623 = shl i32 %21, 2
+  %shl358 = and i32 %conv35623, 28
+  %22 = lshr i32 %21, 17
+  %and363 = and i32 %22, 3
+  %or364 = or i32 %shl358, %and363
+  %conv365 = trunc i32 %or364 to i8
+  store i8 %conv365, i8* %incdec.ptr350, align 1
+  unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-types.ll b/test/Transforms/BBVectorize/X86/sh-types.ll
new file mode 100644
index 000000000000..0bcb714d5e65
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-types.ll
@@ -0,0 +1,25 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define <4 x float> @test7(<4 x float> %A1, <4 x float> %B1, double %C1, double %C2, double %D1, double %D2) {
+        %A2 = shufflevector <4 x float> %A1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+        %B2 = shufflevector <4 x float> %B1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+        %X1 = shufflevector <4 x float> %A2, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+        %X2 = shufflevector <4 x float> %B2, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+        %Y1 = shufflevector <2 x float> %X1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+        %Y2 = shufflevector <2 x float> %X2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+
+	%M1 = fsub double %C1, %D1
+	%M2 = fsub double %C2, %D2
+	%N1 = fmul double %M1, %C1
+	%N2 = fmul double %M2, %C2
+	%Z1 = fadd double %N1, %D1
+	%Z2 = fadd double %N2, %D2
+
+        %R = fmul <4 x float> %Y1, %Y2
+        ret <4 x float> %R
+; CHECK: @test7
+; CHECK-NOT: <8 x float>
+; CHECK: ret <4 x float>
+}
+
diff --git a/test/Transforms/BBVectorize/X86/simple-ldstr.ll b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
new file mode 100644
index 000000000000..0124399bad9d
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple-ldstr.ll
@@ -0,0 +1,29 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+}
+
diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll
new file mode 100644
index 000000000000..0113e38bb1c9
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/simple.ll
@@ -0,0 +1,103 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain
+define double @test1a(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%W1 = fadd double %Y1, %Z1
+	%W2 = fadd double %Y2, %Z2
+	%V1 = fadd double %W1, %Z1
+	%V2 = fadd double %W2, %Z2
+	%Q1 = fadd double %W1, %V1
+	%Q2 = fadd double %W2, %V2
+	%S1 = fadd double %W1, %Q1
+	%S2 = fadd double %W2, %Q2
+	%R  = fmul double %S1, %S2
+	ret double %R
+; CHECK: @test1a
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %W1 = fadd <2 x double> %Y1, %Z1
+; CHECK: %V1 = fadd <2 x double> %W1, %Z1
+; CHECK: %Q1 = fadd <2 x double> %W1, %V1
+; CHECK: %S1 = fadd <2 x double> %W1, %Q1
+; CHECK: %S1.v.r1 = extractelement <2 x double> %S1, i32 0
+; CHECK: %S1.v.r2 = extractelement <2 x double> %S1, i32 1
+; CHECK: %R = fmul double %S1.v.r1, %S1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic depth-4 chain (internal permutation)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+	%W1 = fadd double %Y2, %Z1
+	%W2 = fadd double %Y1, %Z2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test4
+; CHECK-NOT: fmul <2 x double>
+; CHECK: ret double %R
+}
+
+; Basic chain with shuffles
+define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
+	%X1 = sub <8 x i8> %A1, %B1
+	%X2 = sub <8 x i8> %A2, %B2
+	%Y1 = mul <8 x i8> %X1, %A1
+	%Y2 = mul <8 x i8> %X2, %A2
+	%Z1 = add <8 x i8> %Y1, %B1
+	%Z2 = add <8 x i8> %Y2, %B2
+        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
+        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
+	%R  = mul <8 x i8> %Q1, %Q2
+	ret <8 x i8> %R
+; CHECK: @test6
+; CHECK-NOT: sub <16 x i8>
+; CHECK: ret <8 x i8>
+}
+
diff --git a/test/Transforms/BBVectorize/X86/vs-cast.ll b/test/Transforms/BBVectorize/X86/vs-cast.ll
new file mode 100644
index 000000000000..be3efca925b8
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/vs-cast.ll
@@ -0,0 +1,12 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @main() nounwind uwtable {
+entry:
+  %0 = bitcast <2 x i64> undef to i128
+  %1 = bitcast <2 x i64> undef to i128
+  ret void
+; CHECK: @main
+}
+
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
index 32a91ceee007..e8e82ce02479 100644
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -107,6 +107,6 @@ done:
   ret void
 ; CHECK: @test1
 ; CHECK: go:
-; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
 ; FIXME: When tree pruning is deterministic, include the entire output.
 }
diff --git a/test/Transforms/BBVectorize/lit.local.cfg b/test/Transforms/BBVectorize/lit.local.cfg
index 19eebc0ac7ac..a8ad0f1a28b2 100644
--- a/test/Transforms/BBVectorize/lit.local.cfg
+++ b/test/Transforms/BBVectorize/lit.local.cfg
@@ -1 +1,6 @@
 config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index bebc91ad91a0..c22ea5852a1b 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %mul = fmul double %0, %0
 ; CHECK: %mul3 = fmul double %0, %1
 ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
 ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
index d9945b563077..aeaf98865bc9 100644
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -7,8 +7,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK-SL4: @test1
 ; CHECK-SL4-NOT: <2 x double>
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index 68449771436e..ae1d63bfd852 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -17,8 +17,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1,
 	ret double %R
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
@@ -43,8 +43,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 	ret double %R
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
@@ -68,8 +68,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 	ret double %R
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.powi.v2f64(<2 x double> %X1, i32 %P)
diff --git a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
index f992d4154779..d46f7692b6d3 100644
--- a/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr-ptrs.ll
@@ -2,6 +2,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
 
+; FIXME: re-enable this once pointer vectors work properly
+; XFAIL: *
+
 ; Simple 3-pair chain also with loads and stores (using ptrs and gep)
 define double @test1(i64* %a, i64* %b, i64* %c) nounwind uwtable readonly {
 entry:
@@ -79,3 +82,53 @@ entry:
 ; CHECK-AO-NOT: <2 x
 }
 
+; Simple 3-pair chain with loads and stores (using ptrs and gep)
+; using pointer vectors.
+define void @test3(<2 x i64*>* %a, <2 x i64*>* %b, <2 x i64*>* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load <2 x i64*>* %a, align 8
+  %i1 = load <2 x i64*>* %b, align 8
+  %arrayidx3 = getelementptr inbounds <2 x i64*>* %a, i64 1
+  %i3 = load <2 x i64*>* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+  %i4 = load <2 x i64*>* %arrayidx4, align 8
+  %j1 = extractelement <2 x i64*> %i1, i32 0
+  %j4 = extractelement <2 x i64*> %i4, i32 0
+  %o1 = load i64* %j1, align 8
+  %o4 = load i64* %j4, align 8
+  %j0 = extractelement <2 x i64*> %i0, i32 0
+  %j3 = extractelement <2 x i64*> %i3, i32 0
+  %ptr0 = getelementptr inbounds i64* %j0, i64 %o1
+  %ptr3 = getelementptr inbounds i64* %j3, i64 %o4
+  %qtr0 = insertelement <2 x i64*> undef, i64* %ptr0, i32 0
+  %rtr0 = insertelement <2 x i64*> %qtr0, i64* %ptr0, i32 1
+  %qtr3 = insertelement <2 x i64*> undef, i64* %ptr3, i32 0
+  %rtr3 = insertelement <2 x i64*> %qtr3, i64* %ptr3, i32 1
+  store <2 x i64*> %rtr0, <2 x i64*>* %c, align 8
+  %arrayidx5 = getelementptr inbounds <2 x i64*>* %c, i64 1
+  store <2 x i64*> %rtr3, <2 x i64*>* %arrayidx5, align 8
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast <2 x i64*>* %a to <4 x i64*>*
+; CHECK: %i1 = load <2 x i64*>* %b, align 8
+; CHECK: %i0 = load <4 x i64*>* %i0.v.i0, align 8
+; CHECK: %arrayidx4 = getelementptr inbounds <2 x i64*>* %b, i64 1
+; CHECK: %i4 = load <2 x i64*>* %arrayidx4, align 8
+; CHECK: %j1 = extractelement <2 x i64*> %i1, i32 0
+; CHECK: %j4 = extractelement <2 x i64*> %i4, i32 0
+; CHECK: %o1 = load i64* %j1, align 8
+; CHECK: %o4 = load i64* %j4, align 8
+; CHECK: %ptr0.v.i1.1 = insertelement <2 x i64> undef, i64 %o1, i32 0
+; CHECK: %ptr0.v.i1.2 = insertelement <2 x i64> %ptr0.v.i1.1, i64 %o4, i32 1
+; CHECK: %ptr0.v.i0 = shufflevector <4 x i64*> %i0, <4 x i64*> undef, <2 x i32> <i32 0, i32 2>
+; CHECK: %ptr0 = getelementptr inbounds <2 x i64*> %ptr0.v.i0, <2 x i64> %ptr0.v.i1.2
+; CHECK: %rtr0 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> zeroinitializer
+; CHECK: %rtr3 = shufflevector <2 x i64*> %ptr0, <2 x i64*> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %0 = bitcast <2 x i64*>* %c to <4 x i64*>*
+; CHECK: %1 = shufflevector <2 x i64*> %rtr0, <2 x i64*> %rtr3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: store <4 x i64*> %1, <4 x i64*>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO-NOT: <4 x
+}
+
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
index a5397eeb1f96..7dd77c933f6d 100644
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -94,13 +94,13 @@ entry:
 ; CHECK-AO: @test3
 ; CHECK-AO: %i0 = load double* %a, align 8
 ; CHECK-AO: %i1 = load double* %b, align 8
-; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
-; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
 ; CHECK-AO: %i3 = load double* %arrayidx3, align 8
 ; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
 ; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
 ; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
 ; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
 ; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
@@ -108,3 +108,63 @@ entry:
 ; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
 ; CHECK-AO: ret void
 }
+
+; Simple 3-pair chain with loads and stores (unreachable)
+define void @test4(i1 %bool, double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  br i1 %bool, label %if.then1, label %if.end
+
+if.then1:
+  unreachable
+  br label %if.then
+
+if.then:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  br label %if.end
+
+if.end:
+  ret void
+; CHECK: @test4
+; CHECK-NOT: <2 x double>
+; CHECK-AO: @test4
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple 3-pair chain with loads and stores
+define void @test5(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  store double %mul, double* %c, align 4
+  ret void
+; CHECK: @test5
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 4
+; CHECK: ret void
+; CHECK-AO: @test5
+; CHECK-AO-NOT: <2 x double>
+}
+
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
index 325792a5dca1..15ecb597025a 100644
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -33,8 +33,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK-NB: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
index 88eb9c90f7ee..3527ae75b457 100644
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -29,8 +29,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -40,12 +40,13 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
 	%Z1 = fadd double %Y2, %B1
 	%Z2 = fadd double %Y1, %B2
-; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+; CHECK: %Z1.v.i1.1 = insertelement <2 x double> undef, double %B2, i32 0
+; CHECK: %Z1.v.i1.2 = insertelement <2 x double> %Z1.v.i1.1, double %B1, i32 1
+; CHECK: %Z2 = fadd <2 x double> %Y1, %Z1.v.i1.2
 	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %Z2.v.r1 = extractelement <2 x double> %Z2, i32 0
+; CHECK: %Z2.v.r2 = extractelement <2 x double> %Z2, i32 1
+; CHECK: %R = fmul double %Z2.v.r2, %Z2.v.r1
 	ret double %R
 ; CHECK: ret double %R
 }
@@ -54,8 +55,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 define double @test3(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -79,8 +80,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2) {
 define double @test4(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test4
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -148,4 +149,51 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
 ; CHECK: ret <8 x i8> %R
 }
 
+; Basic depth-3 chain (flipped order)
+define double @test7(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test7
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z2 = fadd double %Y2, %B2
+	%Z1 = fadd double %Y1, %B1
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (subclass data)
+define i64 @test8(i64 %A1, i64 %A2, i64 %B1, i64 %B2) {
+; CHECK: @test8
+; CHECK: %X1.v.i1.1 = insertelement <2 x i64> undef, i64 %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x i64> %X1.v.i1.1, i64 %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x i64> undef, i64 %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x i64> %X1.v.i0.1, i64 %A2, i32 1
+	%X1 = sub nsw i64 %A1, %B1
+	%X2 = sub i64 %A2, %B2
+; CHECK: %X1 = sub <2 x i64> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = mul i64 %X1, %A1
+	%Y2 = mul i64 %X2, %A2
+; CHECK: %Y1 = mul <2 x i64> %X1, %X1.v.i0.2
+	%Z1 = add i64 %Y1, %B1
+	%Z2 = add i64 %Y2, %B2
+; CHECK: %Z1 = add <2 x i64> %Y1, %X1.v.i1.2
+	%R  = mul i64 %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x i64> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x i64> %Z1, i32 1
+; CHECK: %R = mul i64 %Z1.v.r1, %Z1.v.r2
+	ret i64 %R
+; CHECK: ret i64 %R
+}
 
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 74d80aa18729..6794288a0ef2 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,17 +1,24 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s 
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
+; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
 @g2 = constant double 1.0
+; { 0x7B, 0x06B1BFF8 }
 @g3 = constant {i64, i64} { i64 123, i64 112312312 }
 
 ; Simple load
 define i32 @test1() {
   %r = load i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0)
   ret i32 %r
-; CHECK: @test1
-; CHECK: ret i32 -559038737
+
+; 0xDEADBEEF
+; LE: @test1
+; LE: ret i32 -559038737
+
+; 0xDEADBEEF
+; BE: @test1
+; BE: ret i32 -559038737
 }
 
 ; PR3152
@@ -20,8 +27,13 @@ define i16 @test2() {
   %r = load i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*)
   ret i16 %r
 
-; CHECK: @test2
-; CHECK: ret i16 -16657 
+; 0xBEEF
+; LE: @test2
+; LE: ret i16 -16657
+
+; 0xDEAD
+; BE: @test2
+; BE: ret i16 -8531
 }
 
 ; Load of second 16 bits of 32-bit value.
@@ -29,16 +41,27 @@ define i16 @test3() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 1)
   ret i16 %r
 
-; CHECK: @test3
-; CHECK: ret i16 -8531
+; 0xDEAD
+; LE: @test3
+; LE: ret i16 -8531
+
+; 0xBEEF
+; BE: @test3
+; BE: ret i16 -16657
 }
 
 ; Load of 8 bit field + tail padding.
 define i16 @test4() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 2)
   ret i16 %r
-; CHECK: @test4
-; CHECK: ret i16 186
+
+; 0x00BA
+; LE: @test4
+; LE: ret i16 186
+
+; 0xBA00
+; BE: @test4
+; BE: ret i16 -17920
 }
 
 ; Load of double bits.
@@ -46,8 +69,13 @@ define i64 @test6() {
   %r = load i64* bitcast(double* @g2 to i64*)
   ret i64 %r
 
-; CHECK: @test6
-; CHECK: ret i64 4607182418800017408
+; 0x3FF_0000000000000
+; LE: @test6
+; LE: ret i64 4607182418800017408
+
+; 0x3FF_0000000000000
+; BE: @test6
+; BE: ret i64 4607182418800017408
 }
 
 ; Load of double bits.
@@ -55,8 +83,13 @@ define i16 @test7() {
   %r = load i16* bitcast(double* @g2 to i16*)
   ret i16 %r
 
-; CHECK: @test7
-; CHECK: ret i16 0
+; 0x0000
+; LE: @test7
+; LE: ret i16 0
+
+; 0x3FF0
+; BE: @test7
+; BE: ret i16 16368
 }
 
 ; Double load.
@@ -64,8 +97,11 @@ define double @test8() {
   %r = load double* bitcast({{i32,i8},i32}* @g1 to double*)
   ret double %r
 
-; CHECK: @test8
-; CHECK: ret double 0xBADEADBEEF
+; LE: @test8
+; LE: ret double 0xBADEADBEEF
+
+; BE: @test8
+; BE: ret double 0xDEADBEEFBA000000
 }
 
 
@@ -74,8 +110,13 @@ define i128 @test9() {
   %r = load i128* bitcast({i64, i64}* @g3 to i128*)
   ret i128 %r
 
-; CHECK: @test9
-; CHECK: ret i128 2071796475790618158476296315
+; 0x00000000_06B1BFF8_00000000_0000007B
+; LE: @test9
+; LE: ret i128 2071796475790618158476296315
+
+; 0x00000000_0000007B_00000000_06B1BFF8
+; BE: @test9
+; BE: ret i128 2268949521066387161080
 }
 
 ; vector load.
@@ -83,21 +124,30 @@ define <2 x i64> @test10() {
   %r = load <2 x i64>* bitcast({i64, i64}* @g3 to <2 x i64>*)
   ret <2 x i64> %r
 
-; CHECK: @test10
-; CHECK: ret <2 x i64> <i64 123, i64 112312312>
+; LE: @test10
+; LE: ret <2 x i64> <i64 123, i64 112312312>
+
+; BE: @test10
+; BE: ret <2 x i64> <i64 123, i64 112312312>
 }
 
 
 ; PR5287
+; { 0xA1, 0x08 }
 @g4 = internal constant { i8, i8 } { i8 -95, i8 8 }
 
 define i16 @test11() nounwind {
 entry:
   %a = load i16* bitcast ({ i8, i8 }* @g4 to i16*)
   ret i16 %a
-  
-; CHECK: @test11
-; CHECK: ret i16 2209
+
+; 0x08A1
+; LE: @test11
+; LE: ret i16 2209
+
+; 0xA108
+; BE: @test11
+; BE: ret i16 -24312
 }
 
 
@@ -107,8 +157,14 @@ entry:
 define i16 @test12() {
   %a = load i16* getelementptr inbounds ([3 x i16]* bitcast ([6 x i8]* @test12g to [3 x i16]*), i32 0, i64 1) 
   ret i16 %a
-; CHECK: @test12
-; CHECK: ret i16 98
+
+; 0x0062
+; LE: @test12
+; LE: ret i16 98
+
+; 0x6200
+; BE: @test12
+; BE: ret i16 25088
 }
 
 
@@ -117,8 +173,12 @@ define i16 @test12() {
 define i1 @test13() {
   %A = load i1* bitcast (i8* @g5 to i1*)
   ret i1 %A
-; CHECK: @test13
-; CHECK: ret i1 false
+
+; LE: @test13
+; LE: ret i1 false
+
+; BE: @test13
+; BE: ret i1 false
 }
 
 @g6 = constant [2 x i8*] [i8* inttoptr (i64 1 to i8*), i8* inttoptr (i64 2 to i8*)]
@@ -126,14 +186,22 @@ define i64 @test14() nounwind {
 entry:
   %tmp = load i64* bitcast ([2 x i8*]* @g6 to i64*)
   ret i64 %tmp
-; CHECK: @test14
-; CHECK: ret i64 1
+
+; LE: @test14
+; LE: ret i64 1
+
+; BE: @test14
+; BE: ret i64 1
 }
 
 define i64 @test15() nounwind {
 entry:
   %tmp = load i64* bitcast (i8** getelementptr inbounds ([2 x i8*]* @g6, i32 0, i64 1) to i64*)
   ret i64 %tmp
-; CHECK: @test15
-; CHECK: ret i64 2
+
+; LE: @test15
+; LE: ret i64 2
+
+; BE: @test15
+; BE: ret i64 2
 }
diff --git a/test/Transforms/CorrelatedValuePropagation/crash.ll b/test/Transforms/CorrelatedValuePropagation/crash.ll
index 80c43d0f1da5..9723d18252a7 100644
--- a/test/Transforms/CorrelatedValuePropagation/crash.ll
+++ b/test/Transforms/CorrelatedValuePropagation/crash.ll
@@ -35,3 +35,28 @@ srf.exit.i:
 func_29.exit:
   ret void
 }
+
+; PR13972
+define void @test3() nounwind {
+for.body:
+  br label %return
+
+for.cond.i:                                       ; preds = %if.else.i, %for.body.i
+  %e.2.i = phi i32 [ %e.2.i, %if.else.i ], [ -8, %for.body.i ]
+  br i1 undef, label %return, label %for.body.i
+
+for.body.i:                                       ; preds = %for.cond.i
+  switch i32 %e.2.i, label %for.cond3.i [
+    i32 -3, label %if.else.i
+    i32 0, label %for.cond.i
+  ]
+
+for.cond3.i:                                      ; preds = %for.cond3.i, %for.body.i
+  br label %for.cond3.i
+
+if.else.i:                                        ; preds = %for.body.i
+  br label %for.cond.i
+
+return:                                           ; preds = %for.cond.i, %for.body
+  ret void
+}
diff --git a/test/Transforms/DeadArgElim/dbginfo.ll b/test/Transforms/DeadArgElim/dbginfo.ll
new file mode 100644
index 000000000000..dcbfaaa3d77b
--- /dev/null
+++ b/test/Transforms/DeadArgElim/dbginfo.ll
@@ -0,0 +1,64 @@
+; RUN: opt %s -deadargelim -S | FileCheck %s
+; PR14016
+
+; Check that debug info metadata for subprograms stores pointers to
+; updated LLVM functions.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@x = global i32 0, align 4
+
+define void @_Z3runv() uwtable {
+entry:
+  call void @_ZN12_GLOBAL__N_18dead_argEPv(i8* null), !dbg !10
+  call void (...)* @_ZN12_GLOBAL__N_111dead_varargEz(), !dbg !12
+  ret void, !dbg !13
+}
+
+; Argument will be deleted
+define internal void @_ZN12_GLOBAL__N_18dead_argEPv(i8* %foo) nounwind uwtable {
+entry:
+  %0 = load i32* @x, align 4, !dbg !14
+  %inc = add nsw i32 %0, 1, !dbg !14
+  store i32 %inc, i32* @x, align 4, !dbg !14
+  ret void, !dbg !16
+}
+
+; Vararg will be deleted
+define internal void @_ZN12_GLOBAL__N_111dead_varargEz(...) nounwind uwtable {
+entry:
+  %0 = load i32* @x, align 4, !dbg !17
+  %inc = add nsw i32 %0, 1, !dbg !17
+  store i32 %inc, i32* @x, align 4, !dbg !17
+  ret void, !dbg !19
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", metadata !"clang version 3.2 (trunk 165305)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/home/samsonov/tmp/clang-di/test.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !8, metadata !9}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"run", metadata !"run", metadata !"", metadata !6, i32 8, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @_Z3runv, null, null, metadata !1, i32 8} ; [ DW_TAG_subprogram ] [line 8] [def] [run]
+!6 = metadata !{i32 786473, metadata !"test.cc", metadata !"/home/samsonov/tmp/clang-di", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !2, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_vararg", metadata !"dead_vararg", metadata !"", metadata !6, i32 5, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (...)* @_ZN12_GLOBAL__N_111dead_varargEz, null, null, metadata !1, i32 5} ; [ DW_TAG_subprogram ] [line 5] [local] [def] [dead_vararg]
+
+; CHECK: metadata !"dead_vararg"{{.*}}void ()* @_ZN12_GLOBAL__N_111dead_varargEz
+
+!9 = metadata !{i32 786478, i32 0, metadata !6, metadata !"dead_arg", metadata !"dead_arg", metadata !"", metadata !6, i32 4, metadata !7, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (i8*)* @_ZN12_GLOBAL__N_18dead_argEPv, null, null, metadata !1, i32 4} ; [ DW_TAG_subprogram ] [line 4] [local] [def] [dead_arg]
+
+; CHECK: metadata !"dead_arg"{{.*}}void ()* @_ZN12_GLOBAL__N_18dead_argEPv
+
+!10 = metadata !{i32 8, i32 14, metadata !11, null}
+!11 = metadata !{i32 786443, metadata !5, i32 8, i32 12, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!12 = metadata !{i32 8, i32 27, metadata !11, null}
+!13 = metadata !{i32 8, i32 42, metadata !11, null}
+!14 = metadata !{i32 4, i32 28, metadata !15, null}
+!15 = metadata !{i32 786443, metadata !9, i32 4, i32 26, metadata !6, i32 2} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!16 = metadata !{i32 4, i32 33, metadata !15, null}
+!17 = metadata !{i32 5, i32 25, metadata !18, null}
+!18 = metadata !{i32 786443, metadata !8, i32 5, i32 23, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/home/samsonov/tmp/clang-di/test.cc]
+!19 = metadata !{i32 5, i32 30, metadata !18, null}
diff --git a/test/Transforms/DeadStoreElimination/libcalls.ll b/test/Transforms/DeadStoreElimination/libcalls.ll
new file mode 100644
index 000000000000..4639c0bc9628
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/libcalls.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -basicaa -dse < %s | FileCheck %s
+
+declare i8* @strcpy(i8* %dest, i8* %src) nounwind
+define void @test1(i8* %src) {
+; CHECK: @test1
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strcpy
+  %call = call i8* @strcpy(i8* %dest, i8* %src)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strncpy(i8* %dest, i8* %src, i32 %n) nounwind
+define void @test2(i8* %src) {
+; CHECK: @test2
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strncpy
+  %call = call i8* @strncpy(i8* %dest, i8* %src, i32 12)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strcat(i8* %dest, i8* %src) nounwind
+define void @test3(i8* %src) {
+; CHECK: @test3
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strcat
+  %call = call i8* @strcat(i8* %dest, i8* %src)
+; CHECK: ret void
+  ret void
+}
+
+declare i8* @strncat(i8* %dest, i8* %src, i32 %n) nounwind
+define void @test4(i8* %src) {
+; CHECK: @test4
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK-NOT: @strncat
+  %call = call i8* @strncat(i8* %dest, i8* %src, i32 12)
+; CHECK: ret void
+  ret void
+}
+
+define void @test5(i8* nocapture %src) {
+; CHECK: @test5
+  %dest = alloca [100 x i8], align 16
+  %arraydecay = getelementptr inbounds [100 x i8]* %dest, i64 0, i64 0
+  %call = call i8* @strcpy(i8* %arraydecay, i8* %src)
+; CHECK: %call = call i8* @strcpy
+  %arrayidx = getelementptr inbounds i8* %call, i64 10
+  store i8 97, i8* %arrayidx, align 1
+  ret void
+}
+
+declare void @user(i8* %p)
+define void @test6(i8* %src) {
+; CHECK: @test6
+  %B = alloca [16 x i8]
+  %dest = getelementptr inbounds [16 x i8]* %B, i64 0, i64 0
+; CHECK: @strcpy
+  %call = call i8* @strcpy(i8* %dest, i8* %src)
+; CHECK: @user
+  call void @user(i8* %dest)
+; CHECK: ret void
+  ret void
+}
+
diff --git a/test/Transforms/DeadStoreElimination/simple.ll b/test/Transforms/DeadStoreElimination/simple.ll
index 7a8cdd531b55..e0eb90af9437 100644
--- a/test/Transforms/DeadStoreElimination/simple.ll
+++ b/test/Transforms/DeadStoreElimination/simple.ll
@@ -310,3 +310,17 @@ define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind {
   store i32 %c, i32* %4, align 4
   ret void
 }
+
+; Check another case like PR13547 where strdup is not like malloc.
+; CHECK: @test25
+; CHECK: load i8
+; CHECK: store i8 0
+; CHECK: store i8 %tmp
+define i8* @test25(i8* %p) nounwind {
+  %p.4 = getelementptr i8* %p, i64 4
+  %tmp = load i8* %p.4, align 1
+  store i8 0, i8* %p.4, align 1
+  %q = call i8* @strdup(i8* %p) nounwind optsize
+  store i8 %tmp, i8* %p.4, align 1
+  ret i8* %q
+}
diff --git a/test/Transforms/EarlyCSE/commute.ll b/test/Transforms/EarlyCSE/commute.ll
new file mode 100644
index 000000000000..f84a7dd1aae9
--- /dev/null
+++ b/test/Transforms/EarlyCSE/commute.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -S -early-cse | FileCheck %s
+
+; CHECK: @test1
+define void @test1(float %A, float %B, float* %PA, float* %PB) {
+  ; CHECK-NEXT: fadd
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fadd float %A, %B
+  store float %C, float* %PA
+  %D = fadd float %B, %A
+  store float %D, float* %PB
+  ret void
+}
+
+; CHECK: @test2
+define void @test2(float %A, float %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: fcmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fcmp eq float %A, %B
+  store i1 %C, i1* %PA
+  %D = fcmp eq float %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test3
+define void @test3(float %A, float %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: fcmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = fcmp uge float %A, %B
+  store i1 %C, i1* %PA
+  %D = fcmp ule float %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test4
+define void @test4(i32 %A, i32 %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: icmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = icmp eq i32 %A, %B
+  store i1 %C, i1* %PA
+  %D = icmp eq i32 %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
+
+; CHECK: @test5
+define void @test5(i32 %A, i32 %B, i1* %PA, i1* %PB) {
+  ; CHECK-NEXT: icmp
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: store
+  ; CHECK-NEXT: ret
+  %C = icmp sgt i32 %A, %B
+  store i1 %C, i1* %PA
+  %D = icmp slt i32 %B, %A
+  store i1 %D, i1* %PB
+  ret void
+}
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 31eae256c6ef..4a8c8e4589c8 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -163,3 +163,39 @@ entry:
   ret i8 %1
 }
 
+
+; Test that a GEP in an unreachable block with the following form doesn't crash
+; GVN:
+;
+;    %x = gep %some.type %x, ...
+
+%struct.type = type { i64, i32, i32 }
+
+define fastcc void @func() nounwind uwtable ssp align 2 {
+entry:
+  br label %reachable.bb
+
+;; Unreachable code.
+
+unreachable.bb:
+  %gep.val = getelementptr inbounds %struct.type* %gep.val, i64 1
+  br i1 undef, label %u2.bb, label %u1.bb
+
+u1.bb:
+  %tmp1 = getelementptr inbounds %struct.type* %gep.val, i64 0, i32 0
+  store i64 -1, i64* %tmp1, align 8
+  br label %unreachable.bb
+
+u2.bb:
+  %0 = load i32* undef, align 4
+  %conv.i.i.i.i.i = zext i32 %0 to i64
+  br label %u2.bb
+
+;; Reachable code.
+
+reachable.bb:
+  br label %r1.bb
+
+r1.bb:
+  br label %u2.bb
+}
diff --git a/test/Transforms/GVN/malloc-load-removal.ll b/test/Transforms/GVN/malloc-load-removal.ll
new file mode 100644
index 000000000000..66b6929d3038
--- /dev/null
+++ b/test/Transforms/GVN/malloc-load-removal.ll
@@ -0,0 +1,31 @@
+; RUN: opt -S -basicaa -gvn < %s | FileCheck %s
+; RUN: opt -S -basicaa -gvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
+; PR13694
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+declare i8* @malloc(i64) nounwind
+
+define noalias i8* @test() nounwind uwtable ssp {
+entry:
+  %call = tail call i8* @malloc(i64 100) nounwind
+  %0 = load i8* %call, align 1
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  store i8 0, i8* %call, align 1
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret i8* %call
+
+; CHECK: @test
+; CHECK-NOT: load
+; CHECK-NOT: icmp
+
+; CHECK_NO_LIBCALLS: @test
+; CHECK_NO_LIBCALLS: load
+; CHECK_NO_LIBCALLS: icmp
+}
diff --git a/test/Transforms/GVN/pr14166.ll b/test/Transforms/GVN/pr14166.ll
new file mode 100644
index 000000000000..9f47e464265b
--- /dev/null
+++ b/test/Transforms/GVN/pr14166.ll
@@ -0,0 +1,27 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+target datalayout = "e-p:32:32:32"
+target triple = "i386-pc-linux-gnu"
+define <2 x i32> @test1() {
+  %v1 = alloca <2 x i32>
+  call void @anything(<2 x i32>* %v1)
+  %v2 = load <2 x i32>* %v1
+  %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+  %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+  store <2 x i8*> %v3, <2 x i8*>* %v4
+  %v5 = load <2 x i32>* %v1
+  ret <2 x i32> %v5
+; CHECK: @test1
+; CHECK: %v1 = alloca <2 x i32>
+; CHECK: call void @anything(<2 x i32>* %v1)
+; CHECK: %v2 = load <2 x i32>* %v1
+; CHECK: %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+; CHECK: %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+; CHECK: store <2 x i8*> %v3, <2 x i8*>* %v4
+; CHECK: %1 = ptrtoint <2 x i8*> %v3 to <2 x i32>
+; CHECK: %2 = bitcast <2 x i32> %1 to i64
+; CHECK: %3 = bitcast i64 %2 to <2 x i32>
+; CHECK: ret <2 x i32> %3
+}
+
+declare void @anything(<2 x i32>*)
+
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index e7641691264c..72fa819d1c73 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -basicaa -gvn -S -die | FileCheck %s
-
-; 32-bit little endian target.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
@@ -318,7 +316,7 @@ define i8 @coerce_offset_nonlocal0(i32* %P, i1 %cond) {
   %P4 = getelementptr i8* %P3, i32 2
   br i1 %cond, label %T, label %F
 T:
-  store i32 42, i32* %P
+  store i32 57005, i32* %P
   br label %Cont
   
 F:
diff --git a/test/Transforms/GlobalOpt/blockaddress.ll b/test/Transforms/GlobalOpt/blockaddress.ll
new file mode 100644
index 000000000000..13da76299d5d
--- /dev/null
+++ b/test/Transforms/GlobalOpt/blockaddress.ll
@@ -0,0 +1,20 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+@x = internal global i8* zeroinitializer
+
+define void @f() {
+; CHECK: @f
+
+; Check that we don't hit an assert in Constant::IsThreadDependent()
+; when storing this blockaddress into a global.
+
+  store i8* blockaddress(@g, %here), i8** @x, align 8
+  ret void
+}
+
+define void @g() {
+; CHECK: @g
+
+here:
+  ret void
+}
diff --git a/test/Transforms/GlobalOpt/load-store-global.ll b/test/Transforms/GlobalOpt/load-store-global.ll
index f824b2c11cbf..25a53370fa09 100644
--- a/test/Transforms/GlobalOpt/load-store-global.ll
+++ b/test/Transforms/GlobalOpt/load-store-global.ll
@@ -1,15 +1,38 @@
-; RUN: opt < %s -globalopt -S | not grep G
+; RUN: opt < %s -globalopt -S | FileCheck %s
 
 @G = internal global i32 17             ; <i32*> [#uses=3]
+; CHECK-NOT: @G
 
 define void @foo() {
         %V = load i32* @G               ; <i32> [#uses=1]
         store i32 %V, i32* @G
         ret void
+; CHECK: @foo
+; CHECK-NEXT: ret void
 }
 
 define i32 @bar() {
         %X = load i32* @G               ; <i32> [#uses=1]
         ret i32 %X
+; CHECK: @bar
+; CHECK-NEXT: ret i32 17
+}
+
+@a = internal global i64* null, align 8
+; CHECK-NOT: @a
+
+; PR13968
+define void @qux() nounwind {
+  %b = bitcast i64** @a to i8*
+  %g = getelementptr i64** @a, i32 1
+  %cmp = icmp ne i8* null, %b
+  %cmp2 = icmp eq i8* null, %b
+  %cmp3 = icmp eq i64** null, %g
+  store i64* inttoptr (i64 1 to i64*), i64** @a, align 8
+  %l = load i64** @a, align 8
+  ret void
+; CHECK: @qux
+; CHECK-NOT: store
+; CHECK-NOT: load
 }
 
diff --git a/test/Transforms/GlobalOpt/tls.ll b/test/Transforms/GlobalOpt/tls.ll
new file mode 100644
index 000000000000..7a410e5ed20b
--- /dev/null
+++ b/test/Transforms/GlobalOpt/tls.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -globalopt -S | FileCheck %s
+
+declare void @wait()
+declare void @signal()
+declare void @start_thread(void ()*)
+
+@x = internal thread_local global [100 x i32] zeroinitializer, align 16
+@ip = internal global i32* null, align 8
+
+; PR14309: GlobalOpt would think that the value of @ip is always the address of
+; x[1]. However, that address is different for different threads so @ip cannot
+; be replaced with a constant.
+
+define i32 @f() {
+entry:
+  ; Set @ip to point to x[1] for thread 1.
+  store i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), i32** @ip, align 8
+
+  ; Run g on a new thread.
+  tail call void @start_thread(void ()* @g) nounwind
+  tail call void @wait() nounwind
+
+  ; Reset x[1] for thread 1.
+  store i32 0, i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), align 4
+
+  ; Read the value of @ip, which now points at x[1] for thread 2.
+  %0 = load i32** @ip, align 8
+
+  %1 = load i32* %0, align 4
+  ret i32 %1
+
+; CHECK: @f
+; Make sure that the load from @ip hasn't been removed.
+; CHECK: load i32** @ip
+; CHECK: ret
+}
+
+define internal void @g() nounwind uwtable {
+entry:
+  ; Set @ip to point to x[1] for thread 2.
+  store i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), i32** @ip, align 8
+
+  ; Store 50 in x[1] for thread 2.
+  store i32 50, i32* getelementptr inbounds ([100 x i32]* @x, i64 0, i64 1), align 4
+
+  tail call void @signal() nounwind
+  ret void
+
+; CHECK: @g
+; Make sure that the store to @ip hasn't been removed.
+; CHECK: store {{.*}} @ip
+; CHECK: ret
+}
diff --git a/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll b/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
index 708a961272b5..0c88e83975c1 100644
--- a/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
+++ b/test/Transforms/IndVarSimplify/2004-04-05-InvokeCastCrash.ll
@@ -39,11 +39,11 @@
 	%"struct.llvm::SymbolTable" = type opaque
 	%"struct.llvm::SymbolTableListTraits<llvm::Argument,llvm::Function,llvm::Function,llvm::ilist_traits<llvm::Argument> >" = type { %"struct.llvm::Function"*, %"struct.llvm::Function"* }
 	%"struct.llvm::SymbolTableListTraits<llvm::Instruction,llvm::BasicBlock,llvm::Function,llvm::ilist_traits<llvm::Instruction> >" = type { %"struct.llvm::Function"*, %"struct.llvm::BasicBlock"* }
-	%"struct.llvm::TargetData" = type { %"struct.llvm::FunctionPass", i1, i8, i8, i8, i8, i8, i8, i8, i8 }
+	%"struct.llvm::DataLayout" = type { %"struct.llvm::FunctionPass", i1, i8, i8, i8, i8, i8, i8, i8, i8 }
 	%"struct.llvm::TargetFrameInfo" = type { i32 (...)**, i32, i32, i32 }
 	%"struct.llvm::TargetInstrDescriptor" = type { i8*, i32, i32, i32, i1, i32, i32, i32, i32, i32, i32*, i32* }
 	%"struct.llvm::TargetInstrInfo" = type { i32 (...)**, %"struct.llvm::TargetInstrDescriptor"*, i32, i32 }
-	%"struct.llvm::TargetMachine" = type { i32 (...)**, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >", %"struct.llvm::TargetData", %"struct.llvm::IntrinsicLowering"* }
+	%"struct.llvm::TargetMachine" = type { i32 (...)**, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >", %"struct.llvm::DataLayout", %"struct.llvm::IntrinsicLowering"* }
 	%"struct.llvm::TargetRegClassInfo" = type { i32 (...)**, i32, i32, i32 }
 	%"struct.llvm::TargetRegInfo" = type { i32 (...)**, %"struct.std::vector<const llvm::TargetRegClassInfo*,std::allocator<const llvm::TargetRegClassInfo*> >", %"struct.llvm::TargetMachine"* }
 	%"struct.llvm::Type" = type { %"struct.llvm::Value", i32, i32, i1, i32, %"struct.llvm::Type"*, %"struct.std::vector<llvm::PATypeHandle,std::allocator<llvm::PATypeHandle> >" }
diff --git a/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
new file mode 100644
index 000000000000..5c478669d298
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2012-10-19-congruent-constant.ll
@@ -0,0 +1,27 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; PR12627
+define void @test1(i32 %x) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %phi1 = phi i1 [ false, %entry ], [ %cmpa, %for.body ]
+  %phi2 = phi i1 [ false, %entry ], [ %cmpb, %for.body ]
+  %i.07 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @aux(i1 %phi1, i1 %phi2) nounwind
+  %cmpa = icmp sgt i32 %i.07, 200
+  %cmpb = icmp sgt i32 %i.07, 100
+  %inc = add nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+
+; CHECK: @test1
+; CHECK-NOT: phi i1
+; CHECK: call void @aux(i1 false, i1 false)
+}
+
+declare void @aux(i1, i1)
diff --git a/test/Transforms/IndVarSimplify/crash.ll b/test/Transforms/IndVarSimplify/crash.ll
index 3335be781dfc..1b702a3b1a3c 100644
--- a/test/Transforms/IndVarSimplify/crash.ll
+++ b/test/Transforms/IndVarSimplify/crash.ll
@@ -87,3 +87,47 @@ entry:
 main.f.exit:                                      ; preds = %"3.i"
   unreachable
 }
+
+
+; PR13967
+
+define void @f() nounwind ssp {
+bb:
+  br label %bb4
+
+bb4:
+  %tmp = phi i64 [ %tmp5, %bb7 ], [ undef, %bb ]
+  %tmp5 = add nsw i64 %tmp, 1
+  %extract.t1 = trunc i64 %tmp5 to i32
+  br i1 false, label %bb6, label %bb7
+
+bb6:
+  br label %bb7
+
+bb7:
+  %.off0 = phi i32 [ undef, %bb6 ], [ %extract.t1, %bb4 ]
+  %tmp8 = icmp eq i32 %.off0, 0
+  br i1 %tmp8, label %bb9, label %bb4
+
+bb9:
+  ret void
+}
+
+; PR12536
+define void @fn1() noreturn nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.end, %entry
+  %b.0 = phi i32 [ undef, %entry ], [ %conv, %for.end ]
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1, %for.cond
+  %c.0 = phi i32 [ %b.0, %for.cond1 ], [ 0, %for.cond ]
+  br i1 undef, label %for.cond1, label %for.end
+
+for.end:                                          ; preds = %for.cond1
+  %cmp2 = icmp slt i32 %c.0, 1
+  %conv = zext i1 %cmp2 to i32
+  br label %for.cond
+}
diff --git a/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
index 953bbdff5c62..5dca71264665 100644
--- a/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -106,3 +106,106 @@ loop:
 return:
   ret void
 }
+
+; PR14432
+; Indvars should not turn the second loop into an infinite one.
+
+; CHECK: @func_11
+; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK-NOT: br i1 true, label %noassert68, label %unrolledend
+
+define i32 @func_11() nounwind uwtable {
+entry:
+  br label %forcond
+
+forcond:                                          ; preds = %noassert, %entry
+  %__key6.0 = phi i32 [ 2, %entry ], [ %tmp37, %noassert ]
+  %tmp5 = icmp slt i32 %__key6.0, 10
+  br i1 %tmp5, label %noassert, label %forcond38.preheader
+
+forcond38.preheader:                              ; preds = %forcond
+  br label %forcond38
+
+noassert:                                         ; preds = %forbody
+  %tmp13 = sdiv i32 -32768, %__key6.0
+  %tmp2936 = shl i32 %tmp13, 24
+  %sext23 = shl i32 %tmp13, 24
+  %tmp32 = icmp eq i32 %tmp2936, %sext23
+  %tmp37 = add i32 %__key6.0, 1
+  br i1 %tmp32, label %forcond, label %assert33
+
+assert33:                                         ; preds = %noassert
+  tail call void @llvm.trap()
+  unreachable
+
+forcond38:                                        ; preds = %noassert68, %forcond38.preheader
+  %__key8.0 = phi i32 [ %tmp81, %noassert68 ], [ 2, %forcond38.preheader ]
+  %tmp46 = icmp slt i32 %__key8.0, 10
+  br i1 %tmp46, label %noassert68, label %unrolledend
+
+noassert68:                                       ; preds = %forbody39
+  %tmp57 = sdiv i32 -32768, %__key8.0
+  %sext34 = shl i32 %tmp57, 16
+  %sext21 = shl i32 %tmp57, 16
+  %tmp76 = icmp eq i32 %sext34, %sext21
+  %tmp81 = add i32 %__key8.0, 1
+  br i1 %tmp76, label %forcond38, label %assert77
+
+assert77:                                         ; preds = %noassert68
+  tail call void @llvm.trap()
+  unreachable
+
+unrolledend:                                      ; preds = %forcond38
+  ret i32 0
+}
+
+declare void @llvm.trap() noreturn nounwind
+
+; In this case the second loop only has a single iteration, fold the header away
+; CHECK: @func_12
+; CHECK: %tmp5 = icmp slt i32 %__key6.0, 10
+; CHECK: br i1 true, label %noassert68, label %unrolledend
+define i32 @func_12() nounwind uwtable {
+entry:
+  br label %forcond
+
+forcond:                                          ; preds = %noassert, %entry
+  %__key6.0 = phi i32 [ 2, %entry ], [ %tmp37, %noassert ]
+  %tmp5 = icmp slt i32 %__key6.0, 10
+  br i1 %tmp5, label %noassert, label %forcond38.preheader
+
+forcond38.preheader:                              ; preds = %forcond
+  br label %forcond38
+
+noassert:                                         ; preds = %forbody
+  %tmp13 = sdiv i32 -32768, %__key6.0
+  %tmp2936 = shl i32 %tmp13, 24
+  %sext23 = shl i32 %tmp13, 24
+  %tmp32 = icmp eq i32 %tmp2936, %sext23
+  %tmp37 = add i32 %__key6.0, 1
+  br i1 %tmp32, label %forcond, label %assert33
+
+assert33:                                         ; preds = %noassert
+  tail call void @llvm.trap()
+  unreachable
+
+forcond38:                                        ; preds = %noassert68, %forcond38.preheader
+  %__key8.0 = phi i32 [ %tmp81, %noassert68 ], [ 2, %forcond38.preheader ]
+  %tmp46 = icmp slt i32 %__key8.0, 10
+  br i1 %tmp46, label %noassert68, label %unrolledend
+
+noassert68:                                       ; preds = %forbody39
+  %tmp57 = sdiv i32 -32768, %__key8.0
+  %sext34 = shl i32 %tmp57, 16
+  %sext21 = shl i32 %tmp57, 16
+  %tmp76 = icmp ne i32 %sext34, %sext21
+  %tmp81 = add i32 %__key8.0, 1
+  br i1 %tmp76, label %forcond38, label %assert77
+
+assert77:                                         ; preds = %noassert68
+  tail call void @llvm.trap()
+  unreachable
+
+unrolledend:                                      ; preds = %forcond38
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
index bfdd000e38eb..507f695e67c5 100644
--- a/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
+++ b/test/Transforms/IndVarSimplify/no-iv-rewrite.ll
@@ -199,7 +199,6 @@ entry:
 ; back to the loop iv.
 ;
 ; CHECK: loop:
-; CHECK: phi i32
 ; CHECK-NOT: phi
 ; CHECK: exit:
 loop:
diff --git a/test/Transforms/IndVarSimplify/verify-scev.ll b/test/Transforms/IndVarSimplify/verify-scev.ll
new file mode 100644
index 000000000000..019f5830d520
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/verify-scev.ll
@@ -0,0 +1,421 @@
+; RUN: opt < %s -S -indvars -verify-scev
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @test1() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 false, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  %cmp272 = icmp eq i32 undef, undef
+  br i1 %cmp272, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  %k.029 = phi i32 [ 1, %for.body409.lr.ph ], [ %inc559, %for.inc558 ]
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  %inc559 = add nsw i32 %k.029, 1
+  %cmp407 = icmp sgt i32 %inc559, undef
+  br i1 %cmp407, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test2() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  %0 = load i32* undef, align 4
+  br i1 undef, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  %add406 = add i32 0, %0
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  %k.029 = phi i32 [ 1, %for.body409.lr.ph ], [ %inc559, %for.inc558 ]
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  %inc559 = add nsw i32 %k.029, 1
+  %cmp407 = icmp sgt i32 %inc559, %add406
+  br i1 %cmp407, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test3() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  br i1 undef, label %for.end11, label %for.body3
+
+for.body3:                                        ; preds = %for.end
+  unreachable
+
+for.end11:                                        ; preds = %for.end
+  br i1 undef, label %while.body, label %while.end
+
+while.body:                                       ; preds = %for.end11
+  unreachable
+
+while.end:                                        ; preds = %for.end11
+  br i1 undef, label %if.end115, label %for.cond109
+
+for.cond109:                                      ; preds = %while.end
+  unreachable
+
+if.end115:                                        ; preds = %while.end
+  br i1 undef, label %while.body119.lr.ph.lr.ph, label %for.cond612
+
+while.body119.lr.ph.lr.ph:                        ; preds = %if.end115
+  br i1 undef, label %for.cond612, label %if.end123.us
+
+if.end123.us:                                     ; preds = %while.body119.lr.ph.lr.ph
+  br label %for.cond132.us
+
+for.cond132.us:                                   ; preds = %for.cond132.us, %if.end123.us
+  br i1 undef, label %if.then136.us, label %for.cond132.us
+
+if.then136.us:                                    ; preds = %for.cond132.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.body211:                                    ; preds = %while.body211, %if.then136.us
+  br i1 undef, label %while.end220, label %while.body211
+
+while.end220:                                     ; preds = %while.body211, %if.then136.us
+  br label %for.cond246.outer
+
+for.cond246.outer:                                ; preds = %for.inc558, %for.cond394.preheader, %if.then274, %for.cond404.preheader, %while.end220
+  br label %for.cond246
+
+for.cond246:                                      ; preds = %for.cond372.loopexit, %for.cond246.outer
+  br i1 undef, label %for.end562, label %if.end250
+
+if.end250:                                        ; preds = %for.cond246
+  br i1 undef, label %if.end256, label %for.end562
+
+if.end256:                                        ; preds = %if.end250
+  br i1 undef, label %if.then274, label %for.cond404.preheader
+
+for.cond404.preheader:                            ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %for.body409.lr.ph
+
+for.body409.lr.ph:                                ; preds = %for.cond404.preheader
+  br label %for.body409
+
+if.then274:                                       ; preds = %if.end256
+  br i1 undef, label %for.cond246.outer, label %if.end309
+
+if.end309:                                        ; preds = %if.then274
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.body361:                                      ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond372.loopexit, label %for.body361
+
+for.cond372.loopexit:                             ; preds = %for.body361, %if.end309
+  br i1 undef, label %for.cond394.preheader, label %for.cond246
+
+for.cond394.preheader:                            ; preds = %for.cond372.loopexit
+  br i1 undef, label %for.cond246.outer, label %for.body397
+
+for.body397:                                      ; preds = %for.cond394.preheader
+  unreachable
+
+for.body409:                                      ; preds = %for.inc558, %for.body409.lr.ph
+  br i1 undef, label %if.then412, label %if.else433
+
+if.then412:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.else433:                                       ; preds = %for.body409
+  br label %if.end440
+
+if.end440:                                        ; preds = %if.else433, %if.then412
+  br i1 undef, label %for.inc558, label %if.end461
+
+if.end461:                                        ; preds = %if.end440
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.body517:                                      ; preds = %for.body517, %if.end461
+  br i1 undef, label %for.cond528.loopexit, label %for.body517
+
+for.cond528.loopexit:                             ; preds = %for.body517, %if.end461
+  br label %for.inc558
+
+for.inc558:                                       ; preds = %for.cond528.loopexit, %if.end440
+  br i1 undef, label %for.cond246.outer, label %for.body409
+
+for.end562:                                       ; preds = %if.end250, %for.cond246
+  unreachable
+
+for.cond612:                                      ; preds = %while.body119.lr.ph.lr.ph, %if.end115
+  unreachable
+}
+
+define void @test4() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %if.end8, label %if.else
+
+if.else:                                          ; preds = %entry
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.else, %entry
+  br i1 undef, label %if.end26, label %if.else22
+
+if.else22:                                        ; preds = %if.end8
+  br label %if.end26
+
+if.end26:                                         ; preds = %if.else22, %if.end8
+  br i1 undef, label %if.end35, label %if.else31
+
+if.else31:                                        ; preds = %if.end26
+  br label %if.end35
+
+if.end35:                                         ; preds = %if.else31, %if.end26
+  br i1 undef, label %for.end226, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %if.end35
+  br label %for.body48
+
+for.body48:                                       ; preds = %for.inc221, %for.body.lr.ph
+  br i1 undef, label %for.inc221, label %for.body65.lr.ph
+
+for.body65.lr.ph:                                 ; preds = %for.body48
+  %0 = load i32* undef, align 4
+  br label %for.body65.us
+
+for.body65.us:                                    ; preds = %for.inc219.us, %for.body65.lr.ph
+  %k.09.us = phi i32 [ %inc.us, %for.inc219.us ], [ 1, %for.body65.lr.ph ]
+  %idxprom66.us = sext i32 %k.09.us to i64
+  br i1 undef, label %for.inc219.us, label %if.end72.us
+
+if.end72.us:                                      ; preds = %for.body65.us
+  br i1 undef, label %if.end93.us, label %if.then76.us
+
+if.then76.us:                                     ; preds = %if.end72.us
+  br label %if.end93.us
+
+if.end93.us:                                      ; preds = %if.then76.us, %if.end72.us
+  br i1 undef, label %if.end110.us, label %for.inc219.us
+
+if.end110.us:                                     ; preds = %if.end93.us
+  br i1 undef, label %for.inc219.us, label %for.body142.us
+
+for.body142.us:                                   ; preds = %for.cond139.loopexit.us, %if.end110.us
+  br label %for.cond152.us
+
+for.cond152.us:                                   ; preds = %for.cond152.us, %for.body142.us
+  br i1 undef, label %for.cond139.loopexit.us, label %for.cond152.us
+
+for.inc219.us:                                    ; preds = %for.cond139.loopexit.us, %if.end110.us, %if.end93.us, %for.body65.us
+  %inc.us = add nsw i32 %k.09.us, 1
+  %cmp64.us = icmp sgt i32 %inc.us, %0
+  br i1 %cmp64.us, label %for.inc221, label %for.body65.us
+
+for.cond139.loopexit.us:                          ; preds = %for.cond152.us
+  br i1 undef, label %for.inc219.us, label %for.body142.us
+
+for.inc221:                                       ; preds = %for.inc219.us, %for.body48
+  br label %for.body48
+
+for.end226:                                       ; preds = %if.end35
+  ret void
+}
diff --git a/test/Transforms/Inline/recursive.ll b/test/Transforms/Inline/recursive.ll
new file mode 100644
index 000000000000..5fe8d1639ca3
--- /dev/null
+++ b/test/Transforms/Inline/recursive.ll
@@ -0,0 +1,38 @@
+; RUN: opt %s -inline -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin10.0"
+
+; rdar://10853263
+
+; Make sure that the callee is still here.
+; CHECK: define i32 @callee
+define i32 @callee(i32 %param) {
+ %yyy = alloca [100000 x i8]
+ %r = bitcast [100000 x i8]* %yyy to i8*
+ call void @foo2(i8* %r)
+ ret i32 4
+}
+
+; CHECK: define i32 @caller
+; CHECK-NEXT: entry:
+; CHECK-NOT: alloca
+; CHECK: ret
+define i32 @caller(i32 %param) {
+entry:
+  %t = call i32 @foo(i32 %param)
+  %cmp = icmp eq i32 %t, -1
+  br i1 %cmp, label %exit, label %cont
+
+cont:
+  %r = call i32 @caller(i32 %t)
+  %f = call i32 @callee(i32 %r)
+  br label %cont
+exit:
+  ret i32 4
+}
+
+declare void @foo2(i8* %in)
+
+declare i32 @foo(i32 %param)
+
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
index 73e5a6653e80..18aab7f27efd 100644
--- a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -1,12 +1,14 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
 ; PR13442
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-
 @test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
 
 define i64 @foo() {
   %ret = load i64* bitcast (i8* getelementptr (i8* bitcast ([4 x i32]* @test to i8*), i64 2) to i64*), align 1
   ret i64 %ret
-  ; CHECK: ret i64 844424930263040
+  ; 0x00030000_00020000 in [01 00/00 00 02 00 00 00 03 00/00 00 04 00 00 00]
+  ; LE: ret i64 844424930263040
+  ; 0x00000200_00000300 in [00 00/00 01 00 00 00 02 00 00/00 03 00 00 00 04]
+  ; BE: ret i64 281474976841728
 }
diff --git a/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
new file mode 100644
index 000000000000..4efaf8c17255
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-08-28-udiv_ashl.ll
@@ -0,0 +1,57 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; rdar://12182093
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @udiv400
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @udiv400(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv400_no
+; CHECK: ashr
+; CHECK: div
+; CHECK: ret
+define i32 @udiv400_no(i32 %x) {
+entry:
+  %div = ashr i32 %x, 2
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
+
+; CHECK: @sdiv400_yes
+; CHECK: udiv i32 %x, 400
+; CHECK: ret
+define i32 @sdiv400_yes(i32 %x) {
+entry:
+  %div = lshr i32 %x, 2
+  ; The sign bits of both operands are zero (i.e. we can prove they are
+  ; unsigned inputs), turn this into a udiv.
+  ; Next, optimize this just like sdiv.
+  %div1 = sdiv i32 %div, 100
+  ret i32 %div1
+}
+
+
+; CHECK: @udiv_i80
+; CHECK: udiv i80 %x, 400
+; CHECK: ret
+define i80 @udiv_i80(i80 %x) {
+  %div = lshr i80 %x, 2
+  %div1 = udiv i80 %div, 100
+  ret i80 %div1
+}
+
+define i32 @no_crash_notconst_udiv(i32 %x, i32 %notconst) {
+  %div = lshr i32 %x, %notconst
+  %div1 = udiv i32 %div, 100
+  ret i32 %div1
+}
diff --git a/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
new file mode 100644
index 000000000000..ba025e92b010
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; When merging zero sized alloca check that requested alignments of the allocas
+; are obeyed.
+
+@x = global i8* null, align 8
+@y = global i8* null, align 8
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @f
+; CHECK-NEXT: alloca [0 x i8], align 1024
+; CHECK-NOT: alloca
+; CHECK: ret void
+define void @f() {
+  %1 = alloca [0 x i8], align 1
+  %2 = alloca [0 x i8], align 1024
+  %3 = getelementptr inbounds [0 x i8]* %1, i64 0, i64 0
+  %4 = getelementptr inbounds [0 x i8]* %2, i64 0, i64 0
+  store i8* %3, i8** @x, align 8
+  store i8* %4, i8** @y, align 8
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll b/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll
new file mode 100644
index 000000000000..4cd60b42fbe1
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-24-MemcpyFromGlobalCrash.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; Check we don't crash due to lack of target data.
+
+@G = constant [100 x i8] zeroinitializer
+
+declare void @bar(i8*)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @test() {
+; CHECK: @test
+; CHECK: llvm.memcpy
+; CHECK: ret void
+  %A = alloca [100 x i8]
+  %a = getelementptr inbounds [100 x i8]* %A, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* getelementptr inbounds ([100 x i8]* @G, i64 0, i32 0), i64 100, i32 4, i1 false)
+  call void @bar(i8* %a) readonly
+  ret void
+}
diff --git a/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll b/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
new file mode 100644
index 000000000000..20ea28268742
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-10-25-vector-of-pointers.ll
@@ -0,0 +1,51 @@
+; RUN: opt < %s -instcombine -S
+
+; Make sure that we don't crash when optimizing the vectors of pointers.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+%struct.hoge = type { double*, double*, double*, double** }
+
+define void @widget(%struct.hoge* nocapture %arg) nounwind uwtable ssp {
+bb:
+  %tmp = getelementptr inbounds %struct.hoge* %arg, i64 0, i32 0
+  br i1 undef, label %bb1, label %bb17
+
+bb1:                                              ; preds = %bb
+  br i1 undef, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  br label %bb17
+
+bb3:                                              ; preds = %bb1
+  %tmp4 = bitcast double** %tmp to <2 x double*>*
+  %tmp5 = load <2 x double*>* %tmp4, align 8
+  %tmp6 = ptrtoint <2 x double*> %tmp5 to <2 x i64>
+  %tmp7 = sub <2 x i64> zeroinitializer, %tmp6
+  %tmp8 = ashr exact <2 x i64> %tmp7, <i64 3, i64 3>
+  %tmp9 = extractelement <2 x i64> %tmp8, i32 0
+  %tmp10 = add nsw i64 undef, %tmp9
+  br i1 undef, label %bb11, label %bb12
+
+bb11:                                             ; preds = %bb3
+  br label %bb13
+
+bb12:                                             ; preds = %bb3
+  br label %bb13
+
+bb13:                                             ; preds = %bb12, %bb11
+  br i1 undef, label %bb16, label %bb14
+
+bb14:                                             ; preds = %bb13
+  br i1 undef, label %bb16, label %bb15
+
+bb15:                                             ; preds = %bb14
+  br label %bb16
+
+bb16:                                             ; preds = %bb15, %bb14, %bb13
+  unreachable
+
+bb17:                                             ; preds = %bb2, %bb
+  ret void
+}
diff --git a/test/Transforms/InstCombine/align-addr.ll b/test/Transforms/InstCombine/align-addr.ll
index 27916b986030..4ea1bd9beb3b 100644
--- a/test/Transforms/InstCombine/align-addr.ll
+++ b/test/Transforms/InstCombine/align-addr.ll
@@ -58,3 +58,19 @@ define double @test2(double* %p, double %n) nounwind {
   store double %n, double* %p
   ret double %t
 }
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+declare void @use(i8*)
+
+%struct.s = type { i32, i32, i32, i32 }
+
+define void @test3(%struct.s* sret %a4) {
+; Check that the alignment is bumped up the alignment of the sret type.
+; CHECK: @test3
+  %a4.cast = bitcast %struct.s* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a4.cast, i8 0, i64 16, i32 1, i1 false)
+; CHECK: call void @llvm.memset.p0i8.i64(i8* %a4.cast, i8 0, i64 16, i32 4, i1 false)
+  call void @use(i8* %a4.cast)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index 50e03479f650..68a671cec88a 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -94,3 +94,19 @@ entry:
   tail call void @f(i32* %b)
   ret void
 }
+
+; PR14371
+%opaque_type = type opaque
+%real_type = type { { i32, i32* } }
+
+@opaque_global = external constant %opaque_type, align 4
+
+define void @test7() {
+entry:
+  %0 = alloca %real_type, align 4
+  %1 = bitcast %real_type* %0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* bitcast (%opaque_type* @opaque_global to i8*), i32 8, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/Transforms/InstCombine/and-fcmp.ll b/test/Transforms/InstCombine/and-fcmp.ll
index 838c2f73fb91..40c44c09a8c0 100644
--- a/test/Transforms/InstCombine/and-fcmp.ll
+++ b/test/Transforms/InstCombine/and-fcmp.ll
@@ -10,7 +10,7 @@ define zeroext i8 @t1(float %x, float %y) nounwind {
 ; CHECK: fcmp oeq float %x, %y
 ; CHECK-NOT: fcmp ueq float %x, %y
 ; CHECK-NOT: fcmp ord float %x, %y
-; CHECK-NOW: and
+; CHECK-NOT: and
 }
 
 define zeroext i8 @t2(float %x, float %y) nounwind {
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 56e5ca3ff720..b4eb69d4363d 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -694,3 +694,209 @@ define i1 @test67(i1 %a, i32 %b) {
 ; CHECK: @test67
 ; CHECK: ret i1 false
 }
+
+%s = type { i32, i32, i32 }
+
+define %s @test68(%s *%p, i64 %i) {
+; CHECK: @test68
+  %o = mul i64 %i, 12
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr %s*
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define double @test69(double *%p, i64 %i) {
+; CHECK: @test69
+  %o = shl nsw i64 %i, 3
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define %s @test70(%s *%p, i64 %i) {
+; CHECK: @test70
+  %o = mul nsw i64 %i, 36
+; CHECK-NEXT: mul nsw i64 %i, 3
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds %s*
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define double @test71(double *%p, i64 %i) {
+; CHECK: @test71
+  %o = shl i64 %i, 5
+; CHECK-NEXT: shl i64 %i, 2
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %o
+; CHECK-NEXT: getelementptr double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test72(double *%p, i32 %i) {
+; CHECK: @test72
+  %so = mul nsw i32 %i, 8
+  %o = sext i32 %so to i64
+; CHECK-NEXT: sext i32 %i to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test73(double *%p, i128 %i) {
+; CHECK: @test73
+  %lo = mul nsw i128 %i, 8
+  %o = trunc i128 %lo to i64
+; CHECK-NEXT: trunc i128 %i to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o
+; CHECK-NEXT: getelementptr double*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test74(double *%p, i64 %i) {
+; CHECK: @test74
+  %q = bitcast double* %p to i64*
+  %pp = getelementptr inbounds i64* %q, i64 %i
+; CHECK-NEXT: getelementptr inbounds double*
+  %r = bitcast i64* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define i32* @test75(i32* %p, i32 %x) {
+; CHECK: @test75
+  %y = shl i32 %x, 3
+; CHECK-NEXT: shl i32 %x, 3
+  %z = sext i32 %y to i64
+; CHECK-NEXT: sext i32 %y to i64
+  %q = bitcast i32* %p to i8*
+  %r = getelementptr i8* %q, i64 %z
+  %s = bitcast i8* %r to i32*
+  ret i32* %s
+}
+
+define %s @test76(%s *%p, i64 %i, i64 %j) {
+; CHECK: @test76
+  %o = mul i64 %i, 12
+  %o2 = mul nsw i64 %o, %j
+; CHECK-NEXT: %o2 = mul i64 %i, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o2
+; CHECK-NEXT: getelementptr %s* %p, i64 %o2
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define %s @test77(%s *%p, i64 %i, i64 %j) {
+; CHECK: @test77
+  %o = mul nsw i64 %i, 36
+  %o2 = mul nsw i64 %o, %j
+; CHECK-NEXT: %o = mul nsw i64 %i, 3
+; CHECK-NEXT: %o2 = mul nsw i64 %o, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %o2
+; CHECK-NEXT: getelementptr inbounds %s* %p, i64 %o2
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %l
+; CHECK-NEXT: ret %s
+}
+
+define %s @test78(%s *%p, i64 %i, i64 %j, i32 %k, i32 %l, i128 %m, i128 %n) {
+; CHECK: @test78
+  %a = mul nsw i32 %k, 36
+; CHECK-NEXT: mul nsw i32 %k, 3
+  %b = mul nsw i32 %a, %l
+; CHECK-NEXT: mul nsw i32 %a, %l
+  %c = sext i32 %b to i128
+; CHECK-NEXT: sext i32 %b to i128
+  %d = mul nsw i128 %c, %m
+; CHECK-NEXT: mul nsw i128 %c, %m
+  %e = mul i128 %d, %n
+; CHECK-NEXT: mul i128 %d, %n
+  %f = trunc i128 %e to i64
+; CHECK-NEXT: trunc i128 %e to i64
+  %g = mul nsw i64 %f, %i
+; CHECK-NEXT: mul i64 %f, %i
+  %h = mul nsw i64 %g, %j
+; CHECK-NEXT: mul i64 %g, %j
+  %q = bitcast %s* %p to i8*
+  %pp = getelementptr inbounds i8* %q, i64 %h
+; CHECK-NEXT: getelementptr %s* %p, i64 %h
+  %r = bitcast i8* %pp to %s*
+  %load = load %s* %r
+; CHECK-NEXT: load %s*
+  ret %s %load
+; CHECK-NEXT: ret %s
+}
+
+define %s @test79(%s *%p, i64 %i, i32 %j) {
+; CHECK: @test79
+  %a = mul nsw i64 %i, 36
+; CHECK: mul nsw i64 %i, 36
+  %b = trunc i64 %a to i32
+  %c = mul i32 %b, %j
+  %q = bitcast %s* %p to i8*
+; CHECK: bitcast
+  %pp = getelementptr inbounds i8* %q, i32 %c
+  %r = bitcast i8* %pp to %s*
+  %l = load %s* %r
+  ret %s %l
+}
+
+define double @test80([100 x double]* %p, i32 %i) {
+; CHECK: @test80
+  %tmp = mul nsw i32 %i, 8
+; CHECK-NEXT: sext i32 %i to i64
+  %q = bitcast [100 x double]* %p to i8*
+  %pp = getelementptr i8* %q, i32 %tmp
+; CHECK-NEXT: getelementptr [100 x double]*
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+; CHECK-NEXT: load double*
+  ret double %l
+; CHECK-NEXT: ret double
+}
+
+define double @test81(double *%p, float %f) {
+  %i = fptosi float %f to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %i
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+  ret double %l
+}
diff --git a/test/Transforms/InstCombine/disable-simplify-libcalls.ll b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
new file mode 100644
index 000000000000..d81e9ae5bd73
--- /dev/null
+++ b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
@@ -0,0 +1,236 @@
+; Test that -disable-simplify-libcalls is wired up correctly.
+;
+; RUN: opt < %s -instcombine -disable-simplify-libcalls -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str  = constant [1 x i8] zeroinitializer, align 1
+@.str1 = constant [13 x i8] c"hello, world\00", align 1
+@.str2 = constant [4 x i8] c"foo\00", align 1
+@.str3 = constant [4 x i8] c"bar\00", align 1
+@.str4 = constant [6 x i8] c"123.4\00", align 1
+@.str5 = constant [5 x i8] c"1234\00", align 1
+@empty = constant [1 x i8] c"\00", align 1
+
+declare double @ceil(double)
+declare double @copysign(double, double)
+declare double @cos(double)
+declare double @fabs(double)
+declare double @floor(double)
+declare i8* @strcat(i8*, i8*)
+declare i8* @strncat(i8*, i8*, i32)
+declare i8* @strchr(i8*, i32)
+declare i8* @strrchr(i8*, i32)
+declare i32 @strcmp(i8*, i8*)
+declare i32 @strncmp(i8*, i8*, i64)
+declare i8* @strcpy(i8*, i8*)
+declare i8* @stpcpy(i8*, i8*)
+declare i8* @strncpy(i8*, i8*, i64)
+declare i64 @strlen(i8*)
+declare i8* @strpbrk(i8*, i8*)
+declare i64 @strspn(i8*, i8*)
+declare double @strtod(i8*, i8**)
+declare float @strtof(i8*, i8**)
+declare x86_fp80 @strtold(i8*, i8**)
+declare i64 @strtol(i8*, i8**, i32)
+declare i64 @strtoll(i8*, i8**, i32)
+declare i64 @strtoul(i8*, i8**, i32)
+declare i64 @strtoull(i8*, i8**, i32)
+declare i64 @strcspn(i8*, i8*)
+
+define double @t1(double %x) {
+; CHECK: @t1
+  %ret = call double @ceil(double %x)
+  ret double %ret
+; CHECK: call double @ceil
+}
+
+define double @t2(double %x, double %y) {
+; CHECK: @t2
+  %ret = call double @copysign(double %x, double %y)
+  ret double %ret
+; CHECK: call double @copysign
+}
+
+define double @t3(double %x) {
+; CHECK: @t3
+  %call = call double @cos(double %x)
+  ret double %call
+; CHECK: call double @cos
+}
+
+define double @t4(double %x) {
+; CHECK: @t4
+  %ret = call double @fabs(double %x)
+  ret double %ret
+; CHECK: call double @fabs
+}
+
+define double @t5(double %x) {
+; CHECK: @t5
+  %ret = call double @floor(double %x)
+  ret double %ret
+; CHECK: call double @floor
+}
+
+define i8* @t6(i8* %x) {
+; CHECK: @t6
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strcat(i8* %x, i8* %empty)
+  ret i8* %ret
+; CHECK: call i8* @strcat
+}
+
+define i8* @t7(i8* %x) {
+; CHECK: @t7
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strncat(i8* %x, i8* %empty, i32 1)
+  ret i8* %ret
+; CHECK: call i8* @strncat
+}
+
+define i8* @t8() {
+; CHECK: @t8
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strchr
+}
+
+define i8* @t9() {
+; CHECK: @t9
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strrchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strrchr
+}
+
+define i32 @t10() {
+; CHECK: @t10
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strcmp(i8* %x, i8* %y)
+  ret i32 %ret
+; CHECK: call i32 @strcmp
+}
+
+define i32 @t11() {
+; CHECK: @t11
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strncmp(i8* %x, i8* %y, i64 3)
+  ret i32 %ret
+; CHECK: call i32 @strncmp
+}
+
+define i8* @t12(i8* %x) {
+; CHECK: @t12
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strcpy
+}
+
+define i8* @t13(i8* %x) {
+; CHECK: @t13
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @stpcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @stpcpy
+}
+
+define i8* @t14(i8* %x) {
+; CHECK: @t14
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strncpy(i8* %x, i8* %y, i64 3)
+  ret i8* %ret
+; CHECK: call i8* @strncpy
+}
+
+define i64 @t15() {
+; CHECK: @t15
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i64 @strlen(i8* %x)
+  ret i64 %ret
+; CHECK: call i64 @strlen
+}
+
+define i8* @t16(i8* %x) {
+; CHECK: @t16
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strpbrk(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strpbrk
+}
+
+define i64 @t17(i8* %x) {
+; CHECK: @t17
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i64 @strspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strspn
+}
+
+define double @t18(i8** %y) {
+; CHECK: @t18
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call double @strtod(i8* %x, i8** %y)
+  ret double %ret
+; CHECK: call double @strtod
+}
+
+define float @t19(i8** %y) {
+; CHECK: @t19
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call float @strtof(i8* %x, i8** %y)
+  ret float %ret
+; CHECK: call float @strtof
+}
+
+define x86_fp80 @t20(i8** %y) {
+; CHECK: @t20
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call x86_fp80 @strtold(i8* %x, i8** %y)
+  ret x86_fp80 %ret
+; CHECK: call x86_fp80 @strtold
+}
+
+define i64 @t21(i8** %y) {
+; CHECK: @t21
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtol(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtol
+}
+
+define i64 @t22(i8** %y) {
+; CHECK: @t22
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoll(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoll
+}
+
+define i64 @t23(i8** %y) {
+; CHECK: @t23
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoul(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoul
+}
+
+define i64 @t24(i8** %y) {
+; CHECK: @t24
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoull(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoull
+}
+
+define i64 @t25(i8* %y) {
+; CHECK: @t25
+  %x = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i64 @strcspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strcspn
+}
diff --git a/test/Transforms/InstCombine/div-shift.ll b/test/Transforms/InstCombine/div-shift.ll
index a07f3ea94914..e0372ebac184 100644
--- a/test/Transforms/InstCombine/div-shift.ll
+++ b/test/Transforms/InstCombine/div-shift.ll
@@ -21,3 +21,17 @@ define i64 @t2(i64 %x, i32 %y) nounwind  {
   %3 = udiv i64 %x, %2
   ret i64 %3
 }
+
+; PR13250
+define i64 @t3(i64 %x, i32 %y) nounwind  {
+; CHECK: t3
+; CHECK-NOT: udiv
+; CHECK-NEXT: %1 = add i32 %y, 2
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = lshr i64 %x, %2
+; CHECK-NEXT: ret i64 %3
+  %1 = shl i32 4, %y
+  %2 = zext i32 %1 to i64
+  %3 = udiv i64 %x, %2
+  ret i64 %3
+}
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index d08cbf574a23..376fa079d24c 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -54,9 +54,8 @@ define i1 @test7(float %x) nounwind readnone ssp noredzone {
   %ext = fpext float %x to ppc_fp128
   %cmp = fcmp ogt ppc_fp128 %ext, 0xM00000000000000000000000000000000
   ret i1 %cmp
-; Can't convert ppc_fp128
 ; CHECK: @test7
-; CHECK-NEXT: fpext float %x to ppc_fp128
+; CHECK-NEXT: fcmp ogt float %x, 0.000000e+00
 }
 
 define float @test8(float %x) nounwind readnone optsize ssp {
@@ -69,3 +68,93 @@ define float @test8(float %x) nounwind readnone optsize ssp {
 ; CHECK: @test8
 ; CHECK-NEXT: fcmp olt float %x, 0.000000e+00
 }
+
+declare double @fabs(double) nounwind readnone
+
+define i32 @test9(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp olt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test9
+; CHECK-NOT: fabs
+; CHECK: ret i32 0
+}
+
+define i32 @test10(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ole double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test10
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
+define i32 @test11(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ogt double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test11
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
+define i32 @test12(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp oge double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test12
+; CHECK-NOT: fabs
+; CHECK: fcmp ord double %a, 0.000000e+00
+}
+
+define i32 @test13(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp une double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test13
+; CHECK-NOT: fabs
+; CHECK: fcmp une double %a, 0.000000e+00
+}
+
+define i32 @test14(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp oeq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test14
+; CHECK-NOT: fabs
+; CHECK: fcmp oeq double %a, 0.000000e+00
+}
+
+define i32 @test15(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp one double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test15
+; CHECK-NOT: fabs
+; CHECK: fcmp one double %a, 0.000000e+00
+}
+
+define i32 @test16(double %a) nounwind {
+  %call = tail call double @fabs(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+; CHECK: @test16
+; CHECK-NOT: fabs
+; CHECK: fcmp ueq double %a, 0.000000e+00
+}
+
+; Don't crash.
+define i32 @test17(double %a, double (double)* %p) nounwind {
+  %call = tail call double %p(double %a) nounwind
+  %cmp = fcmp ueq double %call, 0.000000e+00
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/Transforms/InstCombine/fold-vector-select.ll b/test/Transforms/InstCombine/fold-vector-select.ll
index 3f22522a6ce4..2cb970bf4177 100644
--- a/test/Transforms/InstCombine/fold-vector-select.ll
+++ b/test/Transforms/InstCombine/fold-vector-select.ll
@@ -1,13 +1,148 @@
 ; RUN: opt < %s -instcombine -S | not grep select
 
-define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D) {
- %r = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> zeroinitializer
- %g = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 6, i32 9, i32 1>
- %b = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>,  <4 x i32> zeroinitializer, <4 x i32> <i32 7, i32 1, i32 4, i32 9>
- %a = select <4 x i1> zeroinitializer,  <4 x i32> zeroinitializer, <4 x i32> <i32 3, i32 2, i32 8, i32 5>
- store <4 x i32> %r, <4 x i32>* %A
- store <4 x i32> %g, <4 x i32>* %B
- store <4 x i32> %b, <4 x i32>* %C
- store <4 x i32> %a, <4 x i32>* %D
+define void @foo(<4 x i32> *%A, <4 x i32> *%B, <4 x i32> *%C, <4 x i32> *%D,
+                 <4 x i32> *%E, <4 x i32> *%F, <4 x i32> *%G, <4 x i32> *%H,
+                 <4 x i32> *%I, <4 x i32> *%J, <4 x i32> *%K, <4 x i32> *%L,
+                 <4 x i32> *%M, <4 x i32> *%N, <4 x i32> *%O, <4 x i32> *%P,
+                 <4 x i32> *%Q, <4 x i32> *%R, <4 x i32> *%S, <4 x i32> *%T,
+                 <4 x i32> *%U, <4 x i32> *%V, <4 x i32> *%W, <4 x i32> *%X,
+                 <4 x i32> *%Y, <4 x i32> *%Z, <4 x i32> *%BA, <4 x i32> *%BB,
+                 <4 x i32> *%BC, <4 x i32> *%BD, <4 x i32> *%BE, <4 x i32> *%BF,
+                 <4 x i32> *%BG, <4 x i32> *%BH, <4 x i32> *%BI, <4 x i32> *%BJ,
+                 <4 x i32> *%BK, <4 x i32> *%BL, <4 x i32> *%BM, <4 x i32> *%BN,
+                 <4 x i32> *%BO, <4 x i32> *%BP, <4 x i32> *%BQ, <4 x i32> *%BR,
+                 <4 x i32> *%BS, <4 x i32> *%BT, <4 x i32> *%BU, <4 x i32> *%BV,
+                 <4 x i32> *%BW, <4 x i32> *%BX, <4 x i32> *%BY, <4 x i32> *%BZ,
+                 <4 x i32> *%CA, <4 x i32> *%CB, <4 x i32> *%CC, <4 x i32> *%CD,
+                 <4 x i32> *%CE, <4 x i32> *%CF, <4 x i32> *%CG, <4 x i32> *%CH,
+                 <4 x i32> *%CI, <4 x i32> *%CJ, <4 x i32> *%CK, <4 x i32> *%CL) {
+ %a = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 9, i32 87, i32 57, i32 8>
+ %b = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 99, i32 49, i32 29>
+ %c = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 18, i32 53, i32 84>
+ %d = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 29, i32 82, i32 45, i32 16>
+ %e = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 11, i32 15, i32 32, i32 99>
+ %f = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 19, i32 86, i32 29, i32 33>
+ %g = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 44, i32 10, i32 26, i32 45>
+ %h = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> <i32 88, i32 70, i32 90, i32 48>
+ %i = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 30, i32 53, i32 42, i32 12>
+ %j = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 46, i32 24, i32 93, i32 26>
+ %k = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 33, i32 99, i32 15, i32 57>
+ %l = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 51, i32 60, i32 60, i32 50>
+ %m = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 50, i32 12, i32 7, i32 45>
+ %n = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 15, i32 65, i32 36, i32 36>
+ %o = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 54, i32 0, i32 17, i32 78>
+ %p = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> <i32 56, i32 13, i32 64, i32 48>
+ %q = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 52, i32 69, i32 88, i32 11>, <4 x i32> zeroinitializer
+ %r = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 5, i32 87, i32 68, i32 14>, <4 x i32> zeroinitializer
+ %s = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 47, i32 17, i32 66, i32 63>, <4 x i32> zeroinitializer
+ %t = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 64, i32 25, i32 73, i32 81>, <4 x i32> zeroinitializer
+ %u = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 51, i32 41, i32 61, i32 63>, <4 x i32> zeroinitializer
+ %v = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 39, i32 59, i32 17, i32 0>, <4 x i32> zeroinitializer
+ %w = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 91, i32 99, i32 97, i32 29>, <4 x i32> zeroinitializer
+ %x = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 89, i32 45, i32 89, i32 10>, <4 x i32> zeroinitializer
+ %y = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 25, i32 70, i32 21, i32 27>, <4 x i32> zeroinitializer
+ %z = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 40, i32 12, i32 27, i32 88>, <4 x i32> zeroinitializer
+ %ba = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 36, i32 35, i32 90, i32 23>, <4 x i32> zeroinitializer
+ %bb = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 83, i32 3, i32 64, i32 82>, <4 x i32> zeroinitializer
+ %bc = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 15, i32 72, i32 2, i32 54>, <4 x i32> zeroinitializer
+ %bd = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 32, i32 47, i32 100, i32 84>, <4 x i32> zeroinitializer
+ %be = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 92, i32 57, i32 82, i32 1>, <4 x i32> zeroinitializer
+ %bf = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 42, i32 14, i32 22, i32 89>, <4 x i32> zeroinitializer
+ %bg = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> <i32 33, i32 10, i32 67, i32 66>, <4 x i32> <i32 42, i32 91, i32 47, i32 40>
+ %bh = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> <i32 8, i32 13, i32 48, i32 0>, <4 x i32> <i32 84, i32 66, i32 87, i32 84>
+ %bi = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> <i32 85, i32 96, i32 1, i32 94>, <4 x i32> <i32 54, i32 57, i32 7, i32 92>
+ %bj = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> <i32 55, i32 21, i32 92, i32 68>, <4 x i32> <i32 51, i32 61, i32 62, i32 39>
+ %bk = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> <i32 42, i32 18, i32 77, i32 74>, <4 x i32> <i32 82, i32 33, i32 30, i32 7>
+ %bl = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> <i32 80, i32 92, i32 61, i32 84>, <4 x i32> <i32 43, i32 89, i32 92, i32 6>
+ %bm = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> <i32 49, i32 14, i32 62, i32 62>, <4 x i32> <i32 35, i32 33, i32 92, i32 59>
+ %bn = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> <i32 3, i32 97, i32 49, i32 18>, <4 x i32> <i32 56, i32 64, i32 19, i32 75>
+ %bo = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> <i32 91, i32 57, i32 0, i32 1>, <4 x i32> <i32 43, i32 63, i32 64, i32 11>
+ %bp = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> <i32 41, i32 65, i32 18, i32 11>, <4 x i32> <i32 86, i32 26, i32 31, i32 3>
+ %bq = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> <i32 31, i32 46, i32 32, i32 68>, <4 x i32> <i32 100, i32 59, i32 62, i32 6>
+ %br = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> <i32 76, i32 67, i32 87, i32 7>, <4 x i32> <i32 63, i32 48, i32 97, i32 24>
+ %bs = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> <i32 83, i32 89, i32 19, i32 4>, <4 x i32> <i32 21, i32 2, i32 40, i32 21>
+ %bt = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> <i32 45, i32 76, i32 81, i32 100>, <4 x i32> <i32 65, i32 26, i32 100, i32 46>
+ %bu = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> <i32 16, i32 75, i32 31, i32 17>, <4 x i32> <i32 37, i32 66, i32 86, i32 65>
+ %bv = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> <i32 13, i32 25, i32 43, i32 59>, <4 x i32> <i32 82, i32 78, i32 60, i32 52>
+ %bw = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bx = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %by = select <4 x i1> <i1 false, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %bz = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ca = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cb = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cc = select <4 x i1> <i1 false, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cd = select <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ce = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cf = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cg = select <4 x i1> <i1 false, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ch = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ci = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cj = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %ck = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ %cl = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
+ store <4 x i32> %a, <4 x i32>* %A
+ store <4 x i32> %b, <4 x i32>* %B
+ store <4 x i32> %c, <4 x i32>* %C
+ store <4 x i32> %d, <4 x i32>* %D
+ store <4 x i32> %e, <4 x i32>* %E
+ store <4 x i32> %f, <4 x i32>* %F
+ store <4 x i32> %g, <4 x i32>* %G
+ store <4 x i32> %h, <4 x i32>* %H
+ store <4 x i32> %i, <4 x i32>* %I
+ store <4 x i32> %j, <4 x i32>* %J
+ store <4 x i32> %k, <4 x i32>* %K
+ store <4 x i32> %l, <4 x i32>* %L
+ store <4 x i32> %m, <4 x i32>* %M
+ store <4 x i32> %n, <4 x i32>* %N
+ store <4 x i32> %o, <4 x i32>* %O
+ store <4 x i32> %p, <4 x i32>* %P
+ store <4 x i32> %q, <4 x i32>* %Q
+ store <4 x i32> %r, <4 x i32>* %R
+ store <4 x i32> %s, <4 x i32>* %S
+ store <4 x i32> %t, <4 x i32>* %T
+ store <4 x i32> %u, <4 x i32>* %U
+ store <4 x i32> %v, <4 x i32>* %V
+ store <4 x i32> %w, <4 x i32>* %W
+ store <4 x i32> %x, <4 x i32>* %X
+ store <4 x i32> %y, <4 x i32>* %Y
+ store <4 x i32> %z, <4 x i32>* %Z
+ store <4 x i32> %ba, <4 x i32>* %BA
+ store <4 x i32> %bb, <4 x i32>* %BB
+ store <4 x i32> %bc, <4 x i32>* %BC
+ store <4 x i32> %bd, <4 x i32>* %BD
+ store <4 x i32> %be, <4 x i32>* %BE
+ store <4 x i32> %bf, <4 x i32>* %BF
+ store <4 x i32> %bg, <4 x i32>* %BG
+ store <4 x i32> %bh, <4 x i32>* %BH
+ store <4 x i32> %bi, <4 x i32>* %BI
+ store <4 x i32> %bj, <4 x i32>* %BJ
+ store <4 x i32> %bk, <4 x i32>* %BK
+ store <4 x i32> %bl, <4 x i32>* %BL
+ store <4 x i32> %bm, <4 x i32>* %BM
+ store <4 x i32> %bn, <4 x i32>* %BN
+ store <4 x i32> %bo, <4 x i32>* %BO
+ store <4 x i32> %bp, <4 x i32>* %BP
+ store <4 x i32> %bq, <4 x i32>* %BQ
+ store <4 x i32> %br, <4 x i32>* %BR
+ store <4 x i32> %bs, <4 x i32>* %BS
+ store <4 x i32> %bt, <4 x i32>* %BT
+ store <4 x i32> %bu, <4 x i32>* %BU
+ store <4 x i32> %bv, <4 x i32>* %BV
+ store <4 x i32> %bw, <4 x i32>* %BW
+ store <4 x i32> %bx, <4 x i32>* %BX
+ store <4 x i32> %by, <4 x i32>* %BY
+ store <4 x i32> %bz, <4 x i32>* %BZ
+ store <4 x i32> %ca, <4 x i32>* %CA
+ store <4 x i32> %cb, <4 x i32>* %CB
+ store <4 x i32> %cc, <4 x i32>* %CC
+ store <4 x i32> %cd, <4 x i32>* %CD
+ store <4 x i32> %ce, <4 x i32>* %CE
+ store <4 x i32> %cf, <4 x i32>* %CF
+ store <4 x i32> %cg, <4 x i32>* %CG
+ store <4 x i32> %ch, <4 x i32>* %CH
+ store <4 x i32> %ci, <4 x i32>* %CI
+ store <4 x i32> %cj, <4 x i32>* %CJ
+ store <4 x i32> %ck, <4 x i32>* %CK
+ store <4 x i32> %cl, <4 x i32>* %CL
  ret void
 }
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index eaff87d695ed..8e064a4f2fc9 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -659,3 +659,21 @@ define i1 @test64(i8 %a, i32 %b) nounwind {
 ; CHECK-NEXT: %c = icmp eq i8 %1, %a
 ; CHECK-NEXT: ret i1 %c
 }
+
+define i1 @test65(i64 %A, i64 %B) {
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %A, %B
+  %cmp = icmp eq i64 %s1, %s2
+; CHECK: @test65
+; CHECK-NEXT: ret i1 true
+  ret i1 %cmp
+}
+
+define i1 @test66(i64 %A, i64 %B) {
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %B, %A
+  %cmp = icmp eq i64 %s1, %s2
+; CHECK: @test66
+; CHECK-NEXT: ret i1 true
+  ret i1 %cmp
+}
diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
new file mode 100644
index 000000000000..4238c5f8fb15
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-1.ll
@@ -0,0 +1,72 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@foo = constant [4 x i8] c"foo\00"
+@hel = constant [4 x i8] c"hel\00"
+@hello_u = constant [8 x i8] c"hello_u\00"
+
+declare i32 @memcmp(i8*, i8*, i32)
+
+; Check memcmp(mem, mem, size) -> 0.
+
+define i32 @test_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i32 @memcmp(i8* %mem, i8* %mem, i32 %size)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+; Check memcmp(mem1, mem2, 0) -> 0.
+
+define i32 @test_simplify2(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify2
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 0)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+;; Check memcmp(mem1, mem2, 1) -> *(unsigned char*)mem1 - *(unsigned char*)mem2.
+
+define i32 @test_simplify3(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify3
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 1)
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %mem1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %mem2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ret i32 %ret
+; CHECK: ret i32 [[RET]]
+}
+
+; Check memcmp(mem1, mem2, size) -> cnst, where all arguments are constants.
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [8 x i8]* @hello_u, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+define i32 @test_simplify5() {
+; CHECK: @test_simplify5
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{[0-9]+}}
+}
+
+define i32 @test_simplify6() {
+; CHECK: @test_simplify6
+  %mem1 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{-[0-9]+}}
+}
diff --git a/test/Transforms/InstCombine/memcmp-2.ll b/test/Transforms/InstCombine/memcmp-2.ll
new file mode 100644
index 000000000000..3796117bc24c
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i32* @memcmp(i8*, i8*, i32)
+
+; Check that memcmp functions with the wrong prototype aren't simplified.
+
+define i32* @test_no_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i32* @memcmp(i8* %mem, i8* %mem, i32 %size)
+; CHECK-NEXT: call i32* @memcmp
+  ret i32* %ret
+; CHECK-NEXT: ret i32* %ret
+}
diff --git a/test/Transforms/InstCombine/memcpy-1.ll b/test/Transforms/InstCombine/memcpy-1.ll
new file mode 100644
index 000000000000..65b79ad03df4
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-1.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memcpy(i8*, i8*, i32)
+
+; Check memcpy(mem1, mem2, size) -> llvm.memcpy(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memcpy
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memcpy-2.ll b/test/Transforms/InstCombine/memcpy-2.ll
new file mode 100644
index 000000000000..4a8a02018f5e
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memcpy(i8*, i8*, i32)
+
+; Check that memcpy functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memcpy
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/ScalarRepl/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index 5557a8fd8754..83c893e17dd6 100644
--- a/test/Transforms/ScalarRepl/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -scalarrepl -S | FileCheck %s
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 @C.0.1248 = internal constant [128 x float] [ float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float -1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float -1.000000e+00, float 1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float -1.000000e+00, float 0.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 ], align 32		; <[128 x float]*> [#uses=1]
 
@@ -6,13 +6,11 @@ define float @test1(i32 %hash, float %x, float %y, float %z, float %w) {
 entry:
 	%lookupTable = alloca [128 x float], align 16		; <[128 x float]*> [#uses=5]
 	%lookupTable1 = bitcast [128 x float]* %lookupTable to i8*		; <i8*> [#uses=1]
-	call void @llvm.memcpy.i32( i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i32 512, i32 16 )
+	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %lookupTable1, i8* bitcast ([128 x float]* @C.0.1248 to i8*), i64 512, i32 16, i1 false)
         
 ; CHECK: @test1
 ; CHECK-NOT: alloca
 ; CHECK-NOT: call{{.*}}@llvm.memcpy
-; CHECK: %lookupTable1 = bitcast [128 x float]* @C.0.1248 to i8*
-; CHECK-NOT: call{{.*}}@llvm.memcpy
         
 	%tmp3 = shl i32 %hash, 2		; <i32> [#uses=1]
 	%tmp5 = and i32 %tmp3, 124		; <i32> [#uses=4]
@@ -38,10 +36,6 @@ entry:
 	ret float %tmp43
 }
 
-declare void @llvm.memcpy.i32(i8*, i8*, i32, i32)
-
-
-
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
 %T = type { i8, [123 x i8] }
@@ -59,10 +53,11 @@ define void @test2() {
 ; CHECK: @test2
 
 ; %A alloca is deleted
-; CHECK-NEXT: %B = alloca %T
+; CHECK-NEXT: alloca [124 x i8]
+; CHECK-NEXT: getelementptr inbounds [124 x i8]*
 
 ; use @G instead of %A
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %{{.*}}, i8* getelementptr inbounds (%T* @G, i64 0, i32 0)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %b, i8* %a, i64 124, i32 4, i1 false)
   call void @bar(i8* %b)
@@ -79,8 +74,7 @@ define void @test3() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test3
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -90,8 +84,7 @@ define void @test4() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a) 
 ; CHECK: @test4
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @baz(i8* byval %a)
+; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -103,8 +96,7 @@ define void @test5() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a) 
 ; CHECK: @test5
-; CHECK-NEXT: %a = bitcast %T* @G to i8*
-; CHECK-NEXT: call void @baz(i8* byval %a)
+; CHECK-NEXT: call void @baz(i8* byval getelementptr inbounds (%T* @G, i64 0, i32 0))
   ret void
 }
 
@@ -118,8 +110,7 @@ define void @test6() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([2 x %U]* @H to i8*), i64 20, i32 16, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test6
-; CHECK-NEXT: %a = bitcast
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* bitcast ([2 x %U]* @H to i8*))
   ret void
 }
 
@@ -129,8 +120,7 @@ define void @test7() {
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%U* getelementptr ([2 x %U]* @H, i64 0, i32 0) to i8*), i64 20, i32 4, i1 false)
   call void @bar(i8* %a) readonly
 ; CHECK: @test7
-; CHECK-NEXT: %a = bitcast
-; CHECK-NEXT: call void @bar(i8* %a)
+; CHECK-NEXT: call void @bar(i8* bitcast ([2 x %U]* @H to i8*))
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/memcpy_chk-1.ll b/test/Transforms/InstCombine/memcpy_chk-1.ll
new file mode 100644
index 000000000000..7c7d91808a37
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy_chk-1.ll
@@ -0,0 +1,60 @@
+; Test lib call simplification of __memcpy_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T3 = type { [100 x i32], [100 x i32], [2048 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+@t3 = common global %struct.T3 zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T3* @t3 to i8*
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T3* @t3 to i8*
+  %src = bitcast %struct.T1* @t1 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret void
+}
+
+declare i8* @__memcpy_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memcpy_chk-2.ll b/test/Transforms/InstCombine/memcpy_chk-2.ll
new file mode 100644
index 000000000000..aa43029d47fc
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy_chk-2.ll
@@ -0,0 +1,24 @@
+; Test that lib call simplification doesn't simplify __memcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memcpy_chk
+  call i8* @__memcpy_chk(i8* %dst, i8* %src, i64 1824)
+  ret void
+}
+
+declare i8* @__memcpy_chk(i8*, i8*, i64)
diff --git a/test/Transforms/InstCombine/memmove-1.ll b/test/Transforms/InstCombine/memmove-1.ll
new file mode 100644
index 000000000000..53f2f116c777
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-1.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memmove(i8*, i8*, i32)
+
+; Check memmove(mem1, mem2, size) -> llvm.memmove(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memmove
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memmove-2.ll b/test/Transforms/InstCombine/memmove-2.ll
new file mode 100644
index 000000000000..23887bce31d8
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-2.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memmove(i8*, i8*, i32)
+
+; Check that memmove functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memmove
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memmove_chk-1.ll b/test/Transforms/InstCombine/memmove_chk-1.ll
new file mode 100644
index 000000000000..f9ff9a103a30
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove_chk-1.ll
@@ -0,0 +1,60 @@
+; Test lib call simplification of __memmove_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T3 = type { [100 x i32], [100 x i32], [2048 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+@t3 = common global %struct.T3 zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T3* @t3 to i8*
+
+; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824, i64 2848)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T3* @t3 to i8*
+  %src = bitcast %struct.T1* @t1 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 2848, i64 1824)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1024, i64 0)
+  ret void
+}
+
+declare i8* @__memmove_chk(i8*, i8*, i64, i64)
diff --git a/test/Transforms/InstCombine/memmove_chk-2.ll b/test/Transforms/InstCombine/memmove_chk-2.ll
new file mode 100644
index 000000000000..f0a915fde2e9
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove_chk-2.ll
@@ -0,0 +1,24 @@
+; Test that lib call simplification doesn't simplify __memmove_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T1 = type { [100 x i32], [100 x i32], [1024 x i8] }
+%struct.T2 = type { [100 x i32], [100 x i32], [1024 x i8] }
+
+@t1 = common global %struct.T1 zeroinitializer
+@t2 = common global %struct.T2 zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T1* @t1 to i8*
+  %src = bitcast %struct.T2* @t2 to i8*
+
+; CHECK-NEXT: call i8* @__memmove_chk
+  call i8* @__memmove_chk(i8* %dst, i8* %src, i64 1824)
+  ret void
+}
+
+declare i8* @__memmove_chk(i8*, i8*, i64)
diff --git a/test/Transforms/InstCombine/memset-1.ll b/test/Transforms/InstCombine/memset-1.ll
new file mode 100644
index 000000000000..48b433e137c0
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-1.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memset(i8*, i32, i32)
+
+; Check memset(mem1, val, size) -> llvm.memset(mem1, val, size, 1).
+
+define i8* @test_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call void @llvm.memset
+  ret i8* %ret
+; CHECK: ret i8* %mem
+}
diff --git a/test/Transforms/InstCombine/memset-2.ll b/test/Transforms/InstCombine/memset-2.ll
new file mode 100644
index 000000000000..8a9033302d04
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-2.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memset(i8*, i32, i32)
+
+; Check that memset functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call i8 @memset
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
new file mode 100644
index 000000000000..be4c1cfccdb2
--- /dev/null
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -0,0 +1,61 @@
+; Test lib call simplification of __memset_chk calls with various values
+; for dstlen and len.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+; rdar://7719085
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T = type { [100 x i32], [100 x i32], [1024 x i8] }
+@t = common global %struct.T zeroinitializer
+
+; Check cases where dstlen >= len.
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 1824)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 3648)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 -1)
+  ret void
+}
+
+; Check cases where dstlen < len.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 400)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824, i64 0)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
diff --git a/test/Transforms/InstCombine/memset_chk-2.ll b/test/Transforms/InstCombine/memset_chk-2.ll
new file mode 100644
index 000000000000..60fbf163c212
--- /dev/null
+++ b/test/Transforms/InstCombine/memset_chk-2.ll
@@ -0,0 +1,20 @@
+; Test that lib call simplification doesn't simplify __memset_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+
+%struct.T = type { [100 x i32], [100 x i32], [1024 x i8] }
+@t = common global %struct.T zeroinitializer
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = bitcast %struct.T* @t to i8*
+
+; CHECK-NEXT: call i8* @__memset_chk
+  call i8* @__memset_chk(i8* %dst, i32 0, i64 1824)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64)
diff --git a/test/Transforms/InstCombine/memset_chk.ll b/test/Transforms/InstCombine/memset_chk.ll
deleted file mode 100644
index 58ecda582fd1..000000000000
--- a/test/Transforms/InstCombine/memset_chk.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; rdar://7719085
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
-
-%struct.data = type { [100 x i32], [100 x i32], [1024 x i8] }
-
-define i32 @t() nounwind ssp {
-; CHECK: @t
-; CHECK: @llvm.memset.p0i8.i64
-entry:
-  %0 = alloca %struct.data, align 8               ; <%struct.data*> [#uses=1]
-  %1 = bitcast %struct.data* %0 to i8*            ; <i8*> [#uses=1]
-  %2 = call i8* @__memset_chk(i8* %1, i32 0, i64 1824, i64 1824) nounwind ; <i8*> [#uses=0]
-  ret i32 0
-}
-
-declare i8* @__memset_chk(i8*, i32, i64, i64) nounwind
diff --git a/test/Transforms/InstCombine/obfuscated_splat.ll b/test/Transforms/InstCombine/obfuscated_splat.ll
new file mode 100644
index 000000000000..c25dade168a4
--- /dev/null
+++ b/test/Transforms/InstCombine/obfuscated_splat.ll
@@ -0,0 +1,11 @@
+; RUN: opt -instcombine -S %s | FileCheck %s
+
+define void @test(<4 x float> *%in_ptr, <4 x float> *%out_ptr) {
+  %A = load <4 x float>* %in_ptr, align 16
+  %B = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
+  %C = shufflevector <4 x float> %B, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
+  %D = shufflevector <4 x float> %C, <4 x float> %A, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+; CHECK:  %D = shufflevector <4 x float> %A, <4 x float> undef, <4 x i32> zeroinitializer
+  store <4 x float> %D, <4 x float> *%out_ptr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index d7e292155cd7..31a3cb46e459 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -247,7 +247,8 @@ entry:
 
 ; technically reachable, but this malformed IR may appear as a result of constant propagation
 xpto:
-  %gep = getelementptr i8* %gep, i32 1
+  %gep2 = getelementptr i8* %gep, i32 1
+  %gep = getelementptr i8* %gep2, i32 1
   %o = call i32 @llvm.objectsize.i32(i8* %gep, i1 true)
 ; CHECK: ret i32 undef
   ret i32 %o
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index 4baae2618dde..cc3aacdce3c8 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -829,3 +829,37 @@ define i1 @test63(i1 %A, i1 %B) {
 ; CHECK: %C = or i1 %B, %not
 ; CHECK: ret i1 %C
 }
+
+; PR14131
+define void @test64(i32 %p, i16 %b) noreturn nounwind {
+entry:
+  %p.addr.0.insert.mask = and i32 %p, -65536
+  %conv2 = and i32 %p, 65535
+  br i1 undef, label %lor.rhs, label %lor.end
+
+lor.rhs:
+  %p.addr.0.extract.trunc = trunc i32 %p.addr.0.insert.mask to i16
+  %phitmp = zext i16 %p.addr.0.extract.trunc to i32
+  br label %lor.end
+
+lor.end:
+  %t.1 = phi i32 [ 0, %entry ], [ %phitmp, %lor.rhs ]
+  %conv6 = zext i16 %b to i32
+  %div = udiv i32 %conv6, %t.1
+  %tobool8 = icmp eq i32 %div, 0
+  %cmp = icmp eq i32 %t.1, 0
+  %cmp12 = icmp ult i32 %conv2, 2
+  %cmp.sink = select i1 %tobool8, i1 %cmp12, i1 %cmp
+  br i1 %cmp.sink, label %cond.end17, label %cond.false16
+
+cond.false16:
+  br label %cond.end17
+
+cond.end17:
+  br label %while.body
+
+while.body:
+  br label %while.body
+; CHECK: @test64
+; CHECK-NOT: select
+}
diff --git a/test/Transforms/InstCombine/stpcpy-1.ll b/test/Transforms/InstCombine/stpcpy-1.ll
new file mode 100644
index 000000000000..8b6bb0e0d509
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy-1.ll
@@ -0,0 +1,46 @@
+; Test that the stpcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @stpcpy(i8*, i8*)
+
+define i8* @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+; CHECK-NEXT: getelementptr inbounds ([32 x i8]* @a, i32 0, i32 5)
+  ret i8* %ret
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %dst)
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
+; CHECK-NEXT: getelementptr inbounds [32 x i8]* @a, i32 0, i32 [[LEN]]
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @stpcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @stpcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/stpcpy-2.ll b/test/Transforms/InstCombine/stpcpy-2.ll
new file mode 100644
index 000000000000..2e92c0895ed4
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the stpcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @stpcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @stpcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @stpcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
new file mode 100644
index 000000000000..05603918c642
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -0,0 +1,96 @@
+; Test lib call simplification of __stpcpy_chk calls with various values
+; for src, dst, and slen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where slen >= strlen (src).
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check cases where there are no string constants.
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @stpcpy
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check case where the string length is not constant.
+
+define i8* @test_simplify5() {
+; CHECK: @test_simplify5
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
+; CHECK: ret i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 11)
+  ret i8* %ret
+}
+
+; Check case where the source and destination are the same.
+
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+
+; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen
+; CHECK-NEXT: getelementptr inbounds [60 x i8]* @a, i32 0, i32 [[LEN]]
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
+}
+
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__stpcpy_chk
+  call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/stpcpy_chk-2.ll b/test/Transforms/InstCombine/stpcpy_chk-2.ll
new file mode 100644
index 000000000000..46c2139276e2
--- /dev/null
+++ b/test/Transforms/InstCombine/stpcpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __stpcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@.str = private constant [8 x i8] c"abcdefg\00"
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strcpy_chk
+  call i16* @__strcpy_chk(i16* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i16* @__strcpy_chk(i16*, i8*, i32)
diff --git a/test/Transforms/InstCombine/strcat-1.ll b/test/Transforms/InstCombine/strcat-1.ll
new file mode 100644
index 000000000000..3c05d6b06fa0
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-1.ll
@@ -0,0 +1,38 @@
+; Test that the strcat libcall simplifier works correctly per the
+; bug found in PR3661.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+
+declare i8* @strcat(i8*, i8*)
+declare i32 @puts(i8*)
+
+define i32 @main() {
+; CHECK: @main
+; CHECK-NOT: call i8* @strcat
+; CHECK: call i32 @puts
+
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  ; rslt1 = strcat(target, "hello\00")
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strcat(i8* %arg1, i8* %arg2)
+
+  ; rslt2 = strcat(rslt1, "\00")
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strcat(i8* %rslt1, i8* %arg3)
+
+  ; rslt3 = strcat(rslt2, "\00hello\00")
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strcat(i8* %rslt2, i8* %arg4)
+
+  call i32 @puts( i8* %rslt3 )
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/strcat-2.ll b/test/Transforms/InstCombine/strcat-2.ll
new file mode 100644
index 000000000000..379ee7495317
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-2.ll
@@ -0,0 +1,32 @@
+; Test that the strcat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcat(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strcat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strcat(i8* %dst, i8* %src)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  call i8* @strcat(i8* %dst, i8* %src)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcat-3.ll b/test/Transforms/InstCombine/strcat-3.ll
new file mode 100644
index 000000000000..15aff2f1aa28
--- /dev/null
+++ b/test/Transforms/InstCombine/strcat-3.ll
@@ -0,0 +1,22 @@
+; Test that the strcat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcat(i8*, i8*)
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i16* @strcat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i16* @strcat(i8* %dst, i8* %src)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strchr-1.ll b/test/Transforms/InstCombine/strchr-1.ll
new file mode 100644
index 000000000000..5efab9ec4bee
--- /dev/null
+++ b/test/Transforms/InstCombine/strchr-1.ll
@@ -0,0 +1,54 @@
+; Test that the strchr library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@null = constant [1 x i8] zeroinitializer
+@chp = global i8* zeroinitializer
+
+declare i8* @strchr(i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 6)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: store i8* null, i8** @chp, align 4
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 0)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify4(i32 %chr) {
+; CHECK: call i8* @memchr
+; CHECK-NOT: call i8* @strchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strchr(i8* %src, i32 %chr)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strchr-2.ll b/test/Transforms/InstCombine/strchr-2.ll
new file mode 100644
index 000000000000..35bbd23e6d4d
--- /dev/null
+++ b/test/Transforms/InstCombine/strchr-2.ll
@@ -0,0 +1,21 @@
+; Test that the strchr libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@chr = global i8 zeroinitializer
+
+declare i8 @strchr(i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: test_nosimplify1
+; CHECK: call i8 @strchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8 @strchr(i8* %str, i32 119)
+  store i8 %dst, i8* @chr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcmp-1.ll b/test/Transforms/InstCombine/strcmp-1.ll
new file mode 100644
index 000000000000..0679246e0915
--- /dev/null
+++ b/test/Transforms/InstCombine/strcmp-1.ll
@@ -0,0 +1,82 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+@bell = constant [5 x i8] c"bell\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i32 @strcmp(i8*, i8*)
+
+; strcmp("", x) -> -*x
+define i32 @test1(i8* %str2) {
+; CHECK: @test1
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+
+}
+
+; strcmp(x, "") -> *x
+define i32 @test2(i8* %str1) {
+; CHECK: @test2
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)  -> cnst
+define i32 @test3() {
+; CHECK: @test3
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+define i32 @test4() {
+; CHECK: @test4
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp1
+}
+
+; strcmp(x, y)   -> memcmp(x, y, <known length>)
+; (This transform is rather difficult to trigger in a useful manner)
+define i32 @test5(i1 %b) {
+; CHECK: @test5
+; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %str2, i32 5)
+; CHECK: ret i32 %memcmp
+
+  %str1 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %temp2 = getelementptr inbounds [5 x i8]* @bell, i32 0, i32 0
+  %str2 = select i1 %b, i8* %temp1, i8* %temp2
+  %temp3 = call i32 @strcmp(i8* %str1, i8* %str2)
+  ret i32 %temp3
+}
+
+; strcmp(x,x)  -> 0
+define i32 @test6(i8* %str) {
+; CHECK: @test6
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strcmp(i8* %str, i8* %str)
+  ret i32 %temp1
+}
diff --git a/test/Transforms/InstCombine/strcmp-2.ll b/test/Transforms/InstCombine/strcmp-2.ll
new file mode 100644
index 000000000000..20518960f302
--- /dev/null
+++ b/test/Transforms/InstCombine/strcmp-2.ll
@@ -0,0 +1,20 @@
+; Test that the strcmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+
+declare i16 @strcmp(i8*, i8*)
+
+define i16 @test_nosimplify() {
+; CHECK: @test_nosimplify
+; CHECK: call i16 @strcmp
+; CHECK: ret i16 %temp1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i16 @strcmp(i8* %str1, i8* %str2)
+  ret i16 %temp1
+}
diff --git a/test/Transforms/InstCombine/strcpy-1.ll b/test/Transforms/InstCombine/strcpy-1.ll
new file mode 100644
index 000000000000..b6cf048b2a81
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-1.ll
@@ -0,0 +1,45 @@
+; Test that the strcpy library call simplifier works correctly.
+; rdar://6839935
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strcpy(i8*, i8*)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+define i8* @test_simplify2() {
+; CHECK: @test_simplify2
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %dst)
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+  ret i8* %ret
+}
+
+define i8* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  %ret = call i8* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i8* @strcpy
+  ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strcpy-2.ll b/test/Transforms/InstCombine/strcpy-2.ll
new file mode 100644
index 000000000000..779e9fdd9598
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strcpy library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+;
+; This transformation requires the pointer size, as it assumes that size_t is
+; the size of a pointer.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strcpy(i8*, i8*)
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strcpy(i8* %dst, i8* %src)
+; CHECK: call i16* @strcpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
new file mode 100644
index 000000000000..3e48f4fd3057
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -0,0 +1,94 @@
+; Test lib call simplification of __strcpy_chk calls with various values
+; for src, dst, and slen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where slen >= strlen (src).
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check cases where there are no string constants.
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @strcpy
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 -1)
+  ret void
+}
+
+; Check case where the string length is not constant.
+
+define void @test_simplify5() {
+; CHECK: @test_simplify5
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK: @__memcpy_chk
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
+  ret void
+}
+
+; Check case where the source and destination are the same.
+
+define i8* @test_simplify6() {
+; CHECK: @test_simplify6
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+
+; CHECK: getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0)
+  %len = call i32 @llvm.objectsize.i32(i8* %dst, i1 false)
+  %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
+  ret i8* %ret
+}
+
+; Check case where slen < strlen (src).
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strcpy_chk
+  call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
+declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strcpy_chk-2.ll b/test/Transforms/InstCombine/strcpy_chk-2.ll
new file mode 100644
index 000000000000..d76ea5d068bc
--- /dev/null
+++ b/test/Transforms/InstCombine/strcpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __strcpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@.str = private constant [8 x i8] c"abcdefg\00"
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [8 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strcpy_chk
+  call i16* @__strcpy_chk(i16* %dst, i8* %src, i32 8)
+  ret void
+}
+
+declare i16* @__strcpy_chk(i16*, i8*, i32)
diff --git a/test/Transforms/InstCombine/strcpy_chk.ll b/test/Transforms/InstCombine/strcpy_chk.ll
deleted file mode 100644
index 8835a0ba467c..000000000000
--- a/test/Transforms/InstCombine/strcpy_chk.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-@a = common global [60 x i8] zeroinitializer, align 1 ; <[60 x i8]*> [#uses=1]
-@.str = private constant [8 x i8] c"abcdefg\00"   ; <[8 x i8]*> [#uses=1]
-
-define i8* @foo() nounwind {
-; CHECK: @foo
-; CHECK-NEXT: call i8* @strcpy
-  %call = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([8 x i8]* @.str, i32 0, i32 0), i32 60) ; <i8*> [#uses=1]
-  ret i8* %call
-}
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
diff --git a/test/Transforms/InstCombine/strcspn-1.ll b/test/Transforms/InstCombine/strcspn-1.ll
new file mode 100644
index 000000000000..60fad897b2c8
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-1.ll
@@ -0,0 +1,57 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strcspn(i8*, i8*)
+
+; Check strcspn(s, "") -> strlen(s).
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i64 @strlen(i8* %str)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 [[VAR]]
+}
+
+; Check strcspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strcspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strcspn-2.ll b/test/Transforms/InstCombine/strcspn-2.ll
new file mode 100644
index 000000000000..4e2393686c7d
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-2.ll
@@ -0,0 +1,21 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = constant [1 x i8] zeroinitializer
+
+declare double @strcspn(i8*, i8*)
+
+; Check that strcspn functions with the wrong prototype aren't simplified.
+
+define double @test_no_simplify1(i8* %pat) {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call double @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: call double @strcspn
+  ret double %ret
+; CHECK-NEXT: ret double %ret
+}
diff --git a/test/Transforms/InstCombine/strlen-1.ll b/test/Transforms/InstCombine/strlen-1.ll
new file mode 100644
index 000000000000..6d7464a4cc80
--- /dev/null
+++ b/test/Transforms/InstCombine/strlen-1.ll
@@ -0,0 +1,97 @@
+; Test that the strlen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+@nullstring = constant i8 0
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i32 @strlen(i8*)
+
+; Check strlen(string constant) -> integer constant.
+
+define i32 @test_simplify1() {
+; CHECK: @test_simplify1
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  ret i32 %hello_l
+; CHECK-NEXT: ret i32 5
+}
+
+define i32 @test_simplify2() {
+; CHECK: @test_simplify2
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  ret i32 %null_l
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test_simplify3() {
+; CHECK: @test_simplify3
+  %null_hello_p = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %null_hello_l = call i32 @strlen(i8* %null_hello_p)
+  ret i32 %null_hello_l
+; CHECK-NEXT: ret i32 0
+}
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %len = tail call i32 @strlen(i8* @nullstring) nounwind
+  ret i32 %len
+; CHECK-NEXT: ret i32 0
+}
+
+; Check strlen(x) == 0 --> *x == 0.
+
+define i1 @test_simplify5() {
+; CHECK: @test_simplify5
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  %eq_hello = icmp eq i32 %hello_l, 0
+  ret i1 %eq_hello
+; CHECK-NEXT: ret i1 false
+}
+
+define i1 @test_simplify6() {
+; CHECK: @test_simplify6
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  %eq_null = icmp eq i32 %null_l, 0
+  ret i1 %eq_null
+; CHECK-NEXT: ret i1 true
+}
+
+; Check strlen(x) != 0 --> *x != 0.
+
+define i1 @test_simplify7() {
+; CHECK: @test_simplify7
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p)
+  %ne_hello = icmp ne i32 %hello_l, 0
+  ret i1 %ne_hello
+; CHECK-NEXT: ret i1 true
+}
+
+define i1 @test_simplify8() {
+; CHECK: @test_simplify8
+  %null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %null_l = call i32 @strlen(i8* %null_p)
+  %ne_null = icmp ne i32 %null_l, 0
+  ret i1 %ne_null
+; CHECK-NEXT: ret i1 false
+}
+
+; Check cases that shouldn't be simplified.
+
+define i32 @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %a_p = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %a_l = call i32 @strlen(i8* %a_p)
+; CHECK-NEXT: %a_l = call i32 @strlen
+  ret i32 %a_l
+; CHECK-NEXT: ret i32 %a_l
+}
diff --git a/test/Transforms/InstCombine/strlen-2.ll b/test/Transforms/InstCombine/strlen-2.ll
new file mode 100644
index 000000000000..c4fd54c06db9
--- /dev/null
+++ b/test/Transforms/InstCombine/strlen-2.ll
@@ -0,0 +1,18 @@
+; Test that the strlen library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+
+declare i32 @strlen(i8*, i32)
+
+define i32 @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %hello_l = call i32 @strlen(i8* %hello_p, i32 187)
+; CHECK-NEXT: %hello_l = call i32 @strlen
+  ret i32 %hello_l
+; CHECK-NEXT: ret i32 %hello_l
+}
diff --git a/test/Transforms/InstCombine/strncat-1.ll b/test/Transforms/InstCombine/strncat-1.ll
new file mode 100644
index 000000000000..ad2a18b1465d
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-1.ll
@@ -0,0 +1,37 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+
+declare i8* @strncat(i8*, i8*, i32)
+declare i32 @puts(i8*)
+
+define i32 @main() {
+; CHECK: @main
+; CHECK-NOT: call i8* @strncat
+; CHECK: call i32 @puts
+
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  ; rslt1 = strncat(target, "hello\00")
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strncat(i8* %arg1, i8* %arg2, i32 6)
+
+  ; rslt2 = strncat(rslt1, "\00")
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strncat(i8* %rslt1, i8* %arg3, i32 42)
+
+  ; rslt3 = strncat(rslt2, "\00hello\00")
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strncat(i8* %rslt2, i8* %arg4, i32 42)
+
+  call i32 @puts(i8* %rslt3)
+  ret i32 0
+}
diff --git a/test/Transforms/InstCombine/strncat-2.ll b/test/Transforms/InstCombine/strncat-2.ll
new file mode 100644
index 000000000000..c56deacd39bb
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-2.ll
@@ -0,0 +1,53 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strncat(i8*, i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+; CHECK-NEXT: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 0)
+  ret void
+}
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i8* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i8* @strncat(i8* %dst, i8* %src, i32 1)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncat-3.ll b/test/Transforms/InstCombine/strncat-3.ll
new file mode 100644
index 000000000000..3cd797168705
--- /dev/null
+++ b/test/Transforms/InstCombine/strncat-3.ll
@@ -0,0 +1,22 @@
+; Test that the strncat libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@empty = constant [1 x i8] c"\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strncat(i8*, i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: @test_nosimplify1
+; CHECK: call i16* @strncat
+; CHECK: ret void
+
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  call i16* @strncat(i8* %dst, i8* %src, i32 13)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncmp-1.ll b/test/Transforms/InstCombine/strncmp-1.ll
new file mode 100644
index 000000000000..187c2fa50e82
--- /dev/null
+++ b/test/Transforms/InstCombine/strncmp-1.ll
@@ -0,0 +1,99 @@
+; Test that the strncmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+@bell = constant [5 x i8] c"bell\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i32 @strncmp(i8*, i8*, i32)
+
+; strncmp("", x, n) -> -*x
+define i32 @test1(i8* %str2) {
+; CHECK: @test1
+; CHECK: %strcmpload = load i8* %str
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: %2 = sub i32 0, %1
+; CHECK: ret i32 %2
+
+  %str1 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+; strncmp(x, "", n) -> *x
+define i32 @test2(i8* %str1) {
+; CHECK: @test2
+; CHECK: %strcmpload = load i8* %str1
+; CHECK: %1 = zext i8 %strcmpload to i32
+; CHECK: ret i32 %1
+
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+; strncmp(x, y, n)  -> cnst
+define i32 @test3() {
+; CHECK: @test3
+; CHECK: ret i32 -1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+define i32 @test4() {
+; CHECK: @test4
+; CHECK: ret i32 1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i32 %temp1
+}
+
+define i32 @test5() {
+; CHECK: @test5
+; CHECK: ret i32 0
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 4)
+  ret i32 %temp1
+}
+
+; strncmp(x,y,1) -> memcmp(x,y,1)
+define i32 @test6(i8* %str1, i8* %str2) {
+; CHECK: @test6
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %str1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %str2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK: ret i32 [[RET]]
+
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
+  ret i32 %temp1
+}
+
+; strncmp(x,y,0)   -> 0
+define i32 @test7(i8* %str1, i8* %str2) {
+; CHECK: @test7
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 0)
+  ret i32 %temp1
+}
+
+; strncmp(x,x,n)  -> 0
+define i32 @test8(i8* %str, i32 %n) {
+; CHECK: @test8
+; CHECK: ret i32 0
+
+  %temp1 = call i32 @strncmp(i8* %str, i8* %str, i32 %n)
+  ret i32 %temp1
+}
diff --git a/test/Transforms/InstCombine/strncmp-2.ll b/test/Transforms/InstCombine/strncmp-2.ll
new file mode 100644
index 000000000000..3fc43a6fd4f5
--- /dev/null
+++ b/test/Transforms/InstCombine/strncmp-2.ll
@@ -0,0 +1,20 @@
+; Test that the strncmp library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@hell = constant [5 x i8] c"hell\00"
+
+declare i16 @strncmp(i8*, i8*, i32)
+
+define i16 @test_nosimplify() {
+; CHECK: @test_nosimplify
+; CHECK: call i16 @strncmp
+; CHECK: ret i16 %temp1
+
+  %str1 = getelementptr inbounds [5 x i8]* @hell, i32 0, i32 0
+  %str2 = getelementptr inbounds [6 x i8]* @hello, i32 0, i32 0
+  %temp1 = call i16 @strncmp(i8* %str1, i8* %str2, i32 10)
+  ret i16 %temp1
+}
diff --git a/test/Transforms/InstCombine/strncpy-1.ll b/test/Transforms/InstCombine/strncpy-1.ll
new file mode 100644
index 000000000000..3ce2b9b5eecc
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy-1.ll
@@ -0,0 +1,95 @@
+; Test that the strncpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@null = constant [1 x i8] zeroinitializer
+@null_hello = constant [7 x i8] c"\00hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+@b = common global [32 x i8] zeroinitializer, align 1
+
+declare i8* @strncpy(i8*, i8*, i32)
+declare i32 @puts(i8*)
+
+; Check a bunch of strncpy invocations together.
+
+define i32 @test_simplify1() {
+; CHECK: @test_simplify1
+; CHECK-NOT: call i8* @strncpy
+; CHECK: call i32 @puts
+  %target = alloca [1024 x i8]
+  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
+  store i8 0, i8* %arg1
+
+  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
+  %rslt1 = call i8* @strncpy(i8* %arg1, i8* %arg2, i32 6)
+
+  %arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %rslt2 = call i8* @strncpy(i8* %rslt1, i8* %arg3, i32 42)
+
+  %arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0
+  %rslt3 = call i8* @strncpy(i8* %rslt2, i8* %arg4, i32 42)
+
+  call i32 @puts( i8* %rslt3 )
+  ret i32 0
+}
+
+; Check strncpy(x, "", y) -> memset(x, '\0', y, 1).
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 32)
+; CHECK: call void @llvm.memset.p0i8.i32
+  ret void
+}
+
+; Check strncpy(x, y, 0) -> x.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  %ret = call i8* @strncpy(i8* %dst, i8* %src, i32 0)
+  ret i8* %ret
+; CHECK: ret i8* getelementptr inbounds ([32 x i8]* @a, i32 0, i32 0)
+}
+
+; Check  strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant].
+
+define void @test_simplify4() {
+; CHECK: @test_simplify4
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 6)
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i32
+  ret void
+}
+
+; Check cases that shouldn't be simplified.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [32 x i8]* @b, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 32)
+; CHECK: call i8* @strncpy
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i8* @strncpy(i8* %dst, i8* %src, i32 8)
+; CHECK: call i8* @strncpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncpy-2.ll b/test/Transforms/InstCombine/strncpy-2.ll
new file mode 100644
index 000000000000..ac28ea655009
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy-2.ll
@@ -0,0 +1,22 @@
+; Test that the strncpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [6 x i8] c"hello\00"
+@a = common global [32 x i8] zeroinitializer, align 1
+
+declare i16* @strncpy(i8*, i8*, i32)
+
+; Check that 'strncpy' functions with the wrong prototype aren't simplified.
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr [32 x i8]* @a, i32 0, i32 0
+  %src = getelementptr [6 x i8]* @hello, i32 0, i32 0
+
+  call i16* @strncpy(i8* %dst, i8* %src, i32 6)
+; CHECK: call i16* @strncpy
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strncpy_chk-1.ll b/test/Transforms/InstCombine/strncpy_chk-1.ll
new file mode 100644
index 000000000000..aadff4268ec2
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy_chk-1.ll
@@ -0,0 +1,66 @@
+; Test lib call simplification of __strncpy_chk calls with various values
+; for len and dstlen.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i8] zeroinitializer, align 1
+@b = common global [60 x i8] zeroinitializer, align 1
+@.str = private constant [12 x i8] c"abcdefghijk\00"
+
+; Check cases where dstlen >= len
+
+define void @test_simplify1() {
+; CHECK: @test_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: @test_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 12)
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: @test_simplify3
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @strncpy
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 12, i32 60)
+  ret void
+}
+
+; Check cases where dstlen < len
+
+define void @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [12 x i8]* @.str, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strncpy_chk
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 4)
+  ret void
+}
+
+define void @test_no_simplify2() {
+; CHECK: @test_no_simplify2
+  %dst = getelementptr inbounds [60 x i8]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i8]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i8* @__strncpy_chk
+  call i8* @__strncpy_chk(i8* %dst, i8* %src, i32 8, i32 0)
+  ret void
+}
+
+declare i8* @__strncpy_chk(i8*, i8*, i32, i32)
diff --git a/test/Transforms/InstCombine/strncpy_chk-2.ll b/test/Transforms/InstCombine/strncpy_chk-2.ll
new file mode 100644
index 000000000000..a0f132ebf63b
--- /dev/null
+++ b/test/Transforms/InstCombine/strncpy_chk-2.ll
@@ -0,0 +1,21 @@
+; Test that lib call simplification doesn't simplify __strncpy_chk calls
+; with the wrong prototype.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@a = common global [60 x i16] zeroinitializer, align 1
+@b = common global [60 x i16] zeroinitializer, align 1
+
+define void @test_no_simplify() {
+; CHECK: @test_no_simplify
+  %dst = getelementptr inbounds [60 x i16]* @a, i32 0, i32 0
+  %src = getelementptr inbounds [60 x i16]* @b, i32 0, i32 0
+
+; CHECK-NEXT: call i16* @__strncpy_chk
+  call i16* @__strncpy_chk(i16* %dst, i16* %src, i32 60, i32 60)
+  ret void
+}
+
+declare i16* @__strncpy_chk(i16*, i16*, i32, i32)
diff --git a/test/Transforms/InstCombine/strpbrk-1.ll b/test/Transforms/InstCombine/strpbrk-1.ll
new file mode 100644
index 000000000000..a5d0d86501b1
--- /dev/null
+++ b/test/Transforms/InstCombine/strpbrk-1.ll
@@ -0,0 +1,68 @@
+; Test that the strpbrk library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [12 x i8] c"hello world\00"
+@w = constant [2 x i8] c"w\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i8* @strpbrk(i8*, i8*)
+
+; Check strpbrk(s, "") -> NULL.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* null
+}
+
+; Check strpbrk("", s) -> NULL.
+
+define i8* @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* null
+}
+
+; Check strpbrk(s1, s2), where s1 and s2 are constants.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [12 x i8]* @hello, i32 0, i32 0
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* getelementptr inbounds ([12 x i8]* @hello, i32 0, i32 6)
+}
+
+; Check strpbrk(s, "a") -> strchr(s, 'a').
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i8* @strchr(i8* %str, i32 119)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* [[VAR]]
+}
+
+; Check cases that shouldn't be simplified.
+
+define i8* @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i8* @strpbrk(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %ret
+}
diff --git a/test/Transforms/InstCombine/strpbrk-2.ll b/test/Transforms/InstCombine/strpbrk-2.ll
new file mode 100644
index 000000000000..31ac2905df2c
--- /dev/null
+++ b/test/Transforms/InstCombine/strpbrk-2.ll
@@ -0,0 +1,23 @@
+; Test that the strpbrk library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [12 x i8] c"hello world\00"
+@w = constant [2 x i8] c"w\00"
+
+declare i16* @strpbrk(i8*, i8*)
+
+; Check that 'strpbrk' functions with the wrong prototype aren't simplified.
+
+define i16* @test_no_simplify1() {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [12 x i8]* @hello, i32 0, i32 0
+  %pat = getelementptr [2 x i8]* @w, i32 0, i32 0
+
+  %ret = call i16* @strpbrk(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i16* @strpbrk
+  ret i16* %ret
+; CHECK-NEXT: ret i16* %ret
+}
diff --git a/test/Transforms/InstCombine/strrchr-1.ll b/test/Transforms/InstCombine/strrchr-1.ll
new file mode 100644
index 000000000000..854ce45bffb2
--- /dev/null
+++ b/test/Transforms/InstCombine/strrchr-1.ll
@@ -0,0 +1,54 @@
+; Test that the strrchr library call simplifier works correctly.
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@null = constant [1 x i8] zeroinitializer
+@chp = global i8* zeroinitializer
+
+declare i8* @strrchr(i8*, i32)
+
+define void @test_simplify1() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 6)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify2() {
+; CHECK: store i8* null, i8** @chp, align 4
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %str, i32 119)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_simplify3() {
+; CHECK: store i8* getelementptr inbounds ([14 x i8]* @hello, i32 0, i32 13)
+; CHECK-NOT: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 0)
+  store i8* %dst, i8** @chp
+  ret void
+}
+
+define void @test_nosimplify1(i32 %chr) {
+; CHECK: @test_nosimplify1
+; CHECK: call i8* @strrchr
+; CHECK: ret void
+
+  %src = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8* @strrchr(i8* %src, i32 %chr)
+  store i8* %dst, i8** @chp
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strrchr-2.ll b/test/Transforms/InstCombine/strrchr-2.ll
new file mode 100644
index 000000000000..1974f6ca6033
--- /dev/null
+++ b/test/Transforms/InstCombine/strrchr-2.ll
@@ -0,0 +1,21 @@
+; Test that the strrchr libcall simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@hello = constant [14 x i8] c"hello world\5Cn\00"
+@chr = global i8 zeroinitializer
+
+declare i8 @strrchr(i8*, i32)
+
+define void @test_nosimplify1() {
+; CHECK: test_nosimplify1
+; CHECK: call i8 @strrchr
+; CHECK: ret void
+
+  %str = getelementptr [14 x i8]* @hello, i32 0, i32 0
+  %dst = call i8 @strrchr(i8* %str, i32 119)
+  store i8 %dst, i8* @chr
+  ret void
+}
diff --git a/test/Transforms/InstCombine/strspn-1.ll b/test/Transforms/InstCombine/strspn-1.ll
new file mode 100644
index 000000000000..393f88735bd4
--- /dev/null
+++ b/test/Transforms/InstCombine/strspn-1.ll
@@ -0,0 +1,56 @@
+; Test that the strspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strspn(i8*, i8*)
+
+; Check strspn(s, "") -> 0.
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 5
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strstr-1.ll b/test/Transforms/InstCombine/strstr-1.ll
new file mode 100644
index 000000000000..81f52718747d
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-1.ll
@@ -0,0 +1,65 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str = private constant [1 x i8] zeroinitializer
+@.str1 = private constant [2 x i8] c"a\00"
+@.str2 = private constant [6 x i8] c"abcde\00"
+@.str3 = private constant [4 x i8] c"bcd\00"
+
+declare i8* @strstr(i8*, i8*)
+
+; Check strstr(str, "") -> str.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, "a") -> strchr(str, 'a').
+
+define i8* @test_simplify2(i8* %str) {
+; CHECK: @test_simplify2
+  %pat = getelementptr inbounds [2 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: @strchr(i8* %str, i32 97)
+}
+
+; Check strstr("abcde", "bcd") -> "abcde" + 1.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr inbounds [6 x i8]* @.str2, i32 0, i32 0
+  %pat = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 1)
+}
+
+; Check strstr(str, str) -> str.
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %ret = call i8* @strstr(i8* %str, i8* %str)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, pat) == str -> strncmp(str, pat, strlen(str)) == 0.
+
+define i1 @test_simplify5(i8* %str, i8* %pat) {
+; CHECK: @test_simplify5
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  %cmp = icmp eq i8* %ret, %str
+  ret i1 %cmp
+; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %pat)
+; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %str, i8* %pat, {{i[0-9]+}} [[LEN]])
+; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
+; CHECK: ret i1
+}
diff --git a/test/Transforms/InstCombine/strstr-2.ll b/test/Transforms/InstCombine/strstr-2.ll
new file mode 100644
index 000000000000..5092f9b4f803
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-2.ll
@@ -0,0 +1,18 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = private constant [1 x i8] zeroinitializer
+
+declare i8 @strstr(i8*, i8*)
+
+define i8 @test_no_simplify1(i8* %str) {
+; CHECK: @test_no_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %ret = call i8 @strstr(i8* %str, i8* %pat)
+; CHECK-NEXT: call i8 @strstr
+  ret i8 %ret
+; CHECK-NEXT: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/strto-1.ll b/test/Transforms/InstCombine/strto-1.ll
new file mode 100644
index 000000000000..16c0c67970db
--- /dev/null
+++ b/test/Transforms/InstCombine/strto-1.ll
@@ -0,0 +1,82 @@
+; Test that the strto* library call simplifiers works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i64 @strtol(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtol(i8*, i8**, i32)
+
+declare double @strtod(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare double @strtod(i8*, i8**, i32)
+
+declare float @strtof(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare float @strtof(i8*, i8**, i32)
+
+declare i64 @strtoul(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoul(i8*, i8**, i32)
+
+declare i64 @strtoll(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoll(i8*, i8**, i32)
+
+declare double @strtold(i8* %s, i8** %endptr)
+; CHECK: declare double @strtold(i8*, i8**)
+
+declare i64 @strtoull(i8* %s, i8** %endptr, i32 %base)
+; CHECK: declare i64 @strtoull(i8*, i8**, i32)
+
+define void @test_simplify1(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify1
+  call i64 @strtol(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtol(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify2(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify2
+  call double @strtod(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call double @strtod(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify3(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify3
+  call float @strtof(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call float @strtof(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify4(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify4
+  call i64 @strtoul(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoul(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify5(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify5
+  call i64 @strtoll(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoll(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_simplify6(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify6
+  call double @strtold(i8* %x, i8** null)
+; CHECK-NEXT: call double @strtold(i8* nocapture %x, i8** null)
+  ret void
+}
+
+define void @test_simplify7(i8* %x, i8** %endptr) {
+; CHECK: @test_simplify7
+  call i64 @strtoull(i8* %x, i8** null, i32 10)
+; CHECK-NEXT: call i64 @strtoull(i8* nocapture %x, i8** null, i32 10)
+  ret void
+}
+
+define void @test_no_simplify1(i8* %x, i8** %endptr) {
+; CHECK: @test_no_simplify1
+  call i64 @strtol(i8* %x, i8** %endptr, i32 10)
+; CHECK-NEXT: call i64 @strtol(i8* %x, i8** %endptr, i32 10)
+  ret void
+}
diff --git a/test/Transforms/InstCombine/struct-assign-tbaa.ll b/test/Transforms/InstCombine/struct-assign-tbaa.ll
new file mode 100644
index 000000000000..33a771e6d8b6
--- /dev/null
+++ b/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -0,0 +1,44 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+; Verify that instcombine preserves TBAA tags when converting a memcpy into
+; a scalar load and store.
+
+%struct.test1 = type { float }
+
+; CHECK: @test
+; CHECK: %2 = load float* %0, align 4, !tbaa !0
+; CHECK: store float %2, float* %1, align 4, !tbaa !0
+; CHECK: ret
+define void @test1(%struct.test1* nocapture %a, %struct.test1* nocapture %b) {
+entry:
+  %0 = bitcast %struct.test1* %a to i8*
+  %1 = bitcast %struct.test1* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !tbaa.struct !3
+  ret void
+}
+
+%struct.test2 = type { i32 (i8*, i32*, double*)** }
+
+define i32 (i8*, i32*, double*)*** @test2() {
+; CHECK: @test2
+; CHECK-NOT: memcpy
+; CHECK: ret
+  %tmp = alloca %struct.test2, align 8
+  %tmp1 = bitcast %struct.test2* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp1, i8* undef, i64 8, i32 8, i1 false), !tbaa.struct !4
+  %tmp2 = getelementptr %struct.test2* %tmp, i32 0, i32 0
+  %tmp3 = load i32 (i8*, i32*, double*)*** %tmp2
+  ret i32 (i8*, i32*, double*)*** %tmp2
+}
+
+; CHECK: !0 = metadata !{metadata !"float", metadata !1}
+
+!0 = metadata !{metadata !"Simple C/C++ TBAA"}
+!1 = metadata !{metadata !"omnipotent char", metadata !0}
+!2 = metadata !{metadata !"float", metadata !0}
+!3 = metadata !{i64 0, i64 4, metadata !2}
+!4 = metadata !{i64 0, i64 8, null}
diff --git a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
index d95e8f835908..74f2fdd7cc63 100644
--- a/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
+++ b/test/Transforms/InstCombine/udiv-simplify-bug-1.ll
@@ -6,9 +6,9 @@
 ; The udiv instructions shouldn't be optimized away, and the
 ; sext instructions should be optimized to zext.
 
-define i64 @bar(i32 %x) nounwind {
+define i64 @bar(i32 %x, i32 %g) nounwind {
   %y = lshr i32 %x, 30
-  %r = udiv i32 %y, 3
+  %r = udiv i32 %y, %g
   %z = sext i32 %r to i64
   ret i64 %z
 }
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 0019a57627cb..2d90750a2f1e 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -196,7 +196,7 @@ define <4 x float> @test_select(float %f, float %g) {
 ; CHECK-NOT: insertelement
 ; CHECK: %a3 = insertelement <4 x float> %a0, float 3.000000e+00, i32 3
 ; CHECK-NOT: insertelement
-; CHECK: %ret = select <4 x i1> <i1 true, i1 false, i1 false, i1 true>, <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>
+; CHECK: shufflevector <4 x float> %a3, <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
   %a0 = insertelement <4 x float> undef, float %f, i32 0
   %a1 = insertelement <4 x float> %a0, float 1.000000e+00, i32 1
   %a2 = insertelement <4 x float> %a1, float 2.000000e+00, i32 2
diff --git a/test/Transforms/InstCombine/vec_shuffle.ll b/test/Transforms/InstCombine/vec_shuffle.ll
index 8f78c2e6bd50..14f532195d7c 100644
--- a/test/Transforms/InstCombine/vec_shuffle.ll
+++ b/test/Transforms/InstCombine/vec_shuffle.ll
@@ -153,3 +153,46 @@ define <8 x i8> @test12a(<8 x i8> %tmp6, <8 x i8> %tmp2) nounwind {
   ret <8 x i8> %tmp3
 }
 
+; We should form a shuffle out of a select with constant condition.
+define <4 x i16> @test13a(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13a
+; CHECK-NEXT: shufflevector <4 x i16> %lhs, <4 x i16> %rhs, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13b(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13b
+; CHECK-NEXT: ret <4 x i16> %lhs
+  %A = select <4 x i1> <i1 true, i1 undef, i1 true, i1 true>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13c(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13c
+; CHECK-NEXT: shufflevector <4 x i16> %lhs, <4 x i16> %rhs, <4 x i32> <i32 0, i32 undef, i32 2, i32 7>
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 undef, i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13d(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13d
+; CHECK: select
+; CHECK-NEXT: ret
+  %A = select <4 x i1> <i1 true, i1 icmp ugt (<4 x i16>(<4 x i16>, <4 x i16>)* @test13a, <4 x i16>(<4 x i16>, <4 x i16>)* @test13b), i1 true, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
+
+define <4 x i16> @test13e(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: @test13e
+; CHECK-NEXT: ret <4 x i16> %rhs
+  %A = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>,
+           <4 x i16> %lhs, <4 x i16> %rhs
+  ret <4 x i16> %A
+}
diff --git a/test/Transforms/InstCombine/vector_gep2.ll b/test/Transforms/InstCombine/vector_gep2.ll
new file mode 100644
index 000000000000..20165b110016
--- /dev/null
+++ b/test/Transforms/InstCombine/vector_gep2.ll
@@ -0,0 +1,11 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define <2 x i8*> @testa(<2 x i8*> %a) {
+; CHECK: @testa
+  %g = getelementptr <2 x i8*> %a, <2 x i32> <i32 0, i32 1>
+; CHECK: getelementptr <2 x i8*> %a, <2 x i64> <i64 0, i64 1>
+  ret <2 x i8*> %g
+}
diff --git a/test/Transforms/InstCombine/weak-symbols.ll b/test/Transforms/InstCombine/weak-symbols.ll
new file mode 100644
index 000000000000..0039b5962f74
--- /dev/null
+++ b/test/Transforms/InstCombine/weak-symbols.ll
@@ -0,0 +1,33 @@
+; PR4738 - Test that the library call simplifier doesn't assume anything about
+; weak symbols.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+@real_init = weak_odr constant [2 x i8] c"y\00"
+@fake_init = weak constant [2 x i8] c"y\00"
+@.str = private constant [2 x i8] c"y\00"
+
+define i32 @foo() nounwind {
+; CHECK: define i32 @foo
+; CHECK: call i32 @strcmp
+; CHECK: ret i32 %temp1
+
+entry:
+  %str1 = getelementptr inbounds [2 x i8]* @fake_init, i64 0, i64 0
+  %str2 = getelementptr inbounds [2 x i8]* @.str, i64 0, i64 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2) nounwind readonly
+  ret i32 %temp1
+}
+
+define i32 @bar() nounwind {
+; CHECK: define i32 @bar
+; CHECK: ret i32 0
+
+entry:
+  %str1 = getelementptr inbounds [2 x i8]* @real_init, i64 0, i64 0
+  %str2 = getelementptr inbounds [2 x i8]* @.str, i64 0, i64 0
+  %temp1 = call i32 @strcmp(i8* %str1, i8* %str2) nounwind readonly
+  ret i32 %temp1
+}
+
+declare i32 @strcmp(i8*, i8*) nounwind readonly
diff --git a/test/Transforms/InstSimplify/compare.ll b/test/Transforms/InstSimplify/compare.ll
index ced74bd4be9b..ce2bb799c813 100644
--- a/test/Transforms/InstSimplify/compare.ll
+++ b/test/Transforms/InstSimplify/compare.ll
@@ -266,6 +266,15 @@ define i1 @add5(i32 %x, i32 %y) {
 ; CHECK: ret i1 true
 }
 
+define i1 @add6(i64 %A, i64 %B) {
+; CHECK: @add6
+  %s1 = add i64 %A, %B
+  %s2 = add i64 %B, %A
+  %cmp = icmp eq i64 %s1, %s2
+  ret i1 %cmp
+; CHECK: ret i1 true
+}
+
 define i1 @addpowtwo(i32 %x, i32 %y) {
 ; CHECK: @addpowtwo
   %l = lshr i32 %x, 1
diff --git a/test/Transforms/Internalize/2008-05-09-AllButMain.ll b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
index a85e834582d7..c07abb0c6365 100644
--- a/test/Transforms/Internalize/2008-05-09-AllButMain.ll
+++ b/test/Transforms/Internalize/2008-05-09-AllButMain.ll
@@ -1,27 +1,55 @@
-; No arguments means internalize all but main
-; RUN: opt < %s -internalize -S | grep internal | count 4
+; No arguments means internalize everything
+; RUN: opt < %s -internalize -S | FileCheck --check-prefix=NOARGS %s
+
 ; Internalize all but foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | grep internal | count 3
-; Non existent files should be treated as if they were empty (so internalize all but main)
-; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | grep internal | count 4
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file 2> /dev/null -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list foo -internalize-public-api-list j -S | FileCheck --check-prefix=LIST %s
+
+; Non existent files should be treated as if they were empty (so internalize
+; everything)
+; RUN: opt < %s -internalize -internalize-public-api-file /nonexistent/file 2> /dev/null -S | FileCheck --check-prefix=EMPTYFILE %s
+
+; RUN: opt < %s -S -internalize -internalize-public-api-list bar -internalize-public-api-list foo -internalize-public-api-file /nonexistent/file  2> /dev/null | FileCheck --check-prefix=LIST2 %s
+
 ; -file and -list options should be merged, the .apifile contains foo and j
-; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | grep internal | count 2
+; RUN: opt < %s -internalize -internalize-public-api-list bar -internalize-public-api-file %s.apifile -S | FileCheck --check-prefix=MERGE %s
+
+; NOARGS: @i = internal global
+; LIST: @i = internal global
+; EMPTYFILE: @i = internal global
+; LIST2: @i = internal global
+; MERGE: @i = internal global
+@i = global i32 0
 
-@i = weak global i32 0          ; <i32*> [#uses=0]
-@j = weak global i32 0          ; <i32*> [#uses=0]
+; NOARGS: @j = internal global
+; LIST: @j = global
+; EMPTYFILE: @j = internal global
+; LIST2: @j = internal global
+; MERGE: @j = global
+@j = global i32 0
 
-define void @main(...) {
-entry:  
+; NOARGS: define internal void @main
+; LIST: define internal void @main
+; EMPTYFILE: define internal void @main
+; LIST2: define internal void @main
+; MERGE: define internal void @main
+define void @main() {
         ret void
 }
 
-define void @foo(...) {
-entry:  
+; NOARGS: define internal void @foo
+; LIST: define void @foo
+; EMPTYFILE: define internal void @foo
+; LIST2: define void @foo
+; MERGE: define void @foo
+define void @foo() {
         ret void
 }
 
-define void @bar(...) {
-entry:  
+; NOARGS: define internal void @bar
+; LIST: define internal void @bar
+; EMPTYFILE: define internal void @bar
+; LIST2: define void @bar
+; MERGE: define void @bar
+define void @bar() {
         ret void
 }
diff --git a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
index 7b18a04e1160..47cf3f0373e4 100644
--- a/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
+++ b/test/Transforms/Internalize/2009-01-05-InternalizeAliases.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -internalize -S | grep internal | count 3
+; RUN: opt < %s -internalize -internalize-public-api-list main -S | grep internal | count 3
 
 @A = global i32 0
 @B = alias i32* @A
diff --git a/test/Transforms/JumpThreading/crash.ll b/test/Transforms/JumpThreading/crash.ll
index b9c03544db81..2fe87464c117 100644
--- a/test/Transforms/JumpThreading/crash.ll
+++ b/test/Transforms/JumpThreading/crash.ll
@@ -511,3 +511,56 @@ lbl_260:                                          ; preds = %for.cond, %entry
 if.end:                                           ; preds = %for.cond
   ret void
 }
+
+define void @PR14233(i1 %cmp, i1 %cmp2, i1 %cmp3, i1 %cmp4) {
+entry:
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:
+  br label %if.end
+
+cond.false:
+  br label %if.end
+
+if.end:
+  %A = phi i64 [ 0, %cond.true ], [ 1, %cond.false ]
+  br i1 %cmp2, label %bb, label %if.end2
+
+bb:
+  br label %if.end2
+
+if.end2:
+  %B = phi i64 [ ptrtoint (i8* ()* @PR14233.f1 to i64), %bb ], [ %A, %if.end ]
+  %cmp.ptr = icmp eq i64 %B, ptrtoint (i8* ()* @PR14233.f2 to i64)
+  br i1 %cmp.ptr, label %cond.true2, label %if.end3
+
+cond.true2:
+  br i1 %cmp3, label %bb2, label %ur
+
+bb2:
+  br i1 %cmp4, label %if.end4, label %if.end3
+
+if.end4:
+  unreachable
+
+if.end3:
+  %cmp.ptr2 = icmp eq i64 %B, ptrtoint (i8* ()* @PR14233.f2 to i64)
+  br i1 %cmp.ptr2, label %ur, label %if.then601
+
+if.then601:
+  %C = icmp eq i64 %B, 0
+  br i1 %C, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+
+ur:
+  unreachable
+}
+
+declare i8* @PR14233.f1()
+
+declare i8* @PR14233.f2()
diff --git a/test/Transforms/JumpThreading/select.ll b/test/Transforms/JumpThreading/select.ll
index 8a81857736a7..9676efec9df2 100644
--- a/test/Transforms/JumpThreading/select.ll
+++ b/test/Transforms/JumpThreading/select.ll
@@ -121,3 +121,39 @@ L4:
   call void @quux()
   br label %L0
 }
+
+; Make sure the edge value of %0 from entry to L2 includes 0 and L3 is
+; reachable.
+; CHECK: test_switch_default
+; CHECK: entry:
+; CHECK: load
+; CHECK: switch
+; CHECK: [[THREADED:[A-Za-z.0-9]+]]:
+; CHECK: store
+; CHECK: br
+; CHECK: L2:
+; CHECK: icmp
+define void @test_switch_default(i32* nocapture %status) nounwind {
+entry:
+  %0 = load i32* %status, align 4
+  switch i32 %0, label %L2 [
+    i32 5061, label %L1
+    i32 0, label %L2
+  ]
+
+L1:
+  store i32 10025, i32* %status, align 4
+  br label %L2
+
+L2:
+  %1 = load i32* %status, align 4
+  %cmp57.i = icmp eq i32 %1, 0
+  br i1 %cmp57.i, label %L3, label %L4
+
+L3:
+  store i32 10000, i32* %status, align 4
+  br label %L4
+
+L4:
+  ret void
+}
diff --git a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
index 67c3951d74e4..fe8d44531322 100644
--- a/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
+++ b/test/Transforms/LICM/2003-12-11-SinkingToPHI.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -licm | lli
+; RUN: opt < %s -licm | lli %defaultjit
 
 define i32 @main() {
 entry:
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 6f28d53af66e..98f93345e3c3 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -29,7 +29,7 @@ Out:		; preds = %LoopTail
 }
 
 
-declare void @foo2(i32)
+declare void @foo2(i32) nounwind
 
 
 ;; It is ok and desirable to hoist this potentially trapping instruction.
@@ -64,3 +64,29 @@ Out:		; preds = %Loop
 	%C = sub i32 %A, %B		; <i32> [#uses=1]
 	ret i32 %C
 }
+
+; CHECK: @test4
+; CHECK: call
+; CHECK: sdiv
+; CHECK: ret
+define i32 @test4(i32 %x, i32 %y) nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.02 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %n.01 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+  call void @foo_may_call_exit(i32 0)
+  %div = sdiv i32 %x, %y
+  %add = add nsw i32 %n.01, %div
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, 10000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %n.0.lcssa = phi i32 [ %add, %for.body ]
+  ret i32 %n.0.lcssa
+}
+
+declare void @foo_may_call_exit(i32)
+
diff --git a/test/Transforms/LoopIdiom/basic.ll b/test/Transforms/LoopIdiom/basic.ll
index 46ab7e5542b6..06a5bd90864d 100644
--- a/test/Transforms/LoopIdiom/basic.ll
+++ b/test/Transforms/LoopIdiom/basic.ll
@@ -383,4 +383,37 @@ for.end:                                          ; preds = %for.inc
 
 }
 
+define void @PR14241(i32* %s, i64 %size) {
+; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
+; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
+; instead of a memmove. If we get the memmove transform back, this will catch
+; regressions.
+;
+; CHECK: @PR14241
 
+entry:
+  %end.idx = add i64 %size, -1
+  %end.ptr = getelementptr inbounds i32* %s, i64 %end.idx
+  br label %while.body
+; CHECK-NOT: memcpy
+;
+; FIXME: When we regain the ability to form a memmove here, this test should be
+; reversed and turned into a positive assertion.
+; CHECK-NOT: memmove
+
+while.body:
+  %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
+  %src.ptr = getelementptr inbounds i32* %phi.ptr, i64 1
+  %val = load i32* %src.ptr, align 4
+; CHECK: load
+  %dst.ptr = getelementptr inbounds i32* %phi.ptr, i64 0
+  store i32 %val, i32* %dst.ptr, align 4
+; CHECK: store
+  %next.ptr = getelementptr inbounds i32* %phi.ptr, i64 1
+  %cmp = icmp eq i32* %next.ptr, %end.ptr
+  br i1 %cmp, label %exit, label %while.body
+
+exit:
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/LoopIdiom/crash.ll b/test/Transforms/LoopIdiom/crash.ll
new file mode 100644
index 000000000000..969adbcd7635
--- /dev/null
+++ b/test/Transforms/LoopIdiom/crash.ll
@@ -0,0 +1,25 @@
+; RUN: opt -basicaa -loop-idiom -S < %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Don't crash inside DependenceAnalysis
+; PR14219
+define void @test1(i64* %iwork, i64 %x)  {
+bb0:
+  %mul116 = mul nsw i64 %x, %x
+  %incdec.ptr6.sum175 = add i64 42, %x
+  %arrayidx135 = getelementptr inbounds i64* %iwork, i64 %incdec.ptr6.sum175
+  br label %bb1
+bb1:
+  %storemerge4226 = phi i64 [ 0, %bb0 ], [ %inc139, %bb1 ]
+  store i64 1, i64* %arrayidx135, align 8
+  %incdec.ptr6.sum176 = add i64 %mul116, %storemerge4226
+  %arrayidx137 = getelementptr inbounds i64* %iwork, i64 %incdec.ptr6.sum176
+  store i64 1, i64* %arrayidx137, align 8
+  %inc139 = add nsw i64 %storemerge4226, 1
+  %cmp131 = icmp sgt i64 %storemerge4226, 42
+  br i1 %cmp131, label %bb2, label %bb1
+bb2:
+  ret void
+}
+
diff --git a/test/Transforms/LoopIdiom/non-canonical-loop.ll b/test/Transforms/LoopIdiom/non-canonical-loop.ll
new file mode 100644
index 000000000000..a6a4f9227f9a
--- /dev/null
+++ b/test/Transforms/LoopIdiom/non-canonical-loop.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -loop-idiom < %s
+; Don't crash
+; PR13892
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i32* %currMB) nounwind uwtable {
+entry:
+  br i1 undef, label %start.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+start.exit:                       ; preds = %entry
+  indirectbr i8* undef, [label %0, label %for.bodyprime]
+
+; <label>:0                                       ; preds = %start.exit
+  unreachable
+
+for.bodyprime:                                    ; preds = %for.bodyprime, %start.exit
+  %i.057375 = phi i32 [ 0, %start.exit ], [ %1, %for.bodyprime ]
+  %arrayidx8prime = getelementptr inbounds i32* %currMB, i32 %i.057375
+  store i32 0, i32* %arrayidx8prime, align 4
+  %1 = add i32 %i.057375, 1
+  %cmp5prime = icmp slt i32 %1, 4
+  br i1 %cmp5prime, label %for.bodyprime, label %for.endprime
+
+for.endprime:                                     ; preds = %for.bodyprime
+  br label %for.body23prime
+
+for.body23prime:                                  ; preds = %for.body23prime, %for.endprime
+  br label %for.body23prime
+}
diff --git a/test/Transforms/LoopIdiom/scev-invalidation.ll b/test/Transforms/LoopIdiom/scev-invalidation.ll
new file mode 100644
index 000000000000..a244d9a280b9
--- /dev/null
+++ b/test/Transforms/LoopIdiom/scev-invalidation.ll
@@ -0,0 +1,74 @@
+; RUN: opt -S -indvars -loop-idiom < %s
+; PR14214
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @quote_arg() nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %backslashes.0 = phi i32 [ undef, %entry ], [ %backslashes.2, %for.inc ]
+  %p.0 = phi i8* [ undef, %entry ], [ %incdec.ptr3, %for.inc ]
+  %q.0 = phi i8* [ undef, %entry ], [ %q.2, %for.inc ]
+  %0 = load i8* %p.0, align 1
+  switch i8 %0, label %while.cond.preheader [
+    i8 0, label %for.cond4.preheader
+    i8 92, label %for.inc
+  ]
+
+while.cond.preheader:                             ; preds = %for.cond
+  %tobool210 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool210, label %for.inc.loopexit, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %while.cond.preheader
+  %1 = add i32 %backslashes.0, -1
+  %2 = zext i32 %1 to i64
+  br label %while.body
+
+for.cond4.preheader:                              ; preds = %for.cond
+  %tobool57 = icmp eq i32 %backslashes.0, 0
+  br i1 %tobool57, label %for.end10, label %for.body6.lr.ph
+
+for.body6.lr.ph:                                  ; preds = %for.cond4.preheader
+  br label %for.body6
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %q.112 = phi i8* [ %q.0, %while.body.lr.ph ], [ %incdec.ptr, %while.body ]
+  %backslashes.111 = phi i32 [ %backslashes.0, %while.body.lr.ph ], [ %dec, %while.body ]
+  %incdec.ptr = getelementptr inbounds i8* %q.112, i64 1
+  store i8 92, i8* %incdec.ptr, align 1
+  %dec = add nsw i32 %backslashes.111, -1
+  %tobool2 = icmp eq i32 %dec, 0
+  br i1 %tobool2, label %while.cond.for.inc.loopexit_crit_edge, label %while.body
+
+while.cond.for.inc.loopexit_crit_edge:            ; preds = %while.body
+  %scevgep.sum = add i64 %2, 1
+  %scevgep13 = getelementptr i8* %q.0, i64 %scevgep.sum
+  br label %for.inc.loopexit
+
+for.inc.loopexit:                                 ; preds = %while.cond.for.inc.loopexit_crit_edge, %while.cond.preheader
+  %q.1.lcssa = phi i8* [ %scevgep13, %while.cond.for.inc.loopexit_crit_edge ], [ %q.0, %while.cond.preheader ]
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.cond
+  %backslashes.2 = phi i32 [ %backslashes.0, %for.cond ], [ 0, %for.inc.loopexit ]
+  %q.2 = phi i8* [ %q.0, %for.cond ], [ %q.1.lcssa, %for.inc.loopexit ]
+  %incdec.ptr3 = getelementptr inbounds i8* %p.0, i64 1
+  br label %for.cond
+
+for.body6:                                        ; preds = %for.body6.lr.ph, %for.body6
+  %q.39 = phi i8* [ %q.0, %for.body6.lr.ph ], [ %incdec.ptr7, %for.body6 ]
+  %backslashes.38 = phi i32 [ %backslashes.0, %for.body6.lr.ph ], [ %dec9, %for.body6 ]
+  %incdec.ptr7 = getelementptr inbounds i8* %q.39, i64 1
+  store i8 92, i8* %incdec.ptr7, align 1
+  %dec9 = add nsw i32 %backslashes.38, -1
+  %tobool5 = icmp eq i32 %dec9, 0
+  br i1 %tobool5, label %for.cond4.for.end10_crit_edge, label %for.body6
+
+for.cond4.for.end10_crit_edge:                    ; preds = %for.body6
+  br label %for.end10
+
+for.end10:                                        ; preds = %for.cond4.for.end10_crit_edge, %for.cond4.preheader
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopRotate/multiple-exits.ll b/test/Transforms/LoopRotate/multiple-exits.ll
new file mode 100644
index 000000000000..675d71f60da4
--- /dev/null
+++ b/test/Transforms/LoopRotate/multiple-exits.ll
@@ -0,0 +1,236 @@
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; PR7447
+define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
+  br i1 %i.0, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %for.cond, %land.rhs
+  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
+  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
+  %cmp2 = icmp ult i32 %i.1, 100
+  br i1 %cmp2, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  %conv = zext i32 %i.1 to i64
+  %arrayidx = getelementptr inbounds [100 x i32]* %a, i64 0, i64 %conv
+  %0 = load i32* %arrayidx, align 4
+  %add = add i32 %0, %sum.1
+  %cmp4 = icmp ugt i32 %add, 1000
+  %inc = add i32 %i.1, 1
+  br i1 %cmp4, label %return, label %for.cond1
+
+return:                                           ; preds = %for.cond, %land.rhs
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret i32 %retval.0
+
+; CHECK: @test1
+; CHECK: for.cond1.preheader:
+; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
+; CHECK: br label %for.cond1
+
+; CHECK: for.cond1:
+; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
+; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
+; CHECK: %cmp2 = icmp ult i32 %i.1, 100
+; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
+}
+
+define void @test2(i32 %x) nounwind {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %if.end ]
+  %cmp = icmp eq i32 %i.0, %x
+  br i1 %cmp, label %return.loopexit, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  %call = tail call i32 @foo(i32 %i.0) nounwind
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %a
+
+if.end:                                           ; preds = %for.body
+  %call1 = tail call i32 @foo(i32 42) nounwind
+  %inc = add i32 %i.0, 1
+  br label %for.cond
+
+a:                                                ; preds = %for.body
+  %call2 = tail call i32 @bar(i32 1) nounwind
+  br label %return
+
+return.loopexit:                                  ; preds = %for.cond
+  br label %return
+
+return:                                           ; preds = %return.loopexit, %a
+  ret void
+
+; CHECK: @test2
+; CHECK: if.end:
+; CHECK: %inc = add i32 %i.02, 1
+; CHECK: %cmp = icmp eq i32 %inc, %x
+; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+}
+
+declare i32 @foo(i32)
+
+declare i32 @bar(i32)
+
+@_ZTIi = external constant i8*
+
+; Verify dominators.
+define void @test3(i32 %x) {
+entry:
+  %cmp2 = icmp eq i32 0, %x
+  br i1 %cmp2, label %try.cont.loopexit, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.inc
+  %i.03 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  invoke void @_Z3fooi(i32 %i.03)
+          to label %for.inc unwind label %lpad
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i32 %i.03, 1
+  %cmp = icmp eq i32 %inc, %x
+  br i1 %cmp, label %for.cond.try.cont.loopexit_crit_edge, label %for.body
+
+lpad:                                             ; preds = %for.body
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %1 = extractvalue { i8*, i32 } %0, 0
+  %2 = extractvalue { i8*, i32 } %0, 1
+  %3 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %2, %3
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %lpad
+  %4 = tail call i8* @__cxa_begin_catch(i8* %1) nounwind
+  br i1 true, label %invoke.cont2.loopexit, label %for.body.i.lr.ph
+
+for.body.i.lr.ph:                                 ; preds = %catch
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i.lr.ph, %for.inc.i
+  %i.0.i1 = phi i32 [ 0, %for.body.i.lr.ph ], [ %inc.i, %for.inc.i ]
+  invoke void @_Z3fooi(i32 %i.0.i1)
+          to label %for.inc.i unwind label %lpad.i
+
+for.inc.i:                                        ; preds = %for.body.i
+  %inc.i = add i32 %i.0.i1, 1
+  %cmp.i = icmp eq i32 %inc.i, 0
+  br i1 %cmp.i, label %for.cond.i.invoke.cont2.loopexit_crit_edge, label %for.body.i
+
+lpad.i:                                           ; preds = %for.body.i
+  %5 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %6 = extractvalue { i8*, i32 } %5, 0
+  %7 = extractvalue { i8*, i32 } %5, 1
+  %matches.i = icmp eq i32 %7, %3
+  br i1 %matches.i, label %catch.i, label %lpad1.body
+
+catch.i:                                          ; preds = %lpad.i
+  %8 = tail call i8* @__cxa_begin_catch(i8* %6) nounwind
+  invoke void @test3(i32 0)
+          to label %invoke.cont2.i unwind label %lpad1.i
+
+invoke.cont2.i:                                   ; preds = %catch.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %invoke.cont2
+
+lpad1.i:                                          ; preds = %catch.i
+  %9 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %10 = extractvalue { i8*, i32 } %9, 0
+  %11 = extractvalue { i8*, i32 } %9, 1
+  tail call void @__cxa_end_catch() nounwind
+  br label %lpad1.body
+
+for.cond.i.invoke.cont2.loopexit_crit_edge:       ; preds = %for.inc.i
+  br label %invoke.cont2.loopexit
+
+invoke.cont2.loopexit:                            ; preds = %for.cond.i.invoke.cont2.loopexit_crit_edge, %catch
+  br label %invoke.cont2
+
+invoke.cont2:                                     ; preds = %invoke.cont2.loopexit, %invoke.cont2.i
+  tail call void @__cxa_end_catch() nounwind
+  br label %try.cont
+
+for.cond.try.cont.loopexit_crit_edge:             ; preds = %for.inc
+  br label %try.cont.loopexit
+
+try.cont.loopexit:                                ; preds = %for.cond.try.cont.loopexit_crit_edge, %entry
+  br label %try.cont
+
+try.cont:                                         ; preds = %try.cont.loopexit, %invoke.cont2
+  ret void
+
+lpad1.body:                                       ; preds = %lpad1.i, %lpad.i
+  %exn.slot.0.i = phi i8* [ %10, %lpad1.i ], [ %6, %lpad.i ]
+  %ehselector.slot.0.i = phi i32 [ %11, %lpad1.i ], [ %7, %lpad.i ]
+  tail call void @__cxa_end_catch() nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1.body, %lpad
+  %exn.slot.0 = phi i8* [ %exn.slot.0.i, %lpad1.body ], [ %1, %lpad ]
+  %ehselector.slot.0 = phi i32 [ %ehselector.slot.0.i, %lpad1.body ], [ %2, %lpad ]
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1
+  resume { i8*, i32 } %lpad.val5
+}
+
+declare void @_Z3fooi(i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
+
+define void @test4() nounwind uwtable {
+entry:
+  br label %"7"
+
+"3":                                              ; preds = %"7"
+  br i1 undef, label %"31", label %"4"
+
+"4":                                              ; preds = %"3"
+  %. = select i1 undef, float 0x3F50624DE0000000, float undef
+  %0 = add i32 %1, 1
+  br label %"7"
+
+"7":                                              ; preds = %"4", %entry
+  %1 = phi i32 [ %0, %"4" ], [ 0, %entry ]
+  %2 = icmp slt i32 %1, 100
+  br i1 %2, label %"3", label %"8"
+
+"8":                                              ; preds = %"7"
+  br i1 undef, label %"9", label %"31"
+
+"9":                                              ; preds = %"8"
+  br label %"33"
+
+"27":                                             ; preds = %"31"
+  unreachable
+
+"31":                                             ; preds = %"8", %"3"
+  br i1 undef, label %"27", label %"32"
+
+"32":                                             ; preds = %"31"
+  br label %"33"
+
+"33":                                             ; preds = %"32", %"9"
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
index a6996a81fb07..af3a53708b49 100644
--- a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
+++ b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
@@ -1,15 +1,15 @@
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 ;
 ; Test LSR's use of SplitCriticalEdge during phi rewriting.
-; Verify that identical edges are merged. rdar://problem/6453893
 
 target triple = "x86-apple-darwin"
 
-; CHECK: @test
+; Verify that identical edges are merged. rdar://problem/6453893
+; CHECK: @test1
 ; CHECK: bb89:
 ; CHECK: phi i8* [ %lsr.iv.next1, %bbA.bb89_crit_edge ], [ %lsr.iv.next1, %bbB.bb89_crit_edge ]{{$}}
 
-define i8* @test() {
+define i8* @test1() {
 entry:
   br label %loop
 
@@ -41,3 +41,41 @@ bb89:
 exit:
   ret i8* %tmp75phi
 }
+
+; Handle single-predecessor phis: PR13756
+; CHECK: @test2
+; CHECK: bb89:
+; CHECK: phi i8* [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ]{{$}}
+define i8* @test2() {
+entry:
+  br label %loop
+
+loop:
+  %rec = phi i32 [ %next, %loop ], [ 0, %entry ]
+  %next = add i32 %rec, 1
+  %tmp75 = getelementptr i8* null, i32 %next
+  br i1 false, label %loop, label %loopexit
+
+loopexit:
+  br i1 false, label %bbA, label %bbB
+
+bbA:
+  switch i32 0, label %bb89 [
+    i32 47, label %bb89
+    i32 58, label %bb89
+  ]
+
+bbB:
+  switch i8 0, label %exit [
+    i8 47, label %exit
+    i8 58, label %exit
+  ]
+
+bb89:
+  %tmp75phi = phi i8* [ %tmp75, %bbA ], [ %tmp75, %bbA ], [ %tmp75, %bbA ]
+  br label %exit
+
+exit:
+  %result = phi i8* [ %tmp75phi, %bb89 ], [ %tmp75, %bbB ], [ %tmp75, %bbB ], [ %tmp75, %bbB ]
+  ret i8* %result
+}
diff --git a/test/Transforms/LoopUnroll/pr11361.ll b/test/Transforms/LoopUnroll/pr11361.ll
index 7ce7f5fe4600..62de2f728d23 100644
--- a/test/Transforms/LoopUnroll/pr11361.ll
+++ b/test/Transforms/LoopUnroll/pr11361.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unroll -disable-output
+; RUN: opt -loop-unroll -disable-output < %s
 ; PR11361
 
 ; This tests for an iterator invalidation issue.
diff --git a/test/Transforms/LoopUnroll/pr14167.ll b/test/Transforms/LoopUnroll/pr14167.ll
new file mode 100644
index 000000000000..205ae44b72e4
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr14167.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @test1() nounwind {
+; Ensure that we don't crash when the trip count == -1.
+; CHECK: @test1
+entry:
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.end, %entry
+  br i1 false, label %middle.block, label %vector.ph
+
+vector.ph:                                        ; preds = %for.cond2.preheader
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  br i1 undef, label %middle.block.loopexit, label %vector.body
+
+middle.block.loopexit:                            ; preds = %vector.body
+  br label %middle.block
+
+middle.block:                                     ; preds = %middle.block.loopexit, %for.cond2.preheader
+  br i1 true, label %for.end, label %scalar.preheader
+
+scalar.preheader:                                 ; preds = %middle.block
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %scalar.preheader
+  %indvars.iv = phi i64 [ 16000, %scalar.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %for.body4, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body4
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %middle.block
+  br i1 undef, label %for.cond2.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.end
+  ret void
+}
diff --git a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
index 61c54ddb156b..609520064a7a 100644
--- a/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
+++ b/test/Transforms/LoopUnswitch/2011-06-02-CritSwitch.ll
@@ -1,4 +1,4 @@
-; RUN: opt -loop-unswitch -disable-output
+; RUN: opt -loop-unswitch -disable-output < %s
 ; PR10031
 
 define i32 @test(i32 %command) {
diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
new file mode 100644
index 000000000000..0176c9a18966
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -0,0 +1,27 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce
+
+; Check that we don't fall into an infinite loop.
+define void @test() nounwind {
+entry:
+ br label %for.body
+
+for.body:
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
+}
+
+
+
+define void @test2() nounwind {
+entry:
+ br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+ unreachable
+}
diff --git a/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
new file mode 100644
index 000000000000..2516e248bc96
--- /dev/null
+++ b/test/Transforms/LoopVectorize/2012-10-22-isconsec.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s  -loop-vectorize -dce -force-vector-width=4 
+
+; Check that we don't crash.
+
+target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm "\09.ident\09\22GCC: (GNU) 4.6.3 LLVM: 3.2svn\22"
+
+@b = common global [32000 x float] zeroinitializer, align 16
+
+define i32 @set1ds(i32 %_n, float* nocapture %arr, float %value, i32 %stride) nounwind uwtable {
+entry:
+  %0 = icmp sgt i32 %_n, 0
+  br i1 %0, label %"3.lr.ph", label %"5"
+
+"3.lr.ph":                                        ; preds = %entry
+  %1 = bitcast float* %arr to i8*
+  %2 = sext i32 %stride to i64
+  br label %"3"
+
+"3":                                              ; preds = %"3.lr.ph", %"3"
+  %indvars.iv = phi i64 [ 0, %"3.lr.ph" ], [ %indvars.iv.next, %"3" ]
+  %3 = shl nsw i64 %indvars.iv, 2
+  %4 = getelementptr inbounds i8* %1, i64 %3
+  %5 = bitcast i8* %4 to float*
+  store float %value, float* %5, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, %2
+  %6 = trunc i64 %indvars.iv.next to i32
+  %7 = icmp slt i32 %6, %_n
+  br i1 %7, label %"3", label %"5"
+
+"5":                                              ; preds = %"3", %entry
+  ret i32 0
+}
+
+define i32 @init(i8* nocapture %name) unnamed_addr nounwind uwtable {
+entry:
+  br label %"3"
+
+"3":                                              ; preds = %"3", %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %"3" ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %1 = getelementptr inbounds i8* bitcast (float* getelementptr inbounds ([32000 x float]* @b, i64 0, i64 16000) to i8*), i64 %0
+  %2 = bitcast i8* %1 to float*
+  store float -1.000000e+00, float* %2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %"5", label %"3"
+
+"5":                                              ; preds = %"3"
+  ret i32 0
+}
+
+!0 = metadata !{metadata !"alias set 7: float", metadata !1}
+!1 = metadata !{metadata !1}
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
new file mode 100644
index 000000000000..a2d176a534c9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <8 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @read_mod_i64
+;CHECK: load <8 x i64>
+;CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64* %a, i64 %indvars.iv
+  %3 = load i64* %2, align 4
+  %4 = mul i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
new file mode 100644
index 000000000000..8f1bb545fa01
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @conversion_cost1
+;CHECK: store <2 x i8>
+;CHECK: ret
+define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 3
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 3, %0 ]
+  %2 = trunc i64 %indvars.iv to i8
+  %3 = getelementptr inbounds i8* %A, i64 %indvars.iv
+  store i8 %2, i8* %3, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+;CHECK: @conversion_cost2
+;CHECK: <2 x float>
+;CHECK: ret
+define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = add nsw i64 %indvars.iv, 3
+  %3 = trunc i64 %2 to i32
+  %4 = sitofp i32 %3 to float
+  %5 = getelementptr inbounds float* %B, i64 %indvars.iv
+  store float %4, float* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
new file mode 100644
index 000000000000..628f9912c8c9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@c = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: cost_model_1
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
new file mode 100644
index 000000000000..574c529834ac
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK: @example1
+;CHECK: load <8 x i32>
+;CHECK: add nsw <8 x i32>
+;CHECK: store <8 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. 
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/lit.local.cfg b/test/Transforms/LoopVectorize/X86/lit.local.cfg
new file mode 100644
index 000000000000..a8ad0f1a28b2
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
new file mode 100644
index 000000000000..26902eba9e29
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @cpp_new_arrays
+;CHECK: insertelement <4 x i32>
+;CHECK: load <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK: ret i32
+define i32 @cpp_new_arrays() uwtable ssp {
+entry:
+  %call = call noalias i8* @_Znwm(i64 4)
+  %0 = bitcast i8* %call to float*
+  store float 1.000000e+03, float* %0, align 4
+  %call1 = call noalias i8* @_Znwm(i64 4)
+  %1 = bitcast i8* %call1 to float*
+  store float 1.000000e+03, float* %1, align 4
+  %call3 = call noalias i8* @_Znwm(i64 4)
+  %2 = bitcast i8* %call3 to float*
+  store float 1.000000e+03, float* %2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float* %0, i64 %idxprom
+  %3 = load float* %arrayidx, align 4
+  %idxprom5 = sext i32 %i.01 to i64
+  %arrayidx6 = getelementptr inbounds float* %1, i64 %idxprom5
+  %4 = load float* %arrayidx6, align 4
+  %add = fadd float %3, %4
+  %idxprom7 = sext i32 %i.01 to i64
+  %arrayidx8 = getelementptr inbounds float* %2, i64 %idxprom7
+  store float %add, float* %arrayidx8, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %5 = load float* %2, align 4
+  %conv10 = fptosi float %5 to i32
+  ret i32 %conv10
+}
+
+declare noalias i8* @_Znwm(i64)
diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
new file mode 100644
index 000000000000..2f22a764572f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/flags.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @flags1
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags1(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul nsw i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @flags2
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
new file mode 100644
index 000000000000..fce29d240487
--- /dev/null
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -0,0 +1,650 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example2
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example2(i32 %n, i32 %x) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; We can't vectorize this loop because it has non constant loop bounds.
+;CHECK: @example3
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32* %.023, i64 1
+  %4 = load i32* %.023, align 16
+  %5 = getelementptr inbounds i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+;CHECK: @example4
+;CHECK: load <4 x i32>
+;CHECK: ret void
+define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) nounwind uwtable ssp {
+  %1 = add nsw i32 %n, -1
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %.preheader4, label %.lr.ph10
+
+.preheader4:                                      ; preds = %0
+  %3 = icmp sgt i32 %1, 0
+  br i1 %3, label %.lr.ph6, label %._crit_edge
+
+.lr.ph10:                                         ; preds = %0, %.lr.ph10
+  %4 = phi i32 [ %9, %.lr.ph10 ], [ %1, %0 ]
+  %.018 = phi i32* [ %8, %.lr.ph10 ], [ %p, %0 ]
+  %.027 = phi i32* [ %5, %.lr.ph10 ], [ %q, %0 ]
+  %5 = getelementptr inbounds i32* %.027, i64 1
+  %6 = load i32* %.027, align 16
+  %7 = add nsw i32 %6, 5
+  %8 = getelementptr inbounds i32* %.018, i64 1
+  store i32 %7, i32* %.018, align 16
+  %9 = add nsw i32 %4, -1
+  %10 = icmp eq i32 %4, 0
+  br i1 %10, label %._crit_edge, label %.lr.ph10
+
+.preheader:                                       ; preds = %.lr.ph6
+  br i1 %3, label %.lr.ph, label %._crit_edge
+
+.lr.ph6:                                          ; preds = %.preheader4, %.lr.ph6
+  %indvars.iv11 = phi i64 [ %indvars.iv.next12, %.lr.ph6 ], [ 0, %.preheader4 ]
+  %indvars.iv.next12 = add i64 %indvars.iv11, 1
+  %11 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv.next12
+  %12 = load i32* %11, align 4
+  %13 = add nsw i64 %indvars.iv11, 3
+  %14 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %13
+  %15 = load i32* %14, align 4
+  %16 = add nsw i32 %15, %12
+  %17 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv11
+  store i32 %16, i32* %17, align 4
+  %lftr.wideiv13 = trunc i64 %indvars.iv.next12 to i32
+  %exitcond14 = icmp eq i32 %lftr.wideiv13, %1
+  br i1 %exitcond14, label %.preheader, label %.lr.ph6
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.preheader ]
+  %18 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %19 = load i32* %18, align 4
+  %20 = icmp sgt i32 %19, 4
+  %21 = select i1 %20, i32 4, i32 0
+  %22 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  store i32 %21, i32* %22, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %1
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph10, %.preheader4, %.lr.ph, %.preheader
+  ret void
+}
+
+;CHECK: @example8
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example8(i32 %x) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %3, %0
+  %indvars.iv3 = phi i64 [ 0, %0 ], [ %indvars.iv.next4, %3 ]
+  br label %1
+
+; <label>:1                                       ; preds = %1, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [32 x [1024 x i32]]* @G, i64 0, i64 %indvars.iv3, i64 %indvars.iv
+  store i32 %x, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %3, label %1
+
+; <label>:3                                       ; preds = %1
+  %indvars.iv.next4 = add i64 %indvars.iv3, 1
+  %lftr.wideiv5 = trunc i64 %indvars.iv.next4 to i32
+  %exitcond6 = icmp eq i32 %lftr.wideiv5, 32
+  br i1 %exitcond6, label %4, label %.preheader
+
+; <label>:4                                       ; preds = %3
+  ret void
+}
+
+;CHECK: @example9
+;CHECK: phi <4 x i32>
+;CHECK: ret i32
+define i32 @example9() nounwind uwtable readonly ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %diff.01 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds [1024 x i32]* @ub, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [1024 x i32]* @uc, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add i32 %3, %diff.01
+  %7 = sub i32 %6, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret i32 %7
+}
+
+;CHECK: @example10a
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: load <4 x i16>
+;CHECK: add <4 x i16>
+;CHECK: store <4 x i16>
+;CHECK: ret void
+define void @example10a(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i32* %ib, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %ic, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %8 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %9 = load i16* %8, align 2
+  %10 = getelementptr inbounds i16* %sc, i64 %indvars.iv
+  %11 = load i16* %10, align 2
+  %12 = add i16 %11, %9
+  %13 = getelementptr inbounds i16* %sa, i64 %indvars.iv
+  store i16 %12, i16* %13, align 2
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %14, label %1
+
+; <label>:14                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example11
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: load i32
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: insertelement
+;CHECK: ret void
+define void @example11() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = shl nsw i64 %indvars.iv, 1
+  %3 = or i64 %2, 1
+  %4 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %3
+  %5 = load i32* %4, align 4
+  %6 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %3
+  %7 = load i32* %6, align 4
+  %8 = mul nsw i32 %7, %5
+  %9 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %2
+  %10 = load i32* %9, align 8
+  %11 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %2
+  %12 = load i32* %11, align 8
+  %13 = mul nsw i32 %12, %10
+  %14 = sub nsw i32 %8, %13
+  %15 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %14, i32* %15, align 4
+  %16 = mul nsw i32 %7, %10
+  %17 = mul nsw i32 %12, %5
+  %18 = add nsw i32 %17, %16
+  %19 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  store i32 %18, i32* %19, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 512
+  br i1 %exitcond, label %20, label %1
+
+; <label>:20                                      ; preds = %1
+  ret void
+}
+
+;CHECK: @example12
+;CHECK: trunc <4 x i64>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example13
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example13(i32** nocapture %A, i32** nocapture %B, i32* nocapture %out) nounwind uwtable ssp {
+  br label %.preheader
+
+.preheader:                                       ; preds = %14, %0
+  %indvars.iv4 = phi i64 [ 0, %0 ], [ %indvars.iv.next5, %14 ]
+  %1 = getelementptr inbounds i32** %A, i64 %indvars.iv4
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32** %B, i64 %indvars.iv4
+  %4 = load i32** %3, align 8
+  br label %5
+
+; <label>:5                                       ; preds = %.preheader, %5
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %5 ]
+  %diff.02 = phi i32 [ 0, %.preheader ], [ %11, %5 ]
+  %6 = getelementptr inbounds i32* %2, i64 %indvars.iv
+  %7 = load i32* %6, align 4
+  %8 = getelementptr inbounds i32* %4, i64 %indvars.iv
+  %9 = load i32* %8, align 4
+  %10 = add i32 %7, %diff.02
+  %11 = sub i32 %10, %9
+  %indvars.iv.next = add i64 %indvars.iv, 8
+  %12 = trunc i64 %indvars.iv.next to i32
+  %13 = icmp slt i32 %12, 1024
+  br i1 %13, label %5, label %14
+
+; <label>:14                                      ; preds = %5
+  %15 = getelementptr inbounds i32* %out, i64 %indvars.iv4
+  store i32 %11, i32* %15, align 4
+  %indvars.iv.next5 = add i64 %indvars.iv4, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next5 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 32
+  br i1 %exitcond, label %16, label %.preheader
+
+; <label>:16                                      ; preds = %14
+  ret void
+}
+
+; Can't vectorize because of reductions.
+;CHECK: @example14
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example14(i32** nocapture %in, i32** nocapture %coeff, i32* nocapture %out) nounwind uwtable ssp {
+.preheader3:
+  br label %.preheader
+
+.preheader:                                       ; preds = %11, %.preheader3
+  %indvars.iv7 = phi i64 [ 0, %.preheader3 ], [ %indvars.iv.next8, %11 ]
+  %sum.05 = phi i32 [ 0, %.preheader3 ], [ %10, %11 ]
+  br label %0
+
+; <label>:0                                       ; preds = %0, %.preheader
+  %indvars.iv = phi i64 [ 0, %.preheader ], [ %indvars.iv.next, %0 ]
+  %sum.12 = phi i32 [ %sum.05, %.preheader ], [ %10, %0 ]
+  %1 = getelementptr inbounds i32** %in, i64 %indvars.iv
+  %2 = load i32** %1, align 8
+  %3 = getelementptr inbounds i32* %2, i64 %indvars.iv7
+  %4 = load i32* %3, align 4
+  %5 = getelementptr inbounds i32** %coeff, i64 %indvars.iv
+  %6 = load i32** %5, align 8
+  %7 = getelementptr inbounds i32* %6, i64 %indvars.iv7
+  %8 = load i32* %7, align 4
+  %9 = mul nsw i32 %8, %4
+  %10 = add nsw i32 %9, %sum.12
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %11, label %0
+
+; <label>:11                                      ; preds = %0
+  %indvars.iv.next8 = add i64 %indvars.iv7, 1
+  %lftr.wideiv9 = trunc i64 %indvars.iv.next8 to i32
+  %exitcond10 = icmp eq i32 %lftr.wideiv9, 32
+  br i1 %exitcond10, label %.preheader3.1, label %.preheader
+
+.preheader3.1:                                    ; preds = %11
+  store i32 %10, i32* %out, align 4
+  br label %.preheader.1
+
+.preheader.1:                                     ; preds = %24, %.preheader3.1
+  %indvars.iv7.1 = phi i64 [ 0, %.preheader3.1 ], [ %indvars.iv.next8.1, %24 ]
+  %sum.05.1 = phi i32 [ 0, %.preheader3.1 ], [ %23, %24 ]
+  br label %12
+
+; <label>:12                                      ; preds = %12, %.preheader.1
+  %indvars.iv.1 = phi i64 [ 0, %.preheader.1 ], [ %13, %12 ]
+  %sum.12.1 = phi i32 [ %sum.05.1, %.preheader.1 ], [ %23, %12 ]
+  %13 = add nsw i64 %indvars.iv.1, 1
+  %14 = getelementptr inbounds i32** %in, i64 %13
+  %15 = load i32** %14, align 8
+  %16 = getelementptr inbounds i32* %15, i64 %indvars.iv7.1
+  %17 = load i32* %16, align 4
+  %18 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.1
+  %19 = load i32** %18, align 8
+  %20 = getelementptr inbounds i32* %19, i64 %indvars.iv7.1
+  %21 = load i32* %20, align 4
+  %22 = mul nsw i32 %21, %17
+  %23 = add nsw i32 %22, %sum.12.1
+  %lftr.wideiv.1 = trunc i64 %13 to i32
+  %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 1024
+  br i1 %exitcond.1, label %24, label %12
+
+; <label>:24                                      ; preds = %12
+  %indvars.iv.next8.1 = add i64 %indvars.iv7.1, 1
+  %lftr.wideiv9.1 = trunc i64 %indvars.iv.next8.1 to i32
+  %exitcond10.1 = icmp eq i32 %lftr.wideiv9.1, 32
+  br i1 %exitcond10.1, label %.preheader3.2, label %.preheader.1
+
+.preheader3.2:                                    ; preds = %24
+  %25 = getelementptr inbounds i32* %out, i64 1
+  store i32 %23, i32* %25, align 4
+  br label %.preheader.2
+
+.preheader.2:                                     ; preds = %38, %.preheader3.2
+  %indvars.iv7.2 = phi i64 [ 0, %.preheader3.2 ], [ %indvars.iv.next8.2, %38 ]
+  %sum.05.2 = phi i32 [ 0, %.preheader3.2 ], [ %37, %38 ]
+  br label %26
+
+; <label>:26                                      ; preds = %26, %.preheader.2
+  %indvars.iv.2 = phi i64 [ 0, %.preheader.2 ], [ %indvars.iv.next.2, %26 ]
+  %sum.12.2 = phi i32 [ %sum.05.2, %.preheader.2 ], [ %37, %26 ]
+  %27 = add nsw i64 %indvars.iv.2, 2
+  %28 = getelementptr inbounds i32** %in, i64 %27
+  %29 = load i32** %28, align 8
+  %30 = getelementptr inbounds i32* %29, i64 %indvars.iv7.2
+  %31 = load i32* %30, align 4
+  %32 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.2
+  %33 = load i32** %32, align 8
+  %34 = getelementptr inbounds i32* %33, i64 %indvars.iv7.2
+  %35 = load i32* %34, align 4
+  %36 = mul nsw i32 %35, %31
+  %37 = add nsw i32 %36, %sum.12.2
+  %indvars.iv.next.2 = add i64 %indvars.iv.2, 1
+  %lftr.wideiv.2 = trunc i64 %indvars.iv.next.2 to i32
+  %exitcond.2 = icmp eq i32 %lftr.wideiv.2, 1024
+  br i1 %exitcond.2, label %38, label %26
+
+; <label>:38                                      ; preds = %26
+  %indvars.iv.next8.2 = add i64 %indvars.iv7.2, 1
+  %lftr.wideiv9.2 = trunc i64 %indvars.iv.next8.2 to i32
+  %exitcond10.2 = icmp eq i32 %lftr.wideiv9.2, 32
+  br i1 %exitcond10.2, label %.preheader3.3, label %.preheader.2
+
+.preheader3.3:                                    ; preds = %38
+  %39 = getelementptr inbounds i32* %out, i64 2
+  store i32 %37, i32* %39, align 4
+  br label %.preheader.3
+
+.preheader.3:                                     ; preds = %52, %.preheader3.3
+  %indvars.iv7.3 = phi i64 [ 0, %.preheader3.3 ], [ %indvars.iv.next8.3, %52 ]
+  %sum.05.3 = phi i32 [ 0, %.preheader3.3 ], [ %51, %52 ]
+  br label %40
+
+; <label>:40                                      ; preds = %40, %.preheader.3
+  %indvars.iv.3 = phi i64 [ 0, %.preheader.3 ], [ %indvars.iv.next.3, %40 ]
+  %sum.12.3 = phi i32 [ %sum.05.3, %.preheader.3 ], [ %51, %40 ]
+  %41 = add nsw i64 %indvars.iv.3, 3
+  %42 = getelementptr inbounds i32** %in, i64 %41
+  %43 = load i32** %42, align 8
+  %44 = getelementptr inbounds i32* %43, i64 %indvars.iv7.3
+  %45 = load i32* %44, align 4
+  %46 = getelementptr inbounds i32** %coeff, i64 %indvars.iv.3
+  %47 = load i32** %46, align 8
+  %48 = getelementptr inbounds i32* %47, i64 %indvars.iv7.3
+  %49 = load i32* %48, align 4
+  %50 = mul nsw i32 %49, %45
+  %51 = add nsw i32 %50, %sum.12.3
+  %indvars.iv.next.3 = add i64 %indvars.iv.3, 1
+  %lftr.wideiv.3 = trunc i64 %indvars.iv.next.3 to i32
+  %exitcond.3 = icmp eq i32 %lftr.wideiv.3, 1024
+  br i1 %exitcond.3, label %52, label %40
+
+; <label>:52                                      ; preds = %40
+  %indvars.iv.next8.3 = add i64 %indvars.iv7.3, 1
+  %lftr.wideiv9.3 = trunc i64 %indvars.iv.next8.3 to i32
+  %exitcond10.3 = icmp eq i32 %lftr.wideiv9.3, 32
+  br i1 %exitcond10.3, label %53, label %.preheader.3
+
+; <label>:53                                      ; preds = %52
+  %54 = getelementptr inbounds i32* %out, i64 3
+  store i32 %51, i32* %54, align 4
+  ret void
+}
+
+; Can't vectorize because the src and dst pointers are not disjoint.
+;CHECK: @example21
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @example21(i32* nocapture %b, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i32 %n to i64
+  br label %3
+
+; <label>:3                                       ; preds = %.lr.ph, %3
+  %indvars.iv = phi i64 [ %2, %.lr.ph ], [ %indvars.iv.next, %3 ]
+  %a.02 = phi i32 [ 0, %.lr.ph ], [ %6, %3 ]
+  %indvars.iv.next = add i64 %indvars.iv, -1
+  %4 = getelementptr inbounds i32* %b, i64 %indvars.iv.next
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %a.02
+  %7 = trunc i64 %indvars.iv.next to i32
+  %8 = icmp sgt i32 %7, 0
+  br i1 %8, label %3, label %._crit_edge
+
+._crit_edge:                                      ; preds = %3, %0
+  %a.0.lcssa = phi i32 [ 0, %0 ], [ %6, %3 ]
+  ret i32 %a.0.lcssa
+}
+
+; Can't vectorize because there are multiple PHIs.
+;CHECK: @example23
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example24
+;CHECK: shufflevector <4 x i16>
+;CHECK: ret void
+define void @example24(i16 signext %x, i16 signext %y) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @fa, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @fb, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %x.y = select i1 %6, i16 %x, i16 %y
+  %7 = sext i16 %x.y to i32
+  %8 = getelementptr inbounds [1024 x i32]* @ic, i64 0, i64 %indvars.iv
+  store i32 %7, i32* %8, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
+;CHECK: @example25
+;CHECK: and <4 x i1>
+;CHECK: zext <4 x i1>
+;CHECK: ret void
+define void @example25() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [1024 x float]* @da, i64 0, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = getelementptr inbounds [1024 x float]* @db, i64 0, i64 %indvars.iv
+  %5 = load float* %4, align 4
+  %6 = fcmp olt float %3, %5
+  %7 = getelementptr inbounds [1024 x float]* @dc, i64 0, i64 %indvars.iv
+  %8 = load float* %7, align 4
+  %9 = getelementptr inbounds [1024 x float]* @dd, i64 0, i64 %indvars.iv
+  %10 = load float* %9, align 4
+  %11 = fcmp olt float %8, %10
+  %12 = and i1 %6, %11
+  %13 = zext i1 %12 to i32
+  %14 = getelementptr inbounds [1024 x i32]* @dj, i64 0, i64 %indvars.iv
+  store i32 %13, i32* %14, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %15, label %1
+
+; <label>:15                                      ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
new file mode 100644
index 000000000000..71ea7689fc04
--- /dev/null
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; This is the loop.
+;  for (i=0; i<n; i++){
+;    a[i] += i;
+;  }
+;CHECK: @inc
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @inc(i32 %n) nounwind uwtable noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = trunc i64 %indvars.iv to i32
+  %5 = add nsw i32 %3, %4
+  store i32 %5, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+; Can't vectorize this loop because the access to A[X] is non linear.
+;
+;  for (i = 0; i < n; ++i) {
+;    A[B[i]]++;
+;
+;CHECK: @histogram
+;CHECK-NOT: <4 x i32>
+;CHECK: ret i32
+define i32 @histogram(i32* nocapture noalias %A, i32* nocapture noalias %B, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32* %A, i64 %idxprom1
+  %1 = load i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 0
+}
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
new file mode 100644
index 000000000000..b31bceb50df6
--- /dev/null
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@array = common global [1024 x i32] zeroinitializer, align 16
+
+;CHECK: @array_at_plus_one
+;CHECK: add <4 x i64>
+;CHECK: trunc <4 x i64>
+;CHECK: add i64 %index, 12
+;CHECK: ret i32
+define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = add nsw i64 %indvars.iv, 12
+  %3 = getelementptr inbounds [1024 x i32]* @array, i64 0, i64 %2
+  %4 = trunc i64 %indvars.iv to i32
+  store i32 %4, i32* %3, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/lit.local.cfg b/test/Transforms/LoopVectorize/lit.local.cfg
new file mode 100644
index 000000000000..19eebc0ac7ac
--- /dev/null
+++ b/test/Transforms/LoopVectorize/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll', '.c', '.cpp']
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
new file mode 100644
index 000000000000..1a6c15ed96c4
--- /dev/null
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: shl i32
+;CHECK: zext i32
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i32 %n) nounwind uwtable ssp {
+  %n4 = shl i32 %n, 2
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n4
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
new file mode 100644
index 000000000000..b4d1bac132f0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_only_func
+;CHECK: load <4 x i32>
+;CHECK: ret i32
+define i32 @read_only_func(i32* nocapture %A, i32* nocapture %B, i32 %n) nounwind uwtable readonly ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = add nsw i64 %indvars.iv, 13
+  %5 = getelementptr inbounds i32* %B, i64 %4
+  %6 = load i32* %5, align 4
+  %7 = shl i32 %6, 1
+  %8 = add i32 %3, %sum.02
+  %9 = add i32 %8, %7
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
new file mode 100644
index 000000000000..c1848b35fc6e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -0,0 +1,232 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @reduction_sum
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_sum(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %sum.02, %6
+  %8 = add i32 %7, %3
+  %9 = add i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_prod
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %prod.02 = phi i32 [ %9, %.lr.ph ], [ 1, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = mul i32 %prod.02, %6
+  %8 = mul i32 %7, %3
+  %9 = mul i32 %8, %5
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %prod.0.lcssa = phi i32 [ 1, %0 ], [ %9, %.lr.ph ]
+  ret i32 %prod.0.lcssa
+}
+
+;CHECK: @reduction_mix
+;CHECK: phi <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = mul nsw i32 %5, %3
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %sum.02, %7
+  %9 = add i32 %8, %6
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_mul
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_mul(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %sum.02 = phi i32 [ %9, %.lr.ph ], [ 19, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = trunc i64 %indvars.iv to i32
+  %7 = add i32 %3, %6
+  %8 = add i32 %7, %5
+  %9 = mul i32 %8, %sum.02
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  %sum.0.lcssa = phi i32 [ 0, %0 ], [ %9, %.lr.ph ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @start_at_non_zero
+;CHECK: phi <4 x i32>
+;CHECK: <i32 120, i32 0, i32 0, i32 0>
+;CHECK: ret i32
+define i32 @start_at_non_zero(i32* nocapture %in, i32* nocapture %coeff, i32* nocapture %out, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.09 = phi i32 [ %add, %for.body ], [ 120, %entry ]
+  %arrayidx = getelementptr inbounds i32* %in, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %coeff, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, %sum.09
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 120, %entry ], [ %add, %for.body ]
+  ret i32 %sum.0.lcssa
+}
+
+;CHECK: @reduction_and
+;CHECK: and <4 x i32>
+;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
+;CHECK: ret i32
+define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %and = and i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_or
+;CHECK: or <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_xor
+;CHECK: xor <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
new file mode 100644
index 000000000000..23933cf7c7db
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Make sure we vectorize this loop:
+; int foo(float *a, float *b, int n) {
+;   for (int i=0; i<n; ++i)
+;     a[i] = b[i] * 3;
+; }
+
+;CHECK: load <4 x float>
+define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %mul = fmul float %0, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
new file mode 100644
index 000000000000..e537bde31bb0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK: load <4 x i32>
+; make sure that we have a scalar condition and a vector operand.
+;CHECK: select i1 %cond, <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
new file mode 100644
index 000000000000..4a6e4b231dfe
--- /dev/null
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count.
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
new file mode 100644
index 000000000000..5aa3bc034d0b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @start_at_nonzero
+;CHECK: mul nuw <4 x i32>
+;CHECK: ret i32
+define i32 @start_at_nonzero(i32* nocapture %a, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp3 = icmp slt i32 %start, %end
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx, align 4, !tbaa !0
+  %mul = mul nuw i32 %1, 333
+  store i32 %mul, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %2, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 4
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
new file mode 100644
index 000000000000..eb027604134f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/write-only.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <4 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
index 61ba3c7e6cc5..597b69dee3d4 100644
--- a/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
+++ b/test/Transforms/MemCpyOpt/2008-03-13-ReturnSlotBitcast.ll
@@ -9,11 +9,11 @@ declare void @g(%a*)
 define float @f() {
 entry:
   %a_var = alloca %a
-  %b_var = alloca %b
+  %b_var = alloca %b, align 1
   call void @g(%a* %a_var)
   %a_i8 = bitcast %a* %a_var to i8*
   %b_i8 = bitcast %b* %b_var to i8*
-  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 4, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_i8, i8* %a_i8, i32 4, i32 1, i1 false)
   %tmp1 = getelementptr %b* %b_var, i32 0, i32 0
   %tmp2 = load float* %tmp1
   ret float %tmp2
diff --git a/test/Transforms/MemCpyOpt/align.ll b/test/Transforms/MemCpyOpt/align.ll
index b1f900d9da4c..1b98f6ad383f 100644
--- a/test/Transforms/MemCpyOpt/align.ll
+++ b/test/Transforms/MemCpyOpt/align.ll
@@ -1,12 +1,15 @@
-; RUN: opt < %s -S -memcpyopt | FileCheck %s
+; RUN: opt < %s -S -basicaa -memcpyopt | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64"
 
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
 ; The resulting memset is only 4-byte aligned, despite containing
 ; a 16-byte aligned store in the middle.
 
-; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
-
 define void @foo(i32* %p) {
+; CHECK: @foo
+; CHECK: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 0, i64 16, i32 4, i1 false)
   %a0 = getelementptr i32* %p, i64 0
   store i32 0, i32* %a0, align 4
   %a1 = getelementptr i32* %p, i64 1
@@ -17,3 +20,18 @@ define void @foo(i32* %p) {
   store i32 0, i32* %a3, align 4
   ret void
 }
+
+; Replacing %a8 with %a4 in the memset requires boosting the alignment of %a4.
+
+define void @bar() {
+; CHECK: @bar
+; CHECK: %a4 = alloca i32, align 8
+; CHECK-NOT: memcpy
+  %a4 = alloca i32, align 4
+  %a8 = alloca i32, align 8
+  %a8.cast = bitcast i32* %a8 to i8*
+  %a4.cast = bitcast i32* %a4 to i8*
+  call void @llvm.memset.p0i8.i64(i8* %a8.cast, i8 0, i64 4, i32 8, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a4.cast, i8* %a8.cast, i64 4, i32 4, i1 false)
+  ret void
+}
diff --git a/test/Transforms/MemCpyOpt/form-memset.ll b/test/Transforms/MemCpyOpt/form-memset.ll
index 8832f897b089..f63b1dcfdd5f 100644
--- a/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/test/Transforms/MemCpyOpt/form-memset.ll
@@ -248,3 +248,27 @@ entry:
 ; CHECK: @test8
 ; CHECK: store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16
 }
+
+@test9buf = internal unnamed_addr global [16 x i64] zeroinitializer, align 16
+
+define void @test9() nounwind {
+  store i8 -1, i8* bitcast ([16 x i64]* @test9buf to i8*), align 16
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 1), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 2), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 3), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 4), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 5), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 6), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 7), align 1
+  store i8 -1, i8* bitcast (i64* getelementptr inbounds ([16 x i64]* @test9buf, i64 0, i64 1) to i8*), align 8
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 9), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 10), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 11), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 12), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 13), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 14), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 15), align 1
+  ret void
+; CHECK: @test9(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i32 16, i1 false)
+}
diff --git a/test/Transforms/MetaRenamer/lit.local.cfg b/test/Transforms/MetaRenamer/lit.local.cfg
new file mode 100644
index 000000000000..c6106e4746f2
--- /dev/null
+++ b/test/Transforms/MetaRenamer/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
new file mode 100644
index 000000000000..ad41bcf50f19
--- /dev/null
+++ b/test/Transforms/MetaRenamer/metarenamer.ll
@@ -0,0 +1,96 @@
+; RUN: opt %s -metarenamer -S | FileCheck %s
+
+; CHECK: target triple {{.*}}
+; CHECK-NOT: {{^x*}}xxx{{^x*}}
+; CHECK: ret i32 6
+
+target triple = "x86_64-pc-linux-gnu"
+
+%struct.bar_xxx = type { i32, double }
+%struct.foo_xxx = type { i32, float, %struct.bar_xxx }
+
+@func_5_xxx.static_local_3_xxx = internal global i32 3, align 4
+@global_3_xxx = common global i32 0, align 4
+
+@func_7_xxx = alias weak i32 (...)* @aliased_func_7_xxx
+
+declare i32 @aliased_func_7_xxx(...)
+
+define i32 @func_3_xxx() nounwind uwtable ssp {
+  ret i32 3
+}
+
+define void @func_4_xxx(%struct.foo_xxx* sret %agg.result) nounwind uwtable ssp {
+  %1 = alloca %struct.foo_xxx, align 8
+  %2 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 0
+  store i32 1, i32* %2, align 4
+  %3 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 1
+  store float 2.000000e+00, float* %3, align 4
+  %4 = getelementptr inbounds %struct.foo_xxx* %1, i32 0, i32 2
+  %5 = getelementptr inbounds %struct.bar_xxx* %4, i32 0, i32 0
+  store i32 3, i32* %5, align 4
+  %6 = getelementptr inbounds %struct.bar_xxx* %4, i32 0, i32 1
+  store double 4.000000e+00, double* %6, align 8
+  %7 = bitcast %struct.foo_xxx* %agg.result to i8*
+  %8 = bitcast %struct.foo_xxx* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %7, i8* %8, i64 24, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define i32 @func_5_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, i32 %arg_3_xxx, i32 %arg_4_xxx) nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %local_1_xxx = alloca i32, align 4
+  %local_2_xxx = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %arg_1_xxx, i32* %1, align 4
+  store i32 %arg_2_xxx, i32* %2, align 4
+  store i32 %arg_3_xxx, i32* %3, align 4
+  store i32 %arg_4_xxx, i32* %4, align 4
+  store i32 1, i32* %local_1_xxx, align 4
+  store i32 2, i32* %local_2_xxx, align 4
+  store i32 0, i32* %i, align 4
+  br label %5
+
+; <label>:5                                       ; preds = %9, %0
+  %6 = load i32* %i, align 4
+  %7 = icmp slt i32 %6, 10
+  br i1 %7, label %8, label %12
+
+; <label>:8                                       ; preds = %5
+  br label %9
+
+; <label>:9                                       ; preds = %8
+  %10 = load i32* %i, align 4
+  %11 = add nsw i32 %10, 1
+  store i32 %11, i32* %i, align 4
+  br label %5
+
+; <label>:12                                      ; preds = %5
+  %13 = load i32* %local_1_xxx, align 4
+  %14 = load i32* %1, align 4
+  %15 = add nsw i32 %13, %14
+  %16 = load i32* %local_2_xxx, align 4
+  %17 = add nsw i32 %15, %16
+  %18 = load i32* %2, align 4
+  %19 = add nsw i32 %17, %18
+  %20 = load i32* @func_5_xxx.static_local_3_xxx, align 4
+  %21 = add nsw i32 %19, %20
+  %22 = load i32* %3, align 4
+  %23 = add nsw i32 %21, %22
+  %24 = load i32* %4, align 4
+  %25 = add nsw i32 %23, %24
+  ret i32 %25
+}
+
+define i32 @varargs_func_6_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, ...) nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  store i32 %arg_1_xxx, i32* %1, align 4
+  store i32 %arg_2_xxx, i32* %2, align 4
+  ret i32 6
+}
diff --git a/test/Transforms/ObjCARC/basic.ll b/test/Transforms/ObjCARC/basic.ll
index 0a7ba5de71bc..7b64b1be7c62 100644
--- a/test/Transforms/ObjCARC/basic.ll
+++ b/test/Transforms/ObjCARC/basic.ll
@@ -1,4 +1,4 @@
-; RUN: opt -objc-arc -S < %s | FileCheck %s
+; RUN: opt -basicaa -objc-arc -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64"
 
@@ -1498,7 +1498,7 @@ define i8* @test49(i8* %p) nounwind {
 }
 
 ; Do delete retain+release with intervening stores of the
-; address value;
+; address value.
 
 ; CHECK: define void @test50(
 ; CHECK-NOT: @objc_
diff --git a/test/Transforms/ObjCARC/nested.ll b/test/Transforms/ObjCARC/nested.ll
index a618a21d8bb3..32be03ec6ae0 100644
--- a/test/Transforms/ObjCARC/nested.ll
+++ b/test/Transforms/ObjCARC/nested.ll
@@ -16,6 +16,10 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
 declare i8* @objc_msgSend(i8*, i8*, ...) nonlazybind
 declare void @use(i8*)
 declare void @objc_release(i8*)
+declare i8* @def()
+declare void @__crasher_block_invoke(i8* nocapture)
+declare i8* @objc_retainBlock(i8*)
+declare void @__crasher_block_invoke1(i8* nocapture)
 
 !0 = metadata !{}
 
@@ -279,11 +283,13 @@ forcoll.empty:
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test6(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test6() nounwind {
 entry:
@@ -345,11 +351,13 @@ forcoll.empty:
   ret void
 }
 
-; Delete a nested retain+release pair.
+; TODO: Delete a nested retain+release pair.
+; The optimizer currently can't do this, because isn't isn't sophisticated enough in
+; reasnoning about nesting.
 
 ; CHECK: define void @test7(
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test7() nounwind {
 entry:
@@ -553,12 +561,12 @@ forcoll.empty:
   ret void
 }
 
-; Like test9, but without a split backedge. This we can optimize.
+; Like test9, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test9b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test9b() nounwind {
 entry:
@@ -687,12 +695,12 @@ forcoll.empty:
   ret void
 }
 
-; Like test10, but without a split backedge. This we can optimize.
+; Like test10, but without a split backedge. TODO: optimize this.
 
 ; CHECK: define void @test10b(
 ; CHECK: call i8* @objc_retain
 ; CHECK: call i8* @objc_retain
-; CHECK-NOT: @objc_retain
+; CHECK: @objc_retain
 ; CHECK: }
 define void @test10b() nounwind {
 entry:
@@ -751,3 +759,64 @@ forcoll.empty:
   call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
   ret void
 }
+
+; Pointers to strong pointers can obscure provenance relationships. Be conservative
+; in the face of escaping pointers. rdar://12150909.
+
+%struct.__block_d = type { i64, i64 }
+
+@_NSConcreteStackBlock = external global i8*
+@__block_d_tmp = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+@__block_d_tmp5 = external hidden constant { i64, i64, i8*, i8*, i8*, i8* }
+
+; CHECK: define void @test11(
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: tail call i8* @objc_retain(i8* %call) nounwind
+; CHECK: call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+; CHECK: }
+define void @test11() {
+entry:
+  %block = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %block9 = alloca <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>, align 8
+  %call = call i8* @def(), !clang.arc.no_objc_arc_exceptions !0
+  %foo = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 5
+  %block.isa = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa, align 8
+  %block.flags = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags, align 8
+  %block.reserved = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 2
+  store i32 0, i32* %block.reserved, align 4
+  %block.invoke = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke to i8*), i8** %block.invoke, align 8
+  %block.d = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp to %struct.__block_d*), %struct.__block_d** %block.d, align 8
+  %foo2 = tail call i8* @objc_retain(i8* %call) nounwind
+  store i8* %foo2, i8** %foo, align 8
+  %foo4 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block to i8*
+  %foo5 = call i8* @objc_retainBlock(i8* %foo4) nounwind
+  call void @use(i8* %foo5), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo5) nounwind
+  %strongdestroy = load i8** %foo, align 8
+  call void @objc_release(i8* %strongdestroy) nounwind, !clang.imprecise_release !0
+  %foo10 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 5
+  %block.isa11 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 0
+  store i8* bitcast (i8** @_NSConcreteStackBlock to i8*), i8** %block.isa11, align 8
+  %block.flags12 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 1
+  store i32 1107296256, i32* %block.flags12, align 8
+  %block.reserved13 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 2
+  store i32 0, i32* %block.reserved13, align 4
+  %block.invoke14 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 3
+  store i8* bitcast (void (i8*)* @__crasher_block_invoke1 to i8*), i8** %block.invoke14, align 8
+  %block.d15 = getelementptr inbounds <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9, i64 0, i32 4
+  store %struct.__block_d* bitcast ({ i64, i64, i8*, i8*, i8*, i8* }* @__block_d_tmp5 to %struct.__block_d*), %struct.__block_d** %block.d15, align 8
+  %foo18 = call i8* @objc_retain(i8* %call) nounwind
+  store i8* %call, i8** %foo10, align 8
+  %foo20 = bitcast <{ i8*, i32, i32, i8*, %struct.__block_d*, i8* }>* %block9 to i8*
+  %foo21 = call i8* @objc_retainBlock(i8* %foo20) nounwind
+  call void @use(i8* %foo21), !clang.arc.no_objc_arc_exceptions !0
+  call void @objc_release(i8* %foo21) nounwind
+  %strongdestroy25 = load i8** %foo10, align 8
+  call void @objc_release(i8* %strongdestroy25) nounwind, !clang.imprecise_release !0
+  call void @objc_release(i8* %call) nounwind, !clang.imprecise_release !0
+  ret void
+}
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
new file mode 100644
index 000000000000..e7866ed1b442
--- /dev/null
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -0,0 +1,329 @@
+; RUN: opt -objc-arc -S < %s
+; rdar://12277446
+
+; The total number of paths grows exponentially with the number of branches, and a
+; computation of this number can overflow any reasonable fixed-sized integer.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768 = type { i32*, i32, i8*, i32 }
+
+@_unnamed_cfstring_591 = external constant %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768, section "__DATA,__cfstring"
+
+declare i8* @objc_retain(i8*) nonlazybind
+
+declare void @objc_release(i8*) nonlazybind
+
+define hidden void @foo() {
+entry:
+  br i1 undef, label %msgSend.nullinit, label %msgSend.call
+
+msgSend.call:                                     ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.nullinit:                                 ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
+  %0 = bitcast %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768* @_unnamed_cfstring_591 to i8*
+  %1 = call i8* @objc_retain(i8* %0) nounwind
+  br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
+
+msgSend.call32:                                   ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.nullinit33:                               ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.cont34:                                   ; preds = %msgSend.nullinit33, %msgSend.call32
+  br i1 undef, label %msgSend.nullinit38, label %msgSend.call37
+
+msgSend.call37:                                   ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.nullinit38:                               ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.cont39:                                   ; preds = %msgSend.nullinit38, %msgSend.call37
+  br i1 undef, label %msgSend.nullinit49, label %msgSend.call48
+
+msgSend.call48:                                   ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.nullinit49:                               ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.cont50:                                   ; preds = %msgSend.nullinit49, %msgSend.call48
+  br i1 undef, label %msgSend.nullinit61, label %msgSend.call60
+
+msgSend.call60:                                   ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.nullinit61:                               ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.cont62:                                   ; preds = %msgSend.nullinit61, %msgSend.call60
+  br i1 undef, label %msgSend.nullinit67, label %msgSend.call66
+
+msgSend.call66:                                   ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.nullinit67:                               ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.cont68:                                   ; preds = %msgSend.nullinit67, %msgSend.call66
+  br i1 undef, label %msgSend.nullinit84, label %msgSend.call83
+
+msgSend.call83:                                   ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.nullinit84:                               ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.cont85:                                   ; preds = %msgSend.nullinit84, %msgSend.call83
+  br i1 undef, label %msgSend.nullinit90, label %msgSend.call89
+
+msgSend.call89:                                   ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.nullinit90:                               ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.cont91:                                   ; preds = %msgSend.nullinit90, %msgSend.call89
+  br i1 undef, label %msgSend.nullinit104, label %msgSend.call103
+
+msgSend.call103:                                  ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.nullinit104:                              ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.cont105:                                  ; preds = %msgSend.nullinit104, %msgSend.call103
+  br i1 undef, label %land.lhs.true, label %if.end123
+
+land.lhs.true:                                    ; preds = %msgSend.cont105
+  br i1 undef, label %if.then117, label %if.end123
+
+if.then117:                                       ; preds = %land.lhs.true
+  br label %if.end123
+
+if.end123:                                        ; preds = %if.then117, %land.lhs.true, %msgSend.cont105
+  br i1 undef, label %msgSend.nullinit132, label %msgSend.call131
+
+msgSend.call131:                                  ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.nullinit132:                              ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.cont133:                                  ; preds = %msgSend.nullinit132, %msgSend.call131
+  br i1 undef, label %msgSend.nullinit139, label %msgSend.call138
+
+msgSend.call138:                                  ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.nullinit139:                              ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.cont140:                                  ; preds = %msgSend.nullinit139, %msgSend.call138
+  br i1 undef, label %if.then151, label %if.end157
+
+if.then151:                                       ; preds = %msgSend.cont140
+  br label %if.end157
+
+if.end157:                                        ; preds = %if.then151, %msgSend.cont140
+  br i1 undef, label %msgSend.nullinit164, label %msgSend.call163
+
+msgSend.call163:                                  ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.nullinit164:                              ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.cont165:                                  ; preds = %msgSend.nullinit164, %msgSend.call163
+  br i1 undef, label %msgSend.nullinit176, label %msgSend.call175
+
+msgSend.call175:                                  ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.nullinit176:                              ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.cont177:                                  ; preds = %msgSend.nullinit176, %msgSend.call175
+  br i1 undef, label %land.lhs.true181, label %if.end202
+
+land.lhs.true181:                                 ; preds = %msgSend.cont177
+  br i1 undef, label %if.then187, label %if.end202
+
+if.then187:                                       ; preds = %land.lhs.true181
+  br i1 undef, label %msgSend.nullinit199, label %msgSend.call198
+
+msgSend.call198:                                  ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.nullinit199:                              ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.cont200:                                  ; preds = %msgSend.nullinit199, %msgSend.call198
+  br label %if.end202
+
+if.end202:                                        ; preds = %msgSend.cont200, %land.lhs.true181, %msgSend.cont177
+  br i1 undef, label %msgSend.nullinit236, label %msgSend.call235
+
+msgSend.call235:                                  ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.nullinit236:                              ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.cont237:                                  ; preds = %msgSend.nullinit236, %msgSend.call235
+  br i1 undef, label %msgSend.nullinit254, label %msgSend.call253
+
+msgSend.call253:                                  ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.nullinit254:                              ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.cont255:                                  ; preds = %msgSend.nullinit254, %msgSend.call253
+  br i1 undef, label %msgSend.nullinit269, label %msgSend.call268
+
+msgSend.call268:                                  ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.nullinit269:                              ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.cont270:                                  ; preds = %msgSend.nullinit269, %msgSend.call268
+  br i1 undef, label %msgSend.nullinit281, label %msgSend.call280
+
+msgSend.call280:                                  ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.nullinit281:                              ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.cont282:                                  ; preds = %msgSend.nullinit281, %msgSend.call280
+  br i1 undef, label %msgSend.nullinit287, label %msgSend.call286
+
+msgSend.call286:                                  ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.nullinit287:                              ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.cont288:                                  ; preds = %msgSend.nullinit287, %msgSend.call286
+  br i1 undef, label %msgSend.nullinit303, label %msgSend.call302
+
+msgSend.call302:                                  ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.nullinit303:                              ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.cont304:                                  ; preds = %msgSend.nullinit303, %msgSend.call302
+  br i1 undef, label %msgSend.nullinit344, label %msgSend.call343
+
+msgSend.call343:                                  ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.nullinit344:                              ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.cont345:                                  ; preds = %msgSend.nullinit344, %msgSend.call343
+  br i1 undef, label %msgSend.nullinit350, label %msgSend.call349
+
+msgSend.call349:                                  ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.nullinit350:                              ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.cont351:                                  ; preds = %msgSend.nullinit350, %msgSend.call349
+  br i1 undef, label %msgSend.nullinit366, label %msgSend.call365
+
+msgSend.call365:                                  ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.nullinit366:                              ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.cont367:                                  ; preds = %msgSend.nullinit366, %msgSend.call365
+  br i1 undef, label %msgSend.nullinit376, label %msgSend.call375
+
+msgSend.call375:                                  ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.nullinit376:                              ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.cont377:                                  ; preds = %msgSend.nullinit376, %msgSend.call375
+  br i1 undef, label %if.then384, label %if.else401
+
+if.then384:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit392, label %msgSend.call391
+
+msgSend.call391:                                  ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.nullinit392:                              ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.cont393:                                  ; preds = %msgSend.nullinit392, %msgSend.call391
+  br label %if.end418
+
+if.else401:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit409, label %msgSend.call408
+
+msgSend.call408:                                  ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.nullinit409:                              ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.cont410:                                  ; preds = %msgSend.nullinit409, %msgSend.call408
+  br label %if.end418
+
+if.end418:                                        ; preds = %msgSend.cont410, %msgSend.cont393
+  br i1 undef, label %msgSend.nullinit470, label %msgSend.call469
+
+msgSend.call469:                                  ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.nullinit470:                              ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.cont471:                                  ; preds = %msgSend.nullinit470, %msgSend.call469
+  br i1 undef, label %msgSend.nullinit484, label %msgSend.call483
+
+msgSend.call483:                                  ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.nullinit484:                              ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.cont485:                                  ; preds = %msgSend.nullinit484, %msgSend.call483
+  br i1 undef, label %msgSend.nullinit500, label %msgSend.call499
+
+msgSend.call499:                                  ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.nullinit500:                              ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.cont501:                                  ; preds = %msgSend.nullinit500, %msgSend.call499
+  br i1 undef, label %msgSend.nullinit506, label %msgSend.call505
+
+msgSend.call505:                                  ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.nullinit506:                              ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.cont507:                                  ; preds = %msgSend.nullinit506, %msgSend.call505
+  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/PhaseOrdering/gdce.ll b/test/Transforms/PhaseOrdering/gdce.ll
new file mode 100644
index 000000000000..273e47e97cb4
--- /dev/null
+++ b/test/Transforms/PhaseOrdering/gdce.ll
@@ -0,0 +1,106 @@
+; RUN: opt -O2 -S %s | FileCheck %s
+
+; Run global DCE to eliminate unused ctor and dtor.
+; rdar://9142819
+
+; CHECK: main
+; CHECK-NOT: _ZN4BaseC1Ev
+; CHECK-NOT: _ZN4BaseD1Ev
+; CHECK-NOT: _ZN4BaseD2Ev
+; CHECK-NOT: _ZN4BaseC2Ev
+; CHECK-NOT: _ZN4BaseD0Ev
+
+%class.Base = type { i32 (...)** }
+
+@_ZTV4Base = linkonce_odr unnamed_addr constant [4 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI4Base to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD1Ev to i8*), i8* bitcast (void (%class.Base*)* @_ZN4BaseD0Ev to i8*)]
+@_ZTVN10__cxxabiv117__class_type_infoE = external global i8*
+@_ZTS4Base = linkonce_odr constant [6 x i8] c"4Base\00"
+@_ZTI4Base = linkonce_odr unnamed_addr constant { i8*, i8* } { i8* bitcast (i8** getelementptr inbounds (i8** @_ZTVN10__cxxabiv117__class_type_infoE, i64 2) to i8*), i8* getelementptr inbounds ([6 x i8]* @_ZTS4Base, i32 0, i32 0) }
+
+define i32 @main() uwtable ssp {
+entry:
+  %retval = alloca i32, align 4
+  %b = alloca %class.Base, align 8
+  %cleanup.dest.slot = alloca i32
+  store i32 0, i32* %retval
+  call void @_ZN4BaseC1Ev(%class.Base* %b)
+  store i32 0, i32* %retval
+  store i32 1, i32* %cleanup.dest.slot
+  call void @_ZN4BaseD1Ev(%class.Base* %b)
+  %0 = load i32* %retval
+  ret i32 %0
+}
+
+define linkonce_odr void @_ZN4BaseC1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  call void @_ZN4BaseC2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD1Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  call void @_ZN4BaseD2Ev(%class.Base* %this1)
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseC2Ev(%class.Base* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  %0 = bitcast %class.Base* %this1 to i8***
+  store i8** getelementptr inbounds ([4 x i8*]* @_ZTV4Base, i64 0, i64 2), i8*** %0
+  ret void
+}
+
+define linkonce_odr void @_ZN4BaseD0Ev(%class.Base* %this) unnamed_addr uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.Base*, align 8
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  store %class.Base* %this, %class.Base** %this.addr, align 8
+  %this1 = load %class.Base** %this.addr
+  invoke void @_ZN4BaseD1Ev(%class.Base* %this1)
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:                                      ; preds = %entry
+  %0 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %0) nounwind
+  ret void
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %2 = extractvalue { i8*, i32 } %1, 0
+  store i8* %2, i8** %exn.slot
+  %3 = extractvalue { i8*, i32 } %1, 1
+  store i32 %3, i32* %ehselector.slot
+  %4 = bitcast %class.Base* %this1 to i8*
+  call void @_ZdlPv(i8* %4) nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad
+  %exn = load i8** %exn.slot
+  %sel = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+  %lpad.val2 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+  resume { i8*, i32 } %lpad.val2
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @_ZdlPv(i8*) nounwind
diff --git a/test/Transforms/Reassociate/crash.ll b/test/Transforms/Reassociate/crash.ll
index ce586e15fbcf..e29b5dc9c0ce 100644
--- a/test/Transforms/Reassociate/crash.ll
+++ b/test/Transforms/Reassociate/crash.ll
@@ -144,3 +144,31 @@ define i32 @sozefx_(i32 %x, i32 %y) {
   %t6 = add i32 %t4, %t5
   ret i32 %t6
 }
+
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
+  %tmp1 = mul i32 %arg1, 2
+  %tmp2 = mul i32 %tmp1, 3
+  %tmp3 = mul i32 %arg2, 2
+  %tmp4 = add i32 %tmp1, 1 ; dead code
+  %ret = add i32 %tmp2, %tmp3
+  ret i32 %ret
+}
+
+; PR14060
+define i8 @hang(i8 %p, i8 %p0, i8 %p1, i8 %p2, i8 %p3, i8 %p4, i8 %p5, i8 %p6, i8 %p7, i8 %p8, i8 %p9) {
+  %tmp = zext i1 false to i8
+  %tmp16 = or i8 %tmp, 1
+  %tmp22 = or i8 %p7, %p0
+  %tmp23 = or i8 %tmp16, %tmp22
+  %tmp28 = or i8 %p9, %p1
+  %tmp31 = or i8 %tmp23, %p2
+  %tmp32 = or i8 %tmp31, %tmp28
+  %tmp38 = or i8 %p8, %p3
+  %tmp39 = or i8 %tmp16, %tmp38
+  %tmp43 = or i8 %tmp39, %p4
+  %tmp44 = or i8 %tmp43, 1
+  %tmp47 = or i8 %tmp32, %p5
+  %tmp50 = or i8 %tmp47, %p6
+  %tmp51 = or i8 %tmp44, %tmp50
+  ret i8 %tmp51
+}
diff --git a/test/Transforms/SCCP/loadtest.ll b/test/Transforms/SCCP/loadtest.ll
index add2af483f56..dd1dba69143c 100644
--- a/test/Transforms/SCCP/loadtest.ll
+++ b/test/Transforms/SCCP/loadtest.ll
@@ -1,8 +1,9 @@
 ; This test makes sure that these instructions are properly constant propagated.
 
-target datalayout = "e-p:32:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32" -sccp -S | FileCheck %s
 
-; RUN: opt < %s -sccp -S | not grep load
+; CHECK-NOT: load
 
 
 @X = constant i32 42		; <i32*> [#uses=1]
diff --git a/test/Transforms/SROA/alignment.ll b/test/Transforms/SROA/alignment.ll
new file mode 100644
index 000000000000..ad5fb6c4a5d8
--- /dev/null
+++ b/test/Transforms/SROA/alignment.ll
@@ -0,0 +1,171 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
+
+define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
+; CHECK: @test1
+; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 0
+; CHECK: %[[a0:.*]] = load i8* %[[gep_a0]], align 16
+; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }* %a, i64 0, i32 1
+; CHECK: %[[a1:.*]] = load i8* %[[gep_a1]], align 1
+; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 0
+; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
+; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }* %b, i64 0, i32 1
+; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
+; CHECK: ret void
+
+entry:
+  %alloca = alloca { i8, i8 }, align 16
+  %gep_a = getelementptr { i8, i8 }* %a, i32 0, i32 0
+  %gep_alloca = getelementptr { i8, i8 }* %alloca, i32 0, i32 0
+  %gep_b = getelementptr { i8, i8 }* %b, i32 0, i32 0
+
+  store i8 420, i8* %gep_alloca, align 16
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_alloca, i8* %gep_a, i32 2, i32 16, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %gep_b, i8* %gep_alloca, i32 2, i32 16, i1 false)
+  ret void
+}
+
+define void @test2() {
+; CHECK: @test2
+; CHECK: alloca i16
+; CHECK: load i8* %{{.*}}
+; CHECK: store i8 42, i8* %{{.*}}
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8, i8, i8, i8 }, align 2
+  %gep1 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 1
+  %cast1 = bitcast i8* %gep1 to i16*
+  store volatile i16 0, i16* %cast1
+  %gep2 = getelementptr { i8, i8, i8, i8 }* %a, i32 0, i32 2
+  %result = load i8* %gep2
+  store i8 42, i8* %gep2
+  ret void
+}
+
+define void @PR13920(<2 x i64>* %a, i16* %b) {
+; Test that alignments on memcpy intrinsics get propagated to loads and stores.
+; CHECK: @PR13920
+; CHECK: load <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
+
+entry:
+  %aa = alloca <2 x i64>, align 16
+  %aptr = bitcast <2 x i64>* %a to i8*
+  %aaptr = bitcast <2 x i64>* %aa to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %aaptr, i8* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %bptr, i8* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+define void @test3(i8* %x) {
+; Test that when we promote an alloca to a type with lower ABI alignment, we
+; provide the needed explicit alignment that code using the alloca may be
+; expecting. However, also check that any offset within an alloca can in turn
+; reduce the alignment.
+; CHECK: @test3
+; CHECK: alloca [22 x i8], align 8
+; CHECK: alloca [18 x i8], align 2
+; CHECK: ret void
+
+entry:
+  %a = alloca { i8*, i8*, i8* }
+  %b = alloca { i8*, i8*, i8* }
+  %a_raw = bitcast { i8*, i8*, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a_raw, i8* %x, i32 22, i32 8, i1 false)
+  %b_raw = bitcast { i8*, i8*, i8* }* %b to i8*
+  %b_gep = getelementptr i8* %b_raw, i32 6
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b_gep, i8* %x, i32 18, i32 2, i1 false)
+  ret void
+}
+
+define void @test5() {
+; Test that we preserve underaligned loads and stores when splitting.
+; CHECK: @test5
+; CHECK: alloca [9 x i8]
+; CHECK: alloca [9 x i8]
+; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
+; CHECK: load i16* %{{.*}}, align 1
+; CHECK: load double* %{{.*}}, align 1
+; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
+; CHECK: load i16* %{{.*}}, align 1
+; CHECK: ret void
+
+entry:
+  %a = alloca [18 x i8]
+  %raw1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+  %weird_gep1 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 7
+  %weird_cast1 = bitcast i8* %weird_gep1 to i16*
+  %weird_load1 = load i16* %weird_cast1, align 1
+
+  %raw2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 9
+  %ptr2 = bitcast i8* %raw2 to double*
+  %d1 = load double* %ptr1, align 1
+  store volatile double %d1, double* %ptr2, align 1
+  %weird_gep2 = getelementptr inbounds [18 x i8]* %a, i32 0, i32 16
+  %weird_cast2 = bitcast i8* %weird_gep2 to i16*
+  %weird_load2 = load i16* %weird_cast2, align 1
+
+  ret void
+}
+
+define void @test6() {
+; Test that we promote alignment when the underlying alloca switches to one
+; that innately provides it.
+; CHECK: @test6
+; CHECK: alloca double
+; CHECK: alloca double
+; CHECK-NOT: align
+; CHECK: ret void
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  store volatile double 0.0, double* %ptr1, align 1
+
+  %raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+  %val = load double* %ptr1, align 1
+  store volatile double %val, double* %ptr2, align 1
+
+  ret void
+}
+
+define void @test7(i8* %out) {
+; Test that we properly compute the destination alignment when rewriting
+; memcpys as direct loads or stores.
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %a = alloca [16 x i8]
+  %raw1 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 0
+  %ptr1 = bitcast i8* %raw1 to double*
+  %raw2 = getelementptr inbounds [16 x i8]* %a, i32 0, i32 8
+  %ptr2 = bitcast i8* %raw2 to double*
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i32 0, i1 false)
+; CHECK: %[[val2:.*]] = load double* %{{.*}}, align 1
+; CHECK: %[[val1:.*]] = load double* %{{.*}}, align 1
+
+  %val1 = load double* %ptr2, align 1
+  %val2 = load double* %ptr1, align 1
+
+  store double %val1, double* %ptr1, align 1
+  store double %val2, double* %ptr2, align 1
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i32 0, i1 false)
+; CHECK: store double %[[val1]], double* %{{.*}}, align 1
+; CHECK: store double %[[val2]], double* %{{.*}}, align 1
+
+  ret void
+; CHECK: ret void
+}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
new file mode 100644
index 000000000000..b363eefb3f9d
--- /dev/null
+++ b/test/Transforms/SROA/basictest.ll
@@ -0,0 +1,1136 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+define i32 @test0() {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: ret i32
+
+entry:
+  %a1 = alloca i32
+  %a2 = alloca float
+
+  %a1.i8 = bitcast i32* %a1 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a1.i8)
+
+  store i32 0, i32* %a1
+  %v1 = load i32* %a1
+
+  call void @llvm.lifetime.end(i64 4, i8* %a1.i8)
+
+  %a2.i8 = bitcast float* %a2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a2.i8)
+
+  store float 0.0, float* %a2
+  %v2 = load float * %a2
+  %v2.int = bitcast float %v2 to i32
+  %sum1 = add i32 %v1, %v2.int
+
+  call void @llvm.lifetime.end(i64 4, i8* %a2.i8)
+
+  ret i32 %sum1
+}
+
+define i32 @test1() {
+; CHECK: @test1
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca { i32, float }
+  %Y = getelementptr { i32, float }* %X, i64 0, i32 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+}
+
+define i64 @test2(i64 %X) {
+; CHECK: @test2
+; CHECK-NOT: alloca
+; CHECK: ret i64 %X
+
+entry:
+  %A = alloca [8 x i8]
+  %B = bitcast [8 x i8]* %A to i64*
+  store i64 %X, i64* %B
+  br label %L2
+
+L2:
+  %Z = load i64* %B
+  ret i64 %Z
+}
+
+define void @test3(i8* %dst, i8* %src) {
+; CHECK: @test3
+
+entry:
+  %a = alloca [300 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test3_a1:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a2:.*]] = alloca [99 x i8]
+; CHECK-NEXT: %[[test3_a3:.*]] = alloca [16 x i8]
+; CHECK-NEXT: %[[test3_a4:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a6:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a7:.*]] = alloca [85 x i8]
+
+  %b = getelementptr [300 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test3_r1:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 142
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 158
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 200
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 207
+; CHECK-NEXT: %[[test3_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 208
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 215
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ; Clobber a single element of the array, this should be promotable.
+  %c = getelementptr [300 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  ; Make a sequence of overlapping stores to the array. These overlap both in
+  ; forward strides and in shrinking accesses.
+  %overlap.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 142
+  %overlap.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 143
+  %overlap.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 144
+  %overlap.4.i8 = getelementptr [300 x i8]* %a, i64 0, i64 145
+  %overlap.5.i8 = getelementptr [300 x i8]* %a, i64 0, i64 146
+  %overlap.6.i8 = getelementptr [300 x i8]* %a, i64 0, i64 147
+  %overlap.7.i8 = getelementptr [300 x i8]* %a, i64 0, i64 148
+  %overlap.8.i8 = getelementptr [300 x i8]* %a, i64 0, i64 149
+  %overlap.9.i8 = getelementptr [300 x i8]* %a, i64 0, i64 150
+  %overlap.1.i16 = bitcast i8* %overlap.1.i8 to i16*
+  %overlap.1.i32 = bitcast i8* %overlap.1.i8 to i32*
+  %overlap.1.i64 = bitcast i8* %overlap.1.i8 to i64*
+  %overlap.2.i64 = bitcast i8* %overlap.2.i8 to i64*
+  %overlap.3.i64 = bitcast i8* %overlap.3.i8 to i64*
+  %overlap.4.i64 = bitcast i8* %overlap.4.i8 to i64*
+  %overlap.5.i64 = bitcast i8* %overlap.5.i8 to i64*
+  %overlap.6.i64 = bitcast i8* %overlap.6.i8 to i64*
+  %overlap.7.i64 = bitcast i8* %overlap.7.i8 to i64*
+  %overlap.8.i64 = bitcast i8* %overlap.8.i8 to i64*
+  %overlap.9.i64 = bitcast i8* %overlap.9.i8 to i64*
+  store i8 1, i8* %overlap.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap.1.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap.1.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i64 1, i64* %overlap.1.i64
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i64*
+; CHECK-NEXT: store i64 1, i64* %[[bitcast]]
+  store i64 2, i64* %overlap.2.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 2, i64* %[[bitcast]]
+  store i64 3, i64* %overlap.3.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 3, i64* %[[bitcast]]
+  store i64 4, i64* %overlap.4.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 4, i64* %[[bitcast]]
+  store i64 5, i64* %overlap.5.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 4
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 5, i64* %[[bitcast]]
+  store i64 6, i64* %overlap.6.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 5
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 6, i64* %[[bitcast]]
+  store i64 7, i64* %overlap.7.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 6
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 7, i64* %[[bitcast]]
+  store i64 8, i64* %overlap.8.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 7
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 8, i64* %[[bitcast]]
+  store i64 9, i64* %overlap.9.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 8
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 9, i64* %[[bitcast]]
+
+  ; Make two sequences of overlapping stores with more gaps and irregularities.
+  %overlap2.1.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 200
+  %overlap2.1.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 201
+  %overlap2.1.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 202
+  %overlap2.1.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 203
+
+  %overlap2.2.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 208
+  %overlap2.2.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 209
+  %overlap2.2.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 210
+  %overlap2.2.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 211
+
+  %overlap2.1.0.i16 = bitcast i8* %overlap2.1.0.i8 to i16*
+  %overlap2.1.0.i32 = bitcast i8* %overlap2.1.0.i8 to i32*
+  %overlap2.1.1.i32 = bitcast i8* %overlap2.1.1.i8 to i32*
+  %overlap2.1.2.i32 = bitcast i8* %overlap2.1.2.i8 to i32*
+  %overlap2.1.3.i32 = bitcast i8* %overlap2.1.3.i8 to i32*
+  store i8 1,  i8*  %overlap2.1.0.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.1.0.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.1.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 2, i32* %overlap2.1.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 2, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.1.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.1.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.2.0.i32 = bitcast i8* %overlap2.2.0.i8 to i32*
+  %overlap2.2.1.i16 = bitcast i8* %overlap2.2.1.i8 to i16*
+  %overlap2.2.1.i32 = bitcast i8* %overlap2.2.1.i8 to i32*
+  %overlap2.2.2.i32 = bitcast i8* %overlap2.2.2.i8 to i32*
+  %overlap2.2.3.i32 = bitcast i8* %overlap2.2.3.i8 to i32*
+  store i32 1, i32* %overlap2.2.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a6]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i8 1,  i8*  %overlap2.2.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.2.1.i16
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.2.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.2.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.2.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.prefix = getelementptr i8* %overlap2.1.1.i8, i64 -4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.prefix, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 39
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 3
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 3
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 5
+
+  ; Bridge between the overlapping areas
+  call void @llvm.memset.p0i8.i32(i8* %overlap2.1.2.i8, i8 42, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 5
+; ...promoted i8 store...
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 2
+
+  ; Entirely within the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.1.i8, i8* %src, i32 5, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+
+  ; Trailing past the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.2.i8, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 5
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 3
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 0, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 142
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 158
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 200
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 207
+; CHECK-NEXT: store i8 42, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 208
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 215
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ret void
+}
+
+define void @test4(i8* %dst, i8* %src) {
+; CHECK: @test4
+
+entry:
+  %a = alloca [100 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test4_a1:.*]] = alloca [20 x i8]
+; CHECK-NEXT: %[[test4_a2:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a3:.*]] = alloca [10 x i8]
+; CHECK-NEXT: %[[test4_a4:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a6:.*]] = alloca [40 x i8]
+
+  %b = getelementptr [100 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r1:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 22
+; CHECK-NEXT: %[[test4_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 23
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 30
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r3:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test4_r4:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r5:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 52
+; CHECK-NEXT: %[[test4_r6:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 53
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 60
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  %a.src.1 = getelementptr [100 x i8]* %a, i64 0, i64 20
+  %a.dst.1 = getelementptr [100 x i8]* %a, i64 0, i64 40
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.1, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  ; Clobber a single element of the array, this should be promotable, and be deleted.
+  %c = getelementptr [100 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  %a.src.2 = getelementptr [100 x i8]* %a, i64 0, i64 50
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.2, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r1]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 22
+; CHECK-NEXT: store i8 %[[test4_r2]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 23
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 30
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 52
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 53
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 60
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i16 @test5() {
+; CHECK: @test5
+; CHECK-NOT: alloca float
+; CHECK:      %[[cast:.*]] = bitcast float 0.0{{.*}} to i32
+; CHECK-NEXT: %[[shr:.*]] = lshr i32 %[[cast]], 16
+; CHECK-NEXT: %[[trunc:.*]] = trunc i32 %[[shr]] to i16
+; CHECK-NEXT: ret i16 %[[trunc]]
+
+entry:
+  %a = alloca [4 x i8]
+  %fptr = bitcast [4 x i8]* %a to float*
+  store float 0.0, float* %fptr
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 2
+  %iptr = bitcast i8* %ptr to i16*
+  %val = load i16* %iptr
+  ret i16 %val
+}
+
+define i32 @test6() {
+; CHECK: @test6
+; CHECK: alloca i32
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 42, i32 4, i32 1, i1 true)
+  %iptr = bitcast i8* %ptr to i32*
+  %val = load i32* %iptr
+  ret i32 %val
+}
+
+define void @test7(i8* %src, i8* %dst) {
+; CHECK: @test7
+; CHECK: alloca i32
+; CHECK-NEXT: bitcast i8* %src to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: bitcast i8* %dst to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: ret
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+
+%S1 = type { i32, i32, [16 x i8] }
+%S2 = type { %S1*, %S2* }
+
+define %S2 @test8(%S2* %s2) {
+; CHECK: @test8
+entry:
+  %new = alloca %S2
+; CHECK-NOT: alloca
+
+  %s2.next.ptr = getelementptr %S2* %s2, i64 0, i32 1
+  %s2.next = load %S2** %s2.next.ptr
+; CHECK:      %[[gep:.*]] = getelementptr %S2* %s2, i64 0, i32 1
+; CHECK-NEXT: %[[next:.*]] = load %S2** %[[gep]]
+
+  %s2.next.s1.ptr = getelementptr %S2* %s2.next, i64 0, i32 0
+  %s2.next.s1 = load %S1** %s2.next.s1.ptr
+  %new.s1.ptr = getelementptr %S2* %new, i64 0, i32 0
+  store %S1* %s2.next.s1, %S1** %new.s1.ptr
+  %s2.next.next.ptr = getelementptr %S2* %s2.next, i64 0, i32 1
+  %s2.next.next = load %S2** %s2.next.next.ptr
+  %new.next.ptr = getelementptr %S2* %new, i64 0, i32 1
+  store %S2* %s2.next.next, %S2** %new.next.ptr
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 0
+; CHECK-NEXT: %[[next_s1:.*]] = load %S1** %[[gep]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 1
+; CHECK-NEXT: %[[next_next:.*]] = load %S2** %[[gep]]
+
+  %new.s1 = load %S1** %new.s1.ptr
+  %result1 = insertvalue %S2 undef, %S1* %new.s1, 0
+; CHECK-NEXT: %[[result1:.*]] = insertvalue %S2 undef, %S1* %[[next_s1]], 0
+  %new.next = load %S2** %new.next.ptr
+  %result2 = insertvalue %S2 %result1, %S2* %new.next, 1
+; CHECK-NEXT: %[[result2:.*]] = insertvalue %S2 %[[result1]], %S2* %[[next_next]], 1
+  ret %S2 %result2
+; CHECK-NEXT: ret %S2 %[[result2]]
+}
+
+define i64 @test9() {
+; Ensure we can handle loads off the end of an alloca even when wrapped in
+; weird bit casts and types. The result is undef, but this shouldn't crash
+; anything.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK: ret i64 undef
+
+entry:
+  %a = alloca { [3 x i8] }
+  %gep1 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 0
+  store i8 0, i8* %gep1, align 1
+  %gep2 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep2, align 1
+  %gep3 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 2
+  store i8 26, i8* %gep3, align 1
+  %cast = bitcast { [3 x i8] }* %a to { i64 }*
+  %elt = getelementptr inbounds { i64 }* %cast, i32 0, i32 0
+  %result = load i64* %elt
+  ret i64 %result
+}
+
+define %S2* @test10() {
+; CHECK: @test10
+; CHECK-NOT: alloca %S2*
+; CHECK: ret %S2* null
+
+entry:
+  %a = alloca [8 x i8]
+  %ptr = getelementptr [8 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 0, i32 8, i32 1, i1 false)
+  %s2ptrptr = bitcast i8* %ptr to %S2**
+  %s2ptr = load %S2** %s2ptrptr
+  ret %S2* %s2ptr
+}
+
+define i32 @test11() {
+; CHECK: @test11
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  %Z2 = load i32* %Y2
+  ret i32 %Z2
+}
+
+define i8 @test12() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca.
+;
+; CHECK: @test12
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i32 @test13() {
+; Ensure we don't crash and handle undefined loads that straddle the end of the
+; allocation.
+; CHECK: @test13
+; CHECK: %[[ret:.*]] = zext i16 undef to i32
+; CHECK: ret i32 %[[ret]]
+
+entry:
+  %a = alloca [3 x i8]
+  %b0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %b2ptr
+  %iptrcast = bitcast [3 x i8]* %a to i16*
+  %iptrgep = getelementptr i16* %iptrcast, i64 1
+  %i = load i16* %iptrgep
+  %ret = zext i16 %i to i32
+  ret i32 %ret
+}
+
+%test14.struct = type { [3 x i32] }
+
+define void @test14(...) nounwind uwtable {
+; This is a strange case where we split allocas into promotable partitions, but
+; also gain enough data to prove they must be dead allocas due to GEPs that walk
+; across two adjacent allocas. Test that we don't try to promote or otherwise
+; do bad things to these dead allocas, they should just be removed.
+; CHECK: @test14
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca %test14.struct
+  %p = alloca %test14.struct*
+  %0 = bitcast %test14.struct* %a to i8*
+  %1 = getelementptr i8* %0, i64 12
+  %2 = bitcast i8* %1 to %test14.struct*
+  %3 = getelementptr inbounds %test14.struct* %2, i32 0, i32 0
+  %4 = getelementptr inbounds %test14.struct* %a, i32 0, i32 0
+  %5 = bitcast [3 x i32]* %3 to i32*
+  %6 = bitcast [3 x i32]* %4 to i32*
+  %7 = load i32* %6, align 4
+  store i32 %7, i32* %5, align 4
+  %8 = getelementptr inbounds i32* %5, i32 1
+  %9 = getelementptr inbounds i32* %6, i32 1
+  %10 = load i32* %9, align 4
+  store i32 %10, i32* %8, align 4
+  %11 = getelementptr inbounds i32* %5, i32 2
+  %12 = getelementptr inbounds i32* %6, i32 2
+  %13 = load i32* %12, align 4
+  store i32 %13, i32* %11, align 4
+  ret void
+}
+
+define i32 @test15(i1 %flag) nounwind uwtable {
+; Ensure that when there are dead instructions using an alloca that are not
+; loads or stores we still delete them during partitioning and rewriting.
+; Otherwise we'll go to promote them while thy still have unpromotable uses.
+; CHECK: @test15
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK:      loop:
+; CHECK-NEXT:   br label %loop
+
+entry:
+  %l0 = alloca i64
+  %l1 = alloca i64
+  %l2 = alloca i64
+  %l3 = alloca i64
+  br label %loop
+
+loop:
+  %dead3 = phi i8* [ %gep3, %loop ], [ null, %entry ]
+
+  store i64 1879048192, i64* %l0, align 8
+  %bc0 = bitcast i64* %l0 to i8*
+  %gep0 = getelementptr i8* %bc0, i64 3
+  %dead0 = bitcast i8* %gep0 to i64*
+
+  store i64 1879048192, i64* %l1, align 8
+  %bc1 = bitcast i64* %l1 to i8*
+  %gep1 = getelementptr i8* %bc1, i64 3
+  %dead1 = getelementptr i8* %gep1, i64 1
+
+  store i64 1879048192, i64* %l2, align 8
+  %bc2 = bitcast i64* %l2 to i8*
+  %gep2.1 = getelementptr i8* %bc2, i64 1
+  %gep2.2 = getelementptr i8* %bc2, i64 3
+  ; Note that this select should get visited multiple times due to using two
+  ; different GEPs off the same alloca. We should only delete it once.
+  %dead2 = select i1 %flag, i8* %gep2.1, i8* %gep2.2
+
+  store i64 1879048192, i64* %l3, align 8
+  %bc3 = bitcast i64* %l3 to i8*
+  %gep3 = getelementptr i8* %bc3, i64 3
+
+  br label %loop
+}
+
+define void @test16(i8* %src, i8* %dst) {
+; Ensure that we can promote an alloca of [3 x i8] to an i24 SSA value.
+; CHECK: @test16
+; CHECK-NOT: alloca
+; CHECK:      %[[srccast:.*]] = bitcast i8* %src to i24*
+; CHECK-NEXT: load i24* %[[srccast]]
+; CHECK-NEXT: %[[dstcast:.*]] = bitcast i8* %dst to i24*
+; CHECK-NEXT: store i24 0, i24* %[[dstcast]]
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i24*
+  store i24 0, i24* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 false)
+  ret void
+}
+
+define void @test17(i8* %src, i8* %dst) {
+; Ensure that we can rewrite unpromotable memcpys which extend past the end of
+; the alloca.
+; CHECK: @test17
+; CHECK:      %[[a:.*]] = alloca [3 x i8]
+; CHECK-NEXT: %[[ptr:.*]] = getelementptr [3 x i8]* %[[a]], i32 0, i32 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[ptr]], i8* %src,
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[ptr]],
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+define void @test18(i8* %src, i8* %dst, i32 %size) {
+; Preserve transfer instrinsics with a variable size, even if they overlap with
+; fixed size operations. Further, continue to split and promote allocas preceding
+; the variable sized intrinsic.
+; CHECK: @test18
+; CHECK:      %[[a:.*]] = alloca [34 x i8]
+; CHECK:      %[[srcgep1:.*]] = getelementptr inbounds i8* %src, i64 4
+; CHECK-NEXT: %[[srccast1:.*]] = bitcast i8* %[[srcgep1]] to i32*
+; CHECK-NEXT: %[[srcload:.*]] = load i32* %[[srccast1]]
+; CHECK-NEXT: %[[agep1:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[agep1]], i8* %src, i32 %size,
+; CHECK-NEXT: %[[agep2:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[agep2]], i8 42, i32 %size,
+; CHECK-NEXT: %[[dstcast1:.*]] = bitcast i8* %dst to i32*
+; CHECK-NEXT: store i32 42, i32* %[[dstcast1]]
+; CHECK-NEXT: %[[dstgep1:.*]] = getelementptr inbounds i8* %dst, i64 4
+; CHECK-NEXT: %[[dstcast2:.*]] = bitcast i8* %[[dstgep1]] to i32*
+; CHECK-NEXT: store i32 %[[srcload]], i32* %[[dstcast2]]
+; CHECK-NEXT: %[[agep3:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[agep3]], i32 %size,
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [42 x i8]
+  %ptr = getelementptr [42 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 8, i32 1, i1 false)
+  %ptr2 = getelementptr [42 x i8]* %a, i32 0, i32 8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr2, i8* %src, i32 %size, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %ptr2, i8 42, i32 %size, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i32*
+  store i32 42, i32* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 8, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr2, i32 %size, i32 1, i1 false)
+  ret void
+}
+
+%opaque = type opaque
+
+define i32 @test19(%opaque* %x) {
+; This input will cause us to try to compute a natural GEP when rewriting
+; pointers in such a way that we try to GEP through the opaque type. Previously,
+; a check for an unsized type was missing and this crashed. Ensure it behaves
+; reasonably now.
+; CHECK: @test19
+; CHECK-NOT: alloca
+; CHECK: ret i32 undef
+
+entry:
+  %a = alloca { i64, i8* }
+  %cast1 = bitcast %opaque* %x to i8*
+  %cast2 = bitcast { i64, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast2, i8* %cast1, i32 16, i32 1, i1 false)
+  %gep = getelementptr inbounds { i64, i8* }* %a, i32 0, i32 0
+  %val = load i64* %gep
+  ret i32 undef
+}
+
+define i32 @test20() {
+; Ensure we can track negative offsets (before the beginning of the alloca) and
+; negative relative offsets from offsets starting past the end of the alloca.
+; CHECK: @test20
+; CHECK-NOT: alloca
+; CHECK: %[[sum1:.*]] = add i32 1, 2
+; CHECK: %[[sum2:.*]] = add i32 %[[sum1]], 3
+; CHECK: ret i32 %[[sum2]]
+
+entry:
+  %a = alloca [3 x i32]
+  %gep1 = getelementptr [3 x i32]* %a, i32 0, i32 0
+  store i32 1, i32* %gep1
+  %gep2.1 = getelementptr [3 x i32]* %a, i32 0, i32 -2
+  %gep2.2 = getelementptr i32* %gep2.1, i32 3
+  store i32 2, i32* %gep2.2
+  %gep3.1 = getelementptr [3 x i32]* %a, i32 0, i32 14
+  %gep3.2 = getelementptr i32* %gep3.1, i32 -12
+  store i32 3, i32* %gep3.2
+
+  %load1 = load i32* %gep1
+  %load2 = load i32* %gep2.2
+  %load3 = load i32* %gep3.2
+  %sum1 = add i32 %load1, %load2
+  %sum2 = add i32 %sum1, %load3
+  ret i32 %sum2
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define i8 @test21() {
+; Test allocations and offsets which border on overflow of the int64_t used
+; internally. This is really awkward to really test as LLVM doesn't really
+; support such extreme constructs cleanly.
+; CHECK: @test21
+; CHECK-NOT: alloca
+; CHECK: or i8 -1, -1
+
+entry:
+  %a = alloca [2305843009213693951 x i8]
+  %gep0 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 2305843009213693949
+  store i8 255, i8* %gep0
+  %gep1 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 -9223372036854775807
+  %gep2 = getelementptr i8* %gep1, i64 -1
+  call void @llvm.memset.p0i8.i64(i8* %gep2, i8 0, i64 18446744073709551615, i32 1, i1 false)
+  %gep3 = getelementptr i8* %gep1, i64 9223372036854775807
+  %gep4 = getelementptr i8* %gep3, i64 9223372036854775807
+  %gep5 = getelementptr i8* %gep4, i64 -6917529027641081857
+  store i8 255, i8* %gep5
+  %cast1 = bitcast i8* %gep4 to i32*
+  store i32 0, i32* %cast1
+  %load = load i8* %gep0
+  %gep6 = getelementptr i8* %gep0, i32 1
+  %load2 = load i8* %gep6
+  %result = or i8 %load, %load2
+  ret i8 %result
+}
+
+%PR13916.struct = type { i8 }
+
+define void @PR13916.1() {
+; Ensure that we handle overlapping memcpy intrinsics correctly, especially in
+; the case where there is a directly identical value for both source and dest.
+; CHECK: @PR13916.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %a = alloca i8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %a, i32 1, i32 1, i1 false)
+  %tmp2 = load i8* %a
+  ret void
+}
+
+define void @PR13916.2() {
+; Check whether we continue to handle them correctly when they start off with
+; different pointer value chains, but during rewriting we coalesce them into the
+; same value.
+; CHECK: @PR13916.2
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %a = alloca %PR13916.struct, align 1
+  br i1 undef, label %if.then, label %if.end
+
+if.then:
+  %tmp0 = bitcast %PR13916.struct* %a to i8*
+  %tmp1 = bitcast %PR13916.struct* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %tmp0, i8* %tmp1, i32 1, i32 1, i1 false)
+  br label %if.end
+
+if.end:
+  %gep = getelementptr %PR13916.struct* %a, i32 0, i32 0
+  %tmp2 = load i8* %gep
+  ret void
+}
+
+define void @PR13990() {
+; Ensure we can handle cases where processing one alloca causes the other
+; alloca to become dead and get deleted. This might crash or fail under
+; Valgrind if we regress.
+; CHECK: @PR13990
+; CHECK-NOT: alloca
+; CHECK: unreachable
+; CHECK: unreachable
+
+entry:
+  %tmp1 = alloca i8*
+  %tmp2 = alloca i8*
+  br i1 undef, label %bb1, label %bb2
+
+bb1:
+  store i8* undef, i8** %tmp2
+  br i1 undef, label %bb2, label %bb3
+
+bb2:
+  %tmp50 = select i1 undef, i8** %tmp2, i8** %tmp1
+  br i1 undef, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+}
+
+define double @PR13969(double %x) {
+; Check that we detect when promotion will un-escape an alloca and iterate to
+; re-try running SROA over that alloca. Without that, the two allocas that are
+; stored into a dead alloca don't get rewritten and promoted.
+; CHECK: @PR13969
+
+entry:
+  %a = alloca double
+  %b = alloca double*
+  %c = alloca double
+; CHECK-NOT: alloca
+
+  store double %x, double* %a
+  store double* %c, double** %b
+  store double* %a, double** %b
+  store double %x, double* %c
+  %ret = load double* %a
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+  ret double %ret
+; CHECK: ret double %x
+}
+
+%PR14034.struct = type { { {} }, i32, %PR14034.list }
+%PR14034.list = type { %PR14034.list*, %PR14034.list* }
+
+define void @PR14034() {
+; This test case tries to form GEPs into the empty leading struct members, and
+; subsequently crashed (under valgrind) before we fixed the PR. The important
+; thing is to handle empty structs gracefully.
+; CHECK: @PR14034
+
+entry:
+  %a = alloca %PR14034.struct
+  %list = getelementptr %PR14034.struct* %a, i32 0, i32 2
+  %prev = getelementptr %PR14034.list* %list, i32 0, i32 1
+  store %PR14034.list* undef, %PR14034.list** %prev
+  %cast0 = bitcast %PR14034.struct* undef to i8*
+  %cast1 = bitcast %PR14034.struct* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast0, i8* %cast1, i32 12, i32 0, i1 false)
+  ret void
+}
+
+define i32 @test22(i32 %x) {
+; Test that SROA and promotion is not confused by a grab bax mixture of pointer
+; types involving wrapper aggregates and zero-length aggregate members.
+; CHECK: @test22
+
+entry:
+  %a1 = alloca { { [1 x { i32 }] } }
+  %a2 = alloca { {}, { float }, [0 x i8] }
+  %a3 = alloca { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }
+; CHECK-NOT: alloca
+
+  %wrap1 = insertvalue [1 x { i32 }] undef, i32 %x, 0, 0
+  %gep1 = getelementptr { { [1 x { i32 }] } }* %a1, i32 0, i32 0, i32 0
+  store [1 x { i32 }] %wrap1, [1 x { i32 }]* %gep1
+
+  %gep2 = getelementptr { { [1 x { i32 }] } }* %a1, i32 0, i32 0
+  %ptrcast1 = bitcast { [1 x { i32 }] }* %gep2 to { [1 x { float }] }*
+  %load1 = load { [1 x { float }] }* %ptrcast1
+  %unwrap1 = extractvalue { [1 x { float }] } %load1, 0, 0
+
+  %wrap2 = insertvalue { {}, { float }, [0 x i8] } undef, { float } %unwrap1, 1
+  store { {}, { float }, [0 x i8] } %wrap2, { {}, { float }, [0 x i8] }* %a2
+
+  %gep3 = getelementptr { {}, { float }, [0 x i8] }* %a2, i32 0, i32 1, i32 0
+  %ptrcast2 = bitcast float* %gep3 to <4 x i8>*
+  %load3 = load <4 x i8>* %ptrcast2
+  %valcast1 = bitcast <4 x i8> %load3 to i32
+
+  %wrap3 = insertvalue [1 x [1 x i32]] undef, i32 %valcast1, 0, 0
+  %wrap4 = insertvalue { [1 x [1 x i32]], {} } undef, [1 x [1 x i32]] %wrap3, 0
+  %gep4 = getelementptr { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }* %a3, i32 0, i32 1
+  %ptrcast3 = bitcast { [0 x double], [1 x [1 x <4 x i8>]], {} }* %gep4 to { [1 x [1 x i32]], {} }*
+  store { [1 x [1 x i32]], {} } %wrap4, { [1 x [1 x i32]], {} }* %ptrcast3
+
+  %gep5 = getelementptr { [0 x i8], { [0 x double], [1 x [1 x <4 x i8>]], {} }, { { {} } } }* %a3, i32 0, i32 1, i32 1, i32 0
+  %ptrcast4 = bitcast [1 x <4 x i8>]* %gep5 to { {}, float, {} }*
+  %load4 = load { {}, float, {} }* %ptrcast4
+  %unwrap2 = extractvalue { {}, float, {} } %load4, 1
+  %valcast2 = bitcast float %unwrap2 to i32
+
+  ret i32 %valcast2
+; CHECK: ret i32
+}
+
+define void @PR14059.1(double* %d) {
+; In PR14059 a peculiar construct was identified as something that is used
+; pervasively in ARM's ABI-calling-convention lowering: the passing of a struct
+; of doubles via an array of i32 in order to place the data into integer
+; registers. This in turn was missed as an optimization by SROA due to the
+; partial loads and stores of integers to the double alloca we were trying to
+; form and promote. The solution is to widen the integer operations to be
+; whole-alloca operations, and perform the appropriate bitcasting on the
+; *values* rather than the pointers. When this works, partial reads and writes
+; via integers can be promoted away.
+; CHECK: @PR14059.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %X.sroa.0.i = alloca double, align 8
+  %0 = bitcast double* %X.sroa.0.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0)
+
+  ; Store to the low 32-bits...
+  %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32*
+  store i32 0, i32* %X.sroa.0.0.cast2.i, align 8
+
+  ; Also use a memset to the middle 32-bits for fun.
+  %X.sroa.0.2.raw_idx2.i = getelementptr inbounds i8* %0, i32 2
+  call void @llvm.memset.p0i8.i64(i8* %X.sroa.0.2.raw_idx2.i, i8 0, i64 4, i32 1, i1 false)
+
+  ; Or a memset of the whole thing.
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 8, i32 1, i1 false)
+
+  ; Write to the high 32-bits with a memcpy.
+  %X.sroa.0.4.raw_idx4.i = getelementptr inbounds i8* %0, i32 4
+  %d.raw = bitcast double* %d to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %X.sroa.0.4.raw_idx4.i, i8* %d.raw, i32 4, i32 1, i1 false)
+
+  ; Store to the high 32-bits...
+  %X.sroa.0.4.cast5.i = bitcast i8* %X.sroa.0.4.raw_idx4.i to i32*
+  store i32 1072693248, i32* %X.sroa.0.4.cast5.i, align 4
+
+  ; Do the actual math...
+  %X.sroa.0.0.load1.i = load double* %X.sroa.0.i, align 8
+  %accum.real.i = load double* %d, align 8
+  %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i
+  store double %add.r.i, double* %d, align 8
+  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  ret void
+}
+
+define i64 @PR14059.2({ float, float }* %phi) {
+; Check that SROA can split up alloca-wide integer loads and stores where the
+; underlying alloca has smaller components that are accessed independently. This
+; shows up particularly with ABI lowering patterns coming out of Clang that rely
+; on the particular register placement of a single large integer return value.
+; CHECK: @PR14059.2
+
+entry:
+  %retval = alloca { float, float }, align 4
+  ; CHECK-NOT: alloca
+
+  %0 = bitcast { float, float }* %retval to i64*
+  store i64 0, i64* %0
+  ; CHECK-NOT: store
+
+  %phi.realp = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  %phi.real = load float* %phi.realp
+  %phi.imagp = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  %phi.imag = load float* %phi.imagp
+  ; CHECK:      %[[realp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  ; CHECK-NEXT: %[[real:.*]] = load float* %[[realp]]
+  ; CHECK-NEXT: %[[imagp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  ; CHECK-NEXT: %[[imag:.*]] = load float* %[[imagp]]
+
+  %real = getelementptr inbounds { float, float }* %retval, i32 0, i32 0
+  %imag = getelementptr inbounds { float, float }* %retval, i32 0, i32 1
+  store float %phi.real, float* %real
+  store float %phi.imag, float* %imag
+  ; CHECK-NEXT: %[[real_convert:.*]] = bitcast float %[[real]] to i32
+  ; CHECK-NEXT: %[[imag_convert:.*]] = bitcast float %[[imag]] to i32
+  ; CHECK-NEXT: %[[imag_ext:.*]] = zext i32 %[[imag_convert]] to i64
+  ; CHECK-NEXT: %[[imag_shift:.*]] = shl i64 %[[imag_ext]], 32
+  ; CHECK-NEXT: %[[imag_mask:.*]] = and i64 undef, 4294967295
+  ; CHECK-NEXT: %[[imag_insert:.*]] = or i64 %[[imag_mask]], %[[imag_shift]]
+  ; CHECK-NEXT: %[[real_ext:.*]] = zext i32 %[[real_convert]] to i64
+  ; CHECK-NEXT: %[[real_mask:.*]] = and i64 %[[imag_insert]], -4294967296
+  ; CHECK-NEXT: %[[real_insert:.*]] = or i64 %[[real_mask]], %[[real_ext]]
+
+  %1 = load i64* %0, align 1
+  ret i64 %1
+  ; CHECK-NEXT: ret i64 %[[real_insert]]
+}
+
+define void @PR14105({ [16 x i8] }* %ptr) {
+; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is
+; sign as negative. We use a volatile memcpy to ensure promotion never actually
+; occurs.
+; CHECK: @PR14105
+
+entry:
+  %a = alloca { [16 x i8] }, align 8
+; CHECK: alloca [16 x i8], align 8
+
+  %gep = getelementptr inbounds { [16 x i8] }* %ptr, i64 -1
+; CHECK-NEXT: getelementptr inbounds { [16 x i8] }* %ptr, i64 -1, i32 0, i64 0
+
+  %cast1 = bitcast { [16 x i8 ] }* %gep to i8*
+  %cast2 = bitcast { [16 x i8 ] }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast1, i8* %cast2, i32 16, i32 8, i1 true)
+  ret void
+; CHECK: ret
+}
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
new file mode 100644
index 000000000000..ce82d1f30b57
--- /dev/null
+++ b/test/Transforms/SROA/big-endian.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i8 @test1() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca. Note this is
+; the same as test12 in basictest.ll, but here we assert big-endian byte
+; ordering.
+;
+; CHECK: @test1
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i64 @test2() {
+; Test for various mixed sizes of integer loads and stores all getting
+; promoted.
+;
+; CHECK: @test2
+
+entry:
+  %a = alloca [7 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [7 x i8]* %a, i64 0, i32 0
+  %a1ptr = getelementptr [7 x i8]* %a, i64 0, i32 1
+  %a2ptr = getelementptr [7 x i8]* %a, i64 0, i32 2
+  %a3ptr = getelementptr [7 x i8]* %a, i64 0, i32 3
+
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+
+  %a0i16ptr = bitcast i8* %a0ptr to i16*
+  store i16 1, i16* %a0i16ptr
+; CHECK:      %[[mask0:.*]] = and i16 1, -16
+
+  %a1i4ptr = bitcast i8* %a1ptr to i4*
+  store i4 1, i4* %a1i4ptr
+; CHECK-NEXT: %[[insert0:.*]] = or i16 %[[mask0]], 1
+
+  store i8 1, i8* %a2ptr
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, 4294967295
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], 4294967296
+
+  %a3i24ptr = bitcast i8* %a3ptr to i24*
+  store i24 1, i24* %a3i24ptr
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], 256
+
+  %a2i40ptr = bitcast i8* %a2ptr to i40*
+  store i40 1, i40* %a2i40ptr
+; CHECK-NEXT: %[[ext3:.*]] = zext i40 1 to i56
+; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
+
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+
+  %aiptr = bitcast [7 x i8]* %a to i56*
+  %ai = load i56* %aiptr
+  %ret = zext i56 %ai to i64
+  ret i64 %ret
+; CHECK-NEXT: %[[ext4:.*]] = zext i16 %[[insert0]] to i56
+; CHECK-NEXT: %[[shift4:.*]] = shl i56 %[[ext4]], 40
+; CHECK-NEXT: %[[mask4:.*]] = and i56 %[[insert3]], 1099511627775
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[shift4]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64
+; CHECK-NEXT: ret i64 %[[ret]]
+}
diff --git a/test/Transforms/SROA/fca.ll b/test/Transforms/SROA/fca.ll
new file mode 100644
index 000000000000..c30a5cc974fc
--- /dev/null
+++ b/test/Transforms/SROA/fca.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define { i32, i32 } @test0(i32 %x, i32 %y) {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+
+  store { i32, i32 } undef, { i32, i32 }* %a
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load { i32, i32 }* %a
+  ret { i32, i32 } %result
+}
+
+define { i32, i32 } @test1(i32 %x, i32 %y) {
+; FIXME: This may be too conservative. Duncan argues that we are allowed to
+; split the volatile load and store here but must produce volatile scalar loads
+; and stores from them.
+; CHECK: @test1
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+  %b = alloca { i32, i32 }
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load volatile { i32, i32 }* %a
+  store volatile { i32, i32 } %result, { i32, i32 }* %b
+  ret { i32, i32 } %result
+}
diff --git a/test/Transforms/SROA/lit.local.cfg b/test/Transforms/SROA/lit.local.cfg
new file mode 100644
index 000000000000..c6106e4746f2
--- /dev/null
+++ b/test/Transforms/SROA/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
new file mode 100644
index 000000000000..921016a9c24b
--- /dev/null
+++ b/test/Transforms/SROA/phi-and-select.ll
@@ -0,0 +1,427 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i32 @test1() {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	br i1 %cond, label %then, label %exit
+
+then:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %then ], [ %a0, %entry ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test2() {
+; CHECK: @test2
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a1, i32* %a0
+; CHECK: select i1 %{{.*}}, i32 1, i32 0
+
+	%result = load i32* %select
+	ret i32 %result
+}
+
+define i32 @test3(i32 %x) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  ; Note that we build redundant GEPs here to ensure that having different GEPs
+  ; into the same alloca partation continues to work with PHI speculation. This
+  ; was the underlying cause of PR13926.
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a0b = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+  %a1b = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+  switch i32 %x, label %bb0 [ i32 1, label %bb1
+                              i32 2, label %bb2
+                              i32 3, label %bb3
+                              i32 4, label %bb4
+                              i32 5, label %bb5
+                              i32 6, label %bb6
+                              i32 7, label %bb7 ]
+
+bb0:
+	br label %exit
+bb1:
+	br label %exit
+bb2:
+	br label %exit
+bb3:
+	br label %exit
+bb4:
+	br label %exit
+bb5:
+	br label %exit
+bb6:
+	br label %exit
+bb7:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %bb0 ], [ %a0, %bb1 ], [ %a0, %bb2 ], [ %a1, %bb3 ],
+                  [ %a1b, %bb4 ], [ %a0b, %bb5 ], [ %a0b, %bb6 ], [ %a1b, %bb7 ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ], [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test4() {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a0, i32* %a0
+; CHECK-NOT: select
+
+	%result = load i32* %select
+	ret i32 %result
+; CHECK: ret i32 0
+}
+
+define i32 @test5(i32* %b) {
+; CHECK: @test5
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+	%select = select i1 true, i32* %a1, i32* %b
+; CHECK-NOT: select
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+declare void @f(i32*, i32*)
+
+define i32 @test6(i32* %b) {
+; CHECK: @test6
+entry:
+	%a = alloca [2 x i32]
+  %c = alloca i32
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+
+	%select = select i1 true, i32* %a1, i32* %b
+	%select2 = select i1 false, i32* %a1, i32* %b
+  %select3 = select i1 false, i32* %c, i32* %b
+; CHECK: %[[select2:.*]] = select i1 false, i32* undef, i32* %b
+; CHECK: %[[select3:.*]] = select i1 false, i32* undef, i32* %b
+
+  ; Note, this would potentially escape the alloca pointer except for the
+  ; constant folding of the select.
+  call void @f(i32* %select2, i32* %select3)
+; CHECK: call void @f(i32* %[[select2]], i32* %[[select3]])
+
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+  %dead = load i32* %c
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+define i32 @test7() {
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y1 = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y1
+  br label %exit
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  br label %exit
+
+exit:
+	%P = phi i32* [ %Y1, %good ], [ %Y2, %bad ]
+; CHECK: %[[phi:.*]] = phi i32 [ 0, %good ],
+  %Z2 = load i32* %P
+  ret i32 %Z2
+; CHECK: ret i32 %[[phi]]
+}
+
+define i32 @test8(i32 %b, i32* %ptr) {
+; Ensure that we rewrite allocas to the used type when that use is hidden by
+; a PHI that can be speculated.
+; CHECK: @test8
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i32 [ undef, %else ], [ %[[value]], %then ]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast float* %f to i32*
+  br label %exit
+
+exit:
+  %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load i32* %phi, align 4
+  ret i32 %loaded
+}
+
+define i32 @test9(i32 %b, i32* %ptr) {
+; Same as @test8 but for a select rather than a PHI node.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 undef, i32 %[[value]]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  store i32 0, i32* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast float* %f to i32*
+  %select = select i1 %test, i32* %bitcast, i32* %ptr
+  %loaded = load i32* %select, align 4
+  ret i32 %loaded
+}
+
+define float @test10(i32 %b, float* %ptr) {
+; Don't try to promote allocas which are not elligible for it even after
+; rewriting due to the necessity of inserting bitcasts when speculating a PHI
+; node.
+; CHECK: @test10
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[result:.*]] = phi float [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ]
+; CHECK-NEXT: ret float %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast double* %f to float*
+  br label %exit
+
+exit:
+  %phi = phi float* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load float* %phi, align 4
+  ret float %loaded
+}
+
+define float @test11(i32 %b, float* %ptr) {
+; Same as @test10 but for a select rather than a PHI node.
+; CHECK: @test11
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, float %[[allocavalue]], float %[[argvalue]]
+; CHECK-NEXT: ret float %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  store float 0.0, float* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast double* %f to float*
+  %select = select i1 %test, float* %bitcast, float* %ptr
+  %loaded = load float* %select, align 4
+  ret float %loaded
+}
+
+define i32 @test12(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead selects of allocas if no load is
+; never found.
+; CHECK: @test12
+; CHECK-NOT: alloca
+; CHECK-NOT: select
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  %dead = select i1 undef, i32* %a, i32* %p
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @test13(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead phis of allocas if no load is ever
+; found.
+; CHECK: @test13
+; CHECK-NOT: alloca
+; CHECK-NOT: phi
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  br label %loop
+
+loop:
+  %phi = phi i32* [ %p, %entry ], [ %a, %loop ]
+  br i1 undef, label %loop, label %exit
+
+exit:
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @PR13905() {
+; Check a pattern where we have a chain of dead phi nodes to ensure they are
+; deleted and promotion can proceed.
+; CHECK: @PR13905
+; CHECK-NOT: alloca i32
+; CHECK: ret i32 undef
+
+entry:
+  %h = alloca i32
+  store i32 0, i32* %h
+  br i1 undef, label %loop1, label %exit
+
+loop1:
+  %phi1 = phi i32* [ null, %entry ], [ %h, %loop1 ], [ %h, %loop2 ]
+  br i1 undef, label %loop1, label %loop2
+
+loop2:
+  br i1 undef, label %loop1, label %exit
+
+exit:
+  %phi2 = phi i32* [ %phi1, %loop2 ], [ null, %entry ]
+  ret i32 undef
+}
+
+define i32 @PR13906() {
+; Another pattern which can lead to crashes due to failing to clear out dead
+; PHI nodes or select nodes. This triggers subtly differently from the above
+; cases because the PHI node is (recursively) alive, but the select is dead.
+; CHECK: @PR13906
+; CHECK-NOT: alloca
+
+entry:
+  %c = alloca i32
+  store i32 0, i32* %c
+  br label %for.cond
+
+for.cond:
+  %d.0 = phi i32* [ undef, %entry ], [ %c, %if.then ], [ %d.0, %for.cond ]
+  br i1 undef, label %if.then, label %for.cond
+
+if.then:
+  %tmpcast.d.0 = select i1 undef, i32* %c, i32* %d.0
+  br label %for.cond
+}
+
+define i64 @PR14132(i1 %flag) {
+; CHECK: @PR14132
+; Here we form a PHI-node by promoting the pointer alloca first, and then in
+; order to promote the other two allocas, we speculate the load of the
+; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
+; alloca, which is completely bogus. However, we were asserting on trying to
+; rewrite it. Now it is replaced with undef. Eventually we may replace it with
+; unrechable and even the CFG will go away here.
+entry:
+  %a = alloca i64
+  %b = alloca i8
+  %ptr = alloca i64*
+; CHECK-NOT: alloca
+
+  %ptr.cast = bitcast i64** %ptr to i8**
+  store i64 0, i64* %a
+  store i8 1, i8* %b
+  store i64* %a, i64** %ptr
+  br i1 %flag, label %if.then, label %if.end
+
+if.then:
+  store i8* %b, i8** %ptr.cast
+  br label %if.end
+
+if.end:
+  %tmp = load i64** %ptr
+  %result = load i64* %tmp
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i64 [ undef, %if.then ], [ 0, %entry ]
+
+  ret i64 %result
+; CHECK-NEXT: ret i64 %[[result]]
+}
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
new file mode 100644
index 000000000000..ea28f5d1a647
--- /dev/null
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -0,0 +1,267 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+%S1 = type { i64, [42 x float] }
+
+define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      extractelement <4 x i32> %x, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test2
+; FIXME: This should be handled!
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK: alloca <4 x i32>
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
+  %tmp3.vec = load <2 x i32>* %a.tmp3.cast
+  %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.y.cast, i8 0, i32 16, i32 1, i1 false)
+; CHECK-NOT: memset
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.tmp1.cast, i8 -1, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      %[[insert:.*]] = insertelement <4 x i32> %x, i32 -1, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 3
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test4(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %z.cast = bitcast <4 x i32>* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.y.cast, i8* %z.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.tmp1.cast, i8* %z.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[load:.*]] = load <4 x i32>* %z
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[element_load:.*]] = load i32* %[[gep]]
+; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test5(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test5
+; The same as the above, but with reversed source and destination for the
+; element memcpy, and a self copy.
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %a.x.cast = bitcast <4 x i32>* %a.x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.x.cast, i8* %a.y.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %z.tmp1.cast, i8* %a.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[extract:.*]] = extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: store i32 %[[extract]], i32* %[[gep]]
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i64 @test6(<4 x i64> %x, <4 x i64> %y, i64 %n) {
+; CHECK: @test6
+; The old scalarrepl pass would wrongly drop the store to the second alloca.
+; PR13254
+  %tmp = alloca { <4 x i64>, <4 x i64> }
+  %p0 = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0
+  store <4 x i64> %x, <4 x i64>* %p0
+; CHECK: store <4 x i64> %x,
+  %p1 = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 1
+  store <4 x i64> %y, <4 x i64>* %p1
+; CHECK: store <4 x i64> %y,
+  %addr = getelementptr inbounds { <4 x i64>, <4 x i64> }* %tmp, i32 0, i32 0, i64 %n
+  %res = load i64* %addr, align 4
+  ret i64 %res
+}
+
+define i32 @PR14212() {
+; CHECK: @PR14212
+; This caused a crash when "splitting" the load of the i32 in order to promote
+; the store of <3 x i8> properly. Heavily reduced from an OpenCL test case.
+entry:
+  %retval = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  store <3 x i8> undef, <3 x i8>* %retval, align 4
+  %cast = bitcast <3 x i8>* %retval to i32*
+  %load = load i32* %cast, align 4
+  ret i32 %load
+; CHECK: ret i32
+}
+
+define <2 x i8> @PR14349.1(i32 %x) {
+; CEHCK: @PR14349.1
+; The first testcase for broken SROA rewriting of split integer loads and
+; stores due to smaller vector loads and stores. This particular test ensures
+; that we can rewrite a split store of an integer to a store of a vector.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  store i32 %x, i32* %a
+; CHECK-NOT: store
+
+  %cast = bitcast i32* %a to <2 x i8>*
+  %vec = load <2 x i8>* %cast
+; CHECK-NOT: load
+
+  ret <2 x i8> %vec
+; CHECK: %[[trunc:.*]] = trunc i32 %x to i16
+; CHECK: %[[cast:.*]] = bitcast i16 %[[trunc]] to <2 x i8>
+; CHECK: ret <2 x i8> %[[cast]]
+}
+
+define i32 @PR14349.2(<2 x i8> %x) {
+; CEHCK: @PR14349.2
+; The first testcase for broken SROA rewriting of split integer loads and
+; stores due to smaller vector loads and stores. This particular test ensures
+; that we can rewrite a split load of an integer to a load of a vector.
+entry:
+  %a = alloca i32
+; CHECK-NOT: alloca
+
+  %cast = bitcast i32* %a to <2 x i8>*
+  store <2 x i8> %x, <2 x i8>* %cast
+; CHECK-NOT: store
+
+  %int = load i32* %a
+; CHECK-NOT: load
+
+  ret i32 %int
+; CHECK: %[[cast:.*]] = bitcast <2 x i8> %x to i16
+; CHECK: %[[trunc:.*]] = zext i16 %[[cast]] to i32
+; CHECK: %[[insert:.*]] = or i32 %{{.*}}, %[[trunc]]
+; CHECK: ret i32 %[[insert]]
+}
diff --git a/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
new file mode 100644
index 000000000000..786fee9e6610
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/SPARC/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'Sparc' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll
new file mode 100644
index 000000000000..9d1568557f30
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/SPARC/switch_to_lookup_table.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -simplifycfg -S -mtriple=sparc-unknown-unknown | FileCheck %s
+
+; Check that switches are not turned into lookup tables, as this is not
+; considered profitable on the target.
+
+define i32 @f(i32 %c) nounwind uwtable readnone {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK-NOT: getelementptr
+; CHECK: switch i32 %c
+}
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
new file mode 100644
index 000000000000..a8ad0f1a28b2
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
new file mode 100644
index 000000000000..8a59992f5e64
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -0,0 +1,779 @@
+; RUN: opt < %s -simplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; The table for @f
+; CHECK: @switch.table = private unnamed_addr constant [7 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 27, i32 62, i32 1]
+
+; The float table for @h
+; CHECK: @switch.table1 = private unnamed_addr constant [4 x float] [float 0x40091EB860000000, float 0x3FF3BE76C0000000, float 0x4012449BA0000000, float 0x4001AE1480000000]
+
+; The table for @foostring
+; CHECK: @switch.table2 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
+
+; The table for @earlyreturncrash
+; CHECK: @switch.table3 = private unnamed_addr constant [4 x i32] [i32 42, i32 9, i32 88, i32 5]
+
+; The table for @large.
+; CHECK: @switch.table4 = private unnamed_addr constant [199 x i32] [i32 1, i32 4, i32 9,
+
+; The table for @cprop
+; CHECK: @switch.table5 = private unnamed_addr constant [7 x i32] [i32 5, i32 42, i32 126, i32 -452, i32 128, i32 6, i32 7]
+
+; The table for @unreachable
+; CHECK: @switch.table6 = private unnamed_addr constant [5 x i32] [i32 0, i32 0, i32 0, i32 1, i32 -1]
+
+; A simple int-to-int selection switch.
+; It is dense enough to be replaced by table lookup.
+; The result is directly by a ret from an otherwise empty bb,
+; so we return early, directly from the lookup bb.
+
+define i32 @f(i32 %c) {
+entry:
+  switch i32 %c, label %sw.default [
+    i32 42, label %return
+    i32 43, label %sw.bb1
+    i32 44, label %sw.bb2
+    i32 45, label %sw.bb3
+    i32 46, label %sw.bb4
+    i32 47, label %sw.bb5
+    i32 48, label %sw.bb6
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.default: br label %return
+return:
+  %retval.0 = phi i32 [ 15, %sw.default ], [ 1, %sw.bb6 ], [ 62, %sw.bb5 ], [ 27, %sw.bb4 ], [ -1, %sw.bb3 ], [ 0, %sw.bb2 ], [ 123, %sw.bb1 ], [ 55, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @f
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %c, 42
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 7
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: return:
+; CHECK-NEXT: ret i32 15
+}
+
+; A switch used to initialize two variables, an i8 and a float.
+
+declare void @dummy(i8 signext, float)
+define void @h(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i8 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi float [ 0x4023FAE140000000, %sw.default ], [ 0x4001AE1480000000, %sw.bb3 ], [ 0x4012449BA0000000, %sw.bb2 ], [ 0x3FF3BE76C0000000, %sw.bb1 ], [ 0x40091EB860000000, %entry ]
+  call void @dummy(i8 signext %a.0, float %b.0)
+  ret void
+
+; CHECK: @h
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %sw.epilog
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.shiftamt = mul i32 %switch.tableidx, 8
+; CHECK-NEXT: %switch.downshift = lshr i32 89655594, %switch.shiftamt
+; CHECK-NEXT: %switch.masked = trunc i32 %switch.downshift to i8
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x float]* @switch.table1, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load float* %switch.gep
+; CHECK-NEXT: br label %sw.epilog
+; CHECK: sw.epilog:
+; CHECK-NEXT: %a.0 = phi i8 [ %switch.masked, %switch.lookup ], [ 7, %entry ]
+; CHECK-NEXT: %b.0 = phi float [ %switch.load, %switch.lookup ], [ 0x4023FAE140000000, %entry ]
+; CHECK-NEXT: call void @dummy(i8 signext %a.0, float %b.0)
+; CHECK-NEXT: ret void
+}
+
+
+; Switch used to return a string.
+
+@.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.str1 = private unnamed_addr constant [4 x i8] c"bar\00", align 1
+@.str2 = private unnamed_addr constant [4 x i8] c"baz\00", align 1
+@.str3 = private unnamed_addr constant [4 x i8] c"qux\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"error\00", align 1
+
+define i8* @foostring(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %return
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: br label %return
+
+return:
+  %retval.0 = phi i8* [ getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0), %sw.default ],
+                      [ getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0), %sw.bb3 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), %sw.bb2 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), %sw.bb1 ],
+                      [ getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), %entry ]
+  ret i8* %retval.0
+
+; CHECK: @foostring
+; CHECK: entry:
+; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
+; CHECK-NEXT: %0 = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %return
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i8*]* @switch.table2, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i8** %switch.gep
+; CHECK-NEXT: ret i8* %switch.load
+}
+
+; Switch used to initialize two values. The first value is returned, the second
+; value is not used. This used to make the transformation generate illegal code.
+
+define i32 @earlyreturncrash(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i32 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi i32 [ 10, %sw.default ], [ 5, %sw.bb3 ], [ 1, %sw.bb2 ], [ 4, %sw.bb1 ], [ 3, %entry ]
+  ret i32 %a.0
+
+; CHECK: @earlyreturncrash
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i32]* @switch.table3, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: sw.epilog:
+; CHECK-NEXT: ret i32 7
+}
+
+
+; Example 7 from http://blog.regehr.org/archives/320
+; It is not dense enough for a regular table, but the results
+; can be packed into a bitmap.
+
+define i32 @crud(i8 zeroext %c)  {
+entry:
+  %cmp = icmp ult i8 %c, 33
+  br i1 %cmp, label %lor.end, label %switch.early.test
+
+switch.early.test:
+  switch i8 %c, label %lor.rhs [
+    i8 92, label %lor.end
+    i8 62, label %lor.end
+    i8 60, label %lor.end
+    i8 59, label %lor.end
+    i8 58, label %lor.end
+    i8 46, label %lor.end
+    i8 44, label %lor.end
+    i8 34, label %lor.end
+    i8 39, label %switch.edge
+  ]
+
+switch.edge: br label %lor.end
+lor.rhs: br label %lor.end
+
+lor.end:
+  %0 = phi i1 [ true, %switch.early.test ],
+              [ false, %lor.rhs ],
+              [ true, %entry ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.early.test ],
+              [ true, %switch.edge ]
+  %lor.ext = zext i1 %0 to i32
+  ret i32 %lor.ext
+
+; CHECK: @crud
+; CHECK: entry:
+; CHECK-NEXT: %cmp = icmp ult i8 %c, 33
+; CHECK-NEXT: br i1 %cmp, label %lor.end, label %switch.early.test
+; CHECK: switch.early.test:
+; CHECK-NEXT: %switch.tableidx = sub i8 %c, 34
+; CHECK-NEXT: %0 = icmp ult i8 %switch.tableidx, 59
+; CHECK-NEXT: br i1 %0, label %switch.lookup, label %lor.end
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.cast = zext i8 %switch.tableidx to i59
+; CHECK-NEXT: %switch.shiftamt = mul i59 %switch.cast, 1
+; CHECK-NEXT: %switch.downshift = lshr i59 -288230375765830623, %switch.shiftamt
+; CHECK-NEXT: %switch.masked = trunc i59 %switch.downshift to i1
+; CHECK-NEXT: br label %lor.end
+; CHECK: lor.end:
+; CHECK-NEXT: %1 = phi i1 [ true, %entry ], [ %switch.masked, %switch.lookup ], [ false, %switch.early.test ]
+; CHECK-NEXT: %lor.ext = zext i1 %1 to i32
+; CHECK-NEXT: ret i32 %lor.ext
+}
+
+; PR13946
+define i32 @overflow(i32 %type) {
+entry:
+  switch i32 %type, label %sw.default [
+    i32 -2147483648, label %sw.bb
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 -2147483645, label %sw.bb3
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb: br label %if.end
+sw.bb1: br label %if.end
+sw.bb2: br label %if.end
+sw.bb3: br label %if.end
+sw.default: br label %if.end
+if.else: br label %if.end
+
+if.end:
+  %dirent_type.0 = phi i32 [ 3, %sw.default ], [ 6, %sw.bb3 ], [ 5, %sw.bb2 ], [ 0, %sw.bb1 ], [ 3, %sw.bb ], [ 0, %if.else ]
+  ret i32 %dirent_type.0
+; CHECK: define i32 @overflow
+; CHECK: switch
+; CHECK: phi
+}
+
+; PR13985
+define i1 @undef(i32 %tmp) {
+bb:
+  switch i32 %tmp, label %bb3 [
+    i32 0, label %bb1
+    i32 1, label %bb1
+    i32 7, label %bb2
+    i32 8, label %bb2
+  ]
+
+bb1: br label %bb3
+bb2: br label %bb3
+
+bb3:
+  %tmp4 = phi i1 [ undef, %bb ], [ false, %bb2 ], [ true, %bb1 ]
+  ret i1 %tmp4
+; CHECK: define i1 @undef
+; CHECK: %switch.cast = trunc i32 %switch.tableidx to i9
+; CHECK: %switch.downshift = lshr i9 3, %switch.shiftamt
+}
+
+; Also handle large switches that would be rejected by
+; isValueEqualityComparison()
+; CHECK: large
+; CHECK-NOT: switch i32
+define i32 @large(i32 %x) {
+entry:
+  %cmp = icmp slt i32 %x, 0
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %mul = mul i32 %x, -10
+  br label %if.end
+
+if.end:
+  %x.addr.0 = phi i32 [ %mul, %if.then ], [ %x, %entry ]
+  switch i32 %x.addr.0, label %return [
+    i32 199, label %sw.bb203
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+    i32 4, label %sw.bb4
+    i32 5, label %sw.bb5
+    i32 6, label %sw.bb6
+    i32 7, label %sw.bb7
+    i32 8, label %sw.bb8
+    i32 9, label %sw.bb9
+    i32 10, label %sw.bb10
+    i32 11, label %sw.bb11
+    i32 12, label %sw.bb12
+    i32 13, label %sw.bb13
+    i32 14, label %sw.bb14
+    i32 15, label %sw.bb15
+    i32 16, label %sw.bb16
+    i32 17, label %sw.bb17
+    i32 18, label %sw.bb18
+    i32 19, label %sw.bb19
+    i32 20, label %sw.bb20
+    i32 21, label %sw.bb21
+    i32 22, label %sw.bb22
+    i32 23, label %sw.bb23
+    i32 24, label %sw.bb24
+    i32 25, label %sw.bb25
+    i32 26, label %sw.bb26
+    i32 27, label %sw.bb27
+    i32 28, label %sw.bb28
+    i32 29, label %sw.bb29
+    i32 30, label %sw.bb30
+    i32 31, label %sw.bb31
+    i32 32, label %sw.bb32
+    i32 33, label %sw.bb33
+    i32 34, label %sw.bb34
+    i32 35, label %sw.bb35
+    i32 36, label %sw.bb37
+    i32 37, label %sw.bb38
+    i32 38, label %sw.bb39
+    i32 39, label %sw.bb40
+    i32 40, label %sw.bb41
+    i32 41, label %sw.bb42
+    i32 42, label %sw.bb43
+    i32 43, label %sw.bb44
+    i32 44, label %sw.bb45
+    i32 45, label %sw.bb47
+    i32 46, label %sw.bb48
+    i32 47, label %sw.bb49
+    i32 48, label %sw.bb50
+    i32 49, label %sw.bb51
+    i32 50, label %sw.bb52
+    i32 51, label %sw.bb53
+    i32 52, label %sw.bb54
+    i32 53, label %sw.bb55
+    i32 54, label %sw.bb56
+    i32 55, label %sw.bb58
+    i32 56, label %sw.bb59
+    i32 57, label %sw.bb60
+    i32 58, label %sw.bb61
+    i32 59, label %sw.bb62
+    i32 60, label %sw.bb63
+    i32 61, label %sw.bb64
+    i32 62, label %sw.bb65
+    i32 63, label %sw.bb66
+    i32 64, label %sw.bb67
+    i32 65, label %sw.bb68
+    i32 66, label %sw.bb69
+    i32 67, label %sw.bb70
+    i32 68, label %sw.bb71
+    i32 69, label %sw.bb72
+    i32 70, label %sw.bb73
+    i32 71, label %sw.bb74
+    i32 72, label %sw.bb76
+    i32 73, label %sw.bb77
+    i32 74, label %sw.bb78
+    i32 75, label %sw.bb79
+    i32 76, label %sw.bb80
+    i32 77, label %sw.bb81
+    i32 78, label %sw.bb82
+    i32 79, label %sw.bb83
+    i32 80, label %sw.bb84
+    i32 81, label %sw.bb85
+    i32 82, label %sw.bb86
+    i32 83, label %sw.bb87
+    i32 84, label %sw.bb88
+    i32 85, label %sw.bb89
+    i32 86, label %sw.bb90
+    i32 87, label %sw.bb91
+    i32 88, label %sw.bb92
+    i32 89, label %sw.bb93
+    i32 90, label %sw.bb94
+    i32 91, label %sw.bb95
+    i32 92, label %sw.bb96
+    i32 93, label %sw.bb97
+    i32 94, label %sw.bb98
+    i32 95, label %sw.bb99
+    i32 96, label %sw.bb100
+    i32 97, label %sw.bb101
+    i32 98, label %sw.bb102
+    i32 99, label %sw.bb103
+    i32 100, label %sw.bb104
+    i32 101, label %sw.bb105
+    i32 102, label %sw.bb106
+    i32 103, label %sw.bb107
+    i32 104, label %sw.bb108
+    i32 105, label %sw.bb109
+    i32 106, label %sw.bb110
+    i32 107, label %sw.bb111
+    i32 108, label %sw.bb112
+    i32 109, label %sw.bb113
+    i32 110, label %sw.bb114
+    i32 111, label %sw.bb115
+    i32 112, label %sw.bb116
+    i32 113, label %sw.bb117
+    i32 114, label %sw.bb118
+    i32 115, label %sw.bb119
+    i32 116, label %sw.bb120
+    i32 117, label %sw.bb121
+    i32 118, label %sw.bb122
+    i32 119, label %sw.bb123
+    i32 120, label %sw.bb124
+    i32 121, label %sw.bb125
+    i32 122, label %sw.bb126
+    i32 123, label %sw.bb127
+    i32 124, label %sw.bb128
+    i32 125, label %sw.bb129
+    i32 126, label %sw.bb130
+    i32 127, label %sw.bb131
+    i32 128, label %sw.bb132
+    i32 129, label %sw.bb133
+    i32 130, label %sw.bb134
+    i32 131, label %sw.bb135
+    i32 132, label %sw.bb136
+    i32 133, label %sw.bb137
+    i32 134, label %sw.bb138
+    i32 135, label %sw.bb139
+    i32 136, label %sw.bb140
+    i32 137, label %sw.bb141
+    i32 138, label %sw.bb142
+    i32 139, label %sw.bb143
+    i32 140, label %sw.bb144
+    i32 141, label %sw.bb145
+    i32 142, label %sw.bb146
+    i32 143, label %sw.bb147
+    i32 144, label %sw.bb148
+    i32 145, label %sw.bb149
+    i32 146, label %sw.bb150
+    i32 147, label %sw.bb151
+    i32 148, label %sw.bb152
+    i32 149, label %sw.bb153
+    i32 150, label %sw.bb154
+    i32 151, label %sw.bb155
+    i32 152, label %sw.bb156
+    i32 153, label %sw.bb157
+    i32 154, label %sw.bb158
+    i32 155, label %sw.bb159
+    i32 156, label %sw.bb160
+    i32 157, label %sw.bb161
+    i32 158, label %sw.bb162
+    i32 159, label %sw.bb163
+    i32 160, label %sw.bb164
+    i32 161, label %sw.bb165
+    i32 162, label %sw.bb166
+    i32 163, label %sw.bb167
+    i32 164, label %sw.bb168
+    i32 165, label %sw.bb169
+    i32 166, label %sw.bb170
+    i32 167, label %sw.bb171
+    i32 168, label %sw.bb172
+    i32 169, label %sw.bb173
+    i32 170, label %sw.bb174
+    i32 171, label %sw.bb175
+    i32 172, label %sw.bb176
+    i32 173, label %sw.bb177
+    i32 174, label %sw.bb178
+    i32 175, label %sw.bb179
+    i32 176, label %sw.bb180
+    i32 177, label %sw.bb181
+    i32 178, label %sw.bb182
+    i32 179, label %sw.bb183
+    i32 180, label %sw.bb184
+    i32 181, label %sw.bb185
+    i32 182, label %sw.bb186
+    i32 183, label %sw.bb187
+    i32 184, label %sw.bb188
+    i32 185, label %sw.bb189
+    i32 186, label %sw.bb190
+    i32 187, label %sw.bb191
+    i32 188, label %sw.bb192
+    i32 189, label %sw.bb193
+    i32 190, label %sw.bb194
+    i32 191, label %sw.bb195
+    i32 192, label %sw.bb196
+    i32 193, label %sw.bb197
+    i32 194, label %sw.bb198
+    i32 195, label %sw.bb199
+    i32 196, label %sw.bb200
+    i32 197, label %sw.bb201
+    i32 198, label %sw.bb202
+  ]
+
+sw.bb1: br label %return
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.bb4: br label %return
+sw.bb5: br label %return
+sw.bb6: br label %return
+sw.bb7: br label %return
+sw.bb8: br label %return
+sw.bb9: br label %return
+sw.bb10: br label %return
+sw.bb11: br label %return
+sw.bb12: br label %return
+sw.bb13: br label %return
+sw.bb14: br label %return
+sw.bb15: br label %return
+sw.bb16: br label %return
+sw.bb17: br label %return
+sw.bb18: br label %return
+sw.bb19: br label %return
+sw.bb20: br label %return
+sw.bb21: br label %return
+sw.bb22: br label %return
+sw.bb23: br label %return
+sw.bb24: br label %return
+sw.bb25: br label %return
+sw.bb26: br label %return
+sw.bb27: br label %return
+sw.bb28: br label %return
+sw.bb29: br label %return
+sw.bb30: br label %return
+sw.bb31: br label %return
+sw.bb32: br label %return
+sw.bb33: br label %return
+sw.bb34: br label %return
+sw.bb35: br label %return
+sw.bb37: br label %return
+sw.bb38: br label %return
+sw.bb39: br label %return
+sw.bb40: br label %return
+sw.bb41: br label %return
+sw.bb42: br label %return
+sw.bb43: br label %return
+sw.bb44: br label %return
+sw.bb45: br label %return
+sw.bb47: br label %return
+sw.bb48: br label %return
+sw.bb49: br label %return
+sw.bb50: br label %return
+sw.bb51: br label %return
+sw.bb52: br label %return
+sw.bb53: br label %return
+sw.bb54: br label %return
+sw.bb55: br label %return
+sw.bb56: br label %return
+sw.bb58: br label %return
+sw.bb59: br label %return
+sw.bb60: br label %return
+sw.bb61: br label %return
+sw.bb62: br label %return
+sw.bb63: br label %return
+sw.bb64: br label %return
+sw.bb65: br label %return
+sw.bb66: br label %return
+sw.bb67: br label %return
+sw.bb68: br label %return
+sw.bb69: br label %return
+sw.bb70: br label %return
+sw.bb71: br label %return
+sw.bb72: br label %return
+sw.bb73: br label %return
+sw.bb74: br label %return
+sw.bb76: br label %return
+sw.bb77: br label %return
+sw.bb78: br label %return
+sw.bb79: br label %return
+sw.bb80: br label %return
+sw.bb81: br label %return
+sw.bb82: br label %return
+sw.bb83: br label %return
+sw.bb84: br label %return
+sw.bb85: br label %return
+sw.bb86: br label %return
+sw.bb87: br label %return
+sw.bb88: br label %return
+sw.bb89: br label %return
+sw.bb90: br label %return
+sw.bb91: br label %return
+sw.bb92: br label %return
+sw.bb93: br label %return
+sw.bb94: br label %return
+sw.bb95: br label %return
+sw.bb96: br label %return
+sw.bb97: br label %return
+sw.bb98: br label %return
+sw.bb99: br label %return
+sw.bb100: br label %return
+sw.bb101: br label %return
+sw.bb102: br label %return
+sw.bb103: br label %return
+sw.bb104: br label %return
+sw.bb105: br label %return
+sw.bb106: br label %return
+sw.bb107: br label %return
+sw.bb108: br label %return
+sw.bb109: br label %return
+sw.bb110: br label %return
+sw.bb111: br label %return
+sw.bb112: br label %return
+sw.bb113: br label %return
+sw.bb114: br label %return
+sw.bb115: br label %return
+sw.bb116: br label %return
+sw.bb117: br label %return
+sw.bb118: br label %return
+sw.bb119: br label %return
+sw.bb120: br label %return
+sw.bb121: br label %return
+sw.bb122: br label %return
+sw.bb123: br label %return
+sw.bb124: br label %return
+sw.bb125: br label %return
+sw.bb126: br label %return
+sw.bb127: br label %return
+sw.bb128: br label %return
+sw.bb129: br label %return
+sw.bb130: br label %return
+sw.bb131: br label %return
+sw.bb132: br label %return
+sw.bb133: br label %return
+sw.bb134: br label %return
+sw.bb135: br label %return
+sw.bb136: br label %return
+sw.bb137: br label %return
+sw.bb138: br label %return
+sw.bb139: br label %return
+sw.bb140: br label %return
+sw.bb141: br label %return
+sw.bb142: br label %return
+sw.bb143: br label %return
+sw.bb144: br label %return
+sw.bb145: br label %return
+sw.bb146: br label %return
+sw.bb147: br label %return
+sw.bb148: br label %return
+sw.bb149: br label %return
+sw.bb150: br label %return
+sw.bb151: br label %return
+sw.bb152: br label %return
+sw.bb153: br label %return
+sw.bb154: br label %return
+sw.bb155: br label %return
+sw.bb156: br label %return
+sw.bb157: br label %return
+sw.bb158: br label %return
+sw.bb159: br label %return
+sw.bb160: br label %return
+sw.bb161: br label %return
+sw.bb162: br label %return
+sw.bb163: br label %return
+sw.bb164: br label %return
+sw.bb165: br label %return
+sw.bb166: br label %return
+sw.bb167: br label %return
+sw.bb168: br label %return
+sw.bb169: br label %return
+sw.bb170: br label %return
+sw.bb171: br label %return
+sw.bb172: br label %return
+sw.bb173: br label %return
+sw.bb174: br label %return
+sw.bb175: br label %return
+sw.bb176: br label %return
+sw.bb177: br label %return
+sw.bb178: br label %return
+sw.bb179: br label %return
+sw.bb180: br label %return
+sw.bb181: br label %return
+sw.bb182: br label %return
+sw.bb183: br label %return
+sw.bb184: br label %return
+sw.bb185: br label %return
+sw.bb186: br label %return
+sw.bb187: br label %return
+sw.bb188: br label %return
+sw.bb189: br label %return
+sw.bb190: br label %return
+sw.bb191: br label %return
+sw.bb192: br label %return
+sw.bb193: br label %return
+sw.bb194: br label %return
+sw.bb195: br label %return
+sw.bb196: br label %return
+sw.bb197: br label %return
+sw.bb198: br label %return
+sw.bb199: br label %return
+sw.bb200: br label %return
+sw.bb201: br label %return
+sw.bb202: br label %return
+sw.bb203: br label %return
+
+return:
+  %retval.0 = phi i32 [ 39204, %sw.bb202 ], [ 38809, %sw.bb201 ], [ 38416, %sw.bb200 ], [ 38025, %sw.bb199 ], [ 37636, %sw.bb198 ], [ 37249, %sw.bb197 ], [ 36864, %sw.bb196 ], [ 36481, %sw.bb195 ], [ 36100, %sw.bb194 ], [ 35721, %sw.bb193 ], [ 35344, %sw.bb192 ], [ 34969, %sw.bb191 ], [ 34596, %sw.bb190 ], [ 34225, %sw.bb189 ], [ 33856, %sw.bb188 ], [ 33489, %sw.bb187 ], [ 33124, %sw.bb186 ], [ 32761, %sw.bb185 ], [ 32400, %sw.bb184 ], [ 32041, %sw.bb183 ], [ 31684, %sw.bb182 ], [ 31329, %sw.bb181 ], [ 30976, %sw.bb180 ], [ 30625, %sw.bb179 ], [ 30276, %sw.bb178 ], [ 29929, %sw.bb177 ], [ 29584, %sw.bb176 ], [ 29241, %sw.bb175 ], [ 28900, %sw.bb174 ], [ 28561, %sw.bb173 ], [ 28224, %sw.bb172 ], [ 27889, %sw.bb171 ], [ 27556, %sw.bb170 ], [ 27225, %sw.bb169 ], [ 26896, %sw.bb168 ], [ 26569, %sw.bb167 ], [ 26244, %sw.bb166 ], [ 25921, %sw.bb165 ], [ 25600, %sw.bb164 ], [ 25281, %sw.bb163 ], [ 24964, %sw.bb162 ], [ 24649, %sw.bb161 ], [ 24336, %sw.bb160 ], [ 24025, %sw.bb159 ], [ 23716, %sw.bb158 ], [ 23409, %sw.bb157 ], [ 23104, %sw.bb156 ], [ 22801, %sw.bb155 ], [ 22500, %sw.bb154 ], [ 22201, %sw.bb153 ], [ 21904, %sw.bb152 ], [ 21609, %sw.bb151 ], [ 21316, %sw.bb150 ], [ 21025, %sw.bb149 ], [ 20736, %sw.bb148 ], [ 20449, %sw.bb147 ], [ 20164, %sw.bb146 ], [ 19881, %sw.bb145 ], [ 19600, %sw.bb144 ], [ 19321, %sw.bb143 ], [ 19044, %sw.bb142 ], [ 18769, %sw.bb141 ], [ 18496, %sw.bb140 ], [ 18225, %sw.bb139 ], [ 17956, %sw.bb138 ], [ 17689, %sw.bb137 ], [ 17424, %sw.bb136 ], [ 17161, %sw.bb135 ], [ 16900, %sw.bb134 ], [ 16641, %sw.bb133 ], [ 16384, %sw.bb132 ], [ 16129, %sw.bb131 ], [ 15876, %sw.bb130 ], [ 15625, %sw.bb129 ], [ 15376, %sw.bb128 ], [ 15129, %sw.bb127 ], [ 14884, %sw.bb126 ], [ 14641, %sw.bb125 ], [ 14400, %sw.bb124 ], [ 14161, %sw.bb123 ], [ 13924, %sw.bb122 ], [ 13689, %sw.bb121 ], [ 13456, %sw.bb120 ], [ 13225, %sw.bb119 ], [ 12996, %sw.bb118 ], [ 12769, %sw.bb117 ], [ 12544, %sw.bb116 ], [ 12321, %sw.bb115 ], [ 12100, %sw.bb114 ], [ 11881, %sw.bb113 ], [ 11664, %sw.bb112 ], [ 11449, %sw.bb111 ], [ 11236, %sw.bb110 ], [ 11025, %sw.bb109 ], [ 10816, %sw.bb108 ], [ 10609, %sw.bb107 ], [ 10404, %sw.bb106 ], [ 10201, %sw.bb105 ], [ 10000, %sw.bb104 ], [ 9801, %sw.bb103 ], [ 9604, %sw.bb102 ], [ 9409, %sw.bb101 ], [ 9216, %sw.bb100 ], [ 9025, %sw.bb99 ], [ 8836, %sw.bb98 ], [ 8649, %sw.bb97 ], [ 8464, %sw.bb96 ], [ 8281, %sw.bb95 ], [ 8100, %sw.bb94 ], [ 7921, %sw.bb93 ], [ 7744, %sw.bb92 ], [ 7569, %sw.bb91 ], [ 7396, %sw.bb90 ], [ 7225, %sw.bb89 ], [ 7056, %sw.bb88 ], [ 6889, %sw.bb87 ], [ 6724, %sw.bb86 ], [ 6561, %sw.bb85 ], [ 6400, %sw.bb84 ], [ 6241, %sw.bb83 ], [ 6084, %sw.bb82 ], [ 5929, %sw.bb81 ], [ 5776, %sw.bb80 ], [ 5625, %sw.bb79 ], [ 5476, %sw.bb78 ], [ 5329, %sw.bb77 ], [ 5184, %sw.bb76 ], [ 5112, %sw.bb74 ], [ 4900, %sw.bb73 ], [ 4761, %sw.bb72 ], [ 4624, %sw.bb71 ], [ 4489, %sw.bb70 ], [ 4356, %sw.bb69 ], [ 4225, %sw.bb68 ], [ 4096, %sw.bb67 ], [ 3969, %sw.bb66 ], [ 3844, %sw.bb65 ], [ 3721, %sw.bb64 ], [ 3600, %sw.bb63 ], [ 3481, %sw.bb62 ], [ 3364, %sw.bb61 ], [ 3249, %sw.bb60 ], [ 3136, %sw.bb59 ], [ 3025, %sw.bb58 ], [ 2970, %sw.bb56 ], [ 2809, %sw.bb55 ], [ 2704, %sw.bb54 ], [ 2601, %sw.bb53 ], [ 2500, %sw.bb52 ], [ 2401, %sw.bb51 ], [ 2304, %sw.bb50 ], [ 2209, %sw.bb49 ], [ 2116, %sw.bb48 ], [ 2025, %sw.bb47 ], [ 1980, %sw.bb45 ], [ 1849, %sw.bb44 ], [ 1764, %sw.bb43 ], [ 1681, %sw.bb42 ], [ 1600, %sw.bb41 ], [ 1521, %sw.bb40 ], [ 1444, %sw.bb39 ], [ 1369, %sw.bb38 ], [ 1296, %sw.bb37 ], [ 1260, %sw.bb35 ], [ 1156, %sw.bb34 ], [ 1089, %sw.bb33 ], [ 1024, %sw.bb32 ], [ 961, %sw.bb31 ], [ 900, %sw.bb30 ], [ 841, %sw.bb29 ], [ 784, %sw.bb28 ], [ 729, %sw.bb27 ], [ 676, %sw.bb26 ], [ 625, %sw.bb25 ], [ 576, %sw.bb24 ], [ 529, %sw.bb23 ], [ 484, %sw.bb22 ], [ 441, %sw.bb21 ], [ 400, %sw.bb20 ], [ 361, %sw.bb19 ], [ 342, %sw.bb18 ], [ 289, %sw.bb17 ], [ 256, %sw.bb16 ], [ 225, %sw.bb15 ], [ 196, %sw.bb14 ], [ 169, %sw.bb13 ], [ 144, %sw.bb12 ], [ 121, %sw.bb11 ], [ 100, %sw.bb10 ], [ 81, %sw.bb9 ], [ 64, %sw.bb8 ], [ 49, %sw.bb7 ], [ 36, %sw.bb6 ], [ 25, %sw.bb5 ], [ 16, %sw.bb4 ], [ 9, %sw.bb3 ], [ 4, %sw.bb2 ], [ 1, %sw.bb1 ], [ 39601, %sw.bb203 ], [ 0, %if.end ]
+  ret i32 %retval.0
+}
+
+define i32 @cprop(i32 %x) {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 1, label %return
+    i32 2, label %sw.bb1
+    i32 3, label %sw.bb2
+    i32 4, label %sw.bb2
+    i32 5, label %sw.bb2
+    i32 6, label %sw.bb3
+    i32 7, label %sw.bb3
+  ]
+
+sw.bb1: br label %return
+
+sw.bb2:
+  %and = and i32 %x, 1
+  %tobool = icmp ne i32 %and, 0
+  %cond = select i1 %tobool, i32 -123, i32 456
+  %sub = sub nsw i32 %x, %cond
+  br label %return
+
+sw.bb3:
+  %trunc = trunc i32 %x to i8
+  %sext = sext i8 %trunc to i32
+  br label %return
+
+sw.default:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 123, %sw.default ], [ %sext, %sw.bb3 ], [ %sub, %sw.bb2 ], [ 42, %sw.bb1 ], [ 5, %entry ]
+  ret i32 %retval.0
+
+; CHECK: @cprop
+; CHECK: switch.lookup:
+; CHECK: %switch.gep = getelementptr inbounds [7 x i32]* @switch.table5, i32 0, i32 %switch.tableidx
+}
+
+define i32 @unreachable(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+    i32 4, label %sw.bb2
+    i32 5, label %sw.bb3
+    i32 6, label %sw.bb3
+    i32 7, label %sw.bb3
+    i32 8, label %sw.bb3
+  ]
+
+sw.bb: br label %return
+sw.bb1: unreachable
+sw.bb2: br label %return
+sw.bb3: br label %return
+sw.default: unreachable
+
+return:
+  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 0, %sw.bb ]
+  ret i32 %retval.0
+
+; CHECK: @unreachable
+; CHECK: switch.lookup:
+; CHECK: getelementptr inbounds [5 x i32]* @switch.table6, i32 0, i32 %switch.tableidx
+}
diff --git a/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll b/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
index 65d888ea01e1..028fb0745631 100644
--- a/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
+++ b/test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
@@ -85,3 +85,31 @@ if.end7:                                          ; preds = %if.else, %if.then4,
 ; CHECK: if.end7:
 ; CHECK: phi i32* [ %a, %if.then ], [ null, %if.then4 ], [ null, %if.else ]
 }
+
+define i32 @test4(i32* %a, i32 %b, i32* %c, i32 %d) nounwind {
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @bar() nounwind
+  br label %if.end7
+
+if.else:                                          ; preds = %entry
+  %tobool3 = icmp eq i32 %d, 0
+  br i1 %tobool3, label %if.end7, label %if.then4
+
+if.then4:                                         ; preds = %if.else
+  tail call void @bar() nounwind
+  br label %if.end7
+
+if.end7:                                          ; preds = %if.else, %if.then4, %if.then
+  %x.0 = phi i32* [ %a, %if.then ], [ null, %if.then4 ], [ null, %if.else ]
+  %gep = getelementptr i32* %x.0, i32 10
+  %tmp9 = load i32* %gep
+  %tmp10 = or i32 %tmp9, 1
+  store i32 %tmp10, i32* %gep
+  ret i32 %tmp9
+; CHECK: @test4
+; CHECK-NOT: phi
+}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
new file mode 100644
index 000000000000..53d5448372da
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-partial.ll
@@ -0,0 +1,37 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+; This test case was written to trigger an incorrect assert statement in
+; -simplifycfg.  Thus we don't actually want to check the output, just that
+; -simplifycfg ran successfully.  Thus we only check that the function still
+; exists, and that it still calls foo().
+;
+; NOTE: There are some obviously dead blocks and missing branch weight
+;       metadata.  Both of these features were key to triggering the assert.
+;       Additionally, the not-taken weight of the branch with a weight had to
+;       be 0 to trigger the assert.
+
+declare void @foo() nounwind uwtable
+
+define void @func(i32 %A) nounwind uwtable {
+; CHECK: define void @func
+entry:
+  %cmp11 = icmp eq i32 %A, 1
+  br i1 %cmp11, label %if.then, label %if.else, !prof !0
+
+if.then:
+  call void @foo()
+; CHECK: call void @foo()
+  br label %if.else
+
+if.else:
+  %cmp17 = icmp eq i32 %A, 2
+  br i1 %cmp17, label %if.then2, label %if.end
+
+if.then2:
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 1, i32 0}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
new file mode 100644
index 000000000000..941f5ad9d5b6
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
@@ -0,0 +1,140 @@
+; RUN: opt -simplifycfg -S -o - < %s | FileCheck %s
+
+declare void @func2(i32)
+declare void @func4(i32)
+declare void @func6(i32)
+declare void @func8(i32)
+
+;; test1 - create a switch with case 2 and case 4 from two branches: N == 2
+;; and N == 4.
+define void @test1(i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp eq i32 %N, 2
+  br i1 %cmp, label %if.then, label %if.else, !prof !0
+; CHECK: test1
+; CHECK: switch i32 %N
+; CHECK: ], !prof !0
+
+if.then:
+  call void @func2(i32 %N) nounwind
+  br label %if.end9
+
+if.else:
+  %cmp2 = icmp eq i32 %N, 4
+  br i1 %cmp2, label %if.then7, label %if.else8, !prof !1
+
+if.then7:
+  call void @func4(i32 %N) nounwind
+  br label %if.end
+
+if.else8:
+  call void @func8(i32 %N) nounwind
+  br label %if.end
+
+if.end:
+  br label %if.end9
+
+if.end9:
+  ret void
+}
+
+;; test2 - Merge two switches where PredDefault == BB.
+define void @test2(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw2 [
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !2
+; CHECK: test2
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !1
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated if control is transferred through default case
+;; of the first switch.
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !3
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test3 - Merge two switches where PredDefault != BB.
+define void @test3(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw.bb [
+    i32 2, label %sw2
+    i32 3, label %sw2
+    i32 1, label %sw.bb1
+  ], !prof !4
+; CHECK: test3
+; CHECK: switch i32 %N, label %sw.bb
+; CHECK: i32 1, label %sw.bb1
+; CHECK: i32 3, label %sw.bb4
+; CHECK: i32 2, label %sw.epilog
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+  switch i32 %N, label %sw.epilog [
+    i32 3, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
+!1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
+; CHECK: !0 = metadata !{metadata !"branch_weights", i32 256, i32 4352, i32 16}
+!2 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 8}
+!3 = metadata !{metadata !"branch_weights", i32 8, i32 8, i32 4}
+; CHECK: !1 = metadata !{metadata !"branch_weights", i32 32, i32 48, i32 96, i32 16}
+!4 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 3}
+!5 = metadata !{metadata !"branch_weights", i32 17, i32 13, i32 9}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7, i32 3, i32 4, i32 6}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index c7917857ee60..beef52700820 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -79,10 +79,238 @@ Z:
   ret void
 }
 
+;; test5 - The case where it jumps to the default target will be removed.
+define void @test5(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !3
+; CHECK: test5
+; CHECK: switch i32 %N, label %sw2 [
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: ], !prof !2
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test6 - Some cases of the second switch are pruned during optimization.
+;; Then the second switch will be converted to a branch, finally, the first
+;; switch and the branch will be merged into a single switch.
+define void @test6(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !4
+; CHECK: test6
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated since the default case of the first switch
+;; does not include "case 2".
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @helper(i32 3)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; This test is based on test1 but swapped the targets of the second branch.
+define void @test1_swap(i1 %a, i1 %b) {
+; CHECK: @test1_swap
+entry:
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %or.cond, label %Y, label %Z, !prof !4
+
+X:
+  %c = or i1 %b, false
+  br i1 %c, label %Y, label %Z, !prof !1
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+define void @test7(i1 %a, i1 %b) {
+; CHECK: @test7
+entry:
+  %c = or i1 %b, false
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %brmerge, label %Y, label %Z, !prof !5
+
+X:
+  br i1 %c, label %Y, label %Z, !prof !6
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+; Test basic folding to a conditional branch.
+define void @test8(i64 %x, i64 %y) nounwind {
+; CHECK: @test8
+entry:
+    %lt = icmp slt i64 %x, %y
+; CHECK: br i1 %lt, label %a, label %b, !prof !6
+    %qux = select i1 %lt, i32 0, i32 2
+    switch i32 %qux, label %bees [
+        i32 0, label %a
+        i32 1, label %b
+        i32 2, label %b
+    ], !prof !7
+a:
+    call void @helper(i32 0) nounwind
+    ret void
+b:
+    call void @helper(i32 1) nounwind
+    ret void
+bees:
+    call void @helper(i32 2) nounwind
+    ret void
+}
+
+; Test edge splitting when the default target has icmp and unconditinal
+; branch
+define i1 @test9(i32 %x, i32 %y) nounwind {
+; CHECK: @test9
+entry:
+    switch i32 %x, label %bees [
+        i32 0, label %a
+        i32 1, label %end
+        i32 2, label %end
+    ], !prof !7
+; CHECK: switch i32 %x, label %bees [
+; CHECK: i32 0, label %a
+; CHECK: i32 1, label %end
+; CHECK: i32 2, label %end
+; CHECK: i32 92, label %end
+; CHECK: ], !prof !7
+
+a:
+    call void @helper(i32 0) nounwind
+    %reta = icmp slt i32 %x, %y
+    ret i1 %reta
+
+bees:
+    %tmp = icmp eq i32 %x, 92
+    br label %end
+
+end:
+; CHECK: end:
+; CHECK: %ret = phi i1 [ true, %entry ], [ false, %bees ], [ true, %entry ], [ true, %entry ]
+    %ret = phi i1 [ true, %entry ], [%tmp, %bees], [true, %entry]
+    call void @helper(i32 2) nounwind
+    ret i1 %ret
+}
+
+define void @test10(i32 %x) nounwind readnone ssp noredzone {
+entry:
+ switch i32 %x, label %lor.rhs [
+   i32 2, label %lor.end
+   i32 1, label %lor.end
+   i32 3, label %lor.end
+ ], !prof !7
+
+lor.rhs:
+ call void @helper(i32 1) nounwind
+ ret void
+
+lor.end:
+ call void @helper(i32 0) nounwind
+ ret void
+
+; CHECK: test10
+; CHECK: %x.off = add i32 %x, -1
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %lor.end, label %lor.rhs, !prof !8
+}
+
+; Remove dead cases from the switch.
+define void @test11(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 24, label %c
+  ], !prof !8
+; CHECK: %cond = icmp eq i32 %i, 24
+; CHECK: br i1 %cond, label %c, label %a, !prof !9
+
+a:
+ call void @helper(i32 0) nounwind
+ ret void
+b:
+ call void @helper(i32 1) nounwind
+ ret void
+c:
+ call void @helper(i32 2) nounwind
+ ret void
+}
+
 !0 = metadata !{metadata !"branch_weights", i32 3, i32 5}
 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
 !2 = metadata !{metadata !"branch_weights", i32 1, i32 2}
+!3 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!4 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!5 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 5}
+!6 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!7 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8, i32 7}
+!8 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8}
 
 ; CHECK: !0 = metadata !{metadata !"branch_weights", i32 5, i32 11}
 ; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 5}
-; CHECK-NOT: !2
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 7, i32 1, i32 2}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 49, i32 12, i32 24, i32 35}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 11, i32 5}
+; CHECK: !5 = metadata !{metadata !"branch_weights", i32 17, i32 15} 
+; CHECK: !6 = metadata !{metadata !"branch_weights", i32 9, i32 7}
+; CHECK: !7 = metadata !{metadata !"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
+; CHECK: !8 = metadata !{metadata !"branch_weights", i32 24, i32 33}
+; CHECK: !9 = metadata !{metadata !"branch_weights", i32 8, i32 33}
+; CHECK-NOT: !9
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
new file mode 100644
index 000000000000..28d727938288
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
diff --git a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll b/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
deleted file mode 100644
index 2717228f7ee1..000000000000
--- a/test/Transforms/SimplifyLibCalls/2009-02-12-StrTo.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; Test that we add nocapture to the declaration, and to the second call only.
-
-; CHECK: declare float @strtol(i8*, i8** nocapture, i32) nounwind
-declare float @strtol(i8* %s, i8** %endptr, i32 %base)
-
-define void @foo(i8* %x, i8** %endptr) {
-; CHECK:  call float @strtol(i8* %x, i8** %endptr, i32 10)
-  call float @strtol(i8* %x, i8** %endptr, i32 10)
-; CHECK: %2 = call float @strtol(i8* nocapture %x, i8** null, i32 10)
-  call float @strtol(i8* %x, i8** null, i32 10)
-  ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/FFS.ll b/test/Transforms/SimplifyLibCalls/FFS.ll
index e38d78349d43..6aecbeacd7e6 100644
--- a/test/Transforms/SimplifyLibCalls/FFS.ll
+++ b/test/Transforms/SimplifyLibCalls/FFS.ll
@@ -1,6 +1,7 @@
-; Test that the ToAsciiOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*@ffs"
+; Test that FFSOpt works correctly
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
+
+; CHECK-NOT: call{{.*}}@ffs
 
 @non_const = external global i32		; <i32*> [#uses=1]
 
@@ -34,3 +35,11 @@ define i32 @a(i64) nounwind {
         %2 = call i32 @ffsll(i64 %0)            ; <i32> [#uses=1]
         ret i32 %2
 }
+
+; PR13028
+define i32 @b() nounwind {
+  %ffs = call i32 @ffsll(i64 0)
+  ret i32 %ffs
+; CHECK: @b
+; CHECK-NEXT: ret i32 0
+}
diff --git a/test/Transforms/SimplifyLibCalls/StpCpy.ll b/test/Transforms/SimplifyLibCalls/StpCpy.ll
deleted file mode 100644
index 914b0955bc90..000000000000
--- a/test/Transforms/SimplifyLibCalls/StpCpy.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; Test that the StpCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @stpcpy(i8*, i8*)
-
-declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @stpcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__stpcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
-
-define i8* @t3(i8* %arg) {
-; CHECK: @t3
-  %stpcpy = tail call i8* @stpcpy(i8* %arg, i8* %arg)
-; CHECK: [[LEN:%[a-z]+]] = call i32 @strlen(i8* %arg)
-; CHECK-NEXT: getelementptr inbounds i8* %arg, i32 [[LEN]]
-  ret i8* %stpcpy
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrCat.ll b/test/Transforms/SimplifyLibCalls/StrCat.ll
deleted file mode 100644
index 3ea691a3cfbe..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrCat.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; Test that the StrCatOptimizer works correctly
-; PR3661
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strcat"
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep "puts.*%arg1"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strcat(i8*, i8*)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strcat( i8* %arg1, i8* %arg2 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strcat( i8* %rslt1, i8* %arg3 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strcat( i8* %rslt2, i8* %arg4 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/StrChr.ll b/test/Transforms/SimplifyLibCalls/StrChr.ll
deleted file mode 100644
index eaabeb2feb8f..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrChr.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; Test that the StrChrOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [14 x i8] c"hello world\5Cn\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strchr(i8*, i32)
-
-define i32 @foo(i32 %index) {
-	%hello_p = getelementptr [14 x i8]* @hello, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%world = call i8* @strchr(i8* %hello_p, i32 119)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%ignore = call i8* @strchr(i8* %null_p, i32 119)
-; CHECK-NOT: call i8* strchr
-	%null = call i8* @strchr(i8* %hello_p, i32 0)
-; CHECK: getelementptr i8* %hello_p, i64 13
-	%result = call i8* @strchr(i8* %hello_p, i32 %index)
-; CHECK: call i8* @memchr(i8* %hello_p, i32 %index, i64 14)
-	ret i32 %index
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/StrCmp.ll b/test/Transforms/SimplifyLibCalls/StrCmp.ll
deleted file mode 100644
index 60854d76c97a..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrCmp.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; Test that the StrCmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@hell = constant [5 x i8] c"hell\00"		; <[5 x i8]*> [#uses=1]
-@bell = constant [5 x i8] c"bell\00"		; <[5 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-
-declare i32 @strcmp(i8*, i8*)
-
-; strcmp("", x) -> -*x
-define i32 @test1(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i8* %str)
-  ret i32 %temp1
-  ; CHECK: @test1
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %1 = zext i8 %strcmpload to i32
-  ; CHECK: %temp1 = sub i32 0, %1
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, "") -> *x
-define i32 @test2(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* %str, i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test2
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %temp1 = zext i8 %strcmpload to i32
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, y)  -> cnst
-define i32 @test3() {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test3
-  ; CHECK: ret i32 -1
-}
-define i32 @test4() {
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0))
-  ret i32 %temp1
-  ; CHECK: @test4
-  ; CHECK: ret i32 1
-}
-
-; strcmp(x, y)   -> memcmp(x, y, <known length>)
-; (This transform is rather difficult to trigger in a useful manner)
-define i32 @test5(i1 %b) {
-  %sel = select i1 %b, i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([5 x i8]* @bell, i32 0, i32 0)
-  %temp1 = call i32 @strcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %sel)
-  ret i32 %temp1
-  ; CHECK: @test5
-  ; CHECK: %memcmp = call i32 @memcmp(i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i8* %sel, i32 5)
-  ; CHECK: ret i32 %memcmp
-}
-
-; strcmp(x,x)  -> 0
-define i32 @test6(i8* %str) {
-  %temp1 = call i32 @strcmp(i8* %str, i8* %str)
-  ret i32 %temp1
-  ; CHECK: @test6
-  ; CHECK: ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrCpy.ll b/test/Transforms/SimplifyLibCalls/StrCpy.ll
deleted file mode 100644
index 83406ff8f868..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrCpy.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; Test that the StrCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-
-@hello = constant [6 x i8] c"hello\00"
-
-declare i8* @strcpy(i8*, i8*)
-
-declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-
-declare i32 @llvm.objectsize.i32(i8*, i1) nounwind readonly
-
-; rdar://6839935
-
-define i32 @t1() {
-; CHECK: @t1
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %rslt1 = call i8* @strcpy( i8* %arg1, i8* %arg2 )
-; CHECK: @llvm.memcpy.p0i8.p0i8.i32
-  ret i32 0
-}
-
-define i32 @t2() {
-; CHECK: @t2
-  %target = alloca [1024 x i8]
-  %arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0
-  %arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0
-  %tmp1 = call i32 @llvm.objectsize.i32(i8* %arg1, i1 false)
-  %rslt1 = call i8* @__strcpy_chk(i8* %arg1, i8* %arg2, i32 %tmp1)
-; CHECK: @__memcpy_chk
-  ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrLen.ll b/test/Transforms/SimplifyLibCalls/StrLen.ll
deleted file mode 100644
index 4a20bbd2ce81..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrLen.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; Test that the StrCatOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:    not grep "call.*strlen"
-
-target datalayout = "e-p:32:32"
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=3]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=3]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-@nullstring = constant i8 0
-
-declare i32 @strlen(i8*)
-
-define i32 @test1() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	ret i32 %hello_l
-}
-
-define i32 @test2() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	ret i32 %null_l
-}
-
-define i32 @test3() {
-	%null_hello_p = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_hello_l = call i32 @strlen( i8* %null_hello_p )		; <i32> [#uses=1]
-	ret i32 %null_hello_l
-}
-
-define i1 @test4() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	%eq_hello = icmp eq i32 %hello_l, 0		; <i1> [#uses=1]
-	ret i1 %eq_hello
-}
-
-define i1 @test5() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	%eq_null = icmp eq i32 %null_l, 0		; <i1> [#uses=1]
-	ret i1 %eq_null
-}
-
-define i1 @test6() {
-	%hello_p = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%hello_l = call i32 @strlen( i8* %hello_p )		; <i32> [#uses=1]
-	%ne_hello = icmp ne i32 %hello_l, 0		; <i1> [#uses=1]
-	ret i1 %ne_hello
-}
-
-define i1 @test7() {
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%null_l = call i32 @strlen( i8* %null_p )		; <i32> [#uses=1]
-	%ne_null = icmp ne i32 %null_l, 0		; <i1> [#uses=1]
-	ret i1 %ne_null
-}
-
-define i32 @test8() {
-	%len = tail call i32 @strlen(i8* @nullstring) nounwind
-	ret i32 %len
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCat.ll b/test/Transforms/SimplifyLibCalls/StrNCat.ll
deleted file mode 100644
index 073792b96a1b..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrNCat.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; Test that the StrNCatOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strncat"
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   grep "puts.*%arg1"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strncat(i8*, i8*, i32)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strncat( i8* %arg1, i8* %arg2, i32 6 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strncat( i8* %rslt1, i8* %arg3, i32 42 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strncat( i8* %rslt2, i8* %arg4, i32 42 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCmp.ll b/test/Transforms/SimplifyLibCalls/StrNCmp.ll
deleted file mode 100644
index 0b2a501a3c8a..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrNCmp.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; Test that the StrCmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@hell = constant [5 x i8] c"hell\00"		; <[5 x i8]*> [#uses=1]
-@bell = constant [5 x i8] c"bell\00"		; <[5 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-
-declare i32 @strncmp(i8*, i8*, i32)
-
-; strcmp("", x) -> -*x
-define i32 @test1(i8* %str) {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i8* %str, i32 10)
-  ret i32 %temp1
-  ; CHECK: @test1
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %1 = zext i8 %strcmpload to i32
-  ; CHECK: %temp1 = sub i32 0, %1
-  ; CHECK: ret i32 %temp1
-}
-
-; strcmp(x, "") -> *x
-define i32 @test2(i8* %str) {
-  %temp1 = call i32 @strncmp(i8* %str, i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test2
-  ; CHECK: %strcmpload = load i8* %str
-  ; CHECK: %temp1 = zext i8 %strcmpload to i32
-  ; CHECK: ret i32 %temp1
-}
-
-; strncmp(x, y, n)  -> cnst
-define i32 @test3() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test3
-  ; CHECK: ret i32 -1
-}
-define i32 @test4() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([1 x i8]* @null, i32 0, i32 0), i32 10)
-  ret i32 %temp1
-  ; CHECK: @test4
-  ; CHECK: ret i32 1
-}
-define i32 @test5() {
-  %temp1 = call i32 @strncmp(i8* getelementptr inbounds ([5 x i8]* @hell, i32 0, i32 0), i8* getelementptr inbounds ([6 x i8]* @hello, i32 0, i32 0), i32 4)
-  ret i32 %temp1
-  ; CHECK: @test5
-  ; CHECK: ret i32 0
-}
-
-; strncmp(x,y,1) -> memcmp(x,y,1)
-define i32 @test6(i8* %str1, i8* %str2) {
-  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
-  ret i32 %temp1
-  ; CHECK: @test6
-  ; CHECK: load i8*
-  ; CHECK: load i8*
-  ; CHECK: sub i32
-}
-
-; strncmp(x,y,0)   -> 0
-define i32 @test7(i8* %str1, i8* %str2) {
-  %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 0)
-  ret i32 %temp1
-  ; CHECK: @test7
-  ; CHECK: ret i32 0
-}
-
-; strncmp(x,x,n)  -> 0
-define i32 @test8(i8* %str, i32 %n) {
-  %temp1 = call i32 @strncmp(i8* %str, i8* %str, i32 %n)
-  ret i32 %temp1
-  ; CHECK: @test8
-  ; CHECK: ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrNCpy.ll b/test/Transforms/SimplifyLibCalls/StrNCpy.ll
deleted file mode 100644
index 4e47b31a6afa..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrNCpy.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; Test that the StrNCpyOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | \
-; RUN:   not grep "call.*strncpy"
-
-; This transformation requires the pointer size, as it assumes that size_t is
-; the size of a pointer.
-target datalayout = "-p:64:64:64"
-
-@hello = constant [6 x i8] c"hello\00"		; <[6 x i8]*> [#uses=1]
-@null = constant [1 x i8] zeroinitializer		; <[1 x i8]*> [#uses=1]
-@null_hello = constant [7 x i8] c"\00hello\00"		; <[7 x i8]*> [#uses=1]
-
-declare i8* @strncpy(i8*, i8*, i32)
-
-declare i32 @puts(i8*)
-
-define i32 @main() {
-	%target = alloca [1024 x i8]		; <[1024 x i8]*> [#uses=1]
-	%arg1 = getelementptr [1024 x i8]* %target, i32 0, i32 0		; <i8*> [#uses=2]
-	store i8 0, i8* %arg1
-	%arg2 = getelementptr [6 x i8]* @hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt1 = call i8* @strncpy( i8* %arg1, i8* %arg2, i32 6 )		; <i8*> [#uses=1]
-	%arg3 = getelementptr [1 x i8]* @null, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt2 = call i8* @strncpy( i8* %rslt1, i8* %arg3, i32 42 )		; <i8*> [#uses=1]
-	%arg4 = getelementptr [7 x i8]* @null_hello, i32 0, i32 0		; <i8*> [#uses=1]
-	%rslt3 = call i8* @strncpy( i8* %rslt2, i8* %arg4, i32 42 )		; <i8*> [#uses=1]
-	call i32 @puts( i8* %rslt3 )		; <i32>:1 [#uses=0]
-	ret i32 0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrPBrk.ll b/test/Transforms/SimplifyLibCalls/StrPBrk.ll
deleted file mode 100644
index 29c3b7477b47..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrPBrk.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@hello = constant [12 x i8] c"hello world\00"
-@w = constant [2 x i8] c"w\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strpbrk(i8*, i8*)
-
-define void @test(i8* %s1, i8* %s2) {
-	%hello_p = getelementptr [12 x i8]* @hello, i32 0, i32 0
-	%w_p = getelementptr [2 x i8]* @w, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i8* @strpbrk(i8* %null_p, i8* %s2)
-	%test2 = call i8* @strpbrk(i8* %s1, i8* %null_p)
-; CHECK-NOT: call i8* @strpbrk
-	%test3 = call i8* @strpbrk(i8* %s1, i8* %w_p)
-; CHECK: call i8* @strchr(i8* %s1, i32 119)
-	%test4 = call i8* @strpbrk(i8* %hello_p, i8* %w_p)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%test5 = call i8* @strpbrk(i8* %s1, i8* %s2)
-; CHECK: call i8* @strpbrk(i8* %s1, i8* %s2)
-	ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrRChr.ll b/test/Transforms/SimplifyLibCalls/StrRChr.ll
deleted file mode 100644
index 2259fc0289fb..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrRChr.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; Test that the StrRChrOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@hello = constant [14 x i8] c"hello world\5Cn\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i8* @strrchr(i8*, i32)
-
-define void @foo(i8* %bar) {
-	%hello_p = getelementptr [14 x i8]* @hello, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%world = call i8* @strrchr(i8* %hello_p, i32 119)
-; CHECK: getelementptr i8* %hello_p, i64 6
-	%ignore = call i8* @strrchr(i8* %null_p, i32 119)
-; CHECK-NOT: call i8* strrchr
-	%null = call i8* @strrchr(i8* %hello_p, i32 0)
-; CHECK: getelementptr i8* %hello_p, i64 13
-	%strchr = call i8* @strrchr(i8* %bar, i32 0)
-; CHECK: call i8* @strchr(i8* %bar, i32 0)
-	ret void
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrSpn.ll b/test/Transforms/SimplifyLibCalls/StrSpn.ll
deleted file mode 100644
index 800c19088337..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrSpn.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@abcba = constant [6 x i8] c"abcba\00"
-@abc = constant [4 x i8] c"abc\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i64 @strspn(i8*, i8*)
-
-define i64 @testspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strspn(i8* %s1, i8* %null_p)
-	%test2 = call i64 @strspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strspn
-	%test4 = call i64 @strspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strspn(i8* %s1, i8* %s2)
-	ret i64 %test3
-; CHECK: ret i64 5
-}
-
-declare i64 @strcspn(i8*, i8*)
-
-define i64 @testcspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strcspn(i8* %s1, i8* %null_p)
-; CHECK: call i64 @strlen(i8* %s1)
-	%test2 = call i64 @strcspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strcspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strcspn
-	%test4 = call i64 @strcspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strcspn(i8* %s1, i8* %s2)
-        %add0 = add i64 %test1, %test3
-; CHECK: add i64 %{{.+}}, 0
-	ret i64 %add0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrStr.ll b/test/Transforms/SimplifyLibCalls/StrStr.ll
deleted file mode 100644
index eefd2e8006ab..000000000000
--- a/test/Transforms/SimplifyLibCalls/StrStr.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR5783
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-darwin9.0"
-
-@.str = private constant [1 x i8] zeroinitializer ; <[1 x i8]*> [#uses=1]
-@.str1 = private constant [2 x i8] c"a\00"        ; <[2 x i8]*> [#uses=1]
-@.str2 = private constant [6 x i8] c"abcde\00"    ; <[6 x i8]*> [#uses=1]
-@.str3 = private constant [4 x i8] c"bcd\00"      ; <[4 x i8]*> [#uses=1]
-
-define i8* @test1(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "") -> P
-; CHECK: @test1
-; CHECK: ret i8* %P
-}
-
-declare i8* @strstr(i8*, i8* nocapture) nounwind readonly
-
-define i8* @test2(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([2 x i8]* @.str1, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "a") -> strchr(P, 'a')
-; CHECK: @test2
-; CHECK: @strchr(i8* %P, i32 97)
-}
-
-define i8* @test3(i8* nocapture %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* getelementptr inbounds ([6 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr("abcde", "bcd") -> "abcde"+1
-; CHECK: @test3
-; CHECK: getelementptr inbounds ([6 x i8]* @.str2, i32 0, i64 1)
-}
-
-define i8* @test4(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %P) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, P) -> P
-; CHECK: @test4
-; CHECK: ret i8* %P
-}
-
-define i1 @test5(i8* %P, i8* %Q) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %Q) nounwind ; <i8*> [#uses=1]
-  %cmp = icmp eq i8* %call, %P
-  ret i1 %cmp
-; CHECK: @test5
-; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %Q)
-; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %P, i8* %Q, {{i[0-9]+}} [[LEN]])
-; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
-; CHECK: ret i1
-}
diff --git a/test/Transforms/SimplifyLibCalls/double-float-shrink.ll b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
new file mode 100644
index 000000000000..b4ab8b4ceb9d
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/double-float-shrink.ll
@@ -0,0 +1,333 @@
+; RUN: opt  < %s -simplify-libcalls -enable-double-float-shrink -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define float @acos_test(float %f) nounwind readnone {
+; CHECK: acos_test
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acosf(float %f)
+}
+
+define double @acos_test2(float %f) nounwind readnone {
+; CHECK: acos_test2
+    %conv = fpext float %f to double
+    %call = call double @acos(double %conv)
+    ret double %call
+; CHECK: call double @acos(double %conv)
+}
+
+define float @acosh_test(float %f) nounwind readnone {
+; CHECK: acosh_test
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @acoshf(float %f)
+}
+
+define double @acosh_test2(float %f) nounwind readnone {
+; CHECK: acosh_test2
+    %conv = fpext float %f to double
+    %call = call double @acosh(double %conv)
+    ret double %call
+; CHECK: call double @acosh(double %conv)
+}
+
+define float @asin_test(float %f) nounwind readnone {
+; CHECK: asin_test
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinf(float %f)
+}
+
+define double @asin_test2(float %f) nounwind readnone {
+; CHECK: asin_test2
+    %conv = fpext float %f to double
+    %call = call double @asin(double %conv)
+    ret double %call
+; CHECK: call double @asin(double %conv)
+}
+
+define float @asinh_test(float %f) nounwind readnone {
+; CHECK: asinh_test
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @asinhf(float %f)
+}
+
+define double @asinh_test2(float %f) nounwind readnone {
+; CHECK: asinh_test2
+    %conv = fpext float %f to double
+    %call = call double @asinh(double %conv)
+    ret double %call
+; CHECK: call double @asinh(double %conv)
+}
+
+define float @atan_test(float %f) nounwind readnone {
+; CHECK: atan_test
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanf(float %f)
+}
+
+define double @atan_test2(float %f) nounwind readnone {
+; CHECK: atan_test2
+    %conv = fpext float %f to double
+    %call = call double @atan(double %conv)
+    ret double %call
+; CHECK: call double @atan(double %conv)
+}
+define float @atanh_test(float %f) nounwind readnone {
+; CHECK: atanh_test
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @atanhf(float %f)
+}
+
+define double @atanh_test2(float %f) nounwind readnone {
+; CHECK: atanh_test2
+    %conv = fpext float %f to double
+    %call = call double @atanh(double %conv)
+    ret double %call
+; CHECK: call double @atanh(double %conv)
+}
+define float @cbrt_test(float %f) nounwind readnone {
+; CHECK: cbrt_test
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @cbrtf(float %f)
+}
+
+define double @cbrt_test2(float %f) nounwind readnone {
+; CHECK: cbrt_test2
+    %conv = fpext float %f to double
+    %call = call double @cbrt(double %conv)
+    ret double %call
+; CHECK: call double @cbrt(double %conv)
+}
+define float @exp_test(float %f) nounwind readnone {
+; CHECK: exp_test
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expf(float %f)
+}
+
+define double @exp_test2(float %f) nounwind readnone {
+; CHECK: exp_test2
+    %conv = fpext float %f to double
+    %call = call double @exp(double %conv)
+    ret double %call
+; CHECK: call double @exp(double %conv)
+}
+define float @expm1_test(float %f) nounwind readnone {
+; CHECK: expm1_test
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @expm1f(float %f)
+}
+
+define double @expm1_test2(float %f) nounwind readnone {
+; CHECK: expm1_test2
+    %conv = fpext float %f to double
+    %call = call double @expm1(double %conv)
+    ret double %call
+; CHECK: call double @expm1(double %conv)
+}
+define float @exp10_test(float %f) nounwind readnone {
+; CHECK: exp10_test
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @exp10f(float %f)
+}
+
+define double @exp10_test2(float %f) nounwind readnone {
+; CHECK: exp10_test2
+    %conv = fpext float %f to double
+    %call = call double @exp10(double %conv)
+    ret double %call
+; CHECK: call double @exp10(double %conv)
+}
+define float @log_test(float %f) nounwind readnone {
+; CHECK: log_test
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logf(float %f)
+}
+
+define double @log_test2(float %f) nounwind readnone {
+; CHECK: log_test2
+    %conv = fpext float %f to double
+    %call = call double @log(double %conv)
+    ret double %call
+; CHECK: call double @log(double %conv)
+}
+define float @log10_test(float %f) nounwind readnone {
+; CHECK: log10_test
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log10f(float %f)
+}
+
+define double @log10_test2(float %f) nounwind readnone {
+; CHECK: log10_test2
+    %conv = fpext float %f to double
+    %call = call double @log10(double %conv)
+    ret double %call
+; CHECK: call double @log10(double %conv)
+}
+define float @log1p_test(float %f) nounwind readnone {
+; CHECK: log1p_test
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log1pf(float %f)
+}
+
+define double @log1p_test2(float %f) nounwind readnone {
+; CHECK: log1p_test2
+    %conv = fpext float %f to double
+    %call = call double @log1p(double %conv)
+    ret double %call
+; CHECK: call double @log1p(double %conv)
+}
+define float @log2_test(float %f) nounwind readnone {
+; CHECK: log2_test
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @log2f(float %f)
+}
+
+define double @log2_test2(float %f) nounwind readnone {
+; CHECK: log2_test2
+    %conv = fpext float %f to double
+    %call = call double @log2(double %conv)
+    ret double %call
+; CHECK: call double @log2(double %conv)
+}
+define float @logb_test(float %f) nounwind readnone {
+; CHECK: logb_test
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @logbf(float %f)
+}
+
+define double @logb_test2(float %f) nounwind readnone {
+; CHECK: logb_test2
+    %conv = fpext float %f to double
+    %call = call double @logb(double %conv)
+    ret double %call
+; CHECK: call double @logb(double %conv)
+}
+define float @sin_test(float %f) nounwind readnone {
+; CHECK: sin_test
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sinf(float %f)
+}
+
+define double @sin_test2(float %f) nounwind readnone {
+; CHECK: sin_test2
+    %conv = fpext float %f to double
+    %call = call double @sin(double %conv)
+    ret double %call
+; CHECK: call double @sin(double %conv)
+}
+define float @sqrt_test(float %f) nounwind readnone {
+; CHECK: sqrt_test
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @sqrtf(float %f)
+}
+
+define double @sqrt_test2(float %f) nounwind readnone {
+; CHECK: sqrt_test2
+    %conv = fpext float %f to double
+    %call = call double @sqrt(double %conv)
+    ret double %call
+; CHECK: call double @sqrt(double %conv)
+}
+define float @tan_test(float %f) nounwind readnone {
+; CHECK: tan_test
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanf(float %f)
+}
+
+define double @tan_test2(float %f) nounwind readnone {
+; CHECK: tan_test2
+    %conv = fpext float %f to double
+    %call = call double @tan(double %conv)
+    ret double %call
+; CHECK: call double @tan(double %conv)
+}
+define float @tanh_test(float %f) nounwind readnone {
+; CHECK: tanh_test
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    %conv1 = fptrunc double %call to float
+    ret float %conv1
+; CHECK: call float @tanhf(float %f)
+}
+
+define double @tanh_test2(float %f) nounwind readnone {
+; CHECK: tanh_test2
+    %conv = fpext float %f to double
+    %call = call double @tanh(double %conv)
+    ret double %call
+; CHECK: call double @tanh(double %conv)
+}
+
+declare double @tanh(double) nounwind readnone
+declare double @tan(double) nounwind readnone
+declare double @sqrt(double) nounwind readnone
+declare double @sin(double) nounwind readnone
+declare double @log2(double) nounwind readnone
+declare double @log1p(double) nounwind readnone
+declare double @log10(double) nounwind readnone
+declare double @log(double) nounwind readnone
+declare double @logb(double) nounwind readnone
+declare double @exp10(double) nounwind readnone
+declare double @expm1(double) nounwind readnone
+declare double @exp(double) nounwind readnone
+declare double @cbrt(double) nounwind readnone
+declare double @atanh(double) nounwind readnone
+declare double @atan(double) nounwind readnone
+declare double @acos(double) nounwind readnone
+declare double @acosh(double) nounwind readnone
+declare double @asin(double) nounwind readnone
+declare double @asinh(double) nounwind readnone
diff --git a/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
new file mode 100644
index 000000000000..aecb887beb3a
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/float-shrink-compare.ll
@@ -0,0 +1,179 @@
+; RUN: opt -S -simplify-libcalls -instcombine %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @test1(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @ceil(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test1
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test2(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @fabs(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test2
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test3(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @floor(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test3
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test4(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @nearbyint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test4
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test5(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @rint(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test5
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test6(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @round(double %1) nounwind readnone
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test6
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test7(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %x to double
+  %2 = call double @trunc(double %1) nounwind
+  %3 = fpext float %y to double
+  %4 = fcmp oeq double %2, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test7
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+define i32 @test8(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @ceil(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test8
+; CHECK-NEXT: %ceilf = call float @ceilf(float %x)
+; CHECK-NEXT: fcmp oeq float %ceilf, %y
+}
+
+define i32 @test9(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @fabs(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test9
+; CHECK-NEXT: %fabsf = call float @fabsf(float %x)
+; CHECK-NEXT: fcmp oeq float %fabsf, %y
+}
+
+define i32 @test10(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @floor(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test10
+; CHECK-NEXT: %floorf = call float @floorf(float %x)
+; CHECK-NEXT: fcmp oeq float %floorf, %y
+}
+
+define i32 @test11(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @nearbyint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test11
+; CHECK-NEXT: %nearbyintf = call float @nearbyintf(float %x)
+; CHECK-NEXT: fcmp oeq float %nearbyintf, %y
+}
+
+define i32 @test12(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @rint(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test12
+; CHECK-NEXT: %rintf = call float @rintf(float %x)
+; CHECK-NEXT: fcmp oeq float %rintf, %y
+}
+
+define i32 @test13(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @round(double %2) nounwind readnone
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test13
+; CHECK-NEXT: %roundf = call float @roundf(float %x)
+; CHECK-NEXT: fcmp oeq float %roundf, %y
+}
+
+define i32 @test14(float %x, float %y) nounwind uwtable {
+  %1 = fpext float %y to double
+  %2 = fpext float %x to double
+  %3 = call double @trunc(double %2) nounwind
+  %4 = fcmp oeq double %1, %3
+  %5 = zext i1 %4 to i32
+  ret i32 %5
+; CHECK: @test14
+; CHECK-NEXT: %truncf = call float @truncf(float %x)
+; CHECK-NEXT: fcmp oeq float %truncf, %y
+}
+
+declare double @fabs(double) nounwind readnone
+declare double @ceil(double) nounwind readnone
+declare double @floor(double) nounwind readnone
+declare double @nearbyint(double) nounwind readnone
+declare double @rint(double) nounwind readnone
+declare double @round(double) nounwind readnone
+declare double @trunc(double) nounwind readnone
diff --git a/test/Transforms/SimplifyLibCalls/floor.ll b/test/Transforms/SimplifyLibCalls/floor.ll
index 03dcdf585f9a..93c62c20023d 100644
--- a/test/Transforms/SimplifyLibCalls/floor.ll
+++ b/test/Transforms/SimplifyLibCalls/floor.ll
@@ -9,6 +9,8 @@
 ; DO-SIMPLIFY: call float @ceilf(
 ; DO-SIMPLIFY: call float @roundf(
 ; DO-SIMPLIFY: call float @nearbyintf(
+; DO-SIMPLIFY: call float @truncf(
+; DO-SIMPLIFY: call float @fabsf(
 
 ; C89-SIMPLIFY: call float @floorf(
 ; C89-SIMPLIFY: call float @ceilf(
@@ -19,6 +21,8 @@
 ; DONT-SIMPLIFY: call double @ceil(
 ; DONT-SIMPLIFY: call double @round(
 ; DONT-SIMPLIFY: call double @nearbyint(
+; DONT-SIMPLIFY: call double @trunc(
+; DONT-SIMPLIFY: call double @fabs(
 
 declare double @floor(double)
 
@@ -28,6 +32,10 @@ declare double @round(double)
 
 declare double @nearbyint(double)
 
+declare double @trunc(double)
+
+declare double @fabs(double)
+
 define float @test_floor(float %C) {
 	%D = fpext float %C to double		; <double> [#uses=1]
         ; --> floorf
@@ -60,3 +68,18 @@ define float @test_nearbyint(float %C) {
 	ret float %F
 }
 
+define float @test_trunc(float %C) {
+	%D = fpext float %C to double
+	; --> truncf
+        %E = call double @trunc(double %D)
+	%F = fptrunc double %E to float
+	ret float %F
+}
+
+define float @test_fabs(float %C) {
+	%D = fpext float %C to double
+	; --> fabsf
+        %E = call double @fabs(double %D)
+	%F = fptrunc double %E to float
+	ret float %F
+}
diff --git a/test/Transforms/SimplifyLibCalls/memcmp.ll b/test/Transforms/SimplifyLibCalls/memcmp.ll
deleted file mode 100644
index 6ca4dc97a194..000000000000
--- a/test/Transforms/SimplifyLibCalls/memcmp.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; Test that the memcmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-@h = constant [2 x i8] c"h\00"		; <[2 x i8]*> [#uses=0]
-@hel = constant [4 x i8] c"hel\00"		; <[4 x i8]*> [#uses=0]
-@hello_u = constant [8 x i8] c"hello_u\00"		; <[8 x i8]*> [#uses=0]
-
-declare i32 @memcmp(i8*, i8*, i32)
-
-define void @test(i8* %P, i8* %Q, i32 %N, i32* %IP, i1* %BP) {
-	%A = call i32 @memcmp( i8* %P, i8* %P, i32 %N )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %A, i32* %IP
-	%B = call i32 @memcmp( i8* %P, i8* %Q, i32 0 )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %B, i32* %IP
-	%C = call i32 @memcmp( i8* %P, i8* %Q, i32 1 )		; <i32> [#uses=1]
-; CHECK: load
-; CHECK: zext
-; CHECK: load
-; CHECK: zext
-; CHECK: sub
-; CHECK: store volatile
-	store volatile i32 %C, i32* %IP
-  %F = call i32 @memcmp(i8* getelementptr ([4 x i8]* @hel, i32 0, i32 0),
-                        i8* getelementptr ([8 x i8]* @hello_u, i32 0, i32 0),
-                        i32 3)
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-  store volatile i32 %F, i32* %IP
-	ret void
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/memmove.ll b/test/Transforms/SimplifyLibCalls/memmove.ll
deleted file mode 100644
index 5aaeeeb024f7..000000000000
--- a/test/Transforms/SimplifyLibCalls/memmove.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memmove"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i8* %b, i32 %x) {
-entry:
-	%call = call i8* @memmove(i8* %a, i8* %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memmove(i8*,i8*,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset-64.ll b/test/Transforms/SimplifyLibCalls/memset-64.ll
deleted file mode 100644
index 92412dee71ad..000000000000
--- a/test/Transforms/SimplifyLibCalls/memset-64.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-pc-linux-gnu"
-
-define void @a(i8* %x) nounwind {
-entry:
-	%call = call i8* @memset(i8* %x, i32 1, i64 100)		; <i8*> [#uses=0]
-	ret void
-}
-
-declare i8* @memset(i8*, i32, i64)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset.ll b/test/Transforms/SimplifyLibCalls/memset.ll
deleted file mode 100644
index 853215a4d24c..000000000000
--- a/test/Transforms/SimplifyLibCalls/memset.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i32 %b, i32 %x) {
-entry:
-	%call = call i8* @memset(i8* %a, i32 %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memset(i8*,i32,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/weak-symbols.ll b/test/Transforms/SimplifyLibCalls/weak-symbols.ll
deleted file mode 100644
index 5875b211f776..000000000000
--- a/test/Transforms/SimplifyLibCalls/weak-symbols.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR4738
-
-; SimplifyLibcalls shouldn't assume anything about weak symbols.
-
-@real_init = weak_odr constant [2 x i8] c"y\00"
-@fake_init = weak constant [2 x i8] c"y\00"
-@.str = private constant [2 x i8] c"y\00"
-
-; CHECK: define i32 @foo
-; CHECK: call i32 @strcmp
-define i32 @foo() nounwind {
-entry:
-  %t0 = call i32 @strcmp(i8* getelementptr inbounds ([2 x i8]* @fake_init, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) nounwind readonly
-  ret i32 %t0
-}
-
-; CHECK: define i32 @bar
-; CHECK: ret i32 0
-define i32 @bar() nounwind {
-entry:
-  %t0 = call i32 @strcmp(i8* getelementptr inbounds ([2 x i8]* @real_init, i64 0, i64 0), i8* getelementptr inbounds ([2 x i8]* @.str, i64 0, i64 0)) nounwind readonly
-  ret i32 %t0
-}
-
-declare i32 @strcmp(i8*, i8*) nounwind readonly
diff --git a/test/Verifier/invoke.ll b/test/Verifier/invoke.ll
index a48f9b60feb4..c2750bb121f2 100644
--- a/test/Verifier/invoke.ll
+++ b/test/Verifier/invoke.ll
@@ -19,7 +19,6 @@ L2:		; preds = %0
 	br label %L
 L:		; preds = %L2, %L1, %L1
 ; CHECK: The unwind destination does not have a landingpad instruction
-; CHECK: Instruction does not dominate all uses
 	ret i32 %A
 }
 
diff --git a/test/lit.cfg b/test/lit.cfg
index 6f44bb3d8c66..79eaa23c8ba9 100644
--- a/test/lit.cfg
+++ b/test/lit.cfg
@@ -5,6 +5,7 @@
 import os
 import sys
 import re
+import platform
 
 # name: The name of this test suite.
 config.name = 'LLVM'
@@ -139,9 +140,22 @@ if config.test_exec_root is None:
 
 ###
 
-# When running under valgrind, we mangle '-vg' or '-vg_leak' onto the end of the
-# triple so we can check it with XFAIL and XTARGET.
-config.target_triple += lit.valgrindTriple
+# Provide a target triple for mcjit tests
+mcjit_triple = config.target_triple
+# Force ELF format on Windows
+if re.search(r'cygwin|mingw32|win32', mcjit_triple):
+  mcjit_triple += "-elf"
+config.substitutions.append( ('%mcjit_triple', mcjit_triple) )
+
+# Provide a substition for those tests that need to run the jit to obtain data
+# but simply want use the currently considered most reliable jit for platform
+# FIXME: ppc32 is not ready for mcjit.
+if 'arm' in config.target_triple \
+   or 'powerpc64' in config.target_triple:
+    defaultIsMCJIT = 'true'
+else:
+    defaultIsMCJIT = 'false'
+config.substitutions.append( ('%defaultjit', '-use-mcjit='+defaultIsMCJIT) )
 
 # Process jit implementation option
 jit_impl_cfg = lit.params.get('jit_impl', None)
@@ -230,6 +244,10 @@ else:
 if loadable_module:
     config.available_features.add('loadable_module')
 
+# LTO on OS X
+if config.lto_is_enabled == "1" and platform.system() == "Darwin":
+    config.available_features.add('lto_on_osx')
+
 # llc knows whether he is compiled with -DNDEBUG.
 import subprocess
 try:
diff --git a/test/lit.site.cfg.in b/test/lit.site.cfg.in
index 178b22f10f33..2bbe63e6348e 100644
--- a/test/lit.site.cfg.in
+++ b/test/lit.site.cfg.in
@@ -11,6 +11,7 @@ config.python_executable = "@PYTHON_EXECUTABLE@"
 config.ocamlopt_executable = "@OCAMLOPT@"
 config.enable_shared = @ENABLE_SHARED@
 config.enable_assertions = @ENABLE_ASSERTIONS@
+config.lto_is_enabled = "@LTO_IS_ENABLED@"
 config.targets_to_build = "@TARGETS_TO_BUILD@"
 config.llvm_bindings = "@LLVM_BINDINGS@"
 config.host_os = "@HOST_OS@"
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 1bfc2fe3e868..691828562203 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -36,6 +36,7 @@ add_subdirectory(bugpoint)
 add_subdirectory(bugpoint-passes)
 add_subdirectory(llvm-bcanalyzer)
 add_subdirectory(llvm-stress)
+add_subdirectory(llvm-mcmarkup)
 
 if( NOT WIN32 )
   add_subdirectory(lto)
diff --git a/tools/LLVMBuild.txt b/tools/LLVMBuild.txt
index df4aa9ff4e4d..64164792a77f 100644
--- a/tools/LLVMBuild.txt
+++ b/tools/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-link llvm-mc llvm-nm llvm-objdump llvm-prof llvm-ranlib llvm-rtdyld llvm-size macho-dump opt
+subdirectories = bugpoint llc lli llvm-ar llvm-as llvm-bcanalyzer llvm-cov llvm-diff llvm-dis llvm-dwarfdump llvm-extract llvm-link llvm-mc llvm-nm llvm-objdump llvm-prof llvm-ranlib llvm-rtdyld llvm-size macho-dump opt llvm-mcmarkup
 
 [component_0]
 type = Group
diff --git a/tools/Makefile b/tools/Makefile
index 2b4b9b7878be..a29e49f0a1f8 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -34,7 +34,7 @@ PARALLEL_DIRS := opt llvm-as llvm-dis \
                  bugpoint llvm-bcanalyzer \
                  llvm-diff macho-dump llvm-objdump llvm-readobj \
 	         llvm-rtdyld llvm-dwarfdump llvm-cov \
-	         llvm-size llvm-stress
+	         llvm-size llvm-stress llvm-mcmarkup
 
 # Let users override the set of tools to build from the command line.
 ifdef ONLY_TOOLS
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 888d2c8e9262..b40b4f10db99 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -14,6 +14,7 @@
 
 #include "BugDriver.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
@@ -25,7 +26,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileUtilities.h"
diff --git a/tools/bugpoint/OptimizerDriver.cpp b/tools/bugpoint/OptimizerDriver.cpp
index fb090ee17697..c56911a32e85 100644
--- a/tools/bugpoint/OptimizerDriver.cpp
+++ b/tools/bugpoint/OptimizerDriver.cpp
@@ -16,11 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "BugDriver.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Bitcode/ReaderWriter.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SystemUtils.h"
diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 02f66d73eedc..496e31cc391a 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile
@@ -24,6 +24,8 @@ include $(LEVEL)/Makefile.config
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.
 CXXFLAGS += -I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
-CXXFLAGS += -L$(SharedLibDir)/$(SharedPrefix) -lLTO
+LDFLAGS += -L$(SharedLibDir)/$(SharedPrefix)
 
 include $(LEVEL)/Makefile.common
+
+LIBS += -lLTO
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 9c17da6a4cb6..b0a0dd2a4057 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -378,9 +378,6 @@ static ld_plugin_status all_symbols_read_hook(void) {
     }
   }
 
-  // If we don't preserve any symbols, libLTO will assume that all symbols are
-  // needed. Keep all symbols unless we're producing a final executable.
-  bool anySymbolsPreserved = false;
   for (std::list<claimed_file>::iterator I = Modules.begin(),
          E = Modules.end(); I != E; ++I) {
     if (I->syms.empty())
@@ -389,7 +386,6 @@ static ld_plugin_status all_symbols_read_hook(void) {
     for (unsigned i = 0, e = I->syms.size(); i != e; i++) {
       if (I->syms[i].resolution == LDPR_PREVAILING_DEF) {
         lto_codegen_add_must_preserve_symbol(code_gen, I->syms[i].name);
-        anySymbolsPreserved = true;
 
         if (options::generate_api_file)
           api_file << I->syms[i].name << "\n";
@@ -400,12 +396,6 @@ static ld_plugin_status all_symbols_read_hook(void) {
   if (options::generate_api_file)
     api_file.close();
 
-  if (!anySymbolsPreserved) {
-    // All of the IL is unnecessary!
-    lto_codegen_dispose(code_gen);
-    return LDPS_OK;
-  }
-
   lto_codegen_set_pic_model(code_gen, output_type);
   lto_codegen_set_debug_model(code_gen, LTO_DEBUG_MODEL_DWARF);
   if (!options::mcpu.empty())
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 8951050c07cd..4d4a74c009e0 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -14,12 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LLVMContext.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Support/IRReader.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -34,7 +36,6 @@
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <memory>
@@ -62,211 +63,13 @@ OptLevel("O",
 static cl::opt<std::string>
 TargetTriple("mtriple", cl::desc("Override target triple for module"));
 
-static cl::opt<std::string>
-MArch("march", cl::desc("Architecture to generate code for (see --version)"));
-
-static cl::opt<std::string>
-MCPU("mcpu",
-  cl::desc("Target a specific cpu type (-mcpu=help for details)"),
-  cl::value_desc("cpu-name"),
-  cl::init(""));
-
-static cl::list<std::string>
-MAttrs("mattr",
-  cl::CommaSeparated,
-  cl::desc("Target specific attributes (-mattr=help for details)"),
-  cl::value_desc("a1,+a2,-a3,..."));
-
-static cl::opt<Reloc::Model>
-RelocModel("relocation-model",
-             cl::desc("Choose relocation model"),
-             cl::init(Reloc::Default),
-             cl::values(
-            clEnumValN(Reloc::Default, "default",
-                       "Target default relocation model"),
-            clEnumValN(Reloc::Static, "static",
-                       "Non-relocatable code"),
-            clEnumValN(Reloc::PIC_, "pic",
-                       "Fully relocatable, position independent code"),
-            clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
-                       "Relocatable external references, non-relocatable code"),
-            clEnumValEnd));
-
-static cl::opt<llvm::CodeModel::Model>
-CMModel("code-model",
-        cl::desc("Choose code model"),
-        cl::init(CodeModel::Default),
-        cl::values(clEnumValN(CodeModel::Default, "default",
-                              "Target default code model"),
-                   clEnumValN(CodeModel::Small, "small",
-                              "Small code model"),
-                   clEnumValN(CodeModel::Kernel, "kernel",
-                              "Kernel code model"),
-                   clEnumValN(CodeModel::Medium, "medium",
-                              "Medium code model"),
-                   clEnumValN(CodeModel::Large, "large",
-                              "Large code model"),
-                   clEnumValEnd));
-
-static cl::opt<bool>
-RelaxAll("mc-relax-all",
-  cl::desc("When used with filetype=obj, "
-           "relax all fixups in the emitted object file"));
-
-cl::opt<TargetMachine::CodeGenFileType>
-FileType("filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
-  cl::desc("Choose a file type (not all types are supported by all targets):"),
-  cl::values(
-       clEnumValN(TargetMachine::CGFT_AssemblyFile, "asm",
-                  "Emit an assembly ('.s') file"),
-       clEnumValN(TargetMachine::CGFT_ObjectFile, "obj",
-                  "Emit a native object ('.o') file"),
-       clEnumValN(TargetMachine::CGFT_Null, "null",
-                  "Emit nothing, for performance testing"),
-       clEnumValEnd));
-
 cl::opt<bool> NoVerify("disable-verify", cl::Hidden,
                        cl::desc("Do not verify input module"));
 
-cl::opt<bool> DisableDotLoc("disable-dot-loc", cl::Hidden,
-                            cl::desc("Do not use .loc entries"));
-
-cl::opt<bool> DisableCFI("disable-cfi", cl::Hidden,
-                         cl::desc("Do not use .cfi_* directives"));
-
-cl::opt<bool> EnableDwarfDirectory("enable-dwarf-directory", cl::Hidden,
-    cl::desc("Use .file directives with an explicit directory."));
-
-static cl::opt<bool>
-DisableRedZone("disable-red-zone",
-  cl::desc("Do not emit code that uses the red zone."),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableFPMAD("enable-fp-mad",
-  cl::desc("Enable less precise MAD instructions to be generated"),
-  cl::init(false));
-
-static cl::opt<bool>
-DisableFPElim("disable-fp-elim",
-  cl::desc("Disable frame pointer elimination optimization"),
-  cl::init(false));
-
-static cl::opt<bool>
-DisableFPElimNonLeaf("disable-non-leaf-fp-elim",
-  cl::desc("Disable frame pointer elimination optimization for non-leaf funcs"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableUnsafeFPMath("enable-unsafe-fp-math",
-  cl::desc("Enable optimizations that may decrease FP precision"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableNoInfsFPMath("enable-no-infs-fp-math",
-  cl::desc("Enable FP math optimizations that assume no +-Infs"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableNoNaNsFPMath("enable-no-nans-fp-math",
-  cl::desc("Enable FP math optimizations that assume no NaNs"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
-  cl::Hidden,
-  cl::desc("Force codegen to assume rounding mode can change dynamically"),
-  cl::init(false));
-
-static cl::opt<bool>
-GenerateSoftFloatCalls("soft-float",
-  cl::desc("Generate software floating point library calls"),
-  cl::init(false));
-
-static cl::opt<llvm::FloatABI::ABIType>
-FloatABIForCalls("float-abi",
-  cl::desc("Choose float ABI type"),
-  cl::init(FloatABI::Default),
-  cl::values(
-    clEnumValN(FloatABI::Default, "default",
-               "Target default float ABI type"),
-    clEnumValN(FloatABI::Soft, "soft",
-               "Soft float ABI (implied by -soft-float)"),
-    clEnumValN(FloatABI::Hard, "hard",
-               "Hard float ABI (uses FP registers)"),
-    clEnumValEnd));
-
-static cl::opt<llvm::FPOpFusion::FPOpFusionMode>
-FuseFPOps("fp-contract",
-  cl::desc("Enable aggresive formation of fused FP ops"),
-  cl::init(FPOpFusion::Standard),
-  cl::values(
-    clEnumValN(FPOpFusion::Fast, "fast",
-               "Fuse FP ops whenever profitable"),
-    clEnumValN(FPOpFusion::Standard, "on",
-               "Only fuse 'blessed' FP ops."),
-    clEnumValN(FPOpFusion::Strict, "off",
-               "Only fuse FP ops when the result won't be effected."),
-    clEnumValEnd));
-
-static cl::opt<bool>
-DontPlaceZerosInBSS("nozero-initialized-in-bss",
-  cl::desc("Don't place zero-initialized symbols into bss section"),
-  cl::init(false));
-
-static cl::opt<bool>
+cl::opt<bool>
 DisableSimplifyLibCalls("disable-simplify-libcalls",
-  cl::desc("Disable simplify-libcalls"),
-  cl::init(false));
-
-static cl::opt<bool>
-EnableGuaranteedTailCallOpt("tailcallopt",
-  cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
-  cl::init(false));
-
-static cl::opt<bool>
-DisableTailCalls("disable-tail-calls",
-  cl::desc("Never emit tail calls"),
-  cl::init(false));
-
-static cl::opt<unsigned>
-OverrideStackAlignment("stack-alignment",
-  cl::desc("Override default stack alignment"),
-  cl::init(0));
-
-static cl::opt<bool>
-EnableRealignStack("realign-stack",
-  cl::desc("Realign stack if needed"),
-  cl::init(true));
-
-static cl::opt<std::string>
-TrapFuncName("trap-func", cl::Hidden,
-  cl::desc("Emit a call to trap function rather than a trap instruction"),
-  cl::init(""));
-
-static cl::opt<bool>
-EnablePIE("enable-pie",
-  cl::desc("Assume the creation of a position independent executable."),
-  cl::init(false));
-
-static cl::opt<bool>
-SegmentedStacks("segmented-stacks",
-  cl::desc("Use segmented stacks if possible."),
-  cl::init(false));
-
-static cl::opt<bool>
-UseInitArray("use-init-array",
-  cl::desc("Use .init_array instead of .ctors."),
-  cl::init(false));
-
-static cl::opt<std::string> StopAfter("stop-after",
-  cl::desc("Stop compilation after a specific pass"),
-  cl::value_desc("pass-name"),
-  cl::init(""));
-static cl::opt<std::string> StartAfter("start-after",
-  cl::desc("Resume compilation after a specific pass"),
-  cl::value_desc("pass-name"),
-  cl::init(""));
+                        cl::desc("Disable simplify-libcalls"),
+                        cl::init(false));
 
 // GetFileNameRoot - Helper function to get the basename of a filename.
 static inline std::string
@@ -459,6 +262,7 @@ int main(int argc, char **argv) {
   Options.PositionIndependentExecutable = EnablePIE;
   Options.EnableSegmentedStacks = SegmentedStacks;
   Options.UseInitArray = UseInitArray;
+  Options.SSPBufferSize = SSPBufferSize;
 
   std::auto_ptr<TargetMachine>
     target(TheTarget->createTargetMachine(TheTriple.getTriple(),
@@ -499,11 +303,16 @@ int main(int argc, char **argv) {
     TLI->disableAllFunctions();
   PM.add(TLI);
 
+  if (target.get()) {
+    PM.add(new TargetTransformInfo(target->getScalarTargetTransformInfo(),
+                                   target->getVectorTargetTransformInfo()));
+  }
+
   // Add the target data from the target machine, if it exists, or the module.
-  if (const TargetData *TD = Target.getTargetData())
-    PM.add(new TargetData(*TD));
+  if (const DataLayout *TD = Target.getDataLayout())
+    PM.add(new DataLayout(*TD));
   else
-    PM.add(new TargetData(mod));
+    PM.add(new DataLayout(mod));
 
   // Override default to generate verbose assembly.
   Target.setAsmVerbosityDefault(true);
diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index a5d2e61ea24c..ed479f5323dc 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@@ -1,7 +1,5 @@
 
-link_directories( ${LLVM_INTEL_JITEVENTS_LIBDIR} )
-
-set(LLVM_LINK_COMPONENTS mcjit jit interpreter nativecodegen bitreader asmparser selectiondag)
+set(LLVM_LINK_COMPONENTS mcjit jit interpreter nativecodegen bitreader asmparser selectiondag native)
 
 if( LLVM_USE_OPROFILE )
   set(LLVM_LINK_COMPONENTS
@@ -19,4 +17,6 @@ endif( LLVM_USE_INTEL_JITEVENTS )
 
 add_llvm_tool(lli
   lli.cpp
+  RecordingMemoryManager.cpp
+  RemoteTarget.cpp
   )
diff --git a/tools/lli/LLVMBuild.txt b/tools/lli/LLVMBuild.txt
index 4eb82bd9e1c5..36ceb39b1270 100644
--- a/tools/lli/LLVMBuild.txt
+++ b/tools/lli/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = lli
 parent = Tools
-required_libraries = AsmParser BitReader Interpreter JIT MCJIT NativeCodeGen SelectionDAG
+required_libraries = AsmParser BitReader Interpreter JIT MCJIT NativeCodeGen SelectionDAG Native
diff --git a/tools/lli/Makefile b/tools/lli/Makefile
index 100fc2e415aa..31f3ab8a1e69 100644
--- a/tools/lli/Makefile
+++ b/tools/lli/Makefile
@@ -12,7 +12,7 @@ TOOLNAME := lli
 
 include $(LEVEL)/Makefile.config
 
-LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser selectiondag
+LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser selectiondag native
 
 # If Intel JIT Events support is confiured, link against the LLVM Intel JIT
 # Events interface library
diff --git a/tools/lli/RecordingMemoryManager.cpp b/tools/lli/RecordingMemoryManager.cpp
new file mode 100644
index 000000000000..9e1cff55277d
--- /dev/null
+++ b/tools/lli/RecordingMemoryManager.cpp
@@ -0,0 +1,87 @@
+//===- RecordingMemoryManager.cpp - Recording memory manager --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This memory manager allocates local storage and keeps a record of each
+// allocation. Iterators are provided for all data and code allocations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RecordingMemoryManager.h"
+using namespace llvm;
+
+uint8_t *RecordingMemoryManager::
+allocateCodeSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) {
+  // The recording memory manager is just a local copy of the remote target.
+  // The alignment requirement is just stored here for later use. Regular
+  // heap storage is sufficient here.
+  void *Addr = malloc(Size);
+  assert(Addr && "malloc() failure!");
+  sys::MemoryBlock Block(Addr, Size);
+  AllocatedCodeMem.push_back(Allocation(Block, Alignment));
+  return (uint8_t*)Addr;
+}
+
+uint8_t *RecordingMemoryManager::
+allocateDataSection(uintptr_t Size, unsigned Alignment, unsigned SectionID) {
+  // The recording memory manager is just a local copy of the remote target.
+  // The alignment requirement is just stored here for later use. Regular
+  // heap storage is sufficient here.
+  void *Addr = malloc(Size);
+  assert(Addr && "malloc() failure!");
+  sys::MemoryBlock Block(Addr, Size);
+  AllocatedDataMem.push_back(Allocation(Block, Alignment));
+  return (uint8_t*)Addr;
+}
+void RecordingMemoryManager::setMemoryWritable() { llvm_unreachable("Unexpected!"); }
+void RecordingMemoryManager::setMemoryExecutable() { llvm_unreachable("Unexpected!"); }
+void RecordingMemoryManager::setPoisonMemory(bool poison) { llvm_unreachable("Unexpected!"); }
+void RecordingMemoryManager::AllocateGOT() { llvm_unreachable("Unexpected!"); }
+uint8_t *RecordingMemoryManager::getGOTBase() const {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RecordingMemoryManager::startFunctionBody(const Function *F, uintptr_t &ActualSize){
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RecordingMemoryManager::allocateStub(const GlobalValue* F, unsigned StubSize,
+                                              unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RecordingMemoryManager::endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                                             uint8_t *FunctionEnd) {
+  llvm_unreachable("Unexpected!");
+}
+uint8_t *RecordingMemoryManager::allocateSpace(intptr_t Size, unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+uint8_t *RecordingMemoryManager::allocateGlobal(uintptr_t Size, unsigned Alignment) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RecordingMemoryManager::deallocateFunctionBody(void *Body) {
+  llvm_unreachable("Unexpected!");
+}
+uint8_t* RecordingMemoryManager::startExceptionTable(const Function* F, uintptr_t &ActualSize) {
+  llvm_unreachable("Unexpected!");
+  return 0;
+}
+void RecordingMemoryManager::endExceptionTable(const Function *F, uint8_t *TableStart,
+                                               uint8_t *TableEnd, uint8_t* FrameRegister) {
+  llvm_unreachable("Unexpected!");
+}
+void RecordingMemoryManager::deallocateExceptionTable(void *ET) {
+  llvm_unreachable("Unexpected!");
+}
+void *RecordingMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                        bool AbortOnFailure) {
+  return NULL;
+}
diff --git a/tools/lli/RecordingMemoryManager.h b/tools/lli/RecordingMemoryManager.h
new file mode 100644
index 000000000000..1590235a793c
--- /dev/null
+++ b/tools/lli/RecordingMemoryManager.h
@@ -0,0 +1,78 @@
+//===- RecordingMemoryManager.h - LLI MCJIT recording memory manager ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This memory manager allocates local storage and keeps a record of each
+// allocation. Iterators are provided for all data and code allocations.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef RECORDINGMEMORYMANAGER_H
+#define RECORDINGMEMORYMANAGER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+#include <utility>
+
+namespace llvm {
+
+class RecordingMemoryManager : public JITMemoryManager {
+public:
+  typedef std::pair<sys::MemoryBlock, unsigned> Allocation;
+
+private:
+  SmallVector<Allocation, 16> AllocatedDataMem;
+  SmallVector<Allocation, 16> AllocatedCodeMem;
+
+public:
+  RecordingMemoryManager() {}
+  virtual ~RecordingMemoryManager() {}
+
+  typedef SmallVectorImpl<Allocation>::const_iterator const_data_iterator;
+  typedef SmallVectorImpl<Allocation>::const_iterator const_code_iterator;
+
+  const_data_iterator data_begin() const { return AllocatedDataMem.begin(); }
+  const_data_iterator   data_end() const { return AllocatedDataMem.end(); }
+  const_code_iterator code_begin() const { return AllocatedCodeMem.begin(); }
+  const_code_iterator   code_end() const { return AllocatedCodeMem.end(); }
+
+  uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  void *getPointerToNamedFunction(const std::string &Name,
+                                  bool AbortOnFailure = true);
+  // The following obsolete JITMemoryManager calls are stubbed out for
+  // this model.
+  void setMemoryWritable();
+  void setMemoryExecutable();
+  void setPoisonMemory(bool poison);
+  void AllocateGOT();
+  uint8_t *getGOTBase() const;
+  uint8_t *startFunctionBody(const Function *F, uintptr_t &ActualSize);
+  uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                        unsigned Alignment);
+  void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                       uint8_t *FunctionEnd);
+  uint8_t *allocateSpace(intptr_t Size, unsigned Alignment);
+  uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment);
+  void deallocateFunctionBody(void *Body);
+  uint8_t* startExceptionTable(const Function* F, uintptr_t &ActualSize);
+  void endExceptionTable(const Function *F, uint8_t *TableStart,
+                         uint8_t *TableEnd, uint8_t* FrameRegister);
+  void deallocateExceptionTable(void *ET);
+
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/tools/lli/RemoteTarget.cpp b/tools/lli/RemoteTarget.cpp
new file mode 100644
index 000000000000..212bdfda1cdb
--- /dev/null
+++ b/tools/lli/RemoteTarget.cpp
@@ -0,0 +1,61 @@
+//===- RemoteTarget.cpp - LLVM Remote process JIT execution --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the RemoteTarget class which executes JITed code in a
+// separate address range from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RemoteTarget.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
+#include <stdlib.h>
+#include <string>
+using namespace llvm;
+
+bool RemoteTarget::allocateSpace(size_t Size, unsigned Alignment,
+                                 uint64_t &Address) {
+  sys::MemoryBlock *Prev = Allocations.size() ? &Allocations.back() : NULL;
+  sys::MemoryBlock Mem = sys::Memory::AllocateRWX(Size, Prev, &ErrorMsg);
+  if (Mem.base() == NULL)
+    return true;
+  if ((uintptr_t)Mem.base() % Alignment) {
+    ErrorMsg = "unable to allocate sufficiently aligned memory";
+    return true;
+  }
+  Address = reinterpret_cast<uint64_t>(Mem.base());
+  return false;
+}
+
+bool RemoteTarget::loadData(uint64_t Address, const void *Data, size_t Size) {
+  memcpy ((void*)Address, Data, Size);
+  return false;
+}
+
+bool RemoteTarget::loadCode(uint64_t Address, const void *Data, size_t Size) {
+  memcpy ((void*)Address, Data, Size);
+  sys::MemoryBlock Mem((void*)Address, Size);
+  sys::Memory::setExecutable(Mem, &ErrorMsg);
+  return false;
+}
+
+bool RemoteTarget::executeCode(uint64_t Address, int &RetVal) {
+  int (*fn)(void) = (int(*)(void))Address;
+  RetVal = fn();
+  return false;
+}
+
+void RemoteTarget::create() {
+}
+
+void RemoteTarget::stop() {
+  for (unsigned i = 0, e = Allocations.size(); i != e; ++i)
+    sys::Memory::ReleaseRWX(Allocations[i]);
+}
diff --git a/tools/lli/RemoteTarget.h b/tools/lli/RemoteTarget.h
new file mode 100644
index 000000000000..d05d3c6f4568
--- /dev/null
+++ b/tools/lli/RemoteTarget.h
@@ -0,0 +1,101 @@
+//===- RemoteTarget.h - LLVM Remote process JIT execution ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of the RemoteTarget class which executes JITed code in a
+// separate address range from where it was built.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef REMOTEPROCESS_H
+#define REMOTEPROCESS_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
+#include <stdlib.h>
+#include <string>
+
+namespace llvm {
+
+class RemoteTarget {
+  std::string ErrorMsg;
+  bool IsRunning;
+
+  SmallVector<sys::MemoryBlock, 16> Allocations;
+
+public:
+  StringRef getErrorMsg() const { return ErrorMsg; }
+
+  /// Allocate space in the remote target address space.
+  ///
+  /// @param      Size      Amount of space, in bytes, to allocate.
+  /// @param      Alignment Required minimum alignment for allocated space.
+  /// @param[out] Address   Remote address of the allocated memory.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool allocateSpace(size_t Size, unsigned Alignment, uint64_t &Address);
+
+  /// Load data into the target address space.
+  ///
+  /// @param      Address   Destination address in the target process.
+  /// @param      Data      Source address in the host process.
+  /// @param      Size      Number of bytes to copy.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool loadData(uint64_t Address, const void *Data, size_t Size);
+
+  /// Load code into the target address space and prepare it for execution.
+  ///
+  /// @param      Address   Destination address in the target process.
+  /// @param      Data      Source address in the host process.
+  /// @param      Size      Number of bytes to copy.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool loadCode(uint64_t Address, const void *Data, size_t Size);
+
+  /// Execute code in the target process. The called function is required
+  /// to be of signature int "(*)(void)".
+  ///
+  /// @param      Address   Address of the loaded function in the target
+  ///                       process.
+  /// @param[out] RetVal    The integer return value of the called function.
+  ///
+  /// @returns False on success. On failure, ErrorMsg is updated with
+  ///          descriptive text of the encountered error.
+  bool executeCode(uint64_t Address, int &RetVal);
+
+  /// Minimum alignment for memory permissions. Used to seperate code and
+  /// data regions to make sure data doesn't get marked as code or vice
+  /// versa.
+  ///
+  /// @returns Page alignment return value. Default of 4k.
+  unsigned getPageAlignment() { return 4096; }
+
+  /// Start the remote process.
+  void create();
+
+  /// Terminate the remote process.
+  void stop();
+
+  RemoteTarget() : ErrorMsg(""), IsRunning(false) {}
+  ~RemoteTarget() { if (IsRunning) stop(); }
+
+private:
+  // Main processing function for the remote target process. Command messages
+  // are received on file descriptor CmdFD and responses come back on OutFD.
+  static void doRemoteTargeting(int CmdFD, int OutFD);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index b6c9299c65b5..d41a595de857 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -13,6 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "lli"
+#include "RecordingMemoryManager.h"
+#include "RemoteTarget.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Type.h"
@@ -32,11 +35,14 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/Memory.h"
+#include "llvm/Support/MathExtras.h"
 #include <cerrno>
 
 #ifdef __linux__
@@ -73,6 +79,13 @@ namespace {
     "use-mcjit", cl::desc("Enable use of the MC-based JIT (if available)"),
     cl::init(false));
 
+  // The MCJIT supports building for a target address space separate from
+  // the JIT compilation process. Use a forked process and a copying
+  // memory manager with IPC to execute using this functionality.
+  cl::opt<bool> RemoteMCJIT("remote-mcjit",
+    cl::desc("Execute MCJIT'ed code in a separate process."),
+    cl::init(false));
+
   // Determine optimization level.
   cl::opt<char>
   OptLevel("O",
@@ -159,6 +172,23 @@ namespace {
     cl::init(false));
 
   cl::opt<bool>
+  GenerateSoftFloatCalls("soft-float",
+    cl::desc("Generate software floating point library calls"),
+    cl::init(false));
+
+  cl::opt<llvm::FloatABI::ABIType>
+  FloatABIForCalls("float-abi",
+                   cl::desc("Choose float ABI type"),
+                   cl::init(FloatABI::Default),
+                   cl::values(
+                     clEnumValN(FloatABI::Default, "default",
+                                "Target default float ABI type"),
+                     clEnumValN(FloatABI::Soft, "soft",
+                                "Soft float ABI (implied by -soft-float)"),
+                     clEnumValN(FloatABI::Hard, "hard",
+                                "Hard float ABI (uses FP registers)"),
+                     clEnumValEnd));
+  cl::opt<bool>
 // In debug builds, make this default to true.
 #ifdef NDEBUG
 #define EMIT_DEBUG false
@@ -212,7 +242,7 @@ public:
   // the data cache but not to the instruction cache.
   virtual void invalidateInstructionCache();
 
-  // The MCJITMemoryManager doesn't use the following functions, so we don't
+  // The RTDyldMemoryManager doesn't use the following functions, so we don't
   // need implement them.
   virtual void setMemoryWritable() {
     llvm_unreachable("Unexpected call!");
@@ -274,9 +304,16 @@ uint8_t *LLIMCJITMemoryManager::allocateDataSection(uintptr_t Size,
                                                     unsigned SectionID) {
   if (!Alignment)
     Alignment = 16;
-  uint8_t *Addr = (uint8_t*)calloc((Size + Alignment - 1)/Alignment, Alignment);
-  AllocatedDataMem.push_back(sys::MemoryBlock(Addr, Size));
-  return Addr;
+  // Ensure that enough memory is requested to allow aligning.
+  size_t NumElementsAligned = 1 + (Size + Alignment - 1)/Alignment;
+  uint8_t *Addr = (uint8_t*)calloc(NumElementsAligned, Alignment);
+
+  // Honour the alignment requirement.
+  uint8_t *AlignedAddr = (uint8_t*)RoundUpToAlignment((uint64_t)Addr, Alignment);
+
+  // Store the original address from calloc so we can free it later.
+  AllocatedDataMem.push_back(sys::MemoryBlock(Addr, NumElementsAligned*Alignment));
+  return AlignedAddr;
 }
 
 uint8_t *LLIMCJITMemoryManager::allocateCodeSection(uintptr_t Size,
@@ -326,6 +363,10 @@ void LLIMCJITMemoryManager::invalidateInstructionCache() {
                                             AllocatedCodeMem[i].size());
 }
 
+static int jit_noop() {
+  return 0;
+}
+
 void *LLIMCJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
                                                        bool AbortOnFailure) {
 #if defined(__linux__)
@@ -348,6 +389,14 @@ void *LLIMCJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
   if (Name == "mknod") return (void*)(intptr_t)&mknod;
 #endif // __linux__
 
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+
   const char *NameStr = Name.c_str();
   void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
   if (Ptr) return Ptr;
@@ -372,6 +421,83 @@ LLIMCJITMemoryManager::~LLIMCJITMemoryManager() {
     free(AllocatedDataMem[i].base());
 }
 
+
+void layoutRemoteTargetMemory(RemoteTarget *T, RecordingMemoryManager *JMM) {
+  // Lay out our sections in order, with all the code sections first, then
+  // all the data sections.
+  uint64_t CurOffset = 0;
+  unsigned MaxAlign = T->getPageAlignment();
+  SmallVector<std::pair<const void*, uint64_t>, 16> Offsets;
+  SmallVector<unsigned, 16> Sizes;
+  for (RecordingMemoryManager::const_code_iterator I = JMM->code_begin(),
+                                                   E = JMM->code_end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "code region: size " << I->first.size()
+                 << ", alignment " << I->second << "\n");
+    // Align the current offset up to whatever is needed for the next
+    // section.
+    unsigned Align = I->second;
+    CurOffset = (CurOffset + Align - 1) / Align * Align;
+    // Save off the address of the new section and allocate its space.
+    Offsets.push_back(std::pair<const void*,uint64_t>(I->first.base(), CurOffset));
+    Sizes.push_back(I->first.size());
+    CurOffset += I->first.size();
+  }
+  // Adjust to keep code and data aligned on seperate pages.
+  CurOffset = (CurOffset + MaxAlign - 1) / MaxAlign * MaxAlign;
+  unsigned FirstDataIndex = Offsets.size();
+  for (RecordingMemoryManager::const_data_iterator I = JMM->data_begin(),
+                                                   E = JMM->data_end();
+       I != E; ++I) {
+    DEBUG(dbgs() << "data region: size " << I->first.size()
+                 << ", alignment " << I->second << "\n");
+    // Align the current offset up to whatever is needed for the next
+    // section.
+    unsigned Align = I->second;
+    CurOffset = (CurOffset + Align - 1) / Align * Align;
+    // Save off the address of the new section and allocate its space.
+    Offsets.push_back(std::pair<const void*,uint64_t>(I->first.base(), CurOffset));
+    Sizes.push_back(I->first.size());
+    CurOffset += I->first.size();
+  }
+
+  // Allocate space in the remote target.
+  uint64_t RemoteAddr;
+  if (T->allocateSpace(CurOffset, MaxAlign, RemoteAddr))
+    report_fatal_error(T->getErrorMsg());
+  // Map the section addresses so relocations will get updated in the local
+  // copies of the sections.
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    uint64_t Addr = RemoteAddr + Offsets[i].second;
+    EE->mapSectionAddress(const_cast<void*>(Offsets[i].first), Addr);
+
+    DEBUG(dbgs() << "  Mapping local: " << Offsets[i].first
+                 << " to remote: " << format("%p", Addr) << "\n");
+
+  }
+
+  // Trigger application of relocations
+  EE->finalizeObject();
+
+  // Now load it all to the target.
+  for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
+    uint64_t Addr = RemoteAddr + Offsets[i].second;
+
+    if (i < FirstDataIndex) {
+      T->loadCode(Addr, Offsets[i].first, Sizes[i]);
+
+      DEBUG(dbgs() << "  loading code: " << Offsets[i].first
+            << " to remote: " << format("%p", Addr) << "\n");
+    } else {
+      T->loadData(Addr, Offsets[i].first, Sizes[i]);
+
+      DEBUG(dbgs() << "  loading data: " << Offsets[i].first
+            << " to remote: " << format("%p", Addr) << "\n");
+    }
+
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // main Driver function
 //
@@ -386,6 +512,7 @@ int main(int argc, char **argv, char * const *envp) {
   // usable by the JIT.
   InitializeNativeTarget();
   InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
 
   cl::ParseCommandLineOptions(argc, argv,
                               "llvm interpreter & dynamic compiler\n");
@@ -428,12 +555,19 @@ int main(int argc, char **argv, char * const *envp) {
     Mod->setTargetTriple(Triple::normalize(TargetTriple));
 
   // Enable MCJIT if desired.
-  LLIMCJITMemoryManager *JMM = 0;
+  JITMemoryManager *JMM = 0;
   if (UseMCJIT && !ForceInterpreter) {
     builder.setUseMCJIT(true);
-    JMM = new LLIMCJITMemoryManager();
+    if (RemoteMCJIT)
+      JMM = new RecordingMemoryManager();
+    else
+      JMM = new LLIMCJITMemoryManager();
     builder.setJITMemoryManager(JMM);
   } else {
+    if (RemoteMCJIT) {
+      errs() << "error: Remote process execution requires -use-mcjit\n";
+      exit(1);
+    }
     builder.setJITMemoryManager(ForceInterpreter ? 0 :
                                 JITMemoryManager::CreateDefaultMemManager());
   }
@@ -452,9 +586,19 @@ int main(int argc, char **argv, char * const *envp) {
   builder.setOptLevel(OLvl);
 
   TargetOptions Options;
-  Options.JITExceptionHandling = EnableJITExceptionHandling;
-  Options.JITEmitDebugInfo = EmitJitDebugInfo;
-  Options.JITEmitDebugInfoToDisk = EmitJitDebugInfoToDisk;
+  Options.UseSoftFloat = GenerateSoftFloatCalls;
+  if (FloatABIForCalls != FloatABI::Default)
+    Options.FloatABIType = FloatABIForCalls;
+  if (GenerateSoftFloatCalls)
+    FloatABIForCalls = FloatABI::Soft;
+
+  // Remote target execution doesn't handle EH or debug registration.
+  if (!RemoteMCJIT) {
+    Options.JITExceptionHandling = EnableJITExceptionHandling;
+    Options.JITEmitDebugInfo = EmitJitDebugInfo;
+    Options.JITEmitDebugInfoToDisk = EmitJitDebugInfoToDisk;
+  }
+
   builder.setTargetOptions(Options);
 
   EE = builder.create();
@@ -466,10 +610,6 @@ int main(int argc, char **argv, char * const *envp) {
     exit(1);
   }
 
-  // Clear instruction cache before code will be executed.
-  if (JMM)
-    JMM->invalidateInstructionCache();
-
   // The following functions have no effect if their respective profiling
   // support wasn't enabled in the build configuration.
   EE->RegisterJITEventListener(
@@ -477,6 +617,10 @@ int main(int argc, char **argv, char * const *envp) {
   EE->RegisterJITEventListener(
                 JITEventListener::createIntelJITEventListener());
 
+  if (!NoLazyCompilation && RemoteMCJIT) {
+    errs() << "warning: remote mcjit does not support lazy compilation\n";
+    NoLazyCompilation = true;
+  }
   EE->DisableLazyCompilation(NoLazyCompilation);
 
   // If the user specifically requested an argv[0] to pass into the program,
@@ -513,8 +657,13 @@ int main(int argc, char **argv, char * const *envp) {
   // Reset errno to zero on entry to main.
   errno = 0;
 
+  // Remote target MCJIT doesn't (yet) support static constructors. No reason
+  // it couldn't. This is a limitation of the LLI implemantation, not the
+  // MCJIT itself. FIXME.
+  //
   // Run static constructors.
-  EE->runStaticConstructorsDestructors(false);
+  if (!RemoteMCJIT)
+    EE->runStaticConstructorsDestructors(false);
 
   if (NoLazyCompilation) {
     for (Module::iterator I = Mod->begin(), E = Mod->end(); I != E; ++I) {
@@ -524,24 +673,69 @@ int main(int argc, char **argv, char * const *envp) {
     }
   }
 
-  // Run main.
-  int Result = EE->runFunctionAsMain(EntryFn, InputArgv, envp);
-
-  // Run static destructors.
-  EE->runStaticConstructorsDestructors(true);
-
-  // If the program didn't call exit explicitly, we should call it now.
-  // This ensures that any atexit handlers get called correctly.
-  if (Function *ExitF = dyn_cast<Function>(Exit)) {
-    std::vector<GenericValue> Args;
-    GenericValue ResultGV;
-    ResultGV.IntVal = APInt(32, Result);
-    Args.push_back(ResultGV);
-    EE->runFunction(ExitF, Args);
-    errs() << "ERROR: exit(" << Result << ") returned!\n";
-    abort();
+  int Result;
+  if (RemoteMCJIT) {
+    RecordingMemoryManager *MM = static_cast<RecordingMemoryManager*>(JMM);
+    // Everything is prepared now, so lay out our program for the target
+    // address space, assign the section addresses to resolve any relocations,
+    // and send it to the target.
+    RemoteTarget Target;
+    Target.create();
+
+    // Ask for a pointer to the entry function. This triggers the actual
+    // compilation.
+    (void)EE->getPointerToFunction(EntryFn);
+
+    // Enough has been compiled to execute the entry function now, so
+    // layout the target memory.
+    layoutRemoteTargetMemory(&Target, MM);
+
+    // Since we're executing in a (at least simulated) remote address space,
+    // we can't use the ExecutionEngine::runFunctionAsMain(). We have to
+    // grab the function address directly here and tell the remote target
+    // to execute the function.
+    // FIXME: argv and envp handling.
+    uint64_t Entry = (uint64_t)EE->getPointerToFunction(EntryFn);
+
+    DEBUG(dbgs() << "Executing '" << EntryFn->getName() << "' at "
+                 << format("%p", Entry) << "\n");
+
+    if (Target.executeCode(Entry, Result))
+      errs() << "ERROR: " << Target.getErrorMsg() << "\n";
+
+    Target.stop();
   } else {
-    errs() << "ERROR: exit defined with wrong prototype!\n";
-    abort();
+    // Trigger compilation separately so code regions that need to be 
+    // invalidated will be known.
+    (void)EE->getPointerToFunction(EntryFn);
+    // Clear instruction cache before code will be executed.
+    if (JMM)
+      static_cast<LLIMCJITMemoryManager*>(JMM)->invalidateInstructionCache();
+
+    // Run main.
+    Result = EE->runFunctionAsMain(EntryFn, InputArgv, envp);
+  }
+
+  // Like static constructors, the remote target MCJIT support doesn't handle
+  // this yet. It could. FIXME.
+  if (!RemoteMCJIT) {
+    // Run static destructors.
+    EE->runStaticConstructorsDestructors(true);
+
+    // If the program didn't call exit explicitly, we should call it now.
+    // This ensures that any atexit handlers get called correctly.
+    if (Function *ExitF = dyn_cast<Function>(Exit)) {
+      std::vector<GenericValue> Args;
+      GenericValue ResultGV;
+      ResultGV.IntVal = APInt(32, Result);
+      Args.push_back(ResultGV);
+      EE->runFunction(ExitF, Args);
+      errs() << "ERROR: exit(" << Result << ") returned!\n";
+      abort();
+    } else {
+      errs() << "ERROR: exit defined with wrong prototype!\n";
+      abort();
+    }
   }
+  return Result;
 }
diff --git a/tools/llvm-ar/CMakeLists.txt b/tools/llvm-ar/CMakeLists.txt
index c8b0b725d83c..70eb7603fdc6 100644
--- a/tools/llvm-ar/CMakeLists.txt
+++ b/tools/llvm-ar/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS archive)
-set(LLVM_REQUIRES_EH 1)
 
 add_llvm_tool(llvm-ar
   llvm-ar.cpp
diff --git a/tools/llvm-ar/Makefile b/tools/llvm-ar/Makefile
index 6ee6f34942d7..fafb14bc12a1 100644
--- a/tools/llvm-ar/Makefile
+++ b/tools/llvm-ar/Makefile
@@ -10,7 +10,6 @@
 LEVEL := ../..
 TOOLNAME := llvm-ar
 LINK_COMPONENTS := archive
-REQUIRES_EH := 1
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index 7c53701f009e..a8a5013a9a4c 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include <algorithm>
+#include <cstdlib>
 #include <memory>
 #include <fstream>
 using namespace llvm;
@@ -126,40 +127,57 @@ std::set<sys::Path> Paths;
 // The Archive object to which all the editing operations will be sent.
 Archive* TheArchive = 0;
 
+// The name this program was invoked as.
+static const char *program_name;
+
+// show_help - Show the error message, the help message and exit.
+LLVM_ATTRIBUTE_NORETURN static void
+show_help(const std::string &msg) {
+  errs() << program_name << ": " << msg << "\n\n";
+  cl::PrintHelpMessage();
+  if (TheArchive)
+    delete TheArchive;
+  std::exit(1);
+}
+
+// fail - Show the error message and exit.
+LLVM_ATTRIBUTE_NORETURN static void
+fail(const std::string &msg) {
+  errs() << program_name << ": " << msg << "\n\n";
+  if (TheArchive)
+    delete TheArchive;
+  std::exit(1);
+}
+
 // getRelPos - Extract the member filename from the command line for
 // the [relpos] argument associated with a, b, and i modifiers
 void getRelPos() {
-  if(RestOfArgs.size() > 0) {
-    RelPos = RestOfArgs[0];
-    RestOfArgs.erase(RestOfArgs.begin());
-  }
-  else
-    throw "Expected [relpos] for a, b, or i modifier";
+  if(RestOfArgs.size() == 0)
+    show_help("Expected [relpos] for a, b, or i modifier");
+  RelPos = RestOfArgs[0];
+  RestOfArgs.erase(RestOfArgs.begin());
 }
 
 // getCount - Extract the [count] argument associated with the N modifier
 // from the command line and check its value.
 void getCount() {
-  if(RestOfArgs.size() > 0) {
-    Count = atoi(RestOfArgs[0].c_str());
-    RestOfArgs.erase(RestOfArgs.begin());
-  }
-  else
-    throw "Expected [count] value with N modifier";
+  if(RestOfArgs.size() == 0)
+    show_help("Expected [count] value with N modifier");
+
+  Count = atoi(RestOfArgs[0].c_str());
+  RestOfArgs.erase(RestOfArgs.begin());
 
   // Non-positive counts are not allowed
   if (Count < 1)
-    throw "Invalid [count] value (not a positive integer)";
+    show_help("Invalid [count] value (not a positive integer)");
 }
 
 // getArchive - Get the archive file name from the command line
 void getArchive() {
-  if(RestOfArgs.size() > 0) {
-    ArchiveName = RestOfArgs[0];
-    RestOfArgs.erase(RestOfArgs.begin());
-  }
-  else
-    throw "An archive name must be specified.";
+  if(RestOfArgs.size() == 0)
+    show_help("An archive name must be specified");
+  ArchiveName = RestOfArgs[0];
+  RestOfArgs.erase(RestOfArgs.begin());
 }
 
 // getMembers - Copy over remaining items in RestOfArgs to our Members vector
@@ -240,25 +258,27 @@ ArchiveOperation parseCommandLine() {
   // Perform various checks on the operation/modifier specification
   // to make sure we are dealing with a legal request.
   if (NumOperations == 0)
-    throw "You must specify at least one of the operations";
+    show_help("You must specify at least one of the operations");
   if (NumOperations > 1)
-    throw "Only one operation may be specified";
+    show_help("Only one operation may be specified");
   if (NumPositional > 1)
-    throw "You may only specify one of a, b, and i modifiers";
-  if (AddAfter || AddBefore || InsertBefore)
+    show_help("You may only specify one of a, b, and i modifiers");
+  if (AddAfter || AddBefore || InsertBefore) {
     if (Operation != Move && Operation != ReplaceOrInsert)
-      throw "The 'a', 'b' and 'i' modifiers can only be specified with "
-            "the 'm' or 'r' operations";
+      show_help("The 'a', 'b' and 'i' modifiers can only be specified with "
+            "the 'm' or 'r' operations");
+  }
   if (RecurseDirectories && Operation != ReplaceOrInsert)
-    throw "The 'R' modifiers is only applicabe to the 'r' operation";
+    show_help("The 'R' modifiers is only applicabe to the 'r' operation");
   if (OriginalDates && Operation != Extract)
-    throw "The 'o' modifier is only applicable to the 'x' operation";
+    show_help("The 'o' modifier is only applicable to the 'x' operation");
   if (TruncateNames && Operation!=QuickAppend && Operation!=ReplaceOrInsert)
-    throw "The 'f' modifier is only applicable to the 'q' and 'r' operations";
+    show_help("The 'f' modifier is only applicable to the 'q' and 'r' "
+              "operations");
   if (OnlyUpdate && Operation != ReplaceOrInsert)
-    throw "The 'u' modifier is only applicable to the 'r' operation";
+    show_help("The 'u' modifier is only applicable to the 'r' operation");
   if (Count > 1 && Members.size() > 1)
-    throw "Only one member name may be specified with the 'N' modifier";
+    show_help("Only one member name may be specified with the 'N' modifier");
 
   // Return the parsed operation to the caller
   return Operation;
@@ -304,16 +324,16 @@ bool buildPaths(bool checkExistence, std::string* ErrMsg) {
   for (unsigned i = 0; i < Members.size(); i++) {
     sys::Path aPath;
     if (!aPath.set(Members[i]))
-      throw std::string("File member name invalid: ") + Members[i];
+      fail(std::string("File member name invalid: ") + Members[i]);
     if (checkExistence) {
       bool Exists;
       if (sys::fs::exists(aPath.str(), Exists) || !Exists)
-        throw std::string("File does not exist: ") + Members[i];
+        fail(std::string("File does not exist: ") + Members[i]);
       std::string Err;
       sys::PathWithStatus PwS(aPath);
       const sys::FileStatus *si = PwS.getFileStatus(false, &Err);
       if (!si)
-        throw Err;
+        fail(Err);
       if (si->isDir) {
         std::set<sys::Path> dirpaths;
         if (recurseDirectories(aPath, dirpaths, ErrMsg))
@@ -683,6 +703,7 @@ doReplaceOrInsert(std::string* ErrMsg) {
 
 // main - main program for llvm-ar .. see comments in the code
 int main(int argc, char **argv) {
+  program_name = argv[0];
   // Print a stack trace if we signal out.
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
@@ -698,77 +719,61 @@ int main(int argc, char **argv) {
 
   int exitCode = 0;
 
-  // Make sure we don't exit with "unhandled exception".
-  try {
-    // Do our own parsing of the command line because the CommandLine utility
-    // can't handle the grouped positional parameters without a dash.
-    ArchiveOperation Operation = parseCommandLine();
-
-    // Check the path name of the archive
-    sys::Path ArchivePath;
-    if (!ArchivePath.set(ArchiveName))
-      throw std::string("Archive name invalid: ") + ArchiveName;
-
-    // Create or open the archive object.
-    bool Exists;
-    if (llvm::sys::fs::exists(ArchivePath.str(), Exists) || !Exists) {
-      // Produce a warning if we should and we're creating the archive
-      if (!Create)
-        errs() << argv[0] << ": creating " << ArchivePath.str() << "\n";
-      TheArchive = Archive::CreateEmpty(ArchivePath, Context);
-      TheArchive->writeToDisk();
-    } else {
-      std::string Error;
-      TheArchive = Archive::OpenAndLoad(ArchivePath, Context, &Error);
-      if (TheArchive == 0) {
-        errs() << argv[0] << ": error loading '" << ArchivePath.str() << "': "
-               << Error << "!\n";
-        return 1;
-      }
-    }
+  // Do our own parsing of the command line because the CommandLine utility
+  // can't handle the grouped positional parameters without a dash.
+  ArchiveOperation Operation = parseCommandLine();
 
-    // Make sure we're not fooling ourselves.
-    assert(TheArchive && "Unable to instantiate the archive");
-
-    // Make sure we clean up the archive even on failure.
-    std::auto_ptr<Archive> AutoArchive(TheArchive);
-
-    // Perform the operation
-    std::string ErrMsg;
-    bool haveError = false;
-    switch (Operation) {
-      case Print:           haveError = doPrint(&ErrMsg); break;
-      case Delete:          haveError = doDelete(&ErrMsg); break;
-      case Move:            haveError = doMove(&ErrMsg); break;
-      case QuickAppend:     haveError = doQuickAppend(&ErrMsg); break;
-      case ReplaceOrInsert: haveError = doReplaceOrInsert(&ErrMsg); break;
-      case DisplayTable:    haveError = doDisplayTable(&ErrMsg); break;
-      case Extract:         haveError = doExtract(&ErrMsg); break;
-      case NoOperation:
-        errs() << argv[0] << ": No operation was selected.\n";
-        break;
-    }
-    if (haveError) {
-      errs() << argv[0] << ": " << ErrMsg << "\n";
+  // Check the path name of the archive
+  sys::Path ArchivePath;
+  if (!ArchivePath.set(ArchiveName)) {
+    errs() << argv[0] << ": Archive name invalid: " << ArchiveName << "\n";
+    return 1;
+  }
+
+  // Create or open the archive object.
+  bool Exists;
+  if (llvm::sys::fs::exists(ArchivePath.str(), Exists) || !Exists) {
+    // Produce a warning if we should and we're creating the archive
+    if (!Create)
+      errs() << argv[0] << ": creating " << ArchivePath.str() << "\n";
+    TheArchive = Archive::CreateEmpty(ArchivePath, Context);
+    TheArchive->writeToDisk();
+  } else {
+    std::string Error;
+    TheArchive = Archive::OpenAndLoad(ArchivePath, Context, &Error);
+    if (TheArchive == 0) {
+      errs() << argv[0] << ": error loading '" << ArchivePath.str() << "': "
+             << Error << "!\n";
       return 1;
     }
-  } catch (const char*msg) {
-    // These errors are usage errors, thrown only by the various checks in the
-    // code above.
-    errs() << argv[0] << ": " << msg << "\n\n";
-    cl::PrintHelpMessage();
-    exitCode = 1;
-  } catch (const std::string& msg) {
-    // These errors are thrown by LLVM libraries (e.g. lib System) and represent
-    // a more serious error so we bump the exitCode and don't print the usage.
-    errs() << argv[0] << ": " << msg << "\n";
-    exitCode = 2;
-  } catch (...) {
-    // This really shouldn't happen, but just in case ....
-    errs() << argv[0] << ": An unexpected unknown exception occurred.\n";
-    exitCode = 3;
   }
 
+  // Make sure we're not fooling ourselves.
+  assert(TheArchive && "Unable to instantiate the archive");
+
+  // Perform the operation
+  std::string ErrMsg;
+  bool haveError = false;
+  switch (Operation) {
+    case Print:           haveError = doPrint(&ErrMsg); break;
+    case Delete:          haveError = doDelete(&ErrMsg); break;
+    case Move:            haveError = doMove(&ErrMsg); break;
+    case QuickAppend:     haveError = doQuickAppend(&ErrMsg); break;
+    case ReplaceOrInsert: haveError = doReplaceOrInsert(&ErrMsg); break;
+    case DisplayTable:    haveError = doDisplayTable(&ErrMsg); break;
+    case Extract:         haveError = doExtract(&ErrMsg); break;
+    case NoOperation:
+      errs() << argv[0] << ": No operation was selected.\n";
+      break;
+  }
+  if (haveError) {
+    errs() << argv[0] << ": " << ErrMsg << "\n";
+    return 1;
+  }
+
+  delete TheArchive;
+  TheArchive = 0;
+
   // Return result code back to operating system.
   return exitCode;
 }
diff --git a/tools/llvm-as/CMakeLists.txt b/tools/llvm-as/CMakeLists.txt
index eef4a13e29dc..d5620e72971b 100644
--- a/tools/llvm-as/CMakeLists.txt
+++ b/tools/llvm-as/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS asmparser bitwriter)
-set(LLVM_REQUIRES_EH 1)
 
 add_llvm_tool(llvm-as
   llvm-as.cpp
diff --git a/tools/llvm-bcanalyzer/CMakeLists.txt b/tools/llvm-bcanalyzer/CMakeLists.txt
index 732bc3296f2c..0151ea9b4f65 100644
--- a/tools/llvm-bcanalyzer/CMakeLists.txt
+++ b/tools/llvm-bcanalyzer/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS bitreader)
-set(LLVM_REQUIRES_EH 1)
 
 add_llvm_tool(llvm-bcanalyzer
   llvm-bcanalyzer.cpp
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index d6300878d510..8109ca4d5be7 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -40,7 +40,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/system_error.h"
-#include <cstdio>
+
 #include <map>
 #include <algorithm>
 using namespace llvm;
@@ -463,11 +463,11 @@ static bool ParseBlock(BitstreamCursor &Stream, unsigned IndentLevel) {
 }
 
 static void PrintSize(double Bits) {
-  fprintf(stderr, "%.2f/%.2fB/%luW", Bits, Bits/8,(unsigned long)(Bits/32));
+  outs() << format("%.2f/%.2fB/%luW", Bits, Bits/8,(unsigned long)(Bits/32));
 }
 static void PrintSize(uint64_t Bits) {
-  fprintf(stderr, "%lub/%.2fB/%luW", (unsigned long)Bits,
-          (double)Bits/8, (unsigned long)(Bits/32));
+  outs() << format("%lub/%.2fB/%luW", (unsigned long)Bits,
+                   (double)Bits/8, (unsigned long)(Bits/32));
 }
 
 
@@ -483,7 +483,7 @@ static int AnalyzeBitcode() {
   if (MemBuf->getBufferSize() & 3)
     return Error("Bitcode stream should be a multiple of 4 bytes in length");
 
-  const unsigned char *BufPtr = (unsigned char *)MemBuf->getBufferStart();
+  const unsigned char *BufPtr = (const unsigned char *)MemBuf->getBufferStart();
   const unsigned char *EndBufPtr = BufPtr+MemBuf->getBufferSize();
 
   // If we have a wrapper header, parse it and ignore the non-bc file contents.
@@ -556,7 +556,7 @@ static int AnalyzeBitcode() {
     PrintSize(Stats.NumBits);
     outs() << "\n";
     double pct = (Stats.NumBits * 100.0) / BufferSizeBits;
-    errs() << "    Percent of file: " << format("%2.4f%%", pct) << "\n";
+    outs() << "    Percent of file: " << format("%2.4f%%", pct) << "\n";
     if (Stats.NumInstances > 1) {
       outs() << "       Average Size: ";
       PrintSize(Stats.NumBits/(double)Stats.NumInstances);
@@ -588,24 +588,26 @@ static int AnalyzeBitcode() {
       std::reverse(FreqPairs.begin(), FreqPairs.end());
 
       outs() << "\tRecord Histogram:\n";
-      fprintf(stderr, "\t\t  Count    # Bits   %% Abv  Record Kind\n");
+      outs() << "\t\t  Count    # Bits   %% Abv  Record Kind\n";
       for (unsigned i = 0, e = FreqPairs.size(); i != e; ++i) {
         const PerRecordStats &RecStats = Stats.CodeFreq[FreqPairs[i].second];
 
-        fprintf(stderr, "\t\t%7d %9lu ", RecStats.NumInstances,
-                (unsigned long)RecStats.TotalBits);
+        outs() << format("\t\t%7d %9lu",
+                         RecStats.NumInstances,
+                         (unsigned long)RecStats.TotalBits);
 
         if (RecStats.NumAbbrev)
-          fprintf(stderr, "%7.2f  ",
-                  (double)RecStats.NumAbbrev/RecStats.NumInstances*100);
+          outs() <<
+              format("%7.2f  ",
+                     (double)RecStats.NumAbbrev/RecStats.NumInstances*100);
         else
-          fprintf(stderr, "         ");
+          outs() << "         ";
 
         if (const char *CodeName =
               GetCodeName(FreqPairs[i].second, I->first, StreamFile))
-          fprintf(stderr, "%s\n", CodeName);
+          outs() << CodeName << "\n";
         else
-          fprintf(stderr, "UnknownCode%d\n", FreqPairs[i].second);
+          outs() << "UnknownCode" << FreqPairs[i].second << "\n";
       }
       outs() << "\n";
 
diff --git a/tools/llvm-config/Makefile b/tools/llvm-config/Makefile
index e8c86929e188..b20b6bf4a4be 100644
--- a/tools/llvm-config/Makefile
+++ b/tools/llvm-config/Makefile
@@ -63,5 +63,5 @@ ifeq ($(LLVM_CROSS_COMPILING),1)
 install:: $(DESTDIR)$(PROJ_bindir)
 	$(Echo) Installing llvm-config-host
 	$(Verb) $(ProgInstall) $(BuildLLVMToolDir)/llvm-config \
-	  $(DESTDIR)$(PROJ_bindir)/llvm-config-host
+	  $(DESTDIR)$(PROJ_bindir)/$(program_prefix)llvm-config-host
 endif
diff --git a/tools/llvm-dis/CMakeLists.txt b/tools/llvm-dis/CMakeLists.txt
index 3125f8a5c6bb..9f12ecb66641 100644
--- a/tools/llvm-dis/CMakeLists.txt
+++ b/tools/llvm-dis/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS bitreader analysis)
-set(LLVM_REQUIRES_EH 1)
 
 add_llvm_tool(llvm-dis
   llvm-dis.cpp
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index ec0b4aeb63c6..e73300a0cd8d 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -1,4 +1,4 @@
-//===-- llvm-dwarfdump.cpp - Debug info dumping utility for llvm -----------===//
+//===-- llvm-dwarfdump.cpp - Debug info dumping utility for llvm ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/RelocVisitor.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -28,6 +29,9 @@
 #include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <list>
+#include <string>
+
 using namespace llvm;
 using namespace object;
 
@@ -44,6 +48,18 @@ PrintFunctions("functions", cl::init(false),
                cl::desc("Print function names as well as line information "
                         "for a given address"));
 
+static cl::opt<bool>
+PrintInlining("inlining", cl::init(false),
+              cl::desc("Print all inlined frames for a given address"));
+
+static void PrintDILineInfo(DILineInfo dli) {
+  if (PrintFunctions)
+    outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
+           << "\n";
+  outs() << (dli.getFileName() ? dli.getFileName() : "<unknown>") << ':'
+         << dli.getLine() << ':' << dli.getColumn() << '\n';
+}
+
 static void DumpInput(const StringRef &Filename) {
   OwningPtr<MemoryBuffer> Buff;
 
@@ -55,10 +71,12 @@ static void DumpInput(const StringRef &Filename) {
   OwningPtr<ObjectFile> Obj(ObjectFile::createObjectFile(Buff.take()));
 
   StringRef DebugInfoSection;
+  RelocAddrMap RelocMap;
   StringRef DebugAbbrevSection;
   StringRef DebugLineSection;
   StringRef DebugArangesSection;
   StringRef DebugStringSection;
+  StringRef DebugRangesSection;
 
   error_code ec;
   for (section_iterator i = Obj->begin_sections(),
@@ -82,6 +100,59 @@ static void DumpInput(const StringRef &Filename) {
       DebugArangesSection = data;
     else if (name == "debug_str")
       DebugStringSection = data;
+    else if (name == "debug_ranges")
+      DebugRangesSection = data;
+    // Any more debug info sections go here.
+    else
+      continue;
+
+    // TODO: For now only handle relocations for the debug_info section.
+    if (name != "debug_info")
+      continue;
+
+    if (i->begin_relocations() != i->end_relocations()) {
+      uint64_t SectionSize;
+      i->getSize(SectionSize);
+      for (relocation_iterator reloc_i = i->begin_relocations(),
+                               reloc_e = i->end_relocations();
+                               reloc_i != reloc_e; reloc_i.increment(ec)) {
+        uint64_t Address;
+        reloc_i->getAddress(Address);
+        uint64_t Type;
+        reloc_i->getType(Type);
+
+        RelocVisitor V(Obj->getFileFormatName());
+        // The section address is always 0 for debug sections.
+        RelocToApply R(V.visit(Type, *reloc_i));
+        if (V.error()) {
+          SmallString<32> Name;
+          error_code ec(reloc_i->getTypeName(Name));
+          if (ec) {
+            errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
+          }
+          errs() << "error: failed to compute relocation: "
+                 << Name << "\n";
+          continue;
+        }
+
+        if (Address + R.Width > SectionSize) {
+          errs() << "error: " << R.Width << "-byte relocation starting "
+                 << Address << " bytes into section " << name << " which is "
+                 << SectionSize << " bytes long.\n";
+          continue;
+        }
+        if (R.Width > 8) {
+          errs() << "error: can't handle a relocation of more than 8 bytes at "
+                    "a time.\n";
+          continue;
+        }
+        DEBUG(dbgs() << "Writing " << format("%p", R.Value)
+                     << " at " << format("%p", Address)
+                     << " with width " << format("%d", R.Width)
+                     << "\n");
+        RelocMap[Address] = std::make_pair(R.Width, R.Value);
+      }
+    }
   }
 
   OwningPtr<DIContext> dictx(DIContext::getDWARFContext(/*FIXME*/true,
@@ -89,7 +160,9 @@ static void DumpInput(const StringRef &Filename) {
                                                         DebugAbbrevSection,
                                                         DebugArangesSection,
                                                         DebugLineSection,
-                                                        DebugStringSection));
+                                                        DebugStringSection,
+                                                        DebugRangesSection,
+                                                        RelocMap));
   if (Address == -1ULL) {
     outs() << Filename
            << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
@@ -97,16 +170,27 @@ static void DumpInput(const StringRef &Filename) {
     dictx->dump(outs());
   } else {
     // Print line info for the specified address.
-    int spec_flags = DILineInfoSpecifier::FileLineInfo |
-                     DILineInfoSpecifier::AbsoluteFilePath;
-    if (PrintFunctions)
-      spec_flags |= DILineInfoSpecifier::FunctionName;
-    DILineInfo dli = dictx->getLineInfoForAddress(Address, spec_flags);
+    int SpecFlags = DILineInfoSpecifier::FileLineInfo |
+                    DILineInfoSpecifier::AbsoluteFilePath;
     if (PrintFunctions)
-      outs() << (dli.getFunctionName() ? dli.getFunctionName() : "<unknown>")
-             << "\n";
-    outs() << (dli.getFileName() ? dli.getFileName() : "<unknown>") << ':'
-           << dli.getLine() << ':' << dli.getColumn() << '\n';
+      SpecFlags |= DILineInfoSpecifier::FunctionName;
+    if (PrintInlining) {
+      DIInliningInfo InliningInfo =
+        dictx->getInliningInfoForAddress(Address, SpecFlags);
+      uint32_t n = InliningInfo.getNumberOfFrames();
+      if (n == 0) {
+        // Print one empty debug line info in any case.
+        PrintDILineInfo(DILineInfo());
+      } else {
+        for (uint32_t i = 0; i < n; i++) {
+          DILineInfo dli = InliningInfo.getFrame(i);
+          PrintDILineInfo(dli);
+        }
+      }
+    } else {
+      DILineInfo dli = dictx->getLineInfoForAddress(Address, SpecFlags);
+      PrintDILineInfo(dli);
+    }
   }
 }
 
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index 2ed11c52b2b3..ac82d98b3b77 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -18,7 +18,7 @@
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/IRReader.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -59,6 +59,19 @@ ExtractRegExpFuncs("rfunc", cl::desc("Specify function(s) to extract using a "
                                      "regular expression"),
                    cl::ZeroOrMore, cl::value_desc("rfunction"));
 
+// ExtractAlias - The alias to extract from the module.
+static cl::list<std::string>
+ExtractAliases("alias", cl::desc("Specify alias to extract"),
+               cl::ZeroOrMore, cl::value_desc("alias"));
+
+
+// ExtractRegExpAliases - The aliases, matched via regular expression, to
+// extract from the module.
+static cl::list<std::string>
+ExtractRegExpAliases("ralias", cl::desc("Specify alias(es) to extract using a "
+                                        "regular expression"),
+                     cl::ZeroOrMore, cl::value_desc("ralias"));
+
 // ExtractGlobals - The globals to extract from the module.
 static cl::list<std::string>
 ExtractGlobals("glob", cl::desc("Specify global to extract"),
@@ -97,6 +110,40 @@ int main(int argc, char **argv) {
   // Use SetVector to avoid duplicates.
   SetVector<GlobalValue *> GVs;
 
+  // Figure out which aliases we should extract.
+  for (size_t i = 0, e = ExtractAliases.size(); i != e; ++i) {
+    GlobalAlias *GA = M->getNamedAlias(ExtractAliases[i]);
+    if (!GA) {
+      errs() << argv[0] << ": program doesn't contain alias named '"
+             << ExtractAliases[i] << "'!\n";
+      return 1;
+    }
+    GVs.insert(GA);
+  }
+
+  // Extract aliases via regular expression matching.
+  for (size_t i = 0, e = ExtractRegExpAliases.size(); i != e; ++i) {
+    std::string Error;
+    Regex RegEx(ExtractRegExpAliases[i]);
+    if (!RegEx.isValid(Error)) {
+      errs() << argv[0] << ": '" << ExtractRegExpAliases[i] << "' "
+        "invalid regex: " << Error;
+    }
+    bool match = false;
+    for (Module::alias_iterator GA = M->alias_begin(), E = M->alias_end();
+         GA != E; GA++) {
+      if (RegEx.match(GA->getName())) {
+        GVs.insert(&*GA);
+        match = true;
+      }
+    }
+    if (!match) {
+      errs() << argv[0] << ": program doesn't contain global named '"
+             << ExtractRegExpAliases[i] << "'!\n";
+      return 1;
+    }
+  }
+
   // Figure out which globals we should extract.
   for (size_t i = 0, e = ExtractGlobals.size(); i != e; ++i) {
     GlobalValue *GV = M->getNamedGlobal(ExtractGlobals[i]);
@@ -206,7 +253,7 @@ int main(int argc, char **argv) {
   // In addition to deleting all other functions, we also want to spiff it
   // up a little bit.  Do this now.
   PassManager Passes;
-  Passes.add(new TargetData(M.get())); // Use correct TargetData
+  Passes.add(new DataLayout(M.get())); // Use correct DataLayout
 
   std::vector<GlobalValue*> Gvs(GVs.begin(), GVs.end());
 
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 3bceb1462416..f7c3748f079b 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -158,7 +158,8 @@ enum ActionType {
   AC_AsLex,
   AC_Assemble,
   AC_Disassemble,
-  AC_EDisassemble
+  AC_EDisassemble,
+  AC_MDisassemble
 };
 
 static cl::opt<ActionType>
@@ -172,6 +173,8 @@ Action(cl::desc("Action to perform:"),
                              "Disassemble strings of hex bytes"),
                   clEnumValN(AC_EDisassemble, "edis",
                              "Enhanced disassembly of strings of hex bytes"),
+                  clEnumValN(AC_MDisassemble, "mdis",
+                             "Marked up disassembly of strings of hex bytes"),
                   clEnumValEnd));
 
 static const Target *GetTarget(const char *ProgName) {
@@ -402,14 +405,15 @@ int main(int argc, char **argv) {
   OwningPtr<MCSubtargetInfo>
     STI(TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
 
+  MCInstPrinter *IP;
   if (FileType == OFT_AssemblyFile) {
-    MCInstPrinter *IP =
+    IP =
       TheTarget->createMCInstPrinter(OutputAsmVariant, *MAI, *MCII, *MRI, *STI);
     MCCodeEmitter *CE = 0;
     MCAsmBackend *MAB = 0;
     if (ShowEncoding) {
       CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
-      MAB = TheTarget->createMCAsmBackend(TripleName);
+      MAB = TheTarget->createMCAsmBackend(TripleName, MCPU);
     }
     Str.reset(TheTarget->createAsmStreamer(Ctx, FOS, /*asmverbose*/true,
                                            /*useLoc*/ true,
@@ -422,7 +426,7 @@ int main(int argc, char **argv) {
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
-    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(TripleName);
+    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(TripleName, MCPU);
     Str.reset(TheTarget->createMCObjectStreamer(TripleName, Ctx, *MAB,
                                                 FOS, CE, RelaxAll,
                                                 NoExecStack));
@@ -436,6 +440,9 @@ int main(int argc, char **argv) {
   case AC_Assemble:
     Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI);
     break;
+  case AC_MDisassemble:
+    IP->setUseMarkup(1);
+    // Fall through to do disassembly.
   case AC_Disassemble:
     Res = Disassembler::disassemble(*TheTarget, TripleName, *STI, *Str,
                                     *Buffer, SrcMgr, Out->os());
diff --git a/tools/llvm-mcmarkup/CMakeLists.txt b/tools/llvm-mcmarkup/CMakeLists.txt
new file mode 100644
index 000000000000..0a51e99f1953
--- /dev/null
+++ b/tools/llvm-mcmarkup/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_LINK_COMPONENTS support)
+
+add_llvm_tool(llvm-mcmarkup
+  llvm-mcmarkup.cpp
+  )
diff --git a/tools/llvm-mcmarkup/LLVMBuild.txt b/tools/llvm-mcmarkup/LLVMBuild.txt
new file mode 100644
index 000000000000..6423493a543d
--- /dev/null
+++ b/tools/llvm-mcmarkup/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llvm-mcmarkup/LLVMBuild.txt ----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = llvm-mcmarkup
+parent = Tools
+required_libraries = Support
diff --git a/tools/llvm-mcmarkup/Makefile b/tools/llvm-mcmarkup/Makefile
new file mode 100644
index 000000000000..5633a9c301a3
--- /dev/null
+++ b/tools/llvm-mcmarkup/Makefile
@@ -0,0 +1,17 @@
+##===- tools/llvm-mcmarkup/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := llvm-mcmarkup
+LINK_COMPONENTS := support
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS = 1
+
+include $(LEVEL)/Makefile.common
diff --git a/tools/llvm-mcmarkup/llvm-mcmarkup.cpp b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
new file mode 100644
index 000000000000..888761f10f0a
--- /dev/null
+++ b/tools/llvm-mcmarkup/llvm-mcmarkup.cpp
@@ -0,0 +1,225 @@
+//===-- llvm-mcmarkup.cpp - Parse the MC assembly markup tags -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Example simple parser implementation for the MC assembly markup language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+using namespace llvm;
+
+static cl::list<std::string>
+       InputFilenames(cl::Positional, cl::desc("<input files>"),
+                      cl::ZeroOrMore);
+static cl::opt<bool>
+DumpTags("dump-tags", cl::desc("List all tags encountered in input"));
+
+static StringRef ToolName;
+
+/// Trivial lexer for the markup parser. Input is always handled a character
+/// at a time. The lexer just encapsulates EOF and lookahead handling.
+class MarkupLexer {
+  StringRef::const_iterator Start;
+  StringRef::const_iterator CurPtr;
+  StringRef::const_iterator End;
+public:
+  MarkupLexer(StringRef Source)
+    : Start(Source.begin()), CurPtr(Source.begin()), End(Source.end()) {}
+  // When processing non-markup, input is consumed a character at a time.
+  bool isEOF() { return CurPtr == End; }
+  int getNextChar() {
+    if (CurPtr == End) return EOF;
+    return *CurPtr++;
+  }
+  int peekNextChar() {
+    if (CurPtr == End) return EOF;
+    return *CurPtr;
+  }
+  StringRef::const_iterator getPosition() const { return CurPtr; }
+};
+
+/// A markup tag is a name and a (usually empty) list of modifiers.
+class MarkupTag {
+  StringRef Name;
+  StringRef Modifiers;
+  SMLoc StartLoc;
+public:
+  MarkupTag(StringRef n, StringRef m, SMLoc Loc)
+    : Name(n), Modifiers(m), StartLoc(Loc) {}
+  StringRef getName() const { return Name; }
+  StringRef getModifiers() const { return Modifiers; }
+  SMLoc getLoc() const { return StartLoc; }
+};
+
+/// A simple parser implementation for creating MarkupTags from input text.
+class MarkupParser {
+  MarkupLexer &Lex;
+  SourceMgr &SM;
+public:
+  MarkupParser(MarkupLexer &lex, SourceMgr &SrcMgr) : Lex(lex), SM(SrcMgr) {}
+  /// Create a MarkupTag from the current position in the MarkupLexer.
+  /// The parseTag() method should be called when the lexer has processed
+  /// the opening '<' character. Input will be consumed up to and including
+  /// the ':' which terminates the tag open.
+  MarkupTag parseTag();
+  /// Issue a diagnostic and terminate program execution.
+  void FatalError(SMLoc Loc, StringRef Msg);
+};
+
+void MarkupParser::FatalError(SMLoc Loc, StringRef Msg) {
+  SM.PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+  exit(1);
+}
+
+// Example handler for when a tag is recognized.
+static void processStartTag(MarkupTag &Tag) {
+  // If we're just printing the tags, do that, otherwise do some simple
+  // colorization.
+  if (DumpTags) {
+    outs() << Tag.getName();
+    if (Tag.getModifiers().size())
+      outs() << " " << Tag.getModifiers();
+    outs() << "\n";
+    return;
+  }
+
+  if (!outs().has_colors())
+    return;
+  // Color registers as red and immediates as cyan. Those don't have nested
+  // tags, so don't bother keeping a stack of colors to reset to.
+  if (Tag.getName() == "reg")
+    outs().changeColor(raw_ostream::RED);
+  else if (Tag.getName() == "imm")
+    outs().changeColor(raw_ostream::CYAN);
+}
+
+// Example handler for when the end of a tag is recognized.
+static void processEndTag(MarkupTag &Tag) {
+  // If we're printing the tags, there's nothing more to do here. Otherwise,
+  // set the color back the normal.
+  if (DumpTags)
+    return;
+  if (!outs().has_colors())
+    return;
+  // Just reset to basic white.
+  outs().changeColor(raw_ostream::WHITE, false);
+}
+
+MarkupTag MarkupParser::parseTag() {
+  // First off, extract the tag into it's own StringRef so we can look at it
+  // outside of the context of consuming input.
+  StringRef::const_iterator Start = Lex.getPosition();
+  SMLoc Loc = SMLoc::getFromPointer(Start - 1);
+  while(Lex.getNextChar() != ':') {
+    // EOF is an error.
+    if (Lex.isEOF())
+      FatalError(SMLoc::getFromPointer(Start), "unterminated markup tag");
+  }
+  StringRef RawTag(Start, Lex.getPosition() - Start - 1);
+  std::pair<StringRef, StringRef> SplitTag = RawTag.split(' ');
+  return MarkupTag(SplitTag.first, SplitTag.second, Loc);
+}
+
+static void parseMCMarkup(StringRef Filename) {
+  OwningPtr<MemoryBuffer> BufferPtr;
+  if (error_code ec = MemoryBuffer::getFileOrSTDIN(Filename, BufferPtr)) {
+    errs() << ToolName << ": " << ec.message() << '\n';
+    return;
+  }
+  MemoryBuffer *Buffer = BufferPtr.take();
+
+  SourceMgr SrcMgr;
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(Buffer, SMLoc());
+
+  StringRef InputSource = Buffer->getBuffer();
+  MarkupLexer Lex(InputSource);
+  MarkupParser Parser(Lex, SrcMgr);
+
+  SmallVector<MarkupTag, 4> TagStack;
+
+  for (int CurChar = Lex.getNextChar();
+       CurChar != EOF;
+       CurChar = Lex.getNextChar()) {
+    switch (CurChar) {
+    case '<': {
+      // A "<<" is output as a literal '<' and does not start a markup tag.
+      if (Lex.peekNextChar() == '<') {
+        (void)Lex.getNextChar();
+        break;
+      }
+      // Parse the markup entry.
+      TagStack.push_back(Parser.parseTag());
+
+      // Do any special handling for the start of a tag.
+      processStartTag(TagStack.back());
+      continue;
+    }
+    case '>': {
+      SMLoc Loc = SMLoc::getFromPointer(Lex.getPosition() - 1);
+      // A ">>" is output as a literal '>' and does not end a markup tag.
+      if (Lex.peekNextChar() == '>') {
+        (void)Lex.getNextChar();
+        break;
+      }
+      // Close out the innermost tag.
+      if (TagStack.empty())
+        Parser.FatalError(Loc, "'>' without matching '<'");
+
+      // Do any special handling for the end of a tag.
+      processEndTag(TagStack.back());
+
+      TagStack.pop_back();
+      continue;
+    }
+    default:
+      break;
+    }
+    // For anything else, just echo the character back out.
+    if (!DumpTags && CurChar != EOF)
+      outs() << (char)CurChar;
+  }
+
+  // If there are any unterminated markup tags, issue diagnostics for them.
+  while (!TagStack.empty()) {
+    MarkupTag &Tag = TagStack.back();
+    SrcMgr.PrintMessage(Tag.getLoc(), SourceMgr::DK_Error,
+                        "unterminated markup tag");
+    TagStack.pop_back();
+  }
+}
+
+int main(int argc, char **argv) {
+  // Print a stack trace if we signal out.
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+
+  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+  cl::ParseCommandLineOptions(argc, argv, "llvm MC markup parser\n");
+
+  ToolName = argv[0];
+
+  // If no input files specified, read from stdin.
+  if (InputFilenames.size() == 0)
+    InputFilenames.push_back("-");
+
+  std::for_each(InputFilenames.begin(), InputFilenames.end(),
+                parseMCMarkup);
+  return 0;
+}
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index 9afbd4db90c3..0543e83f9cb4 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -110,6 +110,9 @@ namespace {
 
   cl::opt<bool> SizeSort("size-sort", cl::desc("Sort symbols by size"));
 
+  cl::opt<bool> WithoutAliases("without-aliases", cl::Hidden,
+                               cl::desc("Exclude aliases from output"));
+
   bool PrintAddress = true;
 
   bool MultipleFiles = false;
@@ -256,7 +259,6 @@ static void DumpSymbolNameForGlobalValue(GlobalValue &GV) {
   if (GV.hasPrivateLinkage() ||
       GV.hasLinkerPrivateLinkage() ||
       GV.hasLinkerPrivateWeakLinkage() ||
-      GV.hasLinkerPrivateWeakDefAutoLinkage() ||
       GV.hasAvailableExternallyLinkage())
     return;
   char TypeChar = TypeCharForSymbol(GV);
@@ -276,8 +278,9 @@ static void DumpSymbolNamesFromModule(Module *M) {
   std::for_each (M->begin(), M->end(), DumpSymbolNameForGlobalValue);
   std::for_each (M->global_begin(), M->global_end(),
                  DumpSymbolNameForGlobalValue);
-  std::for_each (M->alias_begin(), M->alias_end(),
-                 DumpSymbolNameForGlobalValue);
+  if (!WithoutAliases)
+    std::for_each (M->alias_begin(), M->alias_end(),
+		   DumpSymbolNameForGlobalValue);
 
   SortAndPrintSymbolList();
 }
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index b431c7638d75..13ea4e32958a 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -94,6 +94,12 @@ static cl::alias
 SectionHeadersShorter("h", cl::desc("Alias for --section-headers"),
                       cl::aliasopt(SectionHeaders));
 
+static cl::list<std::string>
+MAttrs("mattr",
+  cl::CommaSeparated,
+  cl::desc("Target specific attributes"),
+  cl::value_desc("a1,+a2,-a3,..."));
+
 static StringRef ToolName;
 
 static bool error(error_code ec) {
@@ -169,6 +175,15 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   if (!TheTarget)
     return;
 
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (MAttrs.size()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
   error_code ec;
   for (section_iterator i = Obj->begin_sections(),
                         e = Obj->end_sections();
@@ -233,7 +248,7 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     }
 
     OwningPtr<const MCSubtargetInfo> STI(
-      TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+      TheTarget->createMCSubtargetInfo(TripleName, "", FeaturesStr));
 
     if (!STI) {
       errs() << "error: no subtarget info for target " << TripleName << "\n";
diff --git a/tools/llvm-ranlib/CMakeLists.txt b/tools/llvm-ranlib/CMakeLists.txt
index 3116d2e4ff79..2d7defee11f3 100644
--- a/tools/llvm-ranlib/CMakeLists.txt
+++ b/tools/llvm-ranlib/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(LLVM_LINK_COMPONENTS archive)
-set(LLVM_REQUIRES_EH 1)
 
 add_llvm_tool(llvm-ranlib
   llvm-ranlib.cpp
diff --git a/tools/llvm-ranlib/Makefile b/tools/llvm-ranlib/Makefile
index 36195f4399ec..cca95013f486 100644
--- a/tools/llvm-ranlib/Makefile
+++ b/tools/llvm-ranlib/Makefile
@@ -10,7 +10,6 @@
 LEVEL := ../..
 TOOLNAME := llvm-ranlib
 LINK_COMPONENTS := archive
-REQUIRES_EH := 1
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS := 1
diff --git a/tools/llvm-ranlib/llvm-ranlib.cpp b/tools/llvm-ranlib/llvm-ranlib.cpp
index 4006765a9c2b..d2f5f0fff910 100644
--- a/tools/llvm-ranlib/llvm-ranlib.cpp
+++ b/tools/llvm-ranlib/llvm-ranlib.cpp
@@ -61,41 +61,38 @@ int main(int argc, char **argv) {
 
   int exitCode = 0;
 
-  // Make sure we don't exit with "unhandled exception".
-  try {
-
-    // Check the path name of the archive
-    sys::Path ArchivePath;
-    if (!ArchivePath.set(ArchiveName))
-      throw std::string("Archive name invalid: ") + ArchiveName;
+  // Check the path name of the archive
+  sys::Path ArchivePath;
+  if (!ArchivePath.set(ArchiveName)) {
+    errs() << argv[0] << ": " << "Archive name invalid: " << ArchiveName <<
+      "\n";
+    return 1;
+  }
 
-    // Make sure it exists, we don't create empty archives
-    bool Exists;
-    if (llvm::sys::fs::exists(ArchivePath.str(), Exists) || !Exists)
-      throw std::string("Archive file does not exist");
+  // Make sure it exists, we don't create empty archives
+  bool Exists;
+  if (llvm::sys::fs::exists(ArchivePath.str(), Exists) || !Exists) {
+    errs() << argv[0] << ": " << "Archive file does not exist" <<
+      ArchivePath.str() << "\n";
+    return 1;
+  }
 
-    std::string err_msg;
-    std::auto_ptr<Archive>
-      AutoArchive(Archive::OpenAndLoad(ArchivePath, Context, &err_msg));
-    Archive* TheArchive = AutoArchive.get();
-    if (!TheArchive)
-      throw err_msg;
+  std::string err_msg;
+  std::auto_ptr<Archive>
+    AutoArchive(Archive::OpenAndLoad(ArchivePath, Context, &err_msg));
+  Archive* TheArchive = AutoArchive.get();
+  if (!TheArchive) {
+    errs() << argv[0] << ": " << err_msg << "\n";
+    return 1;
+  }
 
-    if (TheArchive->writeToDisk(true, false, &err_msg ))
-      throw err_msg;
+  if (TheArchive->writeToDisk(true, false, &err_msg )) {
+    errs() << argv[0] << ": " << err_msg << "\n";
+    return 1;
+  }
 
-    if (Verbose)
-      printSymbolTable(TheArchive);
+  if (Verbose)
+    printSymbolTable(TheArchive);
 
-  } catch (const char* msg) {
-    errs() << argv[0] << ": " << msg << "\n\n";
-    exitCode = 1;
-  } catch (const std::string& msg) {
-    errs() << argv[0] << ": " << msg << "\n";
-    exitCode = 2;
-  } catch (...) {
-    errs() << argv[0] << ": An unexpected unknown exception occurred.\n";
-    exitCode = 3;
-  }
   return exitCode;
 }
diff --git a/tools/llvm-rtdyld/llvm-rtdyld.cpp b/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 95de8d8a4d53..7b5bd0388d88 100644
--- a/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -14,6 +14,8 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/ObjectImage.h"
+#include "llvm/ExecutionEngine/ObjectBuffer.h"
 #include "llvm/Object/MachOObject.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -120,12 +122,14 @@ static int executeInput() {
   for(unsigned i = 0, e = InputFileList.size(); i != e; ++i) {
     // Load the input memory buffer.
     OwningPtr<MemoryBuffer> InputBuffer;
+    OwningPtr<ObjectImage>  LoadedObject;
     if (error_code ec = MemoryBuffer::getFileOrSTDIN(InputFileList[i],
                                                      InputBuffer))
       return Error("unable to read input: '" + ec.message() + "'");
 
-    // Load the object file into it.
-    if (Dyld.loadObject(InputBuffer.take())) {
+    // Load the object file
+    LoadedObject.reset(Dyld.loadObject(new ObjectBuffer(InputBuffer.take())));
+    if (!LoadedObject) {
       return Error(Dyld.getErrorString());
     }
   }
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 31252dd7f777..8473d94731a5 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -126,6 +126,10 @@ public:
   /// C'tor
   Modifier(BasicBlock *Block, PieceTable *PT, Random *R):
     BB(Block),PT(PT),Ran(R),Context(BB->getContext()) {}
+
+  /// virtual D'tor to silence warnings.
+  virtual ~Modifier() {}
+
   /// Add a new instruction.
   virtual void Act() = 0;
   /// Add N new instructions,
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index b80bc34a231d..b1c4f437ffbb 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -15,6 +15,7 @@
 #include "LTOCodeGenerator.h"
 #include "LTOModule.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Linker.h"
 #include "llvm/LLVMContext.h"
@@ -29,7 +30,6 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Target/Mangler.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Transforms/IPO.h"
@@ -163,13 +163,16 @@ bool LTOCodeGenerator::compile_to_file(const char** name, std::string& errMsg) {
   // generate object file
   bool genResult = false;
   tool_output_file objFile(uniqueObjPath.c_str(), errMsg);
-  if (!errMsg.empty())
+  if (!errMsg.empty()) {
+    uniqueObjPath.eraseFromDisk();
     return true;
+  }
 
   genResult = this->generateObjectFile(objFile.os(), errMsg);
   objFile.os().close();
   if (objFile.os().has_error()) {
     objFile.os().clear_error();
+    uniqueObjPath.eraseFromDisk();
     return true;
   }
 
@@ -196,6 +199,7 @@ const void* LTOCodeGenerator::compile(size_t* length, std::string& errMsg) {
   OwningPtr<MemoryBuffer> BuffPtr;
   if (error_code ec = MemoryBuffer::getFile(name, BuffPtr, -1, false)) {
     errMsg = ec.message();
+    sys::Path(_nativeObjectPath).eraseFromDisk();
     return NULL;
   }
   _nativeObjectFile = BuffPtr.take();
@@ -214,12 +218,13 @@ bool LTOCodeGenerator::determineTarget(std::string& errMsg) {
   if (_target != NULL)
     return false;
 
-  std::string Triple = _linker.getModule()->getTargetTriple();
-  if (Triple.empty())
-    Triple = sys::getDefaultTargetTriple();
+  std::string TripleStr = _linker.getModule()->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
 
   // create target machine from info for merged modules
-  const Target *march = TargetRegistry::lookupTarget(Triple, errMsg);
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (march == NULL)
     return true;
 
@@ -240,11 +245,18 @@ bool LTOCodeGenerator::determineTarget(std::string& errMsg) {
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
-  Features.getDefaultSubtargetFeatures(llvm::Triple(Triple));
+  Features.getDefaultSubtargetFeatures(Triple);
   std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
+  if (_mCpu.empty() && Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      _mCpu = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      _mCpu = "yonah";
+  }
   TargetOptions Options;
   LTOModule::getTargetOptions(Options);
-  _target = march->createTargetMachine(Triple, _mCpu, FeatureStr, Options,
+  _target = march->createTargetMachine(TripleStr, _mCpu, FeatureStr, Options,
                                        RelocModel, CodeModel::Default,
                                        CodeGenOpt::Aggressive);
   return false;
@@ -289,7 +301,7 @@ void LTOCodeGenerator::applyScopeRestrictions() {
 
   // mark which symbols can not be internalized
   MCContext Context(*_target->getMCAsmInfo(), *_target->getRegisterInfo(),NULL);
-  Mangler mangler(Context, *_target->getTargetData());
+  Mangler mangler(Context, *_target->getDataLayout());
   std::vector<const char*> mustPreserveList;
   SmallPtrSet<GlobalValue*, 8> asmUsed;
 
@@ -357,8 +369,10 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
   // Start off with a verification pass.
   passes.add(createVerifierPass());
 
-  // Add an appropriate TargetData instance for this module...
-  passes.add(new TargetData(*_target->getTargetData()));
+  // Add an appropriate DataLayout instance for this module...
+  passes.add(new DataLayout(*_target->getDataLayout()));
+  passes.add(new TargetTransformInfo(_target->getScalarTargetTransformInfo(),
+                                     _target->getVectorTargetTransformInfo()));
 
   // Enabling internalize here would use its AllButMain variant. It
   // keeps only main if it exists and does nothing for libraries. Instead
@@ -372,7 +386,7 @@ bool LTOCodeGenerator::generateObjectFile(raw_ostream &out,
 
   FunctionPassManager *codeGenPasses = new FunctionPassManager(mergedModule);
 
-  codeGenPasses->add(new TargetData(*_target->getTargetData()));
+  codeGenPasses->add(new DataLayout(*_target->getDataLayout()));
 
   formatted_raw_ostream Out(out);
 
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index c5b3d10db71b..ffdcbe644c81 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp
@@ -150,15 +150,20 @@ UseInitArray("use-init-array",
   cl::desc("Use .init_array instead of .ctors."),
   cl::init(false));
 
+static cl::opt<unsigned>
+SSPBufferSize("stack-protector-buffer-size", cl::init(8),
+              cl::desc("Lower bound for a buffer to be considered for "
+                       "stack protection"));
+
 LTOModule::LTOModule(llvm::Module *m, llvm::TargetMachine *t)
   : _module(m), _target(t),
     _context(*_target->getMCAsmInfo(), *_target->getRegisterInfo(), NULL),
-    _mangler(_context, *_target->getTargetData()) {}
+    _mangler(_context, *_target->getDataLayout()) {}
 
 /// isBitcodeFile - Returns 'true' if the file (or memory contents) is LLVM
 /// bitcode.
 bool LTOModule::isBitcodeFile(const void *mem, size_t length) {
-  return llvm::sys::IdentifyFileType((char*)mem, length)
+  return llvm::sys::IdentifyFileType((const char*)mem, length)
     == llvm::sys::Bitcode_FileType;
 }
 
@@ -252,6 +257,7 @@ void LTOModule::getTargetOptions(TargetOptions &Options) {
   Options.PositionIndependentExecutable = EnablePIE;
   Options.EnableSegmentedStacks = SegmentedStacks;
   Options.UseInitArray = UseInitArray;
+  Options.SSPBufferSize = SSPBufferSize;
 }
 
 LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
@@ -272,23 +278,31 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
     return NULL;
   }
 
-  std::string Triple = m->getTargetTriple();
-  if (Triple.empty())
-    Triple = sys::getDefaultTargetTriple();
+  std::string TripleStr = m->getTargetTriple();
+  if (TripleStr.empty())
+    TripleStr = sys::getDefaultTargetTriple();
+  llvm::Triple Triple(TripleStr);
 
   // find machine architecture for this module
-  const Target *march = TargetRegistry::lookupTarget(Triple, errMsg);
+  const Target *march = TargetRegistry::lookupTarget(TripleStr, errMsg);
   if (!march)
     return NULL;
 
   // construct LTOModule, hand over ownership of module and target
   SubtargetFeatures Features;
-  Features.getDefaultSubtargetFeatures(llvm::Triple(Triple));
+  Features.getDefaultSubtargetFeatures(Triple);
   std::string FeatureStr = Features.getString();
+  // Set a default CPU for Darwin triples.
   std::string CPU;
+  if (Triple.isOSDarwin()) {
+    if (Triple.getArch() == llvm::Triple::x86_64)
+      CPU = "core2";
+    else if (Triple.getArch() == llvm::Triple::x86)
+      CPU = "yonah";
+  }
   TargetOptions Options;
   getTargetOptions(Options);
-  TargetMachine *target = march->createTargetMachine(Triple, CPU, FeatureStr,
+  TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
                                                      Options);
   LTOModule *Ret = new LTOModule(m.take(), target);
   if (Ret->parseSymbols(errMsg)) {
@@ -301,7 +315,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
 
 /// makeBuffer - Create a MemoryBuffer from a memory range.
 MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
-  const char *startPtr = (char*)mem;
+  const char *startPtr = (const char*)mem;
   return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
 }
 
@@ -487,8 +501,7 @@ void LTOModule::addDefinedSymbol(GlobalValue *def, bool isFunction) {
 
   // set definition part
   if (def->hasWeakLinkage() || def->hasLinkOnceLinkage() ||
-      def->hasLinkerPrivateWeakLinkage() ||
-      def->hasLinkerPrivateWeakDefAutoLinkage())
+      def->hasLinkerPrivateWeakLinkage())
     attr |= LTO_SYMBOL_DEFINITION_WEAK;
   else if (def->hasCommonLinkage())
     attr |= LTO_SYMBOL_DEFINITION_TENTATIVE;
@@ -504,7 +517,7 @@ void LTOModule::addDefinedSymbol(GlobalValue *def, bool isFunction) {
            def->hasLinkOnceLinkage() || def->hasCommonLinkage() ||
            def->hasLinkerPrivateWeakLinkage())
     attr |= LTO_SYMBOL_SCOPE_DEFAULT;
-  else if (def->hasLinkerPrivateWeakDefAutoLinkage())
+  else if (def->hasLinkOnceODRAutoHideLinkage())
     attr |= LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN;
   else
     attr |= LTO_SYMBOL_SCOPE_INTERNAL;
diff --git a/tools/lto/Makefile b/tools/lto/Makefile
index 153fa031378d..3610fed03bac 100644
--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile
@@ -49,4 +49,11 @@ ifeq ($(HOST_OS),Darwin)
                             -Wl,-install_name \
                             -Wl,"@executable_path/../lib/lib$(LIBRARYNAME)$(SHLIBEXT)"
     endif
+
+    # If we're doing an Apple-style build, add the LTO object path.
+    ifeq ($(RC_BUILDIT),YES)
+       TempFile        := $(shell mkdir -p ${OBJROOT}/dSYMs ; mktemp ${OBJROOT}/dSYMs/llvm-lto.XXXXXX)
+       LLVMLibsOptions := $(LLVMLibsOptions) \
+                          -Wl,-object_path_lto -Wl,$(TempFile)
+    endif
 endif
diff --git a/tools/lto/lto.exports b/tools/lto/lto.exports
index b900bfb594b1..4940bb147efa 100644
--- a/tools/lto/lto.exports
+++ b/tools/lto/lto.exports
@@ -30,3 +30,4 @@ lto_codegen_compile_to_file
 LLVMCreateDisasm
 LLVMDisasmDispose
 LLVMDisasmInstruction
+LLVMSetDisasmOptions
diff --git a/tools/opt/CMakeLists.txt b/tools/opt/CMakeLists.txt
index 7daf22aa9e3e..32de6d406088 100644
--- a/tools/opt/CMakeLists.txt
+++ b/tools/opt/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)
+set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)
 
 add_llvm_tool(opt
   AnalysisWrappers.cpp
diff --git a/tools/opt/LLVMBuild.txt b/tools/opt/LLVMBuild.txt
index 4de99f51c885..b174431e042a 100644
--- a/tools/opt/LLVMBuild.txt
+++ b/tools/opt/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = opt
 parent = Tools
-required_libraries = AsmParser BitReader BitWriter IPO Instrumentation Scalar
+required_libraries = AsmParser BitReader BitWriter IPO Instrumentation Scalar all-targets
diff --git a/tools/opt/Makefile b/tools/opt/Makefile
index 16d116da5dbd..ee7e1cf796a0 100644
--- a/tools/opt/Makefile
+++ b/tools/opt/Makefile
@@ -9,6 +9,6 @@
 
 LEVEL := ../..
 TOOLNAME := opt
-LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize
+LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize all-targets
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 4ada7d1e76d4..bac0d4694799 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -13,17 +13,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/LLVMContext.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/CallGraphSCCPass.h"
+#include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/Bitcode/ReaderWriter.h"
 #include "llvm/Assembly/PrintModulePass.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/RegionPass.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/ADT/StringSet.h"
@@ -36,7 +37,10 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/SystemUtils.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/LinkAllVMCore.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
@@ -478,6 +482,75 @@ static void AddStandardLinkPasses(PassManagerBase &PM) {
                                  /*RunInliner=*/ !DisableInline);
 }
 
+//===----------------------------------------------------------------------===//
+// CodeGen-related helper functions.
+//
+static TargetOptions GetTargetOptions() {
+  TargetOptions Options;
+  Options.LessPreciseFPMADOption = EnableFPMAD;
+  Options.NoFramePointerElim = DisableFPElim;
+  Options.NoFramePointerElimNonLeaf = DisableFPElimNonLeaf;
+  Options.AllowFPOpFusion = FuseFPOps;
+  Options.UnsafeFPMath = EnableUnsafeFPMath;
+  Options.NoInfsFPMath = EnableNoInfsFPMath;
+  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+  EnableHonorSignDependentRoundingFPMath;
+  Options.UseSoftFloat = GenerateSoftFloatCalls;
+  if (FloatABIForCalls != FloatABI::Default)
+    Options.FloatABIType = FloatABIForCalls;
+  Options.NoZerosInBSS = DontPlaceZerosInBSS;
+  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
+  Options.DisableTailCalls = DisableTailCalls;
+  Options.StackAlignmentOverride = OverrideStackAlignment;
+  Options.RealignStack = EnableRealignStack;
+  Options.TrapFuncName = TrapFuncName;
+  Options.PositionIndependentExecutable = EnablePIE;
+  Options.EnableSegmentedStacks = SegmentedStacks;
+  Options.UseInitArray = UseInitArray;
+  Options.SSPBufferSize = SSPBufferSize;
+  return Options;
+}
+
+CodeGenOpt::Level GetCodeGenOptLevel() {
+  if (OptLevelO1)
+    return CodeGenOpt::Less;
+  if (OptLevelO2)
+    return CodeGenOpt::Default;
+  if (OptLevelO3)
+    return CodeGenOpt::Aggressive;
+  return CodeGenOpt::None;
+}
+
+// Returns the TargetMachine instance or zero if no triple is provided.
+static TargetMachine* GetTargetMachine(std::string TripleStr) {
+  if (TripleStr.empty())
+    return 0;
+
+  // Get the target specific parser.
+  std::string Error;
+  Triple TheTriple(Triple::normalize(TargetTriple));
+
+  const Target *TheTarget = TargetRegistry::lookupTarget(MArch, TheTriple,
+                                                         Error);
+  if (!TheTarget) {
+    return 0;
+  }
+
+  // Package up features to be passed to target/subtarget
+  std::string FeaturesStr;
+  if (MAttrs.size()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
+  return TheTarget->createTargetMachine(TheTriple.getTriple(),
+                                        MCPU, FeaturesStr, GetTargetOptions(),
+                                        RelocModel, CMModel,
+                                        GetCodeGenOptLevel());
+}
 
 //===----------------------------------------------------------------------===//
 // main for opt
@@ -492,6 +565,9 @@ int main(int argc, char **argv) {
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   LLVMContext &Context = getGlobalContext();
 
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+
   // Initialize passes
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
@@ -513,10 +589,6 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  // Allocate a full target machine description only if necessary.
-  // FIXME: The choice of target should be controllable on the command line.
-  std::auto_ptr<TargetMachine> target;
-
   SMDiagnostic Err;
 
   // Load the input module...
@@ -572,22 +644,28 @@ int main(int argc, char **argv) {
     TLI->disableAllFunctions();
   Passes.add(TLI);
 
-  // Add an appropriate TargetData instance for this module.
-  TargetData *TD = 0;
+  // Add an appropriate DataLayout instance for this module.
+  DataLayout *TD = 0;
   const std::string &ModuleDataLayout = M.get()->getDataLayout();
   if (!ModuleDataLayout.empty())
-    TD = new TargetData(ModuleDataLayout);
+    TD = new DataLayout(ModuleDataLayout);
   else if (!DefaultDataLayout.empty())
-    TD = new TargetData(DefaultDataLayout);
+    TD = new DataLayout(DefaultDataLayout);
 
   if (TD)
     Passes.add(TD);
 
+  std::auto_ptr<TargetMachine> TM(GetTargetMachine(TargetTriple));
+  if (TM.get()) {
+    Passes.add(new TargetTransformInfo(TM->getScalarTargetTransformInfo(),
+                                       TM->getVectorTargetTransformInfo()));
+  }
+
   OwningPtr<FunctionPassManager> FPasses;
   if (OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz || OptLevelO3) {
     FPasses.reset(new FunctionPassManager(M.get()));
     if (TD)
-      FPasses->add(new TargetData(*TD));
+      FPasses->add(new DataLayout(*TD));
   }
 
   if (PrintBreakpoints) {
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 00b62feaeb15..117b8204b9ed 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -635,6 +635,12 @@ TEST(APFloatTest, exactInverse) {
   EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(0.5)));
   EXPECT_TRUE(APFloat(2.0f).getExactInverse(&inv));
   EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(0.5f)));
+  EXPECT_TRUE(APFloat(APFloat::IEEEquad, "2.0").getExactInverse(&inv));
+  EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(APFloat::IEEEquad, "0.5")));
+  EXPECT_TRUE(APFloat(APFloat::PPCDoubleDouble, "2.0").getExactInverse(&inv));
+  EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(APFloat::PPCDoubleDouble, "0.5")));
+  EXPECT_TRUE(APFloat(APFloat::x87DoubleExtended, "2.0").getExactInverse(&inv));
+  EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(APFloat::x87DoubleExtended, "0.5")));
 
   // FLT_MIN
   EXPECT_TRUE(APFloat(1.17549435e-38f).getExactInverse(&inv));
@@ -689,6 +695,23 @@ TEST(APFloatTest, roundToIntegral) {
   P = R;
   P.roundToIntegral(APFloat::rmNearestTiesToEven);
   EXPECT_EQ(R.convertToDouble(), P.convertToDouble());
+
+  P = APFloat::getZero(APFloat::IEEEdouble);
+  P.roundToIntegral(APFloat::rmTowardZero);
+  EXPECT_EQ(0.0, P.convertToDouble());
+  P = APFloat::getZero(APFloat::IEEEdouble, true);
+  P.roundToIntegral(APFloat::rmTowardZero);
+  EXPECT_EQ(-0.0, P.convertToDouble());
+  P = APFloat::getNaN(APFloat::IEEEdouble);
+  P.roundToIntegral(APFloat::rmTowardZero);
+  EXPECT_TRUE(IsNAN(P.convertToDouble()));
+  P = APFloat::getInf(APFloat::IEEEdouble);
+  P.roundToIntegral(APFloat::rmTowardZero);
+  EXPECT_TRUE(IsInf(P.convertToDouble()) && P.convertToDouble() > 0.0);
+  P = APFloat::getInf(APFloat::IEEEdouble, true);
+  P.roundToIntegral(APFloat::rmTowardZero);
+  EXPECT_TRUE(IsInf(P.convertToDouble()) && P.convertToDouble() < 0.0);
+
 }
 
 TEST(APFloatTest, getLargest) {
@@ -720,4 +743,40 @@ TEST(APFloatTest, convert) {
   EXPECT_EQ(4294967295.0, test.convertToDouble());
   EXPECT_FALSE(losesInfo);
 }
+
+TEST(APFloatTest, PPCDoubleDouble) {
+  APFloat test(APFloat::PPCDoubleDouble, "1.0");
+  EXPECT_EQ(0x3ff0000000000000ull, test.bitcastToAPInt().getRawData()[0]);
+  EXPECT_EQ(0x0000000000000000ull, test.bitcastToAPInt().getRawData()[1]);
+
+  test.divide(APFloat(APFloat::PPCDoubleDouble, "3.0"), APFloat::rmNearestTiesToEven);
+  EXPECT_EQ(0x3fd5555555555555ull, test.bitcastToAPInt().getRawData()[0]);
+  EXPECT_EQ(0x3c75555555555556ull, test.bitcastToAPInt().getRawData()[1]);
+
+  // LDBL_MAX
+  test = APFloat(APFloat::PPCDoubleDouble, "1.79769313486231580793728971405301e+308");
+  EXPECT_EQ(0x7fefffffffffffffull, test.bitcastToAPInt().getRawData()[0]);
+  EXPECT_EQ(0x7c8ffffffffffffeull, test.bitcastToAPInt().getRawData()[1]);
+
+  // LDBL_MIN
+  test = APFloat(APFloat::PPCDoubleDouble, "2.00416836000897277799610805135016e-292");
+  EXPECT_EQ(0x0360000000000000ull, test.bitcastToAPInt().getRawData()[0]);
+  EXPECT_EQ(0x0000000000000000ull, test.bitcastToAPInt().getRawData()[1]);
+
+  test = APFloat(APFloat::PPCDoubleDouble, "1.0");
+  test.add(APFloat(APFloat::PPCDoubleDouble, "0x1p-105"), APFloat::rmNearestTiesToEven);
+  EXPECT_EQ(0x3ff0000000000000ull, test.bitcastToAPInt().getRawData()[0]);
+  EXPECT_EQ(0x3960000000000000ull, test.bitcastToAPInt().getRawData()[1]);
+
+  test = APFloat(APFloat::PPCDoubleDouble, "1.0");
+  test.add(APFloat(APFloat::PPCDoubleDouble, "0x1p-106"), APFloat::rmNearestTiesToEven);
+  EXPECT_EQ(0x3ff0000000000000ull, test.bitcastToAPInt().getRawData()[0]);
+#if 0 // XFAIL
+  // This is what we would expect with a true double-double implementation
+  EXPECT_EQ(0x3950000000000000ull, test.bitcastToAPInt().getRawData()[1]);
+#else
+  // This is what we get with our 106-bit mantissa approximation
+  EXPECT_EQ(0x0000000000000000ull, test.bitcastToAPInt().getRawData()[1]);
+#endif
+}
 }
diff --git a/unittests/ADT/BitVectorTest.cpp b/unittests/ADT/BitVectorTest.cpp
index d836036aeaea..dc298a83d571 100644
--- a/unittests/ADT/BitVectorTest.cpp
+++ b/unittests/ADT/BitVectorTest.cpp
@@ -281,5 +281,57 @@ TYPED_TEST(BitVectorTest, BinOps) {
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
 }
+
+TYPED_TEST(BitVectorTest, RangeOps) {
+  TypeParam A;
+  A.resize(256);
+  A.reset();
+  A.set(1, 255);
+
+  EXPECT_FALSE(A.test(0));
+  EXPECT_TRUE( A.test(1));
+  EXPECT_TRUE( A.test(23));
+  EXPECT_TRUE( A.test(254));
+  EXPECT_FALSE(A.test(255));
+
+  TypeParam B;
+  B.resize(256);
+  B.set();
+  B.reset(1, 255);
+
+  EXPECT_TRUE( B.test(0));
+  EXPECT_FALSE(B.test(1));
+  EXPECT_FALSE(B.test(23));
+  EXPECT_FALSE(B.test(254));
+  EXPECT_TRUE( B.test(255));
+
+  TypeParam C;
+  C.resize(3);
+  C.reset();
+  C.set(0, 1);
+
+  EXPECT_TRUE(C.test(0));
+  EXPECT_FALSE( C.test(1));
+  EXPECT_FALSE( C.test(2));
+
+  TypeParam D;
+  D.resize(3);
+  D.set();
+  D.reset(0, 1);
+
+  EXPECT_FALSE(D.test(0));
+  EXPECT_TRUE( D.test(1));
+  EXPECT_TRUE( D.test(2));
+
+  TypeParam E;
+  E.resize(128);
+  E.reset();
+  E.set(1, 33);
+
+  EXPECT_FALSE(E.test(0));
+  EXPECT_TRUE( E.test(1));
+  EXPECT_TRUE( E.test(32));
+  EXPECT_FALSE(E.test(33));
+}
 }
 #endif
diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index d272b09088f5..94f7fda2a9e3 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt
@@ -2,7 +2,7 @@ set(LLVM_LINK_COMPONENTS
   Support
   )
 
-add_llvm_unittest(ADTTests
+set(ADTSources
   APFloatTest.cpp
   APIntTest.cpp
   BitVectorTest.cpp
@@ -13,6 +13,7 @@ add_llvm_unittest(ADTTests
   FoldingSet.cpp
   HashingTest.cpp
   ilistTest.cpp
+  ImmutableMapTest.cpp
   ImmutableSetTest.cpp
   IntEqClassesTest.cpp
   IntervalMapTest.cpp
@@ -31,3 +32,16 @@ add_llvm_unittest(ADTTests
   TwineTest.cpp
   VariadicFunctionTest.cpp
  )
+
+# They cannot be compiled on MSVC9 due to its bug.
+if(MSVC AND MSVC_VERSION LESS 1600)
+  set(LLVM_OPTIONAL_SOURCES
+    DenseMapTest.cpp
+    SmallVectorTest.cpp
+    )
+  list(REMOVE_ITEM ADTSources ${LLVM_OPTIONAL_SOURCES})
+endif()
+
+add_llvm_unittest(ADTTests
+  ${ADTSources}
+  )
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index 75e7006434a0..15eb6988f669 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -330,4 +330,37 @@ TEST(DenseMapCustomTest, FindAsTest) {
   EXPECT_TRUE(map.find_as("d") == map.end());
 }
 
+struct ContiguousDenseMapInfo {
+  static inline unsigned getEmptyKey() { return ~0; }
+  static inline unsigned getTombstoneKey() { return ~0U - 1; }
+  static unsigned getHashValue(const unsigned& Val) { return Val; }
+  static bool isEqual(const unsigned& LHS, const unsigned& RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Test that filling a small dense map with exactly the number of elements in
+// the map grows to have enough space for an empty bucket.
+TEST(DenseMapCustomTest, SmallDenseMapGrowTest) {
+  SmallDenseMap<unsigned, unsigned, 32, ContiguousDenseMapInfo> map;
+  // Add some number of elements, then delete a few to leave us some tombstones.
+  // If we just filled the map with 32 elements we'd grow because of not enough
+  // tombstones which masks the issue here.
+  for (unsigned i = 0; i < 20; ++i)
+    map[i] = i + 1;
+  for (unsigned i = 0; i < 10; ++i)
+    map.erase(i);
+  for (unsigned i = 20; i < 32; ++i)
+    map[i] = i + 1;
+
+  // Size tests
+  EXPECT_EQ(22u, map.size());
+
+  // Try to find an element which doesn't exist.  There was a bug in
+  // SmallDenseMap which led to a map with num elements == small capacity not
+  // having an empty bucket any more.  Finding an element not in the map would
+  // therefore never terminate.
+  EXPECT_TRUE(map.find(32) == map.end());
+}
+
 }
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 7a35f521a196..ada5f6db83af 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "gtest/gtest.h"
-#include <llvm/ADT/DenseSet.h>
+#include "llvm/ADT/DenseSet.h"
 
 using namespace llvm;
 
diff --git a/unittests/ADT/ImmutableMapTest.cpp b/unittests/ADT/ImmutableMapTest.cpp
new file mode 100644
index 000000000000..774581ca4eeb
--- /dev/null
+++ b/unittests/ADT/ImmutableMapTest.cpp
@@ -0,0 +1,50 @@
+//===----------- ImmutableMapTest.cpp - ImmutableMap unit tests ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/ImmutableMap.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(ImmutableMapTest, EmptyIntMapTest) {
+  ImmutableMap<int, int>::Factory f;
+
+  EXPECT_TRUE(f.getEmptyMap() == f.getEmptyMap());
+  EXPECT_FALSE(f.getEmptyMap() != f.getEmptyMap());
+  EXPECT_TRUE(f.getEmptyMap().isEmpty());
+
+  ImmutableMap<int, int> S = f.getEmptyMap();
+  EXPECT_EQ(0u, S.getHeight());
+  EXPECT_TRUE(S.begin() == S.end());
+  EXPECT_FALSE(S.begin() != S.end());
+}
+
+TEST(ImmutableMapTest, MultiElemIntMapTest) {
+  ImmutableMap<int, int>::Factory f;
+  ImmutableMap<int, int> S = f.getEmptyMap();
+
+  ImmutableMap<int, int> S2 = f.add(f.add(f.add(S, 3, 10), 4, 11), 5, 12);
+
+  EXPECT_TRUE(S.isEmpty());
+  EXPECT_FALSE(S2.isEmpty());
+
+  EXPECT_EQ(0, S.lookup(3));
+  EXPECT_EQ(0, S.lookup(9));
+
+  EXPECT_EQ(10, *S2.lookup(3));
+  EXPECT_EQ(11, *S2.lookup(4));
+  EXPECT_EQ(12, *S2.lookup(5));
+
+  EXPECT_EQ(5, S2.getMaxElement()->first);
+  EXPECT_EQ(3U, S2.getHeight());
+}
+
+}
diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp
index 315eacbaa400..ead372f365d6 100644
--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@@ -456,4 +456,27 @@ TEST(StringRefTest, getAsInteger) {
   }
 }
 
+
+static const char* BadStrings[] = {
+    "18446744073709551617"  // value just over max
+  , "123456789012345678901" // value way too large
+  , "4t23v"                 // illegal decimal characters
+  , "0x123W56"              // illegal hex characters
+  , "0b2"                   // illegal bin characters
+  , "08"                    // illegal oct characters
+  , "0o8"                   // illegal oct characters
+  , "-123"                  // negative unsigned value
+};
+
+
+TEST(StringRefTest, getAsUnsignedIntegerBadStrings) {
+  unsigned long long U64;
+  for (size_t i = 0; i < array_lengthof(BadStrings); ++i) {
+    bool IsBadNumber = StringRef(BadStrings[i]).getAsInteger(0, U64);
+    ASSERT_TRUE(IsBadNumber);
+  }
+}
+
+
+
 } // end anonymous namespace
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index 967437ca05a2..7c3ab9738940 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -105,6 +105,18 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::Linux, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("powerpc-ibm-aix");
+  EXPECT_EQ(Triple::ppc, T.getArch());
+  EXPECT_EQ(Triple::IBM, T.getVendor());
+  EXPECT_EQ(Triple::AIX, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
+  T = Triple("powerpc64-ibm-aix");
+  EXPECT_EQ(Triple::ppc64, T.getArch());
+  EXPECT_EQ(Triple::IBM, T.getVendor());
+  EXPECT_EQ(Triple::AIX, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
   T = Triple("powerpc-dunno-notsure");
   EXPECT_EQ(Triple::ppc, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index ea5aeb38b01e..c30492a5f006 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <llvm/Analysis/ScalarEvolutionExpressions.h>
-#include <llvm/Analysis/LoopInfo.h>
-#include <llvm/GlobalVariable.h>
-#include <llvm/Constants.h>
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/PassManager.h>
-#include <llvm/ADT/SmallVector.h>
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Constants.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
diff --git a/unittests/ExecutionEngine/CMakeLists.txt b/unittests/ExecutionEngine/CMakeLists.txt
index 5fffadd4ca0c..ed7f10a23c8a 100644
--- a/unittests/ExecutionEngine/CMakeLists.txt
+++ b/unittests/ExecutionEngine/CMakeLists.txt
@@ -7,3 +7,4 @@ add_llvm_unittest(ExecutionEngineTests
   )
 
 add_subdirectory(JIT)
+add_subdirectory(MCJIT)
diff --git a/unittests/ExecutionEngine/JIT/CMakeLists.txt b/unittests/ExecutionEngine/JIT/CMakeLists.txt
index d43d72de4095..11cf784e1e59 100644
--- a/unittests/ExecutionEngine/JIT/CMakeLists.txt
+++ b/unittests/ExecutionEngine/JIT/CMakeLists.txt
@@ -14,8 +14,6 @@ set(LLVM_OPTIONAL_SOURCES
   )
 
 if( LLVM_USE_INTEL_JITEVENTS )
-  include_directories( ${LLVM_INTEL_JITEVENTS_INCDIR} )
-  link_directories( ${LLVM_INTEL_JITEVENTS_LIBDIR} )
   set(ProfileTestSources
     IntelJITEventListenerTest.cpp
     )
diff --git a/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp b/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
index 8ed7a15be37c..d3f66a27e942 100644
--- a/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
+++ b/unittests/ExecutionEngine/JIT/IntelJITEventListenerTest.cpp
@@ -11,7 +11,10 @@
 
 using namespace llvm;
 
-#include "llvm/ExecutionEngine/IntelJITEventsWrapper.h"
+// Because we want to keep the implementation details of the Intel API used to
+// communicate with Amplifier out of the public header files, the header below
+// is included from the source tree instead.
+#include "../../../lib/ExecutionEngine/IntelJITEvents/IntelJITEventsWrapper.h"
 
 #include <map>
 #include <list>
@@ -80,7 +83,7 @@ public:
     EXPECT_TRUE(0 != MockWrapper);
 
     Listener.reset(JITEventListener::createIntelJITEventListener(
-      MockWrapper.get()));
+      MockWrapper.take()));
     EXPECT_TRUE(0 != Listener);
     EE->RegisterJITEventListener(Listener.get());
   }
diff --git a/unittests/ExecutionEngine/JIT/JITTest.cpp b/unittests/ExecutionEngine/JIT/JITTest.cpp
index 89f7e8e4229f..59604dfbf5cf 100644
--- a/unittests/ExecutionEngine/JIT/JITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/JITTest.cpp
@@ -224,6 +224,9 @@ class JITTest : public testing::Test {
   OwningPtr<ExecutionEngine> TheJIT;
 };
 
+// Tests on ARM and PowerPC disabled as we're running the old jit
+#if !defined(__arm__) && !defined(__powerpc__)
+
 // Regression test for a bug.  The JIT used to allocate globals inside the same
 // memory block used for the function, and when the function code was freed,
 // the global was left in the same place.  This test allocates a function
@@ -292,12 +295,14 @@ TEST(JIT, GlobalInFunction) {
   EXPECT_EQ(3, *GPtr);
 }
 
+#endif // !defined(__arm__) && !defined(__powerpc__)
+
 int PlusOne(int arg) {
   return arg + 1;
 }
 
-// ARM tests disabled pending fix for PR10783.
-#if !defined(__arm__)
+// ARM and PowerPC tests disabled pending fix for PR10783.
+#if !defined(__arm__) && !defined(__powerpc__)
 TEST_F(JITTest, FarCallToKnownFunction) {
   // x86-64 can only make direct calls to functions within 32 bits of
   // the current PC.  To call anything farther away, we have to load
@@ -475,7 +480,7 @@ TEST_F(JITTest, ModuleDeletion) {
   EXPECT_EQ(RJMM->startExceptionTableCalls.size(),
             NumTablesDeallocated);
 }
-#endif // !defined(__arm__)
+#endif // !defined(__arm__) && !defined(__powerpc__)
 
 // ARM, MIPS and PPC still emit stubs for calls since the target may be
 // too far away to call directly.  This #if can probably be removed when
@@ -521,6 +526,9 @@ TEST_F(JITTest, NoStubs) {
 }
 #endif  // !ARM && !PPC
 
+// Tests on ARM and PowerPC disabled as we're running the old jit
+#if !defined(__arm__) && !defined(__powerpc__)
+
 TEST_F(JITTest, FunctionPointersOutliveTheirCreator) {
   TheJIT->DisableLazyCompilation(true);
   LoadAssembly("define i8()* @get_foo_addr() { "
@@ -555,10 +563,13 @@ TEST_F(JITTest, FunctionPointersOutliveTheirCreator) {
 #endif
 }
 
-// ARM does not have an implementation
+#endif //!defined(__arm__) && !defined(__powerpc__)
+
+// Tests on ARM and PowerPC disabled as we're running the old jit
+// In addition, ARM does not have an implementation
 // of replaceMachineCodeForFunction(), so recompileAndRelinkFunction
 // doesn't work.
-#if !defined(__arm__)
+#if !defined(__arm__) && !defined(__powerpc__)
 TEST_F(JITTest, FunctionIsRecompiledAndRelinked) {
   Function *F = Function::Create(TypeBuilder<int(void), false>::get(Context),
                                  GlobalValue::ExternalLinkage, "test", M);
@@ -589,16 +600,19 @@ TEST_F(JITTest, FunctionIsRecompiledAndRelinked) {
   EXPECT_EQ(2, OrigFPtr())
     << "The old pointer's target should now jump to the new version";
 }
-#endif  // !defined(__arm__)
+#endif  // !defined(__arm__) && !defined(__powerpc__)
 
 }  // anonymous namespace
 // This variable is intentionally defined differently in the statically-compiled
 // program from the IR input to the JIT to assert that the JIT doesn't use its
 // definition.
 extern "C" int32_t JITTest_AvailableExternallyGlobal;
-int32_t JITTest_AvailableExternallyGlobal = 42;
+int32_t JITTest_AvailableExternallyGlobal LLVM_ATTRIBUTE_USED = 42;
 namespace {
 
+// Tests on ARM and PowerPC disabled as we're running the old jit
+#if !defined(__arm__) && !defined(__powerpc__)
+
 TEST_F(JITTest, AvailableExternallyGlobalIsntEmitted) {
   TheJIT->DisableLazyCompilation(true);
   LoadAssembly("@JITTest_AvailableExternallyGlobal = "
@@ -615,18 +629,19 @@ TEST_F(JITTest, AvailableExternallyGlobalIsntEmitted) {
   EXPECT_EQ(42, loader()) << "func should return 42 from the external global,"
                           << " not 7 from the IR version.";
 }
-
+#endif //!defined(__arm__) && !defined(__powerpc__)
 }  // anonymous namespace
 // This function is intentionally defined differently in the statically-compiled
 // program from the IR input to the JIT to assert that the JIT doesn't use its
 // definition.
+extern "C" int32_t JITTest_AvailableExternallyFunction() LLVM_ATTRIBUTE_USED;
 extern "C" int32_t JITTest_AvailableExternallyFunction() {
   return 42;
 }
 namespace {
 
-// ARM tests disabled pending fix for PR10783.
-#if !defined(__arm__)
+// ARM and PowerPC tests disabled pending fix for PR10783.
+#if !defined(__arm__) && !defined(__powerpc__)
 TEST_F(JITTest, AvailableExternallyFunctionIsntCompiled) {
   TheJIT->DisableLazyCompilation(true);
   LoadAssembly("define available_externally i32 "
@@ -782,7 +797,7 @@ TEST(LazyLoadedJITTest, EagerCompiledRecursionThroughGhost) {
     (intptr_t)TheJIT->getPointerToFunction(recur1IR));
   EXPECT_EQ(3, recur1(4));
 }
-#endif // !defined(__arm__)
+#endif // !defined(__arm__) && !defined(__powerpc__)
 
 // This code is copied from JITEventListenerTest, but it only runs once for all
 // the tests in this directory.  Everything seems fine, but that's strange
diff --git a/unittests/ExecutionEngine/JIT/Makefile b/unittests/ExecutionEngine/JIT/Makefile
index b535a6b29605..9e0bb9ea5930 100644
--- a/unittests/ExecutionEngine/JIT/Makefile
+++ b/unittests/ExecutionEngine/JIT/Makefile
@@ -35,8 +35,15 @@ ifeq ($(USE_OPROFILE), 1)
   LINK_COMPONENTS += oprofilejit
 endif
 
+EXPORTED_SYMBOL_FILE = $(PROJ_OBJ_DIR)/JITTests.exports
 
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
 
 # Permit these tests to use the JIT's symbolic lookup.
 LD.Flags += $(RDYNAMIC)
+
+# Symbol exports are necessary (at least for now) when building with LTO.
+$(LLVMUnitTestExe): $(NativeExportsFile)
+$(PROJ_OBJ_DIR)/JITTests.exports: $(PROJ_SRC_DIR)/JITTests.def $(PROJ_OBJ_DIR)/.dir
+	tail -n +2 $< > $@
+
diff --git a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
index 5b99d5b676e2..4a22e2f641e7 100644
--- a/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
+++ b/unittests/ExecutionEngine/JIT/MultiJITTest.cpp
@@ -65,8 +65,8 @@ void createModule2(LLVMContext &Context2, Module *&M2, Function *&FooF2) {
   FooF2 = M2->getFunction("foo2");
 }
 
-// ARM tests disabled pending fix for PR10783.
-#if !defined(__arm__)
+// ARM and PowerPC tests disabled pending fix for PR10783.
+#if !defined(__arm__) && !defined(__powerpc__)
 
 TEST(MultiJitTest, EagerMode) {
   LLVMContext Context1;
@@ -176,6 +176,6 @@ TEST(MultiJitTest, JitPool) {
 #endif
   EXPECT_TRUE(sa == fa);
 }
-#endif  // !defined(__arm__)
+#endif  // !defined(__arm__) && !defined(__powerpc__)
 
 }  // anonymous namespace
diff --git a/unittests/ExecutionEngine/MCJIT/CMakeLists.txt b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
new file mode 100644
index 000000000000..3e9c5b631e45
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/CMakeLists.txt
@@ -0,0 +1,25 @@
+set(LLVM_LINK_COMPONENTS
+  asmparser
+  bitreader
+  bitwriter
+  mcjit
+  jit
+  nativecodegen
+  )
+
+set(MCJITTestsSources
+  MCJITTest.cpp
+  SectionMemoryManager.cpp
+  )
+
+if(MSVC)
+  list(APPEND MCJITTestsSources MCJITTests.def)
+endif()
+
+add_llvm_unittest(MCJITTests
+  ${MCJITTestsSources}
+  )
+
+if(MINGW OR CYGWIN)
+  set_property(TARGET MCJITTests PROPERTY LINK_FLAGS -Wl,--export-all-symbols)
+endif()
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
new file mode 100644
index 000000000000..6b79a683bce0
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTest.cpp
@@ -0,0 +1,231 @@
+//===- MCJITTest.cpp - Unit tests for the MCJIT ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This test suite verifies basic MCJIT functionality such as making function
+// calls, using global variables, and compiling multpile modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "MCJITTestBase.h"
+#include "SectionMemoryManager.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class MCJITTest : public testing::Test, public MCJITTestBase {
+protected:
+
+  virtual void SetUp() {
+    M.reset(createEmptyModule("<main>"));
+  }
+};
+
+namespace {
+
+// FIXME: In order to JIT an empty module, there needs to be
+// an interface to ExecutionEngine that forces compilation but
+// does require retrieval of a pointer to a function/global.
+/*
+TEST_F(MCJITTest, empty_module) {
+  createJIT(M.take());
+  //EXPECT_NE(0, TheJIT->getObjectImage())
+  //  << "Unable to generate executable loaded object image";
+}
+*/
+
+TEST_F(MCJITTest, global_variable) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  int initialValue = 5;
+  GlobalValue *Global = insertGlobalInt32(M.get(), "test_global", initialValue);
+  createJIT(M.take());
+  void *globalPtr =  TheJIT->getPointerToGlobal(Global);
+  static_cast<SectionMemoryManager*>(MM)->invalidateInstructionCache();
+  EXPECT_TRUE(0 != globalPtr)
+    << "Unable to get pointer to global value from JIT";
+
+  EXPECT_EQ(initialValue, *(int32_t*)globalPtr)
+    << "Unexpected initial value of global";
+}
+
+TEST_F(MCJITTest, add_function) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  Function *F = insertAddFunction(M.get());
+  createJIT(M.take());
+  void *addPtr = TheJIT->getPointerToFunction(F);
+  static_cast<SectionMemoryManager*>(MM)->invalidateInstructionCache();
+  EXPECT_TRUE(0 != addPtr)
+    << "Unable to get pointer to function from JIT";
+
+  int (*AddPtrTy)(int, int) = (int(*)(int, int))(intptr_t)addPtr;
+  EXPECT_EQ(0, AddPtrTy(0, 0));
+  EXPECT_EQ(3, AddPtrTy(1, 2));
+  EXPECT_EQ(-5, AddPtrTy(-2, -3));
+}
+
+TEST_F(MCJITTest, run_main) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  int rc = 6;
+  Function *Main = insertMainFunction(M.get(), 6);
+  createJIT(M.take());
+  void *vPtr = TheJIT->getPointerToFunction(Main);
+  static_cast<SectionMemoryManager*>(MM)->invalidateInstructionCache();
+  EXPECT_TRUE(0 != vPtr)
+    << "Unable to get pointer to main() from JIT";
+
+  int (*FuncPtr)(void) = (int(*)(void))(intptr_t)vPtr;
+  int returnCode = FuncPtr();
+  EXPECT_EQ(returnCode, rc);
+}
+
+TEST_F(MCJITTest, return_global) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  int32_t initialNum = 7;
+  GlobalVariable *GV = insertGlobalInt32(M.get(), "myglob", initialNum);
+
+  Function *ReturnGlobal = startFunction<int32_t(void)>(M.get(),
+                                                        "ReturnGlobal");
+  Value *ReadGlobal = Builder.CreateLoad(GV);
+  endFunctionWithRet(ReturnGlobal, ReadGlobal);
+
+  createJIT(M.take());
+  void *rgvPtr = TheJIT->getPointerToFunction(ReturnGlobal);
+  static_cast<SectionMemoryManager*>(MM)->invalidateInstructionCache();
+  EXPECT_TRUE(0 != rgvPtr);
+
+  int32_t(*FuncPtr)(void) = (int32_t(*)(void))(intptr_t)rgvPtr;
+  EXPECT_EQ(initialNum, FuncPtr())
+    << "Invalid value for global returned from JITted function";
+}
+
+// FIXME: This case fails due to a bug with getPointerToGlobal().
+// The bug is due to MCJIT not having an implementation of getPointerToGlobal()
+// which results in falling back on the ExecutionEngine implementation that
+// allocates a new memory block for the global instead of using the same
+// global variable that is emitted by MCJIT. Hence, the pointer (gvPtr below)
+// has the correct initial value, but updates to the real global (accessed by
+// JITted code) are not propagated. Instead, getPointerToGlobal() should return
+// a pointer into the loaded ObjectImage to reference the emitted global.
+/*
+TEST_F(MCJITTest, increment_global) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  int32_t initialNum = 5;
+  Function *IncrementGlobal = startFunction<int32_t(void)>(M.get(), "IncrementGlobal");
+  GlobalVariable *GV = insertGlobalInt32(M.get(), "my_global", initialNum);
+  Value *DerefGV = Builder.CreateLoad(GV);
+  Value *AddResult = Builder.CreateAdd(DerefGV,
+                                       ConstantInt::get(Context, APInt(32, 1)));
+  Builder.CreateStore(AddResult, GV);
+  endFunctionWithRet(IncrementGlobal, AddResult);
+
+  createJIT(M.take());
+  void *gvPtr = TheJIT->getPointerToGlobal(GV);
+  EXPECT_EQ(initialNum, *(int32_t*)gvPtr);
+
+  void *vPtr = TheJIT->getPointerToFunction(IncrementGlobal);
+  EXPECT_TRUE(0 != vPtr)
+    << "Unable to get pointer to main() from JIT";
+
+  int32_t(*FuncPtr)(void) = (int32_t(*)(void))(intptr_t)vPtr;
+
+  for(int i = 1; i < 3; ++i) {
+    int32_t result = FuncPtr();
+    EXPECT_EQ(initialNum + i, result);            // OK
+    EXPECT_EQ(initialNum + i, *(int32_t*)gvPtr);  // FAILS
+  }
+}
+*/
+
+TEST_F(MCJITTest, multiple_functions) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  unsigned int numLevels = 23;
+  int32_t innerRetVal= 5;
+
+  Function *Inner = startFunction<int32_t(void)>(M.get(), "Inner");
+  endFunctionWithRet(Inner, ConstantInt::get(Context, APInt(32, innerRetVal)));
+
+  Function *Outer;
+  for (unsigned int i = 0; i < numLevels; ++i) {
+    std::stringstream funcName;
+    funcName << "level_" << i;
+    Outer = startFunction<int32_t(void)>(M.get(), funcName.str());
+    Value *innerResult = Builder.CreateCall(Inner);
+    endFunctionWithRet(Outer, innerResult);
+
+    Inner = Outer;
+  }
+
+  createJIT(M.take());
+  void *vPtr = TheJIT->getPointerToFunction(Outer);
+  static_cast<SectionMemoryManager*>(MM)->invalidateInstructionCache();
+  EXPECT_TRUE(0 != vPtr)
+    << "Unable to get pointer to outer function from JIT";
+
+  int32_t(*FuncPtr)(void) = (int32_t(*)(void))(intptr_t)vPtr;
+  EXPECT_EQ(innerRetVal, FuncPtr())
+    << "Incorrect result returned from function";
+}
+
+// FIXME: ExecutionEngine has no support empty modules
+/*
+TEST_F(MCJITTest, multiple_empty_modules) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  createJIT(M.take());
+  // JIT-compile
+  EXPECT_NE(0, TheJIT->getObjectImage())
+    << "Unable to generate executable loaded object image";
+
+  TheJIT->addModule(createEmptyModule("<other module>"));
+  TheJIT->addModule(createEmptyModule("<other other module>"));
+
+  // JIT again
+  EXPECT_NE(0, TheJIT->getObjectImage())
+    << "Unable to generate executable loaded object image";
+}
+*/
+
+// FIXME: MCJIT must support multiple modules
+/*
+TEST_F(MCJITTest, multiple_modules) {
+  SKIP_UNSUPPORTED_PLATFORM;
+
+  Function *Callee = insertAddFunction(M.get());
+  createJIT(M.take());
+
+  // caller function is defined in a different module
+  M.reset(createEmptyModule("<caller module>"));
+
+  Function *CalleeRef = insertExternalReferenceToFunction(M.get(), Callee);
+  Function *Caller = insertSimpleCallFunction(M.get(), CalleeRef);
+
+  TheJIT->addModule(M.take());
+
+  // get a function pointer in a module that was not used in EE construction
+  void *vPtr = TheJIT->getPointerToFunction(Caller);
+  EXPECT_NE(0, vPtr)
+    << "Unable to get pointer to caller function from JIT";
+
+  int(*FuncPtr)(int, int) = (int(*)(int, int))(intptr_t)vPtr;
+  EXPECT_EQ(0, FuncPtr(0, 0));
+  EXPECT_EQ(30, FuncPtr(10, 20));
+  EXPECT_EQ(-30, FuncPtr(-10, -20));
+
+  // ensure caller is destroyed before callee (free use before def)
+  M.reset();
+}
+*/
+
+}
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
new file mode 100644
index 000000000000..9b4a4ac3cf00
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTestBase.h
@@ -0,0 +1,245 @@
+//===- MCJITTestBase.h - Common base class for MCJIT Unit tests  ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements common functionality required by the MCJIT unit tests,
+// as well as logic to skip tests on unsupported architectures and operating
+// systems.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef MCJIT_TEST_BASE_H
+#define MCJIT_TEST_BASE_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Config/config.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/TypeBuilder.h"
+
+#include "SectionMemoryManager.h"
+
+// Used to skip tests on unsupported architectures and operating systems.
+// To skip a test, add this macro at the top of a test-case in a suite that
+// inherits from MCJITTestBase. See MCJITTest.cpp for examples.
+#define SKIP_UNSUPPORTED_PLATFORM \
+  do \
+    if (!ArchSupportsMCJIT() || !OSSupportsMCJIT()) \
+      return; \
+  while(0);
+
+namespace llvm {
+
+class MCJITTestBase {
+protected:
+
+  MCJITTestBase()
+    : OptLevel(CodeGenOpt::None)
+    , RelocModel(Reloc::Default)
+    , CodeModel(CodeModel::Default)
+    , MArch("")
+    , Builder(Context)
+    , MM(new SectionMemoryManager)
+    , HostTriple(LLVM_HOSTTRIPLE)
+  {
+    InitializeNativeTarget();
+    InitializeNativeTargetAsmPrinter();
+
+#ifdef LLVM_ON_WIN32
+    // On Windows, generate ELF objects by specifying "-elf" in triple
+    HostTriple += "-elf";
+#endif // LLVM_ON_WIN32
+    HostTriple = Triple::normalize(HostTriple);
+
+    // The architectures below are known to be compatible with MCJIT as they
+    // are copied from test/ExecutionEngine/MCJIT/lit.local.cfg and should be
+    // kept in sync.
+    SupportedArchs.push_back(Triple::arm);
+    SupportedArchs.push_back(Triple::mips);
+    SupportedArchs.push_back(Triple::x86);
+    SupportedArchs.push_back(Triple::x86_64);
+
+    // The operating systems below are known to be incompatible with MCJIT as
+    // they are copied from the test/ExecutionEngine/MCJIT/lit.local.cfg and
+    // should be kept in sync.
+    UnsupportedOSs.push_back(Triple::Cygwin);
+    UnsupportedOSs.push_back(Triple::Darwin);
+  }
+
+  /// Returns true if the host architecture is known to support MCJIT
+  bool ArchSupportsMCJIT() {
+    Triple Host(HostTriple);
+    if (std::find(SupportedArchs.begin(), SupportedArchs.end(), Host.getArch())
+        == SupportedArchs.end()) {
+      return false;
+    }
+    return true;
+  }
+
+  /// Returns true if the host OS is known to support MCJIT
+  bool OSSupportsMCJIT() {
+    Triple Host(HostTriple);
+    if (std::find(UnsupportedOSs.begin(), UnsupportedOSs.end(), Host.getOS())
+        == UnsupportedOSs.end()) {
+      return true;
+    }
+    return false;
+  }
+
+  Module *createEmptyModule(StringRef Name) {
+    Module * M = new Module(Name, Context);
+    M->setTargetTriple(Triple::normalize(HostTriple));
+    return M;
+  }
+
+  template<typename FuncType>
+  Function *startFunction(Module *M, StringRef Name) {
+    Function *Result = Function::Create(
+      TypeBuilder<FuncType, false>::get(Context),
+      GlobalValue::ExternalLinkage, Name, M);
+
+    BasicBlock *BB = BasicBlock::Create(Context, Name, Result);
+    Builder.SetInsertPoint(BB);
+
+    return Result;
+  }
+
+  void endFunctionWithRet(Function *Func, Value *RetValue) {
+    Builder.CreateRet(RetValue);
+  }
+
+  // Inserts a simple function that invokes Callee and takes the same arguments:
+  //    int Caller(...) { return Callee(...); }
+  template<typename Signature>
+  Function *insertSimpleCallFunction(Module *M, Function *Callee) {
+    Function *Result = startFunction<Signature>(M, "caller");
+
+    SmallVector<Value*, 1> CallArgs;
+
+    Function::arg_iterator arg_iter = Result->arg_begin();
+    for(;arg_iter != Result->arg_end(); ++arg_iter)
+      CallArgs.push_back(arg_iter);
+
+    Value *ReturnCode = Builder.CreateCall(Callee, CallArgs);
+    Builder.CreateRet(ReturnCode);
+    return Result;
+  }
+
+  // Inserts a function named 'main' that returns a uint32_t:
+  //    int32_t main() { return X; }
+  // where X is given by returnCode
+  Function *insertMainFunction(Module *M, uint32_t returnCode) {
+    Function *Result = startFunction<int32_t(void)>(M, "main");
+
+    Value *ReturnVal = ConstantInt::get(Context, APInt(32, returnCode));
+    endFunctionWithRet(Result, ReturnVal);
+
+    return Result;
+  }
+
+  // Inserts a function
+  //    int32_t add(int32_t a, int32_t b) { return a + b; }
+  // in the current module and returns a pointer to it.
+  Function *insertAddFunction(Module *M, StringRef Name = "add") {
+    Function *Result = startFunction<int32_t(int32_t, int32_t)>(M, Name);
+
+    Function::arg_iterator args = Result->arg_begin();
+    Value *Arg1 = args;
+    Value *Arg2 = ++args;
+    Value *AddResult = Builder.CreateAdd(Arg1, Arg2);
+
+    endFunctionWithRet(Result, AddResult);
+
+    return Result;
+  }
+
+  // Inserts an declaration to a function defined elsewhere
+  Function *insertExternalReferenceToFunction(Module *M, StringRef Name,
+                                              FunctionType *FuncTy) {
+    Function *Result = Function::Create(FuncTy,
+                                        GlobalValue::ExternalLinkage,
+                                        Name, M);
+    return Result;
+  }
+
+  // Inserts an declaration to a function defined elsewhere
+  Function *insertExternalReferenceToFunction(Module *M, Function *Func) {
+    Function *Result = Function::Create(Func->getFunctionType(),
+                                        GlobalValue::AvailableExternallyLinkage,
+                                        Func->getName(), M);
+    return Result;
+  }
+
+  // Inserts a global variable of type int32
+  GlobalVariable *insertGlobalInt32(Module *M,
+                                    StringRef name,
+                                    int32_t InitialValue) {
+    Type *GlobalTy = TypeBuilder<types::i<32>, true>::get(Context);
+    Constant *IV = ConstantInt::get(Context, APInt(32, InitialValue));
+    GlobalVariable *Global = new GlobalVariable(*M,
+                                                GlobalTy,
+                                                false,
+                                                GlobalValue::ExternalLinkage,
+                                                IV,
+                                                name);
+    return Global;
+  }
+
+  void createJIT(Module *M) {
+
+    // Due to the EngineBuilder constructor, it is required to have a Module
+    // in order to construct an ExecutionEngine (i.e. MCJIT)
+    assert(M != 0 && "a non-null Module must be provided to create MCJIT");
+
+    EngineBuilder EB(M);
+    std::string Error;
+    TheJIT.reset(EB.setEngineKind(EngineKind::JIT)
+                 .setUseMCJIT(true) /* can this be folded into the EngineKind enum? */
+                 .setJITMemoryManager(MM)
+                 .setErrorStr(&Error)
+                 .setOptLevel(CodeGenOpt::None)
+                 .setAllocateGVsWithCode(false) /*does this do anything?*/
+                 .setCodeModel(CodeModel::JITDefault)
+                 .setRelocationModel(Reloc::Default)
+                 .setMArch(MArch)
+                 .setMCPU(sys::getHostCPUName())
+                 //.setMAttrs(MAttrs)
+                 .create());
+    // At this point, we cannot modify the module any more.
+    assert(TheJIT.get() != NULL && "error creating MCJIT with EngineBuilder");
+  }
+
+  LLVMContext Context;
+  CodeGenOpt::Level OptLevel;
+  Reloc::Model RelocModel;
+  CodeModel::Model CodeModel;
+  StringRef MArch;
+  SmallVector<std::string, 1> MAttrs;
+  OwningPtr<TargetMachine> TM;
+  OwningPtr<ExecutionEngine> TheJIT;
+  IRBuilder<> Builder;
+  JITMemoryManager *MM;
+
+  std::string HostTriple;
+  SmallVector<Triple::ArchType, 4> SupportedArchs;
+  SmallVector<Triple::OSType, 4> UnsupportedOSs;
+
+  OwningPtr<Module> M;
+};
+
+} // namespace llvm
+
+#endif // MCJIT_TEST_H
diff --git a/unittests/ExecutionEngine/MCJIT/MCJITTests.def b/unittests/ExecutionEngine/MCJIT/MCJITTests.def
new file mode 100644
index 000000000000..aabd2247c049
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/MCJITTests.def
@@ -0,0 +1 @@
+EXPORTS
diff --git a/unittests/ExecutionEngine/MCJIT/Makefile b/unittests/ExecutionEngine/MCJIT/Makefile
new file mode 100644
index 000000000000..454f83099d4b
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/Makefile
@@ -0,0 +1,18 @@
+##===- unittests/ExecutionEngine/MCJIT/Makefile ------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+TESTNAME = MCJIT
+LINK_COMPONENTS := core jit mcjit native support
+
+include $(LEVEL)/Makefile.config
+include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
+
+# Permit these tests to use the MCJIT's symbolic lookup.
+LD.Flags += $(RDYNAMIC)
diff --git a/unittests/ExecutionEngine/MCJIT/SectionMemoryManager.cpp b/unittests/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
new file mode 100644
index 000000000000..d6baf3c9bb8e
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/SectionMemoryManager.cpp
@@ -0,0 +1,143 @@
+//===-- SectionMemoryManager.cpp - The memory manager for MCJIT -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the implementation of the section-based memory manager
+// used by MCJIT.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/config.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/MathExtras.h"
+
+#include "SectionMemoryManager.h"
+
+#ifdef __linux__
+// These includes used by SectionMemoryManager::getPointerToNamedFunction()
+// for Glibc trickery. Look comments in this function for more information.
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+namespace llvm {
+
+uint8_t *SectionMemoryManager::allocateDataSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID) {
+  if (!Alignment)
+    Alignment = 16;
+  // Ensure that enough memory is requested to allow aligning.
+  size_t NumElementsAligned = 1 + (Size + Alignment - 1)/Alignment;
+  uint8_t *Addr = (uint8_t*)calloc(NumElementsAligned, Alignment);
+
+  // Honour the alignment requirement.
+  uint8_t *AlignedAddr = (uint8_t*)RoundUpToAlignment((uint64_t)Addr, Alignment);
+
+  // Store the original address from calloc so we can free it later.
+  AllocatedDataMem.push_back(sys::MemoryBlock(Addr, NumElementsAligned*Alignment));
+  return AlignedAddr;
+}
+
+uint8_t *SectionMemoryManager::allocateCodeSection(uintptr_t Size,
+                                                    unsigned Alignment,
+                                                    unsigned SectionID) {
+  if (!Alignment)
+    Alignment = 16;
+  unsigned NeedAllocate = Alignment * ((Size + Alignment - 1)/Alignment + 1);
+  uintptr_t Addr = 0;
+  // Look in the list of free code memory regions and use a block there if one
+  // is available.
+  for (int i = 0, e = FreeCodeMem.size(); i != e; ++i) {
+    sys::MemoryBlock &MB = FreeCodeMem[i];
+    if (MB.size() >= NeedAllocate) {
+      Addr = (uintptr_t)MB.base();
+      uintptr_t EndOfBlock = Addr + MB.size();
+      // Align the address.
+      Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+      // Store cutted free memory block.
+      FreeCodeMem[i] = sys::MemoryBlock((void*)(Addr + Size),
+                                        EndOfBlock - Addr - Size);
+      return (uint8_t*)Addr;
+    }
+  }
+
+  // No pre-allocated free block was large enough. Allocate a new memory region.
+  sys::MemoryBlock MB = sys::Memory::AllocateRWX(NeedAllocate, 0, 0);
+
+  AllocatedCodeMem.push_back(MB);
+  Addr = (uintptr_t)MB.base();
+  uintptr_t EndOfBlock = Addr + MB.size();
+  // Align the address.
+  Addr = (Addr + Alignment - 1) & ~(uintptr_t)(Alignment - 1);
+  // The AllocateRWX may allocate much more memory than we need. In this case,
+  // we store the unused memory as a free memory block.
+  unsigned FreeSize = EndOfBlock-Addr-Size;
+  if (FreeSize > 16)
+    FreeCodeMem.push_back(sys::MemoryBlock((void*)(Addr + Size), FreeSize));
+
+  // Return aligned address
+  return (uint8_t*)Addr;
+}
+
+void SectionMemoryManager::invalidateInstructionCache() {
+  for (int i = 0, e = AllocatedCodeMem.size(); i != e; ++i)
+    sys::Memory::InvalidateInstructionCache(AllocatedCodeMem[i].base(),
+                                            AllocatedCodeMem[i].size());
+}
+
+void *SectionMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                                       bool AbortOnFailure) {
+#if defined(__linux__)
+  //===--------------------------------------------------------------------===//
+  // Function stubs that are invoked instead of certain library calls
+  //
+  // Force the following functions to be linked in to anything that uses the
+  // JIT. This is a hack designed to work around the all-too-clever Glibc
+  // strategy of making these functions work differently when inlined vs. when
+  // not inlined, and hiding their real definitions in a separate archive file
+  // that the dynamic linker can't see. For more info, search for
+  // 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+  if (Name == "stat") return (void*)(intptr_t)&stat;
+  if (Name == "fstat") return (void*)(intptr_t)&fstat;
+  if (Name == "lstat") return (void*)(intptr_t)&lstat;
+  if (Name == "stat64") return (void*)(intptr_t)&stat64;
+  if (Name == "fstat64") return (void*)(intptr_t)&fstat64;
+  if (Name == "lstat64") return (void*)(intptr_t)&lstat64;
+  if (Name == "atexit") return (void*)(intptr_t)&atexit;
+  if (Name == "mknod") return (void*)(intptr_t)&mknod;
+#endif // __linux__
+
+  const char *NameStr = Name.c_str();
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr) return Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr) return Ptr;
+  }
+
+  if (AbortOnFailure)
+    report_fatal_error("Program used external function '" + Name +
+                      "' which could not be resolved!");
+  return 0;
+}
+
+SectionMemoryManager::~SectionMemoryManager() {
+  for (unsigned i = 0, e = AllocatedCodeMem.size(); i != e; ++i)
+    sys::Memory::ReleaseRWX(AllocatedCodeMem[i]);
+  for (unsigned i = 0, e = AllocatedDataMem.size(); i != e; ++i)
+    free(AllocatedDataMem[i].base());
+}
+
+} // namespace llvm
diff --git a/unittests/ExecutionEngine/MCJIT/SectionMemoryManager.h b/unittests/ExecutionEngine/MCJIT/SectionMemoryManager.h
new file mode 100644
index 000000000000..e44217c90638
--- /dev/null
+++ b/unittests/ExecutionEngine/MCJIT/SectionMemoryManager.h
@@ -0,0 +1,118 @@
+//===-- SectionMemoryManager.h - Memory allocator for MCJIT -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of a section-based memory manager used by
+// the MCJIT execution engine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTION_ENGINE_SECTION_MEMORY_MANAGER_H
+#define LLVM_EXECUTION_ENGINE_SECTION_MEMORY_MANAGER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Memory.h"
+
+namespace llvm {
+
+// Section-based memory manager for MCJIT
+class SectionMemoryManager : public JITMemoryManager {
+
+public:
+
+  SectionMemoryManager() { }
+  ~SectionMemoryManager();
+
+  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true);
+
+  // Invalidate instruction cache for code sections. Some platforms with
+  // separate data cache and instruction cache require explicit cache flush,
+  // otherwise JIT code manipulations (like resolved relocations) will get to
+  // the data cache but not to the instruction cache.
+  virtual void invalidateInstructionCache();
+
+private:
+
+  SmallVector<sys::MemoryBlock, 16> AllocatedDataMem;
+  SmallVector<sys::MemoryBlock, 16> AllocatedCodeMem;
+  SmallVector<sys::MemoryBlock, 16> FreeCodeMem;
+
+public:
+
+  ///
+  /// Functions below are not used by MCJIT, but must be implemented because
+  /// they are declared as pure virtuals in the base class.
+  ///
+
+  virtual void setMemoryWritable() {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void setMemoryExecutable() {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void setPoisonMemory(bool poison) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void AllocateGOT() {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual uint8_t *getGOTBase() const {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual uint8_t *startFunctionBody(const Function *F,
+                                     uintptr_t &ActualSize){
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                                unsigned Alignment) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                               uint8_t *FunctionEnd) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual void deallocateFunctionBody(void *Body) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual uint8_t *startExceptionTable(const Function *F,
+                                       uintptr_t &ActualSize) {
+    llvm_unreachable("Unexpected call!");
+    return 0;
+  }
+  virtual void endExceptionTable(const Function *F, uint8_t *TableStart,
+                                 uint8_t *TableEnd, uint8_t *FrameRegister) {
+    llvm_unreachable("Unexpected call!");
+  }
+  virtual void deallocateExceptionTable(void *ET) {
+    llvm_unreachable("Unexpected call!");
+  }
+};
+
+}
+
+#endif // LLVM_EXECUTION_ENGINE_SECTION_MEMORY_MANAGER_H
diff --git a/unittests/ExecutionEngine/Makefile b/unittests/ExecutionEngine/Makefile
index 63508d2399b5..ca1195631a22 100644
--- a/unittests/ExecutionEngine/Makefile
+++ b/unittests/ExecutionEngine/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../..
 TESTNAME = ExecutionEngine
 LINK_COMPONENTS :=interpreter
-PARALLEL_DIRS = JIT
+PARALLEL_DIRS = JIT MCJIT
 
 include $(LEVEL)/Makefile.config
 include $(LLVM_SRC_ROOT)/unittests/Makefile.unittest
diff --git a/unittests/Support/AlignOfTest.cpp b/unittests/Support/AlignOfTest.cpp
index 6f576681a3e2..f01e6609390c 100644
--- a/unittests/Support/AlignOfTest.cpp
+++ b/unittests/Support/AlignOfTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/Support/AlignOfTest.cpp - Alignment utility tests ----===//
+//=== - llvm/unittest/Support/AlignOfTest.cpp - Alignment utility tests ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,31 +23,25 @@ namespace {
 #endif
 
 // Define some fixed alignment types to use in these tests.
-#if __cplusplus == 201103L || __has_feature(cxx_alignas)
-typedef char alignas(1) A1;
-typedef char alignas(2) A2;
-typedef char alignas(4) A4;
-typedef char alignas(8) A8;
-#elif defined(__clang__) || defined(__GNUC__)
-typedef char A1 __attribute__((aligned(1)));
-typedef char A2 __attribute__((aligned(2)));
-typedef char A4 __attribute__((aligned(4)));
-typedef char A8 __attribute__((aligned(8)));
+#if __has_feature(cxx_alignas)
+struct alignas(1) A1 { };
+struct alignas(2) A2 { };
+struct alignas(4) A4 { };
+struct alignas(8) A8 { };
+#elif defined(__GNUC__)
+struct A1 { } __attribute__((aligned(1)));
+struct A2 { } __attribute__((aligned(2)));
+struct A4 { } __attribute__((aligned(4)));
+struct A8 { } __attribute__((aligned(8)));
 #elif defined(_MSC_VER)
-typedef __declspec(align(1)) char A1;
-typedef __declspec(align(2)) char A2;
-typedef __declspec(align(4)) char A4;
-typedef __declspec(align(8)) char A8;
+__declspec(align(1)) struct A1 { };
+__declspec(align(2)) struct A2 { };
+__declspec(align(4)) struct A4 { };
+__declspec(align(8)) struct A8 { };
 #else
 # error No supported align as directive.
 #endif
 
-// Wrap the forced aligned types in structs to hack around compiler bugs.
-struct SA1 { A1 a; };
-struct SA2 { A2 a; };
-struct SA4 { A4 a; };
-struct SA8 { A8 a; };
-
 struct S1 {};
 struct S2 { char a; };
 struct S3 { int x; };
@@ -72,6 +66,17 @@ struct V6 : S1 { virtual ~V6(); };
 struct V7 : virtual V2, virtual V6 { virtual ~V7(); };
 struct V8 : V5, virtual V6, V7 { double zz; virtual ~V8(); };
 
+double S6::f() { return 0.0; }
+float D2::g() { return 0.0f; }
+V1::~V1() {}
+V2::~V2() {}
+V3::~V3() {}
+V4::~V4() {}
+V5::~V5() {}
+V6::~V6() {}
+V7::~V7() {}
+V8::~V8() {}
+
 // Ensure alignment is a compile-time constant.
 char LLVM_ATTRIBUTE_UNUSED test_arr1
   [AlignOf<char>::Alignment > 0]
@@ -90,11 +95,7 @@ char LLVM_ATTRIBUTE_UNUSED test_arr2
   [AlignOf<A1>::Alignment > 0]
   [AlignOf<A2>::Alignment > 0]
   [AlignOf<A4>::Alignment > 0]
-  [AlignOf<A8>::Alignment > 0]
-  [AlignOf<SA1>::Alignment > 0]
-  [AlignOf<SA2>::Alignment > 0]
-  [AlignOf<SA4>::Alignment > 0]
-  [AlignOf<SA8>::Alignment > 0];
+  [AlignOf<A8>::Alignment > 0];
 char LLVM_ATTRIBUTE_UNUSED test_arr3
   [AlignOf<S1>::Alignment > 0]
   [AlignOf<S2>::Alignment > 0]
@@ -123,20 +124,10 @@ char LLVM_ATTRIBUTE_UNUSED test_arr5
   [AlignOf<V8>::Alignment > 0];
 
 TEST(AlignOfTest, BasicAlignmentInvariants) {
-  // For a very strange reason, many compilers do not support this. Both Clang
-  // and GCC fail to align these properly.
-  EXPECT_EQ(1u, alignOf<A1>());
-#if 0
-  EXPECT_EQ(2u, alignOf<A2>());
-  EXPECT_EQ(4u, alignOf<A4>());
-  EXPECT_EQ(8u, alignOf<A8>());
-#endif
-
-  // But once wrapped in structs, the alignment is correctly managed.
-  EXPECT_LE(1u, alignOf<SA1>());
-  EXPECT_LE(2u, alignOf<SA2>());
-  EXPECT_LE(4u, alignOf<SA4>());
-  EXPECT_LE(8u, alignOf<SA8>());
+  EXPECT_LE(1u, alignOf<A1>());
+  EXPECT_LE(2u, alignOf<A2>());
+  EXPECT_LE(4u, alignOf<A4>());
+  EXPECT_LE(8u, alignOf<A8>());
 
   EXPECT_EQ(1u, alignOf<char>());
   EXPECT_LE(alignOf<char>(),   alignOf<short>());
@@ -174,42 +165,38 @@ TEST(AlignOfTest, BasicAlignmentInvariants) {
 }
 
 TEST(AlignOfTest, BasicAlignedArray) {
-  // Note: this code exclusively uses the struct-wrapped arbitrarily aligned
-  // types because of the bugs mentioned above where GCC and Clang both
-  // disregard the arbitrary alignment specifier until the type is used to
-  // declare a member of a struct.
-  EXPECT_LE(1u, alignOf<AlignedCharArrayUnion<SA1> >());
-  EXPECT_LE(2u, alignOf<AlignedCharArrayUnion<SA2> >());
-  EXPECT_LE(4u, alignOf<AlignedCharArrayUnion<SA4> >());
-  EXPECT_LE(8u, alignOf<AlignedCharArrayUnion<SA8> >());
+  EXPECT_LE(1u, alignOf<AlignedCharArrayUnion<A1> >());
+  EXPECT_LE(2u, alignOf<AlignedCharArrayUnion<A2> >());
+  EXPECT_LE(4u, alignOf<AlignedCharArrayUnion<A4> >());
+  EXPECT_LE(8u, alignOf<AlignedCharArrayUnion<A8> >());
 
-  EXPECT_LE(1u, sizeof(AlignedCharArrayUnion<SA1>));
-  EXPECT_LE(2u, sizeof(AlignedCharArrayUnion<SA2>));
-  EXPECT_LE(4u, sizeof(AlignedCharArrayUnion<SA4>));
-  EXPECT_LE(8u, sizeof(AlignedCharArrayUnion<SA8>));
+  EXPECT_LE(1u, sizeof(AlignedCharArrayUnion<A1>));
+  EXPECT_LE(2u, sizeof(AlignedCharArrayUnion<A2>));
+  EXPECT_LE(4u, sizeof(AlignedCharArrayUnion<A4>));
+  EXPECT_LE(8u, sizeof(AlignedCharArrayUnion<A8>));
 
-  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<SA1> >()));
-  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<SA1, SA2> >()));
-  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<SA1, SA2, SA4> >()));
-  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<SA1, SA2, SA4, SA8> >()));
+  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<A1> >()));
+  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<A1, A2> >()));
+  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<A1, A2, A4> >()));
+  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<A1, A2, A4, A8> >()));
 
-  EXPECT_EQ(1u, sizeof(AlignedCharArrayUnion<SA1>));
-  EXPECT_EQ(2u, sizeof(AlignedCharArrayUnion<SA1, SA2>));
-  EXPECT_EQ(4u, sizeof(AlignedCharArrayUnion<SA1, SA2, SA4>));
-  EXPECT_EQ(8u, sizeof(AlignedCharArrayUnion<SA1, SA2, SA4, SA8>));
+  EXPECT_EQ(1u, sizeof(AlignedCharArrayUnion<A1>));
+  EXPECT_EQ(2u, sizeof(AlignedCharArrayUnion<A1, A2>));
+  EXPECT_EQ(4u, sizeof(AlignedCharArrayUnion<A1, A2, A4>));
+  EXPECT_EQ(8u, sizeof(AlignedCharArrayUnion<A1, A2, A4, A8>));
 
-  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<SA1[1]> >()));
-  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<SA1[2], SA2[1]> >()));
-  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<SA1[42], SA2[55],
-                                               SA4[13]> >()));
-  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<SA1[2], SA2[1],
-                                               SA4, SA8> >()));
+  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<A1[1]> >()));
+  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<A1[2], A2[1]> >()));
+  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<A1[42], A2[55],
+                                               A4[13]> >()));
+  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<A1[2], A2[1],
+                                               A4, A8> >()));
 
-  EXPECT_EQ(1u,  sizeof(AlignedCharArrayUnion<SA1[1]>));
-  EXPECT_EQ(2u,  sizeof(AlignedCharArrayUnion<SA1[2], SA2[1]>));
-  EXPECT_EQ(4u,  sizeof(AlignedCharArrayUnion<SA1[3], SA2[2], SA4>));
-  EXPECT_EQ(16u, sizeof(AlignedCharArrayUnion<SA1, SA2[3],
-                                              SA4[3], SA8>));
+  EXPECT_EQ(1u,  sizeof(AlignedCharArrayUnion<A1[1]>));
+  EXPECT_EQ(2u,  sizeof(AlignedCharArrayUnion<A1[2], A2[1]>));
+  EXPECT_EQ(4u,  sizeof(AlignedCharArrayUnion<A1[3], A2[2], A4>));
+  EXPECT_EQ(16u, sizeof(AlignedCharArrayUnion<A1, A2[3],
+                                              A4[3], A8>));
 
   // For other tests we simply assert that the alignment of the union mathes
   // that of the fundamental type and hope that we have any weird type
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 3b9bf8437031..09a0ea50d748 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -17,11 +17,14 @@ add_llvm_unittest(SupportTests
   LeakDetectorTest.cpp
   ManagedStatic.cpp
   MathExtrasTest.cpp
+  MemoryBufferTest.cpp
+  MemoryTest.cpp
   Path.cpp
-  raw_ostream_test.cpp
   RegexTest.cpp
   SwapByteOrderTest.cpp
   TimeValue.cpp
   ValueHandleTest.cpp
   YAMLParserTest.cpp
+  formatted_raw_ostream_test.cpp
+  raw_ostream_test.cpp
   )
diff --git a/unittests/Support/Casting.cpp b/unittests/Support/Casting.cpp
index ca0b40b1f55b..ad564aa366df 100644
--- a/unittests/Support/Casting.cpp
+++ b/unittests/Support/Casting.cpp
@@ -95,8 +95,9 @@ TEST(CastingTest, cast) {
   EXPECT_NE(&F5, null_foo);
   const foo *F6 = cast<foo>(B4);
   EXPECT_NE(F6, null_foo);
-  foo *F7 = cast<foo>(fub());
-  EXPECT_EQ(F7, null_foo);
+  // Can't pass null pointer to cast<>.
+  // foo *F7 = cast<foo>(fub());
+  // EXPECT_EQ(F7, null_foo);
   foo *F8 = B1.baz();
   EXPECT_NE(F8, null_foo);
 }
@@ -121,7 +122,8 @@ TEST(CastingTest, dyn_cast) {
   EXPECT_NE(F2, null_foo);
   const foo *F3 = dyn_cast<foo>(B4);
   EXPECT_NE(F3, null_foo);
-  // foo *F4 = dyn_cast<foo>(fub()); // not permittible
+  // Can't pass null pointer to dyn_cast<>.
+  // foo *F4 = dyn_cast<foo>(fub());
   // EXPECT_EQ(F4, null_foo);
   foo *F5 = B1.daz();
   EXPECT_NE(F5, null_foo);
@@ -151,3 +153,54 @@ const bar *B2 = &B;
 }  // anonymous namespace
 
 bar *llvm::fub() { return 0; }
+
+namespace {
+namespace inferred_upcasting {
+// This test case verifies correct behavior of inferred upcasts when the
+// types are statically known to be OK to upcast. This is the case when,
+// for example, Derived inherits from Base, and we do `isa<Base>(Derived)`.
+
+// Note: This test will actually fail to compile without inferred
+// upcasting.
+
+class Base {
+public:
+  // No classof. We are testing that the upcast is inferred.
+  Base() {}
+};
+
+class Derived : public Base {
+public:
+  Derived() {}
+};
+
+// Even with no explicit classof() in Base, we should still be able to cast
+// Derived to its base class.
+TEST(CastingTest, UpcastIsInferred) {
+  Derived D;
+  EXPECT_TRUE(isa<Base>(D));
+  Base *BP = dyn_cast<Base>(&D);
+  EXPECT_TRUE(BP != NULL);
+}
+
+
+// This test verifies that the inferred upcast takes precedence over an
+// explicitly written one. This is important because it verifies that the
+// dynamic check gets optimized away.
+class UseInferredUpcast {
+public:
+  int Dummy;
+  static bool classof(const UseInferredUpcast *) {
+    return false;
+  }
+};
+
+TEST(CastingTest, InferredUpcastTakesPrecedence) {
+  UseInferredUpcast UIU;
+  // Since the explicit classof() returns false, this will fail if the
+  // explicit one is used.
+  EXPECT_TRUE(isa<UseInferredUpcast>(&UIU));
+}
+
+} // end namespace inferred_upcasting
+} // end anonymous namespace
diff --git a/unittests/Support/DataExtractorTest.cpp b/unittests/Support/DataExtractorTest.cpp
index 9813e465f7ed..ec8bd3d18c8a 100644
--- a/unittests/Support/DataExtractorTest.cpp
+++ b/unittests/Support/DataExtractorTest.cpp
@@ -16,6 +16,7 @@ namespace {
 const char numberData[] = "\x80\x90\xFF\xFF\x80\x00\x00\x00";
 const char stringData[] = "hellohello\0hello";
 const char leb128data[] = "\xA6\x49";
+const char bigleb128data[] = "\xAA\xA9\xFF\xAA\xFF\xAA\xFF\x4A";
 
 TEST(DataExtractorTest, OffsetOverflow) {
   DataExtractor DE(StringRef(numberData, sizeof(numberData)-1), false, 8);
@@ -106,6 +107,14 @@ TEST(DataExtractorTest, LEB128) {
   offset = 0;
   EXPECT_EQ(-7002LL, DE.getSLEB128(&offset));
   EXPECT_EQ(2U, offset);
+
+  DataExtractor BDE(StringRef(bigleb128data, sizeof(bigleb128data)-1), false,8);
+  offset = 0;
+  EXPECT_EQ(42218325750568106ULL, BDE.getULEB128(&offset));
+  EXPECT_EQ(8U, offset);
+  offset = 0;
+  EXPECT_EQ(-29839268287359830LL, BDE.getSLEB128(&offset));
+  EXPECT_EQ(8U, offset);
 }
 
 }
diff --git a/unittests/Support/MemoryBufferTest.cpp b/unittests/Support/MemoryBufferTest.cpp
new file mode 100644
index 000000000000..6c78cd80e8b5
--- /dev/null
+++ b/unittests/Support/MemoryBufferTest.cpp
@@ -0,0 +1,99 @@
+//===- llvm/unittest/Support/MemoryBufferTest.cpp - MemoryBuffer tests ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements unit tests for the MemoryBuffer support class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/ADT/OwningPtr.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+class MemoryBufferTest : public testing::Test {
+protected:
+  MemoryBufferTest()
+  : data("this is some data")
+  { }
+
+  virtual void SetUp() { }
+
+  typedef OwningPtr<MemoryBuffer> OwningBuffer;
+
+  std::string data;
+};
+
+namespace {
+
+TEST_F(MemoryBufferTest, get) {
+  // Default name and null-terminator flag
+  OwningBuffer MB1(MemoryBuffer::getMemBuffer(data));
+  EXPECT_TRUE(0 != MB1.get());
+
+  // RequiresNullTerminator = false
+  OwningBuffer MB2(MemoryBuffer::getMemBuffer(data, "one", false));
+  EXPECT_TRUE(0 != MB2.get());
+
+  // RequiresNullTerminator = true
+  OwningBuffer MB3(MemoryBuffer::getMemBuffer(data, "two", true));
+  EXPECT_TRUE(0 != MB3.get());
+
+  // verify all 3 buffers point to the same address
+  EXPECT_EQ(MB1->getBufferStart(), MB2->getBufferStart());
+  EXPECT_EQ(MB2->getBufferStart(), MB3->getBufferStart());
+
+  // verify the original data is unmodified after deleting the buffers
+  MB1.reset();
+  MB2.reset();
+  MB3.reset();
+  EXPECT_EQ("this is some data", data);
+}
+
+TEST_F(MemoryBufferTest, copy) {
+  // copy with no name
+  OwningBuffer MBC1(MemoryBuffer::getMemBufferCopy(data));
+  EXPECT_TRUE(0 != MBC1.get());
+
+  // copy with a name
+  OwningBuffer MBC2(MemoryBuffer::getMemBufferCopy(data, "copy"));
+  EXPECT_TRUE(0 != MBC2.get());
+
+  // verify the two copies do not point to the same place
+  EXPECT_NE(MBC1->getBufferStart(), MBC2->getBufferStart());
+}
+
+TEST_F(MemoryBufferTest, make_new) {
+  // 0-sized buffer
+  OwningBuffer Zero(MemoryBuffer::getNewUninitMemBuffer(0));
+  EXPECT_TRUE(0 != Zero.get());
+
+  // uninitialized buffer with no name
+  OwningBuffer One(MemoryBuffer::getNewUninitMemBuffer(321));
+  EXPECT_TRUE(0 != One.get());
+
+  // uninitialized buffer with name
+  OwningBuffer Two(MemoryBuffer::getNewUninitMemBuffer(123, "bla"));
+  EXPECT_TRUE(0 != Two.get());
+
+  // 0-initialized buffer with no name
+  OwningBuffer Three(MemoryBuffer::getNewMemBuffer(321, data));
+  EXPECT_TRUE(0 != Three.get());
+  for (size_t i = 0; i < 321; ++i)
+    EXPECT_EQ(0, Three->getBufferStart()[0]);
+
+  // 0-initialized buffer with name
+  OwningBuffer Four(MemoryBuffer::getNewMemBuffer(123, "zeros"));
+  EXPECT_TRUE(0 != Four.get());
+  for (size_t i = 0; i < 123; ++i)
+    EXPECT_EQ(0, Four->getBufferStart()[0]);
+}
+
+}
diff --git a/unittests/Support/MemoryTest.cpp b/unittests/Support/MemoryTest.cpp
new file mode 100644
index 000000000000..21cb27eaf0ef
--- /dev/null
+++ b/unittests/Support/MemoryTest.cpp
@@ -0,0 +1,356 @@
+//===- llvm/unittest/Support/AllocatorTest.cpp - BumpPtrAllocator tests ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Process.h"
+
+#include "gtest/gtest.h"
+#include <cstdlib>
+
+using namespace llvm;
+using namespace sys;
+
+namespace {
+
+class MappedMemoryTest : public ::testing::TestWithParam<unsigned> {
+public:
+  MappedMemoryTest() {
+    Flags = GetParam();
+    PageSize = sys::Process::GetPageSize();
+  }
+
+protected:
+  // Adds RW flags to permit testing of the resulting memory
+  unsigned getTestableEquivalent(unsigned RequestedFlags) {
+    switch (RequestedFlags) {
+    case Memory::MF_READ:
+    case Memory::MF_WRITE:
+    case Memory::MF_READ|Memory::MF_WRITE:
+      return Memory::MF_READ|Memory::MF_WRITE;
+    case Memory::MF_READ|Memory::MF_EXEC:
+    case Memory::MF_READ|Memory::MF_WRITE|Memory::MF_EXEC:
+    case Memory::MF_EXEC:
+      return Memory::MF_READ|Memory::MF_WRITE|Memory::MF_EXEC;
+    }
+    // Default in case values are added to the enum, as required by some compilers
+    return Memory::MF_READ|Memory::MF_WRITE;
+  }
+
+  // Returns true if the memory blocks overlap
+  bool doesOverlap(MemoryBlock M1, MemoryBlock M2) {
+    if (M1.base() == M2.base())
+      return true;
+
+    if (M1.base() > M2.base())
+      return (unsigned char *)M2.base() + M2.size() > M1.base();
+
+    return (unsigned char *)M1.base() + M1.size() > M2.base();
+  }
+
+  unsigned Flags;
+  size_t   PageSize;
+};
+
+TEST_P(MappedMemoryTest, AllocAndRelease) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(sizeof(int), M1.size());
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+}
+
+TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_NE((void*)0, M4.base());
+  EXPECT_LE(16U, M4.size());
+  EXPECT_FALSE(Memory::releaseMappedMemory(M4));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, BasicWrite) {
+  // This test applies only to writeable combinations
+  if (Flags && !(Flags & Memory::MF_WRITE))
+    return;
+
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(sizeof(int), M1.size());
+
+  int *a = (int*)M1.base();
+  *a = 1;
+  EXPECT_EQ(1, *a);
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+}
+
+TEST_P(MappedMemoryTest, MultipleWrite) {
+  // This test applies only to writeable combinations
+  if (Flags && !(Flags & Memory::MF_WRITE))
+    return;
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(1U * sizeof(int), M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(8U * sizeof(int), M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(4U * sizeof(int), M3.size());
+
+  int *x = (int*)M1.base();
+  *x = 1;
+
+  int *y = (int*)M2.base();
+  for (int i = 0; i < 8; i++) {
+    y[i] = i;
+  }
+
+  int *z = (int*)M3.base();
+  *z = 42;
+
+  EXPECT_EQ(1, *x);
+  EXPECT_EQ(7, y[7]);
+  EXPECT_EQ(42, *z);
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+
+  MemoryBlock M4 = Memory::allocateMappedMemory(64 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_NE((void*)0, M4.base());
+  EXPECT_LE(64U * sizeof(int), M4.size());
+  x = (int*)M4.base();
+  *x = 4;
+  EXPECT_EQ(4, *x);
+  EXPECT_FALSE(Memory::releaseMappedMemory(M4));
+
+  // Verify that M2 remains unaffected by other activity
+  for (int i = 0; i < 8; i++) {
+    EXPECT_EQ(i, y[i]);
+  }
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, EnabledWrite) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(2U * sizeof(int), M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(8U * sizeof(int), M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(4U * sizeof(int), M3.size());
+
+  EXPECT_FALSE(Memory::protectMappedMemory(M1, getTestableEquivalent(Flags)));
+  EXPECT_FALSE(Memory::protectMappedMemory(M2, getTestableEquivalent(Flags)));
+  EXPECT_FALSE(Memory::protectMappedMemory(M3, getTestableEquivalent(Flags)));
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  int *x = (int*)M1.base();
+  *x = 1;
+  int *y = (int*)M2.base();
+  for (unsigned int i = 0; i < 8; i++) {
+    y[i] = i;
+  }
+  int *z = (int*)M3.base();
+  *z = 42;
+
+  EXPECT_EQ(1, *x);
+  EXPECT_EQ(7, y[7]);
+  EXPECT_EQ(42, *z);
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_EQ(6, y[6]);
+
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_NE((void*)0, M4.base());
+  EXPECT_LE(16U, M4.size());
+  EXPECT_EQ(error_code::success(), Memory::protectMappedMemory(M4, getTestableEquivalent(Flags)));
+  x = (int*)M4.base();
+  *x = 4;
+  EXPECT_EQ(4, *x);
+  EXPECT_FALSE(Memory::releaseMappedMemory(M4));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, SuccessiveNear) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &M1, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &M2, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, DuplicateNear) {
+  error_code EC;
+  MemoryBlock Near((void*)(3*PageSize), 16);
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, ZeroNear) {
+  error_code EC;
+  MemoryBlock Near(0, 0);
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, ZeroSizeNear) {
+  error_code EC;
+  MemoryBlock Near((void*)(4*PageSize), 0);
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, UnalignedNear) {
+  error_code EC;
+  MemoryBlock Near((void*)(2*PageSize+5), 0);
+  MemoryBlock M1 = Memory::allocateMappedMemory(15, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(sizeof(int), M1.size());
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+}
+
+// Note that Memory::MF_WRITE is not supported exclusively across
+// operating systems and architectures and can imply MF_READ|MF_WRITE
+unsigned MemoryFlags[] = {
+                           Memory::MF_READ,
+                           Memory::MF_WRITE,
+                           Memory::MF_READ|Memory::MF_WRITE,
+                           Memory::MF_EXEC,
+                           Memory::MF_READ|Memory::MF_EXEC,
+                           Memory::MF_READ|Memory::MF_WRITE|Memory::MF_EXEC
+                         };
+
+INSTANTIATE_TEST_CASE_P(AllocationTests,
+                        MappedMemoryTest,
+                        ::testing::ValuesIn(MemoryFlags));
+
+}  // anonymous namespace
diff --git a/unittests/Support/formatted_raw_ostream_test.cpp b/unittests/Support/formatted_raw_ostream_test.cpp
new file mode 100644
index 000000000000..4725cedc2119
--- /dev/null
+++ b/unittests/Support/formatted_raw_ostream_test.cpp
@@ -0,0 +1,33 @@
+//===- llvm/unittest/Support/formatted_raw_ostream_test.cpp ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(formatted_raw_ostreamTest, Test_Tell) {
+  // Check offset when underlying stream has buffer contents.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  formatted_raw_ostream C(B);
+  char tmp[100] = "";
+
+  for (unsigned i = 0; i != 3; ++i) {
+    C.write(tmp, 100);
+
+    EXPECT_EQ(100*(i+1), (unsigned) C.tell());
+  }
+}
+
+}
diff --git a/unittests/Transforms/Utils/CMakeLists.txt b/unittests/Transforms/Utils/CMakeLists.txt
index 365bfbb0bf03..730d83b838fb 100644
--- a/unittests/Transforms/Utils/CMakeLists.txt
+++ b/unittests/Transforms/Utils/CMakeLists.txt
@@ -4,5 +4,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(UtilsTests
   Cloning.cpp
+  IntegerDivision.cpp
   Local.cpp
   )
diff --git a/unittests/Transforms/Utils/IntegerDivision.cpp b/unittests/Transforms/Utils/IntegerDivision.cpp
new file mode 100644
index 000000000000..a3211391d689
--- /dev/null
+++ b/unittests/Transforms/Utils/IntegerDivision.cpp
@@ -0,0 +1,142 @@
+//===- IntegerDivision.cpp - Unit tests for the integer division code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Module.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(IntegerDivision, SDiv) {
+  LLVMContext &C(getGlobalContext());
+  Module M("test division", C);
+  IRBuilder<> Builder(C);
+
+  SmallVector<Type*, 2> ArgTys(2, Builder.getInt32Ty());
+  Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
+                                                   ArgTys, false),
+                                 GlobalValue::ExternalLinkage, "F", &M);
+  assert(F->getArgumentList().size() == 2);
+
+  BasicBlock *BB = BasicBlock::Create(C, "", F);
+  Builder.SetInsertPoint(BB);
+
+  Function::arg_iterator AI = F->arg_begin();
+  Value *A = AI++;
+  Value *B = AI++;
+
+  Value *Div = Builder.CreateSDiv(A, B);
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::SDiv);
+
+  Value *Ret = Builder.CreateRet(Div);
+
+  expandDivision(cast<BinaryOperator>(Div));
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::AShr);
+
+  Instruction* Quotient = dyn_cast<Instruction>(cast<User>(Ret)->getOperand(0));
+  EXPECT_TRUE(Quotient && Quotient->getOpcode() == Instruction::Sub);
+}
+
+TEST(IntegerDivision, UDiv) {
+  LLVMContext &C(getGlobalContext());
+  Module M("test division", C);
+  IRBuilder<> Builder(C);
+
+  SmallVector<Type*, 2> ArgTys(2, Builder.getInt32Ty());
+  Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
+                                                   ArgTys, false),
+                                 GlobalValue::ExternalLinkage, "F", &M);
+  assert(F->getArgumentList().size() == 2);
+
+  BasicBlock *BB = BasicBlock::Create(C, "", F);
+  Builder.SetInsertPoint(BB);
+
+  Function::arg_iterator AI = F->arg_begin();
+  Value *A = AI++;
+  Value *B = AI++;
+
+  Value *Div = Builder.CreateUDiv(A, B);
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::UDiv);
+
+  Value *Ret = Builder.CreateRet(Div);
+
+  expandDivision(cast<BinaryOperator>(Div));
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::ICmp);
+
+  Instruction* Quotient = dyn_cast<Instruction>(cast<User>(Ret)->getOperand(0));
+  EXPECT_TRUE(Quotient && Quotient->getOpcode() == Instruction::PHI);
+}
+
+TEST(IntegerDivision, SRem) {
+  LLVMContext &C(getGlobalContext());
+  Module M("test remainder", C);
+  IRBuilder<> Builder(C);
+
+  SmallVector<Type*, 2> ArgTys(2, Builder.getInt32Ty());
+  Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
+                                                   ArgTys, false),
+                                 GlobalValue::ExternalLinkage, "F", &M);
+  assert(F->getArgumentList().size() == 2);
+
+  BasicBlock *BB = BasicBlock::Create(C, "", F);
+  Builder.SetInsertPoint(BB);
+
+  Function::arg_iterator AI = F->arg_begin();
+  Value *A = AI++;
+  Value *B = AI++;
+
+  Value *Rem = Builder.CreateSRem(A, B);
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::SRem);
+
+  Value *Ret = Builder.CreateRet(Rem);
+
+  expandRemainder(cast<BinaryOperator>(Rem));
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::AShr);
+
+  Instruction* Remainder = dyn_cast<Instruction>(cast<User>(Ret)->getOperand(0));
+  EXPECT_TRUE(Remainder && Remainder->getOpcode() == Instruction::Sub);
+}
+
+TEST(IntegerDivision, URem) {
+  LLVMContext &C(getGlobalContext());
+  Module M("test remainder", C);
+  IRBuilder<> Builder(C);
+
+  SmallVector<Type*, 2> ArgTys(2, Builder.getInt32Ty());
+  Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
+                                                   ArgTys, false),
+                                 GlobalValue::ExternalLinkage, "F", &M);
+  assert(F->getArgumentList().size() == 2);
+
+  BasicBlock *BB = BasicBlock::Create(C, "", F);
+  Builder.SetInsertPoint(BB);
+
+  Function::arg_iterator AI = F->arg_begin();
+  Value *A = AI++;
+  Value *B = AI++;
+
+  Value *Rem = Builder.CreateURem(A, B);
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::URem);
+
+  Value *Ret = Builder.CreateRet(Rem);
+
+  expandRemainder(cast<BinaryOperator>(Rem));
+  EXPECT_TRUE(BB->front().getOpcode() == Instruction::ICmp);
+
+  Instruction* Remainder = dyn_cast<Instruction>(cast<User>(Ret)->getOperand(0));
+  EXPECT_TRUE(Remainder && Remainder->getOpcode() == Instruction::Sub);
+}
+
+}
diff --git a/unittests/VMCore/IRBuilderTest.cpp b/unittests/VMCore/IRBuilderTest.cpp
index b6a3795fd0f4..9f26936df475 100644
--- a/unittests/VMCore/IRBuilderTest.cpp
+++ b/unittests/VMCore/IRBuilderTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BasicBlock.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/IntrinsicInst.h"
@@ -96,4 +97,15 @@ TEST_F(IRBuilderTest, CreateCondBr) {
   EXPECT_EQ(Weights, TI->getMetadata(LLVMContext::MD_prof));
 }
 
+TEST_F(IRBuilderTest, GetIntTy) {
+  IRBuilder<> Builder(BB);
+  IntegerType *Ty1 = Builder.getInt1Ty();
+  EXPECT_EQ(Ty1, IntegerType::get(getGlobalContext(), 1));
+
+  DataLayout* DL = new DataLayout(M.get());
+  IntegerType *IntPtrTy = Builder.getIntPtrTy(DL);
+  unsigned IntPtrBitSize =  DL->getPointerSizeInBits(0);
+  EXPECT_EQ(IntPtrTy, IntegerType::get(getGlobalContext(), IntPtrBitSize));
+}
+
 }
diff --git a/unittests/VMCore/InstructionsTest.cpp b/unittests/VMCore/InstructionsTest.cpp
index 72cdc8b99420..a3b13ce92d15 100644
--- a/unittests/VMCore/InstructionsTest.cpp
+++ b/unittests/VMCore/InstructionsTest.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
@@ -17,7 +18,6 @@
 #include "llvm/Operator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Target/TargetData.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
@@ -183,7 +183,7 @@ TEST(InstructionsTest, VectorGep) {
   EXPECT_NE(S3, Gep3);
 
   int64_t Offset;
-  TargetData TD("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3"
+  DataLayout TD("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3"
                 "2:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80"
                 ":128:128-n8:16:32:64-S128");
   // Make sure we don't crash
@@ -243,5 +243,42 @@ TEST(InstructionsTest, FPMathOperator) {
   delete I;
 }
 
+
+TEST(InstructionsTest, isEliminableCastPair) {
+  LLVMContext &C(getGlobalContext());
+
+  Type* Int32Ty = Type::getInt32Ty(C);
+  Type* Int64Ty = Type::getInt64Ty(C);
+  Type* Int64PtrTy = Type::getInt64PtrTy(C);
+
+  // Source and destination pointers have same size -> bitcast.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
+                                           CastInst::IntToPtr,
+                                           Int64PtrTy, Int64Ty, Int64PtrTy,
+                                           Int32Ty, 0, Int32Ty),
+            CastInst::BitCast);
+
+  // Source and destination pointers have different sizes -> fail.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::PtrToInt,
+                                           CastInst::IntToPtr,
+                                           Int64PtrTy, Int64Ty, Int64PtrTy,
+                                           Int32Ty, 0, Int64Ty),
+            0U);
+
+  // Middle pointer big enough -> bitcast.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToInt,
+                                           Int64Ty, Int64PtrTy, Int64Ty,
+                                           0, Int64Ty, 0),
+            CastInst::BitCast);
+
+  // Middle pointer too small -> fail.
+  EXPECT_EQ(CastInst::isEliminableCastPair(CastInst::IntToPtr,
+                                           CastInst::PtrToInt,
+                                           Int64Ty, Int64PtrTy, Int64Ty,
+                                           0, Int32Ty, 0),
+            0U);
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm
diff --git a/unittests/VMCore/PassManagerTest.cpp b/unittests/VMCore/PassManagerTest.cpp
index 60d33c19c337..9c070c84bbc7 100644
--- a/unittests/VMCore/PassManagerTest.cpp
+++ b/unittests/VMCore/PassManagerTest.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/CallGraphSCCPass.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Constants.h"
@@ -94,7 +94,7 @@ namespace llvm {
         initializeModuleNDMPass(*PassRegistry::getPassRegistry());
       }
       virtual bool runOnModule(Module &M) {
-        EXPECT_TRUE(getAnalysisIfAvailable<TargetData>());
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         run++;
         return false;
       }
@@ -167,7 +167,7 @@ namespace llvm {
         initializeCGPassPass(*PassRegistry::getPassRegistry());
       }
       virtual bool runOnSCC(CallGraphSCC &SCMM) {
-        EXPECT_TRUE(getAnalysisIfAvailable<TargetData>());
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         run();
         return false;
       }
@@ -177,7 +177,7 @@ namespace llvm {
     public:
       virtual bool runOnFunction(Function &F) {
         // FIXME: PR4112
-        // EXPECT_TRUE(getAnalysisIfAvailable<TargetData>());
+        // EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         run();
         return false;
       }
@@ -204,7 +204,7 @@ namespace llvm {
         return false;
       }
       virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-        EXPECT_TRUE(getAnalysisIfAvailable<TargetData>());
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         run();
         return false;
       }
@@ -241,7 +241,7 @@ namespace llvm {
         return false;
       }
       virtual bool runOnBasicBlock(BasicBlock &BB) {
-        EXPECT_TRUE(getAnalysisIfAvailable<TargetData>());
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         run();
         return false;
       }
@@ -266,7 +266,7 @@ namespace llvm {
         initializeFPassPass(*PassRegistry::getPassRegistry());
       }
       virtual bool runOnModule(Module &M) {
-        EXPECT_TRUE(getAnalysisIfAvailable<TargetData>());
+        EXPECT_TRUE(getAnalysisIfAvailable<DataLayout>());
         for (Module::iterator I=M.begin(),E=M.end(); I != E; ++I) {
           Function &F = *I;
           {
@@ -292,7 +292,7 @@ namespace llvm {
       mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
 
       PassManager Passes;
-      Passes.add(new TargetData(&M));
+      Passes.add(new DataLayout(&M));
       Passes.add(mNDM2);
       Passes.add(mNDM);
       Passes.add(mNDNM);
@@ -316,7 +316,7 @@ namespace llvm {
       mNDM->run = mNDNM->run = mDNM->run = mNDM2->run = 0;
 
       PassManager Passes;
-      Passes.add(new TargetData(&M));
+      Passes.add(new DataLayout(&M));
       Passes.add(mNDM);
       Passes.add(mNDNM);
       Passes.add(mNDM2);// invalidates mNDM needed by mDNM
@@ -338,7 +338,7 @@ namespace llvm {
       OwningPtr<Module> M(makeLLVMModule());
       T *P = new T();
       PassManager Passes;
-      Passes.add(new TargetData(M.get()));
+      Passes.add(new DataLayout(M.get()));
       Passes.add(P);
       Passes.run(*M);
       T::finishedOK(run);
@@ -349,7 +349,7 @@ namespace llvm {
       Module *M = makeLLVMModule();
       T *P = new T();
       PassManager Passes;
-      Passes.add(new TargetData(M));
+      Passes.add(new DataLayout(M));
       Passes.add(P);
       Passes.run(*M);
       T::finishedOK(run, N);
@@ -387,7 +387,7 @@ namespace llvm {
         SCOPED_TRACE("Running OnTheFlyTest");
         struct OnTheFlyTest *O = new OnTheFlyTest();
         PassManager Passes;
-        Passes.add(new TargetData(M));
+        Passes.add(new DataLayout(M));
         Passes.add(O);
         Passes.run(*M);
 
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 33f04ce64779..e79162867eba 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -470,7 +470,7 @@ static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB) {
       continue;
     }
 
-    // If C is not a horizontal whitespace, skip it.
+    // If current char is not a horizontal whitespace, dump it to output as is.
     if (*Ptr != ' ' && *Ptr != '\t') {
       NewFile.push_back(*Ptr);
       continue;
@@ -537,11 +537,11 @@ static bool ReadCheckFile(SourceMgr &SM,
       Buffer = Buffer.substr(CheckPrefix.size()+1);
     } else if (Buffer.size() > CheckPrefix.size()+6 &&
                memcmp(Buffer.data()+CheckPrefix.size(), "-NEXT:", 6) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+7);
+      Buffer = Buffer.substr(CheckPrefix.size()+6);
       IsCheckNext = true;
     } else if (Buffer.size() > CheckPrefix.size()+5 &&
                memcmp(Buffer.data()+CheckPrefix.size(), "-NOT:", 5) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+6);
+      Buffer = Buffer.substr(CheckPrefix.size()+5);
       IsCheckNot = true;
     } else {
       Buffer = Buffer.substr(1);
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 026d47f4bd77..ee83311c583b 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -77,7 +77,7 @@
 //
 //  Some targets need a custom way to parse operands, some specific instructions
 //  can contain arguments that can represent processor flags and other kinds of
-//  identifiers that need to be mapped to specific valeus in the final encoded
+//  identifiers that need to be mapped to specific values in the final encoded
 //  instructions. The target specific custom operand parsing works in the
 //  following way:
 //
@@ -199,7 +199,7 @@ public:
     return Kind >= UserClass0;
   }
 
-  /// isRelatedTo - Check whether this class is "related" to \arg RHS. Classes
+  /// isRelatedTo - Check whether this class is "related" to \p RHS. Classes
   /// are related if they are in the same class hierarchy.
   bool isRelatedTo(const ClassInfo &RHS) const {
     // Tokens are only related to tokens.
@@ -238,7 +238,7 @@ public:
     return Root == RHSRoot;
   }
 
-  /// isSubsetOf - Test whether this class is a subset of \arg RHS;
+  /// isSubsetOf - Test whether this class is a subset of \p RHS.
   bool isSubsetOf(const ClassInfo &RHS) const {
     // This is a subset of RHS if it is the same class...
     if (this == &RHS)
@@ -279,6 +279,15 @@ public:
   }
 };
 
+namespace {
+/// Sort ClassInfo pointers independently of pointer value.
+struct LessClassInfoPtr {
+  bool operator()(const ClassInfo *LHS, const ClassInfo *RHS) const {
+    return *LHS < *RHS;
+  }
+};
+}
+
 /// MatchableInfo - Helper class for storing the necessary information for an
 /// instruction or alias which is capable of being matched.
 struct MatchableInfo {
@@ -416,7 +425,7 @@ struct MatchableInfo {
   SmallVector<SubtargetFeatureInfo*, 4> RequiredFeatures;
 
   /// ConversionFnKind - The enum value which is passed to the generated
-  /// ConvertToMCInst to convert parsed operands into an MCInst for this
+  /// convertToMCInst to convert parsed operands into an MCInst for this
   /// function.
   std::string ConversionFnKind;
 
@@ -488,11 +497,20 @@ struct MatchableInfo {
         return false;
     }
 
+    // Give matches that require more features higher precedence. This is useful
+    // because we cannot define AssemblerPredicates with the negation of
+    // processor features. For example, ARM v6 "nop" may be either a HINT or
+    // MOV. With v6, we want to match HINT. The assembler has no way to
+    // predicate MOV under "NoV6", but HINT will always match first because it
+    // requires V6 while MOV does not.
+    if (RequiredFeatures.size() != RHS.RequiredFeatures.size())
+      return RequiredFeatures.size() > RHS.RequiredFeatures.size();
+
     return false;
   }
 
   /// couldMatchAmbiguouslyWith - Check whether this matchable could
-  /// ambiguously match the same set of operands as \arg RHS (without being a
+  /// ambiguously match the same set of operands as \p RHS (without being a
   /// strictly superior match).
   bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) {
     // The primary comparator is the instruction mnemonic.
@@ -590,7 +608,8 @@ public:
   std::vector<OperandMatchEntry> OperandMatchInfo;
 
   /// Map of Register records to their class information.
-  std::map<Record*, ClassInfo*> RegisterClasses;
+  typedef std::map<Record*, ClassInfo*, LessRecordByID> RegisterClassesTy;
+  RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
   std::map<Record*, SubtargetFeatureInfo*> SubtargetFeatures;
@@ -666,22 +685,22 @@ void MatchableInfo::dump() {
 }
 
 static std::pair<StringRef, StringRef>
-parseTwoOperandConstraint(StringRef S, SMLoc Loc) {
+parseTwoOperandConstraint(StringRef S, ArrayRef<SMLoc> Loc) {
   // Split via the '='.
   std::pair<StringRef, StringRef> Ops = S.split('=');
   if (Ops.second == "")
-    throw TGError(Loc, "missing '=' in two-operand alias constraint");
+    PrintFatalError(Loc, "missing '=' in two-operand alias constraint");
   // Trim whitespace and the leading '$' on the operand names.
   size_t start = Ops.first.find_first_of('$');
   if (start == std::string::npos)
-    throw TGError(Loc, "expected '$' prefix on asm operand name");
+    PrintFatalError(Loc, "expected '$' prefix on asm operand name");
   Ops.first = Ops.first.slice(start + 1, std::string::npos);
   size_t end = Ops.first.find_last_of(" \t");
   Ops.first = Ops.first.slice(0, end);
   // Now the second operand.
   start = Ops.second.find_first_of('$');
   if (start == std::string::npos)
-    throw TGError(Loc, "expected '$' prefix on asm operand name");
+    PrintFatalError(Loc, "expected '$' prefix on asm operand name");
   Ops.second = Ops.second.slice(start + 1, std::string::npos);
   end = Ops.second.find_last_of(" \t");
   Ops.first = Ops.first.slice(0, end);
@@ -697,11 +716,11 @@ void MatchableInfo::formTwoOperandAlias(StringRef Constraint) {
   int SrcAsmOperand = findAsmOperandNamed(Ops.first);
   int DstAsmOperand = findAsmOperandNamed(Ops.second);
   if (SrcAsmOperand == -1)
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "unknown source two-operand alias operand '" +
                   Ops.first.str() + "'.");
   if (DstAsmOperand == -1)
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "unknown destination two-operand alias operand '" +
                   Ops.second.str() + "'.");
 
@@ -833,15 +852,15 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info) {
   // The first token of the instruction is the mnemonic, which must be a
   // simple string, not a $foo variable or a singleton register.
   if (AsmOperands.empty())
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "Instruction '" + TheDef->getName() + "' has no tokens");
   Mnemonic = AsmOperands[0].Token;
   if (Mnemonic.empty())
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "Missing instruction mnemonic");
   // FIXME : Check and raise an error if it is a register.
   if (Mnemonic[0] == '$')
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "Invalid instruction mnemonic '" + Mnemonic.str() + "'!");
 
   // Remove the first operand, it is tracked in the mnemonic field.
@@ -851,12 +870,12 @@ void MatchableInfo::tokenizeAsmString(const AsmMatcherInfo &Info) {
 bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
   // Reject matchables with no .s string.
   if (AsmString.empty())
-    throw TGError(TheDef->getLoc(), "instruction with empty asm string");
+    PrintFatalError(TheDef->getLoc(), "instruction with empty asm string");
 
   // Reject any matchables with a newline in them, they should be marked
   // isCodeGenOnly if they are pseudo instructions.
   if (AsmString.find('\n') != std::string::npos)
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "multiline instruction is not valid for the asmparser, "
                   "mark it isCodeGenOnly");
 
@@ -864,7 +883,7 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
   // has one line.
   if (!CommentDelimiter.empty() &&
       StringRef(AsmString).find(CommentDelimiter) != StringRef::npos)
-    throw TGError(TheDef->getLoc(),
+    PrintFatalError(TheDef->getLoc(),
                   "asmstring for instruction has comment character in it, "
                   "mark it isCodeGenOnly");
 
@@ -878,7 +897,7 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
   for (unsigned i = 0, e = AsmOperands.size(); i != e; ++i) {
     StringRef Tok = AsmOperands[i].Token;
     if (Tok[0] == '$' && Tok.find(':') != StringRef::npos)
-      throw TGError(TheDef->getLoc(),
+      PrintFatalError(TheDef->getLoc(),
                     "matchable with operand modifier '" + Tok.str() +
                     "' not supported by asm matcher.  Mark isCodeGenOnly!");
 
@@ -886,7 +905,7 @@ bool MatchableInfo::validate(StringRef CommentDelimiter, bool Hack) const {
     // We reject aliases and ignore instructions for now.
     if (Tok[0] == '$' && !OperandNames.insert(Tok).second) {
       if (!Hack)
-        throw TGError(TheDef->getLoc(),
+        PrintFatalError(TheDef->getLoc(),
                       "ERROR: matchable with tied operand '" + Tok.str() +
                       "' can never be matched!");
       // FIXME: Should reject these.  The ARM backend hits this with $lane in a
@@ -974,7 +993,7 @@ AsmMatcherInfo::getOperandClass(const CGIOperandList::OperandInfo &OI,
                                 int SubOpIdx) {
   Record *Rec = OI.Rec;
   if (SubOpIdx != -1)
-    Rec = dynamic_cast<DefInit*>(OI.MIOperandInfo->getArg(SubOpIdx))->getDef();
+    Rec = cast<DefInit>(OI.MIOperandInfo->getArg(SubOpIdx))->getDef();
   return getOperandClass(Rec, SubOpIdx);
 }
 
@@ -985,10 +1004,10 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
     // use it, else just fall back to the underlying register class.
     const RecordVal *R = Rec->getValue("ParserMatchClass");
     if (R == 0 || R->getValue() == 0)
-      throw "Record `" + Rec->getName() +
-        "' does not have a ParserMatchClass!\n";
+      PrintFatalError("Record `" + Rec->getName() +
+        "' does not have a ParserMatchClass!\n");
 
-    if (DefInit *DI= dynamic_cast<DefInit*>(R->getValue())) {
+    if (DefInit *DI= dyn_cast<DefInit>(R->getValue())) {
       Record *MatchClass = DI->getDef();
       if (ClassInfo *CI = AsmOperandClasses[MatchClass])
         return CI;
@@ -997,26 +1016,28 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
     // No custom match class. Just use the register class.
     Record *ClassRec = Rec->getValueAsDef("RegClass");
     if (!ClassRec)
-      throw TGError(Rec->getLoc(), "RegisterOperand `" + Rec->getName() +
+      PrintFatalError(Rec->getLoc(), "RegisterOperand `" + Rec->getName() +
                     "' has no associated register class!\n");
     if (ClassInfo *CI = RegisterClassClasses[ClassRec])
       return CI;
-    throw TGError(Rec->getLoc(), "register class has no class info!");
+    PrintFatalError(Rec->getLoc(), "register class has no class info!");
   }
 
 
   if (Rec->isSubClassOf("RegisterClass")) {
     if (ClassInfo *CI = RegisterClassClasses[Rec])
       return CI;
-    throw TGError(Rec->getLoc(), "register class has no class info!");
+    PrintFatalError(Rec->getLoc(), "register class has no class info!");
   }
 
-  assert(Rec->isSubClassOf("Operand") && "Unexpected operand!");
+  if (!Rec->isSubClassOf("Operand"))
+    PrintFatalError(Rec->getLoc(), "Operand `" + Rec->getName() +
+                  "' does not derive from class Operand!\n");
   Record *MatchClass = Rec->getValueAsDef("ParserMatchClass");
   if (ClassInfo *CI = AsmOperandClasses[MatchClass])
     return CI;
 
-  throw TGError(Rec->getLoc(), "operand has no match class!");
+  PrintFatalError(Rec->getLoc(), "operand has no match class!");
 }
 
 void AsmMatcherInfo::
@@ -1164,7 +1185,7 @@ void AsmMatcherInfo::buildOperandClasses() {
 
     ListInit *Supers = (*it)->getValueAsListInit("SuperClasses");
     for (unsigned i = 0, e = Supers->getSize(); i != e; ++i) {
-      DefInit *DI = dynamic_cast<DefInit*>(Supers->getElement(i));
+      DefInit *DI = dyn_cast<DefInit>(Supers->getElement(i));
       if (!DI) {
         PrintError((*it)->getLoc(), "Invalid super class reference!");
         continue;
@@ -1182,33 +1203,31 @@ void AsmMatcherInfo::buildOperandClasses() {
 
     // Get or construct the predicate method name.
     Init *PMName = (*it)->getValueInit("PredicateMethod");
-    if (StringInit *SI = dynamic_cast<StringInit*>(PMName)) {
+    if (StringInit *SI = dyn_cast<StringInit>(PMName)) {
       CI->PredicateMethod = SI->getValue();
     } else {
-      assert(dynamic_cast<UnsetInit*>(PMName) &&
-             "Unexpected PredicateMethod field!");
+      assert(isa<UnsetInit>(PMName) && "Unexpected PredicateMethod field!");
       CI->PredicateMethod = "is" + CI->ClassName;
     }
 
     // Get or construct the render method name.
     Init *RMName = (*it)->getValueInit("RenderMethod");
-    if (StringInit *SI = dynamic_cast<StringInit*>(RMName)) {
+    if (StringInit *SI = dyn_cast<StringInit>(RMName)) {
       CI->RenderMethod = SI->getValue();
     } else {
-      assert(dynamic_cast<UnsetInit*>(RMName) &&
-             "Unexpected RenderMethod field!");
+      assert(isa<UnsetInit>(RMName) && "Unexpected RenderMethod field!");
       CI->RenderMethod = "add" + CI->ClassName + "Operands";
     }
 
     // Get the parse method name or leave it as empty.
     Init *PRMName = (*it)->getValueInit("ParserMethod");
-    if (StringInit *SI = dynamic_cast<StringInit*>(PRMName))
+    if (StringInit *SI = dyn_cast<StringInit>(PRMName))
       CI->ParserMethod = SI->getValue();
 
     // Get the diagnostic type or leave it as empty.
     // Get the parse method name or leave it as empty.
     Init *DiagnosticType = (*it)->getValueInit("DiagnosticType");
-    if (StringInit *SI = dynamic_cast<StringInit*>(DiagnosticType))
+    if (StringInit *SI = dyn_cast<StringInit>(DiagnosticType))
       CI->DiagnosticType = SI->getValue();
 
     AsmOperandClasses[*it] = CI;
@@ -1228,7 +1247,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
 
   /// Map containing a mask with all operands indices that can be found for
   /// that class inside a instruction.
-  std::map<ClassInfo*, unsigned> OpClassMask;
+  typedef std::map<ClassInfo*, unsigned, LessClassInfoPtr> OpClassMaskTy;
+  OpClassMaskTy OpClassMask;
 
   for (std::vector<MatchableInfo*>::const_iterator it =
        Matchables.begin(), ie = Matchables.end();
@@ -1247,7 +1267,7 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
     }
 
     // Generate operand match info for each mnemonic/operand class pair.
-    for (std::map<ClassInfo*, unsigned>::iterator iit = OpClassMask.begin(),
+    for (OpClassMaskTy::iterator iit = OpClassMask.begin(),
          iie = OpClassMask.end(); iit != iie; ++iit) {
       unsigned OpMask = iit->second;
       ClassInfo *CI = iit->first;
@@ -1267,7 +1287,7 @@ void AsmMatcherInfo::buildInfo() {
       continue;
 
     if (Pred->getName().empty())
-      throw TGError(Pred->getLoc(), "Predicate has no name!");
+      PrintFatalError(Pred->getLoc(), "Predicate has no name!");
 
     unsigned FeatureNo = SubtargetFeatures.size();
     SubtargetFeatures[Pred] = new SubtargetFeatureInfo(Pred, FeatureNo);
@@ -1448,7 +1468,7 @@ void AsmMatcherInfo::buildInfo() {
     ClassInfo *FromClass = getTokenClass(Rec->getValueAsString("FromToken"));
     ClassInfo *ToClass = getTokenClass(Rec->getValueAsString("ToToken"));
     if (FromClass == ToClass)
-      throw TGError(Rec->getLoc(),
+      PrintFatalError(Rec->getLoc(),
                     "error: Destination value identical to source value.");
     FromClass->SuperClasses.push_back(ToClass);
   }
@@ -1470,7 +1490,7 @@ buildInstructionOperandReference(MatchableInfo *II,
   // Map this token to an operand.
   unsigned Idx;
   if (!Operands.hasOperandNamed(OperandName, Idx))
-    throw TGError(II->TheDef->getLoc(), "error: unable to find operand: '" +
+    PrintFatalError(II->TheDef->getLoc(), "error: unable to find operand: '" +
                   OperandName.str() + "'");
 
   // If the instruction operand has multiple suboperands, but the parser
@@ -1541,7 +1561,7 @@ void AsmMatcherInfo::buildAliasOperandReference(MatchableInfo *II,
       return;
     }
 
-  throw TGError(II->TheDef->getLoc(), "error: unable to find operand: '" +
+  PrintFatalError(II->TheDef->getLoc(), "error: unable to find operand: '" +
                 OperandName.str() + "'");
 }
 
@@ -1563,7 +1583,7 @@ void MatchableInfo::buildInstructionResultOperands() {
     // Find out what operand from the asmparser this MCInst operand comes from.
     int SrcOperand = findAsmOperandNamed(OpInfo.Name);
     if (OpInfo.Name.empty() || SrcOperand == -1)
-      throw TGError(TheDef->getLoc(), "Instruction '" +
+      PrintFatalError(TheDef->getLoc(), "Instruction '" +
                     TheDef->getName() + "' has operand '" + OpInfo.Name +
                     "' that doesn't appear in asm string!");
 
@@ -1615,7 +1635,7 @@ void MatchableInfo::buildAliasResultOperands() {
         StringRef Name = CGA.ResultOperands[AliasOpNo].getName();
         int SrcOperand = findAsmOperand(Name, SubIdx);
         if (SrcOperand == -1)
-          throw TGError(TheDef->getLoc(), "Instruction '" +
+          PrintFatalError(TheDef->getLoc(), "Instruction '" +
                         TheDef->getName() + "' has operand '" + OpName +
                         "' that doesn't appear in asm string!");
         unsigned NumOperands = (SubIdx == -1 ? OpInfo->MINumOperands : 1);
@@ -1638,35 +1658,85 @@ void MatchableInfo::buildAliasResultOperands() {
   }
 }
 
-static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
-                                std::vector<MatchableInfo*> &Infos,
-                                raw_ostream &OS) {
-  // Write the convert function to a separate stream, so we can drop it after
-  // the enum.
-  std::string ConvertFnBody;
-  raw_string_ostream CvtOS(ConvertFnBody);
+static unsigned getConverterOperandID(const std::string &Name,
+                                      SetVector<std::string> &Table,
+                                      bool &IsNew) {
+  IsNew = Table.insert(Name);
 
-  // Function we have already generated.
-  std::set<std::string> GeneratedFns;
+  unsigned ID = IsNew ? Table.size() - 1 :
+    std::find(Table.begin(), Table.end(), Name) - Table.begin();
 
-  // Start the unified conversion function.
-  CvtOS << "bool " << Target.getName() << ClassName << "::\n";
-  CvtOS << "ConvertToMCInst(unsigned Kind, MCInst &Inst, "
-        << "unsigned Opcode,\n"
-        << "                      const SmallVectorImpl<MCParsedAsmOperand*"
-        << "> &Operands) {\n";
-  CvtOS << "  Inst.setOpcode(Opcode);\n";
-  CvtOS << "  switch (Kind) {\n";
-  CvtOS << "  default:\n";
+  assert(ID < Table.size());
+
+  return ID;
+}
 
-  // Start the enum, which we will generate inline.
 
-  OS << "// Unified function for converting operands to MCInst instances.\n\n";
-  OS << "enum ConversionKind {\n";
+static void emitConvertFuncs(CodeGenTarget &Target, StringRef ClassName,
+                             std::vector<MatchableInfo*> &Infos,
+                             raw_ostream &OS) {
+  SetVector<std::string> OperandConversionKinds;
+  SetVector<std::string> InstructionConversionKinds;
+  std::vector<std::vector<uint8_t> > ConversionTable;
+  size_t MaxRowLength = 2; // minimum is custom converter plus terminator.
 
   // TargetOperandClass - This is the target's operand class, like X86Operand.
   std::string TargetOperandClass = Target.getName() + "Operand";
 
+  // Write the convert function to a separate stream, so we can drop it after
+  // the enum. We'll build up the conversion handlers for the individual
+  // operand types opportunistically as we encounter them.
+  std::string ConvertFnBody;
+  raw_string_ostream CvtOS(ConvertFnBody);
+  // Start the unified conversion function.
+  CvtOS << "void " << Target.getName() << ClassName << "::\n"
+        << "convertToMCInst(unsigned Kind, MCInst &Inst, "
+        << "unsigned Opcode,\n"
+        << "                const SmallVectorImpl<MCParsedAsmOperand*"
+        << "> &Operands) {\n"
+        << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
+        << "  const uint8_t *Converter = ConversionTable[Kind];\n"
+        << "  Inst.setOpcode(Opcode);\n"
+        << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n"
+        << "    switch (*p) {\n"
+        << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
+        << "    case CVT_Reg:\n"
+        << "      static_cast<" << TargetOperandClass
+        << "*>(Operands[*(p + 1)])->addRegOperands(Inst, 1);\n"
+        << "      break;\n"
+        << "    case CVT_Tied:\n"
+        << "      Inst.addOperand(Inst.getOperand(*(p + 1)));\n"
+        << "      break;\n";
+
+  std::string OperandFnBody;
+  raw_string_ostream OpOS(OperandFnBody);
+  // Start the operand number lookup function.
+  OpOS << "void " << Target.getName() << ClassName << "::\n"
+       << "convertToMapAndConstraints(unsigned Kind,\n";
+  OpOS.indent(27);
+  OpOS << "const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {\n"
+       << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
+       << "  unsigned NumMCOperands = 0;\n"
+       << "  const uint8_t *Converter = ConversionTable[Kind];\n"
+       << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n"
+       << "    switch (*p) {\n"
+       << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
+       << "    case CVT_Reg:\n"
+       << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n"
+       << "      Operands[*(p + 1)]->setConstraint(\"m\");\n"
+       << "      ++NumMCOperands;\n"
+       << "      break;\n"
+       << "    case CVT_Tied:\n"
+       << "      ++NumMCOperands;\n"
+       << "      break;\n";
+
+  // Pre-populate the operand conversion kinds with the standard always
+  // available entries.
+  OperandConversionKinds.insert("CVT_Done");
+  OperandConversionKinds.insert("CVT_Reg");
+  OperandConversionKinds.insert("CVT_Tied");
+  enum { CVT_Done, CVT_Reg, CVT_Tied };
+
   for (std::vector<MatchableInfo*>::const_iterator it = Infos.begin(),
          ie = Infos.end(); it != ie; ++it) {
     MatchableInfo &II = **it;
@@ -1679,24 +1749,35 @@ static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
       II.ConversionFnKind = Signature;
 
       // Check if we have already generated this signature.
-      if (!GeneratedFns.insert(Signature).second)
+      if (!InstructionConversionKinds.insert(Signature))
         continue;
 
-      // If not, emit it now.  Add to the enum list.
-      OS << "  " << Signature << ",\n";
+      // Remember this converter for the kind enum.
+      unsigned KindID = OperandConversionKinds.size();
+      OperandConversionKinds.insert("CVT_" + AsmMatchConverter);
+
+      // Add the converter row for this instruction.
+      ConversionTable.push_back(std::vector<uint8_t>());
+      ConversionTable.back().push_back(KindID);
+      ConversionTable.back().push_back(CVT_Done);
+
+      // Add the handler to the conversion driver function.
+      CvtOS << "    case CVT_" << AsmMatchConverter << ":\n"
+            << "      " << AsmMatchConverter << "(Inst, Operands);\n"
+            << "      break;\n";
 
-      CvtOS << "  case " << Signature << ":\n";
-      CvtOS << "    return " << AsmMatchConverter
-            << "(Inst, Opcode, Operands);\n";
+      // FIXME: Handle the operand number lookup for custom match functions.
       continue;
     }
 
     // Build the conversion function signature.
     std::string Signature = "Convert";
-    std::string CaseBody;
-    raw_string_ostream CaseOS(CaseBody);
+
+    std::vector<uint8_t> ConversionRow;
 
     // Compute the convert enum and the case body.
+    MaxRowLength = std::max(MaxRowLength, II.ResOperands.size()*2 + 1 );
+
     for (unsigned i = 0, e = II.ResOperands.size(); i != e; ++i) {
       const MatchableInfo::ResOperand &OpInfo = II.ResOperands[i];
 
@@ -1709,74 +1790,180 @@ static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
         // Registers are always converted the same, don't duplicate the
         // conversion function based on them.
         Signature += "__";
-        if (Op.Class->isRegisterClass())
-          Signature += "Reg";
-        else
-          Signature += Op.Class->ClassName;
+        std::string Class;
+        Class = Op.Class->isRegisterClass() ? "Reg" : Op.Class->ClassName;
+        Signature += Class;
         Signature += utostr(OpInfo.MINumOperands);
         Signature += "_" + itostr(OpInfo.AsmOperandNum);
 
-        CaseOS << "    ((" << TargetOperandClass << "*)Operands["
-               << (OpInfo.AsmOperandNum+1) << "])->" << Op.Class->RenderMethod
-               << "(Inst, " << OpInfo.MINumOperands << ");\n";
+        // Add the conversion kind, if necessary, and get the associated ID
+        // the index of its entry in the vector).
+        std::string Name = "CVT_" + (Op.Class->isRegisterClass() ? "Reg" :
+                                     Op.Class->RenderMethod);
+
+        bool IsNewConverter = false;
+        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
+                                            IsNewConverter);
+
+        // Add the operand entry to the instruction kind conversion row.
+        ConversionRow.push_back(ID);
+        ConversionRow.push_back(OpInfo.AsmOperandNum + 1);
+
+        if (!IsNewConverter)
+          break;
+
+        // This is a new operand kind. Add a handler for it to the
+        // converter driver.
+        CvtOS << "    case " << Name << ":\n"
+              << "      static_cast<" << TargetOperandClass
+              << "*>(Operands[*(p + 1)])->"
+              << Op.Class->RenderMethod << "(Inst, " << OpInfo.MINumOperands
+              << ");\n"
+              << "      break;\n";
+
+        // Add a handler for the operand number lookup.
+        OpOS << "    case " << Name << ":\n"
+             << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n"
+             << "      Operands[*(p + 1)]->setConstraint(\"m\");\n"
+             << "      NumMCOperands += " << OpInfo.MINumOperands << ";\n"
+             << "      break;\n";
         break;
       }
-
       case MatchableInfo::ResOperand::TiedOperand: {
         // If this operand is tied to a previous one, just copy the MCInst
         // operand from the earlier one.We can only tie single MCOperand values.
         //assert(OpInfo.MINumOperands == 1 && "Not a singular MCOperand");
         unsigned TiedOp = OpInfo.TiedOperandNum;
         assert(i > TiedOp && "Tied operand precedes its target!");
-        CaseOS << "    Inst.addOperand(Inst.getOperand(" << TiedOp << "));\n";
         Signature += "__Tie" + utostr(TiedOp);
+        ConversionRow.push_back(CVT_Tied);
+        ConversionRow.push_back(TiedOp);
+        // FIXME: Handle the operand number lookup for tied operands.
         break;
       }
       case MatchableInfo::ResOperand::ImmOperand: {
         int64_t Val = OpInfo.ImmVal;
-        CaseOS << "    Inst.addOperand(MCOperand::CreateImm(" << Val << "));\n";
-        Signature += "__imm" + itostr(Val);
+        std::string Ty = "imm_" + itostr(Val);
+        Signature += "__" + Ty;
+
+        std::string Name = "CVT_" + Ty;
+        bool IsNewConverter = false;
+        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
+                                            IsNewConverter);
+        // Add the operand entry to the instruction kind conversion row.
+        ConversionRow.push_back(ID);
+        ConversionRow.push_back(0);
+
+        if (!IsNewConverter)
+          break;
+
+        CvtOS << "    case " << Name << ":\n"
+              << "      Inst.addOperand(MCOperand::CreateImm(" << Val << "));\n"
+              << "      break;\n";
+
+        OpOS << "    case " << Name << ":\n"
+             << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n"
+             << "      Operands[*(p + 1)]->setConstraint(\"\");\n"
+             << "      ++NumMCOperands;\n"
+             << "      break;\n";
         break;
       }
       case MatchableInfo::ResOperand::RegOperand: {
+        std::string Reg, Name;
         if (OpInfo.Register == 0) {
-          CaseOS << "    Inst.addOperand(MCOperand::CreateReg(0));\n";
-          Signature += "__reg0";
+          Name = "reg0";
+          Reg = "0";
         } else {
-          std::string N = getQualifiedName(OpInfo.Register);
-          CaseOS << "    Inst.addOperand(MCOperand::CreateReg(" << N << "));\n";
-          Signature += "__reg" + OpInfo.Register->getName();
+          Reg = getQualifiedName(OpInfo.Register);
+          Name = "reg" + OpInfo.Register->getName();
         }
+        Signature += "__" + Name;
+        Name = "CVT_" + Name;
+        bool IsNewConverter = false;
+        unsigned ID = getConverterOperandID(Name, OperandConversionKinds,
+                                            IsNewConverter);
+        // Add the operand entry to the instruction kind conversion row.
+        ConversionRow.push_back(ID);
+        ConversionRow.push_back(0);
+
+        if (!IsNewConverter)
+          break;
+        CvtOS << "    case " << Name << ":\n"
+              << "      Inst.addOperand(MCOperand::CreateReg(" << Reg << "));\n"
+              << "      break;\n";
+
+        OpOS << "    case " << Name << ":\n"
+             << "      Operands[*(p + 1)]->setMCOperandNum(NumMCOperands);\n"
+             << "      Operands[*(p + 1)]->setConstraint(\"m\");\n"
+             << "      ++NumMCOperands;\n"
+             << "      break;\n";
       }
       }
     }
 
+    // If there were no operands, add to the signature to that effect
+    if (Signature == "Convert")
+      Signature += "_NoOperands";
+
     II.ConversionFnKind = Signature;
 
-    // Check if we have already generated this signature.
-    if (!GeneratedFns.insert(Signature).second)
+    // Save the signature. If we already have it, don't add a new row
+    // to the table.
+    if (!InstructionConversionKinds.insert(Signature))
       continue;
 
-    // If not, emit it now.  Add to the enum list.
-    OS << "  " << Signature << ",\n";
-
-    CvtOS << "  case " << Signature << ":\n";
-    CvtOS << CaseOS.str();
-    CvtOS << "    return true;\n";
+    // Add the row to the table.
+    ConversionTable.push_back(ConversionRow);
   }
 
-  // Finish the convert function.
+  // Finish up the converter driver function.
+  CvtOS << "    }\n  }\n}\n\n";
+
+  // Finish up the operand number lookup function.
+  OpOS << "    }\n  }\n}\n\n";
 
-  CvtOS << "  }\n";
-  CvtOS << "  return false;\n";
-  CvtOS << "}\n\n";
+  OS << "namespace {\n";
+
+  // Output the operand conversion kind enum.
+  OS << "enum OperatorConversionKind {\n";
+  for (unsigned i = 0, e = OperandConversionKinds.size(); i != e; ++i)
+    OS << "  " << OperandConversionKinds[i] << ",\n";
+  OS << "  CVT_NUM_CONVERTERS\n";
+  OS << "};\n\n";
+
+  // Output the instruction conversion kind enum.
+  OS << "enum InstructionConversionKind {\n";
+  for (SetVector<std::string>::const_iterator
+         i = InstructionConversionKinds.begin(),
+         e = InstructionConversionKinds.end(); i != e; ++i)
+    OS << "  " << *i << ",\n";
+  OS << "  CVT_NUM_SIGNATURES\n";
+  OS << "};\n\n";
+
+
+  OS << "} // end anonymous namespace\n\n";
 
-  // Finish the enum, and drop the convert function after it.
+  // Output the conversion table.
+  OS << "static const uint8_t ConversionTable[CVT_NUM_SIGNATURES]["
+     << MaxRowLength << "] = {\n";
+
+  for (unsigned Row = 0, ERow = ConversionTable.size(); Row != ERow; ++Row) {
+    assert(ConversionTable[Row].size() % 2 == 0 && "bad conversion row!");
+    OS << "  // " << InstructionConversionKinds[Row] << "\n";
+    OS << "  { ";
+    for (unsigned i = 0, e = ConversionTable[Row].size(); i != e; i += 2)
+      OS << OperandConversionKinds[ConversionTable[Row][i]] << ", "
+         << (unsigned)(ConversionTable[Row][i + 1]) << ", ";
+    OS << "CVT_Done },\n";
+  }
 
-  OS << "  NumConversionVariants\n";
   OS << "};\n\n";
 
+  // Spit out the conversion driver function.
   OS << CvtOS.str();
+
+  // Spit out the operand number lookup function.
+  OS << OpOS.str();
 }
 
 /// emitMatchClassEnumeration - Emit the enumeration for match class kinds.
@@ -1853,7 +2040,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
   OS << "    MatchClassKind OpKind;\n";
   OS << "    switch (Operand.getReg()) {\n";
   OS << "    default: OpKind = InvalidMatchClass; break;\n";
-  for (std::map<Record*, ClassInfo*>::iterator
+  for (AsmMatcherInfo::RegisterClassesTy::iterator
          it = Info.RegisterClasses.begin(), ie = Info.RegisterClasses.end();
        it != ie; ++it)
     OS << "    case " << Info.Target.getName() << "::"
@@ -1874,7 +2061,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
 static void emitIsSubclass(CodeGenTarget &Target,
                            std::vector<ClassInfo*> &Infos,
                            raw_ostream &OS) {
-  OS << "/// isSubclass - Compute whether \\arg A is a subclass of \\arg B.\n";
+  OS << "/// isSubclass - Compute whether \\p A is a subclass of \\p B.\n";
   OS << "static bool isSubclass(MatchClassKind A, MatchClassKind B) {\n";
   OS << "  if (A == B)\n";
   OS << "    return true;\n\n";
@@ -2083,7 +2270,7 @@ static std::string GetAliasRequiredFeatures(Record *R,
     SubtargetFeatureInfo *F = Info.getSubtargetFeature(ReqFeatures[i]);
 
     if (F == 0)
-      throw TGError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() +
+      PrintFatalError(R->getLoc(), "Predicate '" + ReqFeatures[i]->getName() +
                     "' is not marked as an AssemblerPredicate!");
 
     if (NumFeatures)
@@ -2146,14 +2333,14 @@ static bool emitMnemonicAliases(raw_ostream &OS, const AsmMatcherInfo &Info) {
           // We can't have two aliases from the same mnemonic with no predicate.
           PrintError(ToVec[AliasWithNoPredicate]->getLoc(),
                      "two MnemonicAliases with the same 'from' mnemonic!");
-          throw TGError(R->getLoc(), "this is the other MnemonicAlias.");
+          PrintFatalError(R->getLoc(), "this is the other MnemonicAlias.");
         }
 
         AliasWithNoPredicate = i;
         continue;
       }
       if (R->getValueAsString("ToMnemonic") == I->first)
-        throw TGError(R->getLoc(), "MnemonicAlias to the same string");
+        PrintFatalError(R->getLoc(), "MnemonicAlias to the same string");
 
       if (!MatchCode.empty())
         MatchCode += "else ";
@@ -2189,17 +2376,27 @@ static const char *getMinimalTypeForRange(uint64_t Range) {
 }
 
 static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
-                              const AsmMatcherInfo &Info, StringRef ClassName) {
+                              const AsmMatcherInfo &Info, StringRef ClassName,
+                              StringToOffsetTable &StringTable,
+                              unsigned MaxMnemonicIndex) {
+  unsigned MaxMask = 0;
+  for (std::vector<OperandMatchEntry>::const_iterator it =
+       Info.OperandMatchInfo.begin(), ie = Info.OperandMatchInfo.end();
+       it != ie; ++it) {
+    MaxMask |= it->OperandMask;
+  }
+
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
   OS << "  struct OperandMatchEntry {\n";
-  OS << "    static const char *const MnemonicTable;\n";
-  OS << "    uint32_t OperandMask;\n";
-  OS << "    uint32_t Mnemonic;\n";
   OS << "    " << getMinimalTypeForRange(1ULL << Info.SubtargetFeatures.size())
                << " RequiredFeatures;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
+               << " Mnemonic;\n";
   OS << "    " << getMinimalTypeForRange(Info.Classes.size())
-               << " Class;\n\n";
+               << " Class;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMask)
+               << " OperandMask;\n\n";
   OS << "    StringRef getMnemonic() const {\n";
   OS << "      return StringRef(MnemonicTable + Mnemonic + 1,\n";
   OS << "                       MnemonicTable[Mnemonic]);\n";
@@ -2222,8 +2419,6 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "} // end anonymous namespace.\n\n";
 
-  StringToOffsetTable StringTable;
-
   OS << "static const OperandMatchEntry OperandMatchTable["
      << Info.OperandMatchInfo.size() << "] = {\n";
 
@@ -2234,8 +2429,25 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
     const OperandMatchEntry &OMI = *it;
     const MatchableInfo &II = *OMI.MI;
 
-    OS << "  { " << OMI.OperandMask;
+    OS << "  { ";
 
+    // Write the required features mask.
+    if (!II.RequiredFeatures.empty()) {
+      for (unsigned i = 0, e = II.RequiredFeatures.size(); i != e; ++i) {
+        if (i) OS << "|";
+        OS << II.RequiredFeatures[i]->getEnumName();
+      }
+    } else
+      OS << "0";
+
+    // Store a pascal-style length byte in the mnemonic.
+    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+    OS << ", " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
+       << " /* " << II.Mnemonic << " */, ";
+
+    OS << OMI.CI->Name;
+
+    OS << ", " << OMI.OperandMask;
     OS << " /* ";
     bool printComma = false;
     for (int i = 0, e = 31; i !=e; ++i)
@@ -2247,30 +2459,10 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
       }
     OS << " */";
 
-    // Store a pascal-style length byte in the mnemonic.
-    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
-    OS << ", " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-       << " /* " << II.Mnemonic << " */, ";
-
-    // Write the required features mask.
-    if (!II.RequiredFeatures.empty()) {
-      for (unsigned i = 0, e = II.RequiredFeatures.size(); i != e; ++i) {
-        if (i) OS << "|";
-        OS << II.RequiredFeatures[i]->getEnumName();
-      }
-    } else
-      OS << "0";
-
-    OS << ", " << OMI.CI->Name;
-
     OS << " },\n";
   }
   OS << "};\n\n";
 
-  OS << "const char *const OperandMatchEntry::MnemonicTable =\n";
-  StringTable.EmitString(OS);
-  OS << ";\n\n";
-
   // Emit the operand class switch to call the correct custom parser for
   // the found operand class.
   OS << Target.getName() << ClassName << "::OperandMatchResultTy "
@@ -2407,14 +2599,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  // This should be included into the middle of the declaration of\n";
   OS << "  // your subclasses implementation of MCTargetAsmParser.\n";
   OS << "  unsigned ComputeAvailableFeatures(uint64_t FeatureBits) const;\n";
-  OS << "  bool ConvertToMCInst(unsigned Kind, MCInst &Inst, "
+  OS << "  void convertToMCInst(unsigned Kind, MCInst &Inst, "
      << "unsigned Opcode,\n"
      << "                       const SmallVectorImpl<MCParsedAsmOperand*> "
      << "&Operands);\n";
-  OS << "  bool MnemonicIsValid(StringRef Mnemonic);\n";
+  OS << "  void convertToMapAndConstraints(unsigned Kind,\n                ";
+  OS << "           const SmallVectorImpl<MCParsedAsmOperand*> &Operands);\n";
+  OS << "  bool mnemonicIsValid(StringRef Mnemonic);\n";
   OS << "  unsigned MatchInstructionImpl(\n";
-  OS << "    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n";
-  OS << "    MCInst &Inst, unsigned &ErrorInfo, unsigned VariantID = 0);\n";
+  OS.indent(27);
+  OS << "const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n"
+     << "                                MCInst &Inst,\n"
+     << "                                unsigned &ErrorInfo,"
+     << " bool matchingInlineAsm,\n"
+     << "                                unsigned VariantID = 0);\n";
 
   if (Info.OperandMatchInfo.size()) {
     OS << "\n  enum OperandMatchResultTy {\n";
@@ -2447,7 +2645,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   emitSubtargetFeatureFlagEnumeration(Info, OS);
 
   // Emit the function to match a register name to number.
-  emitMatchRegisterName(Target, AsmParser, OS);
+  // This should be omitted for Mips target
+  if (AsmParser->getValueAsBit("ShouldEmitMatchRegisterName"))
+    emitMatchRegisterName(Target, AsmParser, OS);
 
   OS << "#endif // GET_REGISTER_MATCHER\n\n";
 
@@ -2465,8 +2665,10 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // Generate the function that remaps for mnemonic aliases.
   bool HasMnemonicAliases = emitMnemonicAliases(OS, Info);
 
-  // Generate the unified function to convert operands into an MCInst.
-  emitConvertToMCInst(Target, ClassName, Info.Matchables, OS);
+  // Generate the convertToMCInst function to convert operands into an MCInst.
+  // Also, generate the convertToMapAndConstraints function for MS-style inline
+  // assembly.  The latter doesn't actually generate a MCInst.
+  emitConvertFuncs(Target, ClassName, Info.Matchables, OS);
 
   // Emit the enumeration for classes which participate in matching.
   emitMatchClassEnumeration(Target, Info.Classes, OS);
@@ -2484,11 +2686,25 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   emitComputeAvailableFeatures(Info, OS);
 
 
+  StringToOffsetTable StringTable;
+
   size_t MaxNumOperands = 0;
+  unsigned MaxMnemonicIndex = 0;
   for (std::vector<MatchableInfo*>::const_iterator it =
          Info.Matchables.begin(), ie = Info.Matchables.end();
-       it != ie; ++it)
-    MaxNumOperands = std::max(MaxNumOperands, (*it)->AsmOperands.size());
+       it != ie; ++it) {
+    MatchableInfo &II = **it;
+    MaxNumOperands = std::max(MaxNumOperands, II.AsmOperands.size());
+
+    // Store a pascal-style length byte in the mnemonic.
+    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+    MaxMnemonicIndex = std::max(MaxMnemonicIndex,
+                        StringTable.GetOrAddStringOffset(LenMnemonic, false));
+  }
+
+  OS << "static const char *const MnemonicTable =\n";
+  StringTable.EmitString(OS);
+  OS << ";\n\n";
 
   // Emit the static match table; unused classes get initalized to 0 which is
   // guaranteed to be InvalidMatchClass.
@@ -2502,8 +2718,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // following the mnemonic.
   OS << "namespace {\n";
   OS << "  struct MatchEntry {\n";
-  OS << "    static const char *const MnemonicTable;\n";
-  OS << "    uint32_t Mnemonic;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
+               << " Mnemonic;\n";
   OS << "    uint16_t Opcode;\n";
   OS << "    " << getMinimalTypeForRange(Info.Matchables.size())
                << " ConvertFn;\n";
@@ -2533,8 +2749,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "} // end anonymous namespace.\n\n";
 
-  StringToOffsetTable StringTable;
-
   OS << "static const MatchEntry MatchTable["
      << Info.Matchables.size() << "] = {\n";
 
@@ -2573,13 +2787,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "};\n\n";
 
-  OS << "const char *const MatchEntry::MnemonicTable =\n";
-  StringTable.EmitString(OS);
-  OS << ";\n\n";
-
   // A method to determine if a mnemonic is in the list.
   OS << "bool " << Target.getName() << ClassName << "::\n"
-     << "MnemonicIsValid(StringRef Mnemonic) {\n";
+     << "mnemonicIsValid(StringRef Mnemonic) {\n";
   OS << "  // Search the table.\n";
   OS << "  std::pair<const MatchEntry*, const MatchEntry*> MnemonicRange =\n";
   OS << "    std::equal_range(MatchTable, MatchTable+"
@@ -2592,8 +2802,14 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << Target.getName() << ClassName << "::\n"
      << "MatchInstructionImpl(const SmallVectorImpl<MCParsedAsmOperand*>"
      << " &Operands,\n";
-  OS << "                     MCInst &Inst, unsigned &ErrorInfo, ";
-  OS << "unsigned VariantID) {\n";
+  OS << "                     MCInst &Inst,\n"
+     << "unsigned &ErrorInfo, bool matchingInlineAsm, unsigned VariantID) {\n";
+
+  OS << "  // Eliminate obvious mismatches.\n";
+  OS << "  if (Operands.size() > " << (MaxNumOperands+1) << ") {\n";
+  OS << "    ErrorInfo = " << (MaxNumOperands+1) << ";\n";
+  OS << "    return Match_InvalidOperand;\n";
+  OS << "  }\n\n";
 
   // Emit code to get the available features.
   OS << "  // Get the current feature set.\n";
@@ -2611,12 +2827,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   }
 
   // Emit code to compute the class list for this operand vector.
-  OS << "  // Eliminate obvious mismatches.\n";
-  OS << "  if (Operands.size() > " << (MaxNumOperands+1) << ") {\n";
-  OS << "    ErrorInfo = " << (MaxNumOperands+1) << ";\n";
-  OS << "    return Match_InvalidOperand;\n";
-  OS << "  }\n\n";
-
   OS << "  // Some state to try to produce better error messages.\n";
   OS << "  bool HadMatchOtherThanFeatures = false;\n";
   OS << "  bool HadMatchOtherThanPredicate = false;\n";
@@ -2681,17 +2891,20 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "      HadMatchOtherThanFeatures = true;\n";
   OS << "      unsigned NewMissingFeatures = it->RequiredFeatures & "
         "~AvailableFeatures;\n";
-  OS << "      if (CountPopulation_32(NewMissingFeatures) <= "
-        "CountPopulation_32(MissingFeatures))\n";
+  OS << "      if (CountPopulation_32(NewMissingFeatures) <=\n"
+        "          CountPopulation_32(MissingFeatures))\n";
   OS << "        MissingFeatures = NewMissingFeatures;\n";
   OS << "      continue;\n";
   OS << "    }\n";
   OS << "\n";
+  OS << "    if (matchingInlineAsm) {\n";
+  OS << "      Inst.setOpcode(it->Opcode);\n";
+  OS << "      convertToMapAndConstraints(it->ConvertFn, Operands);\n";
+  OS << "      return Match_Success;\n";
+  OS << "    }\n\n";
   OS << "    // We have selected a definite instruction, convert the parsed\n"
      << "    // operands into the appropriate MCInst.\n";
-  OS << "    if (!ConvertToMCInst(it->ConvertFn, Inst,\n"
-     << "                         it->Opcode, Operands))\n";
-  OS << "      return Match_ConversionFail;\n";
+  OS << "    convertToMCInst(it->ConvertFn, Inst, it->Opcode, Operands);\n";
   OS << "\n";
 
   // Verify the instruction with the target-specific match predicate function.
@@ -2716,15 +2929,16 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "  }\n\n";
 
   OS << "  // Okay, we had no match.  Try to return a useful error code.\n";
-  OS << "  if (HadMatchOtherThanPredicate || !HadMatchOtherThanFeatures)";
-  OS << "  return RetCode;\n";
+  OS << "  if (HadMatchOtherThanPredicate || !HadMatchOtherThanFeatures)\n";
+  OS << "    return RetCode;\n\n";
   OS << "  // Missing feature matches return which features were missing\n";
   OS << "  ErrorInfo = MissingFeatures;\n";
   OS << "  return Match_MissingFeature;\n";
   OS << "}\n\n";
 
   if (Info.OperandMatchInfo.size())
-    emitCustomOperandParsing(OS, Target, Info, ClassName);
+    emitCustomOperandParsing(OS, Target, Info, ClassName, StringTable,
+                             MaxMnemonicIndex);
 
   OS << "#endif // GET_MATCHER_IMPLEMENTATION\n\n";
 }
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 57979b3e6dbd..a4114d9815b6 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -313,7 +313,9 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
   /// OpcodeInfo - This encodes the index of the string to use for the first
   /// chunk of the output as well as indices used for operand printing.
-  std::vector<unsigned> OpcodeInfo;
+  /// To reduce the number of unhandled cases, we expand the size from 32-bit
+  /// to 32+16 = 48-bit.
+  std::vector<uint64_t> OpcodeInfo;
 
   // Add all strings to the string table upfront so it can generate an optimized
   // representation.
@@ -362,7 +364,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
   // To reduce code size, we compactify common instructions into a few bits
   // in the opcode-indexed table.
-  unsigned BitsLeft = 32-AsmStrBits;
+  unsigned BitsLeft = 64-AsmStrBits;
 
   std::vector<std::vector<std::string> > TableDrivenOperandPrinters;
 
@@ -388,10 +390,11 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
     }
 
     // Otherwise, we can include this in the initial lookup table.  Add it in.
-    BitsLeft -= NumBits;
     for (unsigned i = 0, e = InstIdxs.size(); i != e; ++i)
-      if (InstIdxs[i] != ~0U)
-        OpcodeInfo[i] |= InstIdxs[i] << (BitsLeft+AsmStrBits);
+      if (InstIdxs[i] != ~0U) {
+        OpcodeInfo[i] |= (uint64_t)InstIdxs[i] << (64-BitsLeft);
+      }
+    BitsLeft -= NumBits;
 
     // Remove the info about this operand.
     for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
@@ -410,16 +413,32 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   }
 
 
-
-  O<<"  static const unsigned OpInfo[] = {\n";
+  // We always emit at least one 32-bit table. A second table is emitted if
+  // more bits are needed.
+  O<<"  static const uint32_t OpInfo[] = {\n";
   for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
-    O << "    " << OpcodeInfo[i] << "U,\t// "
+    O << "    " << (OpcodeInfo[i] & 0xffffffff) << "U,\t// "
       << NumberedInstructions[i]->TheDef->getName() << "\n";
   }
   // Add a dummy entry so the array init doesn't end with a comma.
   O << "    0U\n";
   O << "  };\n\n";
 
+  if (BitsLeft < 32) {
+    // Add a second OpInfo table only when it is necessary.
+    // Adjust the type of the second table based on the number of bits needed.
+    O << "  static const uint"
+      << ((BitsLeft < 16) ? "32" : (BitsLeft < 24) ? "16" : "8")
+      << "_t OpInfo2[] = {\n";
+    for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
+      O << "    " << (OpcodeInfo[i] >> 32) << "U,\t// "
+        << NumberedInstructions[i]->TheDef->getName() << "\n";
+    }
+    // Add a dummy entry so the array init doesn't end with a comma.
+    O << "    0U\n";
+    O << "  };\n\n";
+  }
+
   // Emit the string itself.
   O << "  const char AsmStrs[] = {\n";
   StringTable.emit(O, printChar);
@@ -427,13 +446,22 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
   O << "  O << \"\\t\";\n\n";
 
-  O << "  // Emit the opcode for the instruction.\n"
-    << "  unsigned Bits = OpInfo[MI->getOpcode()];\n"
-    << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n"
+  O << "  // Emit the opcode for the instruction.\n";
+  if (BitsLeft < 32) {
+    // If we have two tables then we need to perform two lookups and combine
+    // the results into a single 64-bit value.
+    O << "  uint64_t Bits1 = OpInfo[MI->getOpcode()];\n"
+      << "  uint64_t Bits2 = OpInfo2[MI->getOpcode()];\n"
+      << "  uint64_t Bits = (Bits2 << 32) | Bits1;\n";
+  } else {
+    // If only one table is used we just need to perform a single lookup.
+    O << "  uint32_t Bits = OpInfo[MI->getOpcode()];\n";
+  }
+  O << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n"
     << "  O << AsmStrs+(Bits & " << (1 << AsmStrBits)-1 << ")-1;\n\n";
 
   // Output the table driven operand information.
-  BitsLeft = 32-AsmStrBits;
+  BitsLeft = 64-AsmStrBits;
   for (unsigned i = 0, e = TableDrivenOperandPrinters.size(); i != e; ++i) {
     std::vector<std::string> &Commands = TableDrivenOperandPrinters[i];
 
@@ -443,14 +471,13 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
     assert(NumBits <= BitsLeft && "consistency error");
 
     // Emit code to extract this field from Bits.
-    BitsLeft -= NumBits;
-
     O << "\n  // Fragment " << i << " encoded into " << NumBits
       << " bits for " << Commands.size() << " unique commands.\n";
 
     if (Commands.size() == 2) {
       // Emit two possibilitys with if/else.
-      O << "  if ((Bits >> " << (BitsLeft+AsmStrBits) << ") & "
+      O << "  if ((Bits >> "
+        << (64-BitsLeft) << ") & "
         << ((1 << NumBits)-1) << ") {\n"
         << Commands[1]
         << "  } else {\n"
@@ -460,7 +487,8 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
       // Emit a single possibility.
       O << Commands[0] << "\n\n";
     } else {
-      O << "  switch ((Bits >> " << (BitsLeft+AsmStrBits) << ") & "
+      O << "  switch ((Bits >> "
+        << (64-BitsLeft) << ") & "
         << ((1 << NumBits)-1) << ") {\n"
         << "  default:   // unreachable.\n";
 
@@ -472,6 +500,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
       }
       O << "  }\n\n";
     }
+    BitsLeft -= NumBits;
   }
 
   // Okay, delete instructions with no operand info left.
@@ -537,9 +566,9 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
         std::vector<std::string> AltNames =
           Reg.TheDef->getValueAsListOfStrings("AltNames");
         if (AltNames.size() <= Idx)
-          throw TGError(Reg.TheDef->getLoc(),
-                        (Twine("Register definition missing alt name for '") +
-                        AltName + "'.").str());
+          PrintFatalError(Reg.TheDef->getLoc(),
+            (Twine("Register definition missing alt name for '") +
+             AltName + "'.").str());
         AsmName = AltNames[Idx];
       }
     }
@@ -551,7 +580,7 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
   StringTable.emit(O, printChar);
   O << "  };\n\n";
 
-  O << "  static const unsigned RegAsmOffset" << AltName << "[] = {";
+  O << "  static const uint32_t RegAsmOffset" << AltName << "[] = {";
   for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
     if ((i % 14) == 0)
       O << "\n    ";
@@ -590,7 +619,7 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
     emitRegisterNameString(O, "", Registers);
 
   if (hasAltNames) {
-    O << "  const unsigned *RegAsmOffset;\n"
+    O << "  const uint32_t *RegAsmOffset;\n"
       << "  const char *AsmStrs;\n"
       << "  switch(AltIdx) {\n"
       << "  default: llvm_unreachable(\"Invalid register alt name index!\");\n";
@@ -763,7 +792,7 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
     if (!R->getValueAsBit("EmitAlias"))
       continue; // We were told not to emit the alias, but to emit the aliasee.
     const DagInit *DI = R->getValueAsDag("ResultInst");
-    const DefInit *Op = dynamic_cast<const DefInit*>(DI->getOperator());
+    const DefInit *Op = cast<DefInit>(DI->getOperator());
     AliasMap[getQualifiedName(Op->getDef())].push_back(Alias);
   }
 
diff --git a/utils/TableGen/AsmWriterInst.cpp b/utils/TableGen/AsmWriterInst.cpp
index 350a2ccfcc23..fe1f75636198 100644
--- a/utils/TableGen/AsmWriterInst.cpp
+++ b/utils/TableGen/AsmWriterInst.cpp
@@ -14,6 +14,7 @@
 #include "AsmWriterInst.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
@@ -123,8 +124,8 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI,
                    != std::string::npos) {
           AddLiteralString(std::string(1, AsmString[DollarPos+1]));
         } else {
-          throw "Non-supported escaped character found in instruction '" +
-          CGI.TheDef->getName() + "'!";
+          PrintFatalError("Non-supported escaped character found in instruction '" +
+            CGI.TheDef->getName() + "'!");
         }
         LastEmitted = DollarPos+2;
         continue;
@@ -162,15 +163,15 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI,
       // brace.
       if (hasCurlyBraces) {
         if (VarEnd >= AsmString.size())
-          throw "Reached end of string before terminating curly brace in '"
-          + CGI.TheDef->getName() + "'";
+          PrintFatalError("Reached end of string before terminating curly brace in '"
+            + CGI.TheDef->getName() + "'");
         
         // Look for a modifier string.
         if (AsmString[VarEnd] == ':') {
           ++VarEnd;
           if (VarEnd >= AsmString.size())
-            throw "Reached end of string before terminating curly brace in '"
-            + CGI.TheDef->getName() + "'";
+            PrintFatalError("Reached end of string before terminating curly brace in '"
+              + CGI.TheDef->getName() + "'");
           
           unsigned ModifierStart = VarEnd;
           while (VarEnd < AsmString.size() && isIdentChar(AsmString[VarEnd]))
@@ -178,17 +179,17 @@ AsmWriterInst::AsmWriterInst(const CodeGenInstruction &CGI,
           Modifier = std::string(AsmString.begin()+ModifierStart,
                                  AsmString.begin()+VarEnd);
           if (Modifier.empty())
-            throw "Bad operand modifier name in '"+ CGI.TheDef->getName() + "'";
+            PrintFatalError("Bad operand modifier name in '"+ CGI.TheDef->getName() + "'");
         }
         
         if (AsmString[VarEnd] != '}')
-          throw "Variable name beginning with '{' did not end with '}' in '"
-          + CGI.TheDef->getName() + "'";
+          PrintFatalError("Variable name beginning with '{' did not end with '}' in '"
+            + CGI.TheDef->getName() + "'");
         ++VarEnd;
       }
       if (VarName.empty() && Modifier.empty())
-        throw "Stray '$' in '" + CGI.TheDef->getName() +
-        "' asm string, maybe you want $$?";
+        PrintFatalError("Stray '$' in '" + CGI.TheDef->getName() +
+          "' asm string, maybe you want $$?");
       
       if (VarName.empty()) {
         // Just a modifier, pass this into PrintSpecial.
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 0e14cbae38ad..d0416c908131 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -1,5 +1,3 @@
-set(LLVM_REQUIRES_EH 1)
-set(LLVM_REQUIRES_RTTI 1)
 set(LLVM_LINK_COMPONENTS Support)
 
 add_tablegen(llvm-tblgen LLVM
@@ -10,6 +8,7 @@ add_tablegen(llvm-tblgen LLVM
   CodeEmitterGen.cpp
   CodeGenDAGPatterns.cpp
   CodeGenInstruction.cpp
+  CodeGenMapTable.cpp
   CodeGenRegisters.cpp
   CodeGenSchedule.cpp
   CodeGenTarget.cpp
diff --git a/utils/TableGen/CallingConvEmitter.cpp b/utils/TableGen/CallingConvEmitter.cpp
index e9c4bd30f914..94f3c6518ca0 100644
--- a/utils/TableGen/CallingConvEmitter.cpp
+++ b/utils/TableGen/CallingConvEmitter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <cassert>
@@ -93,7 +94,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       O << Action->getValueAsString("Predicate");
     } else {
       Action->dump();
-      throw "Unknown CCPredicateAction!";
+      PrintFatalError("Unknown CCPredicateAction!");
     }
     
     O << ") {\n";
@@ -131,7 +132,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       ListInit *ShadowRegList = Action->getValueAsListInit("ShadowRegList");
       if (ShadowRegList->getSize() >0 &&
           ShadowRegList->getSize() != RegList->getSize())
-        throw "Invalid length of list of shadowed registers";
+        PrintFatalError("Invalid length of list of shadowed registers");
 
       if (RegList->getSize() == 1) {
         O << IndentStr << "if (unsigned Reg = State.AllocateReg(";
@@ -177,12 +178,12 @@ void CallingConvEmitter::EmitAction(Record *Action,
       if (Size)
         O << Size << ", ";
       else
-        O << "\n" << IndentStr << "  State.getTarget().getTargetData()"
+        O << "\n" << IndentStr << "  State.getTarget().getDataLayout()"
           "->getTypeAllocSize(EVT(LocVT).getTypeForEVT(State.getContext())), ";
       if (Align)
         O << Align;
       else
-        O << "\n" << IndentStr << "  State.getTarget().getTargetData()"
+        O << "\n" << IndentStr << "  State.getTarget().getDataLayout()"
           "->getABITypeAlignment(EVT(LocVT).getTypeForEVT(State.getContext()))";
       if (Action->isSubClassOf("CCAssignToStackWithShadow"))
         O << ", " << getQualifiedName(Action->getValueAsDef("ShadowReg"));
@@ -221,7 +222,7 @@ void CallingConvEmitter::EmitAction(Record *Action,
       O << IndentStr << IndentStr << "return false;\n";
     } else {
       Action->dump();
-      throw "Unknown CCAction!";
+      PrintFatalError("Unknown CCAction!");
     }
   }
 }
diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index 31a39b1f0426..3e4f626d4862 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp
@@ -91,11 +91,11 @@ void CodeEmitterGen::reverseBits(std::vector<Record*> &Insts) {
 // return the variable bit position.  Otherwise return -1.
 int CodeEmitterGen::getVariableBit(const std::string &VarName,
                                    BitsInit *BI, int bit) {
-  if (VarBitInit *VBI = dynamic_cast<VarBitInit*>(BI->getBit(bit))) {
-    if (VarInit *VI = dynamic_cast<VarInit*>(VBI->getVariable()))
+  if (VarBitInit *VBI = dyn_cast<VarBitInit>(BI->getBit(bit))) {
+    if (VarInit *VI = dyn_cast<VarInit>(VBI->getBitVar()))
       if (VI->getName() == VarName)
         return VBI->getBitNum();
-  } else if (VarInit *VI = dynamic_cast<VarInit*>(BI->getBit(bit))) {
+  } else if (VarInit *VI = dyn_cast<VarInit>(BI->getBit(bit))) {
     if (VI->getName() == VarName)
       return 0;
   }
@@ -134,10 +134,13 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
     assert(!CGI.Operands.isFlatOperandNotEmitted(OpIdx) &&
            "Explicitly used operand also marked as not emitted!");
   } else {
+    unsigned NumberOps = CGI.Operands.size();
     /// If this operand is not supposed to be emitted by the
     /// generated emitter, skip it.
-    while (CGI.Operands.isFlatOperandNotEmitted(NumberedOp))
+    while (NumberedOp < NumberOps &&
+           CGI.Operands.isFlatOperandNotEmitted(NumberedOp))
       ++NumberedOp;
+
     OpIdx = NumberedOp++;
   }
   
@@ -269,7 +272,7 @@ void CodeEmitterGen::run(raw_ostream &o) {
     // Start by filling in fixed values.
     uint64_t Value = 0;
     for (unsigned i = 0, e = BI->getNumBits(); i != e; ++i) {
-      if (BitInit *B = dynamic_cast<BitInit*>(BI->getBit(e-i-1)))
+      if (BitInit *B = dyn_cast<BitInit>(BI->getBit(e-i-1)))
         Value |= (uint64_t)B->getValue() << (e-i-1);
     }
     o << "    UINT64_C(" << Value << ")," << '\t' << "// " << R->getName() << "\n";
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 34f8a34e7af7..d5b581b5981a 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -79,14 +79,19 @@ bool EEVT::TypeSet::FillWithPossibleTypes(TreePattern &TP,
   const std::vector<MVT::SimpleValueType> &LegalTypes =
     TP.getDAGPatterns().getTargetInfo().getLegalValueTypes();
 
+  if (TP.hasError())
+    return false;
+
   for (unsigned i = 0, e = LegalTypes.size(); i != e; ++i)
     if (Pred == 0 || Pred(LegalTypes[i]))
       TypeVec.push_back(LegalTypes[i]);
 
   // If we have nothing that matches the predicate, bail out.
-  if (TypeVec.empty())
+  if (TypeVec.empty()) {
     TP.error("Type inference contradiction found, no " +
              std::string(PredicateName) + " types found");
+    return false;
+  }
   // No need to sort with one element.
   if (TypeVec.size() == 1) return true;
 
@@ -146,9 +151,9 @@ std::string EEVT::TypeSet::getName() const {
 
 /// MergeInTypeInfo - This merges in type information from the specified
 /// argument.  If 'this' changes, it returns true.  If the two types are
-/// contradictory (e.g. merge f32 into i32) then this throws an exception.
+/// contradictory (e.g. merge f32 into i32) then this flags an error.
 bool EEVT::TypeSet::MergeInTypeInfo(const EEVT::TypeSet &InVT, TreePattern &TP){
-  if (InVT.isCompletelyUnknown() || *this == InVT)
+  if (InVT.isCompletelyUnknown() || *this == InVT || TP.hasError())
     return false;
 
   if (isCompletelyUnknown()) {
@@ -224,11 +229,13 @@ bool EEVT::TypeSet::MergeInTypeInfo(const EEVT::TypeSet &InVT, TreePattern &TP){
   // FIXME: Really want an SMLoc here!
   TP.error("Type inference contradiction found, merging '" +
            InVT.getName() + "' into '" + InputSet.getName() + "'");
-  return true; // unreachable
+  return false;
 }
 
 /// EnforceInteger - Remove all non-integer types from this set.
 bool EEVT::TypeSet::EnforceInteger(TreePattern &TP) {
+  if (TP.hasError())
+    return false;
   // If we know nothing, then get the full set.
   if (TypeVec.empty())
     return FillWithPossibleTypes(TP, isInteger, "integer");
@@ -242,14 +249,18 @@ bool EEVT::TypeSet::EnforceInteger(TreePattern &TP) {
     if (!isInteger(TypeVec[i]))
       TypeVec.erase(TypeVec.begin()+i--);
 
-  if (TypeVec.empty())
+  if (TypeVec.empty()) {
     TP.error("Type inference contradiction found, '" +
              InputSet.getName() + "' needs to be integer");
+    return false;
+  }
   return true;
 }
 
 /// EnforceFloatingPoint - Remove all integer types from this set.
 bool EEVT::TypeSet::EnforceFloatingPoint(TreePattern &TP) {
+  if (TP.hasError())
+    return false;
   // If we know nothing, then get the full set.
   if (TypeVec.empty())
     return FillWithPossibleTypes(TP, isFloatingPoint, "floating point");
@@ -264,14 +275,19 @@ bool EEVT::TypeSet::EnforceFloatingPoint(TreePattern &TP) {
     if (!isFloatingPoint(TypeVec[i]))
       TypeVec.erase(TypeVec.begin()+i--);
 
-  if (TypeVec.empty())
+  if (TypeVec.empty()) {
     TP.error("Type inference contradiction found, '" +
              InputSet.getName() + "' needs to be floating point");
+    return false;
+  }
   return true;
 }
 
 /// EnforceScalar - Remove all vector types from this.
 bool EEVT::TypeSet::EnforceScalar(TreePattern &TP) {
+  if (TP.hasError())
+    return false;
+
   // If we know nothing, then get the full set.
   if (TypeVec.empty())
     return FillWithPossibleTypes(TP, isScalar, "scalar");
@@ -286,14 +302,19 @@ bool EEVT::TypeSet::EnforceScalar(TreePattern &TP) {
     if (!isScalar(TypeVec[i]))
       TypeVec.erase(TypeVec.begin()+i--);
 
-  if (TypeVec.empty())
+  if (TypeVec.empty()) {
     TP.error("Type inference contradiction found, '" +
              InputSet.getName() + "' needs to be scalar");
+    return false;
+  }
   return true;
 }
 
 /// EnforceVector - Remove all vector types from this.
 bool EEVT::TypeSet::EnforceVector(TreePattern &TP) {
+  if (TP.hasError())
+    return false;
+
   // If we know nothing, then get the full set.
   if (TypeVec.empty())
     return FillWithPossibleTypes(TP, isVector, "vector");
@@ -308,9 +329,11 @@ bool EEVT::TypeSet::EnforceVector(TreePattern &TP) {
       MadeChange = true;
     }
 
-  if (TypeVec.empty())
+  if (TypeVec.empty()) {
     TP.error("Type inference contradiction found, '" +
              InputSet.getName() + "' needs to be a vector");
+    return false;
+  }
   return MadeChange;
 }
 
@@ -319,6 +342,9 @@ bool EEVT::TypeSet::EnforceVector(TreePattern &TP) {
 /// EnforceSmallerThan - 'this' must be a smaller VT than Other.  Update
 /// this an other based on this information.
 bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
+  if (TP.hasError())
+    return false;
+
   // Both operands must be integer or FP, but we don't care which.
   bool MadeChange = false;
 
@@ -365,19 +391,22 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
     if (hasVectorTypes() && Other.hasVectorTypes()) {
       if (Type.getSizeInBits() >= OtherType.getSizeInBits())
         if (Type.getVectorElementType().getSizeInBits()
-            >= OtherType.getVectorElementType().getSizeInBits())
+            >= OtherType.getVectorElementType().getSizeInBits()) {
           TP.error("Type inference contradiction found, '" +
                    getName() + "' element type not smaller than '" +
                    Other.getName() +"'!");
+          return false;
+        }
     }
     else
       // For scalar types, the bitsize of this type must be larger
       // than that of the other.
-      if (Type.getSizeInBits() >= OtherType.getSizeInBits())
+      if (Type.getSizeInBits() >= OtherType.getSizeInBits()) {
         TP.error("Type inference contradiction found, '" +
                  getName() + "' is not smaller than '" +
                  Other.getName() +"'!");
-
+        return false;
+      }
   }
   
 
@@ -437,9 +466,11 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
   // If this is the only type in the large set, the constraint can never be
   // satisfied.
   if ((Other.hasIntegerTypes() && OtherIntSize == 0)
-      || (Other.hasFloatingPointTypes() && OtherFPSize == 0))
+      || (Other.hasFloatingPointTypes() && OtherFPSize == 0)) {
     TP.error("Type inference contradiction found, '" +
              Other.getName() + "' has nothing larger than '" + getName() +"'!");
+    return false;
+  }
 
   // Okay, find the largest type in the Other set and remove it from the
   // current set.
@@ -493,9 +524,11 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
   // If this is the only type in the small set, the constraint can never be
   // satisfied.
   if ((hasIntegerTypes() && IntSize == 0)
-      || (hasFloatingPointTypes() && FPSize == 0))
+      || (hasFloatingPointTypes() && FPSize == 0)) {
     TP.error("Type inference contradiction found, '" +
              getName() + "' has nothing smaller than '" + Other.getName()+"'!");
+    return false;
+  }
 
   return MadeChange;
 }
@@ -504,6 +537,9 @@ bool EEVT::TypeSet::EnforceSmallerThan(EEVT::TypeSet &Other, TreePattern &TP) {
 /// whose element is specified by VTOperand.
 bool EEVT::TypeSet::EnforceVectorEltTypeIs(EEVT::TypeSet &VTOperand,
                                            TreePattern &TP) {
+  if (TP.hasError())
+    return false;
+
   // "This" must be a vector and "VTOperand" must be a scalar.
   bool MadeChange = false;
   MadeChange |= EnforceVector(TP);
@@ -535,9 +571,11 @@ bool EEVT::TypeSet::EnforceVectorEltTypeIs(EEVT::TypeSet &VTOperand,
     }
   }
 
-  if (TypeVec.empty())  // FIXME: Really want an SMLoc here!
+  if (TypeVec.empty()) {  // FIXME: Really want an SMLoc here!
     TP.error("Type inference contradiction found, forcing '" +
              InputSet.getName() + "' to have a vector element");
+    return false;
+  }
   return MadeChange;
 }
 
@@ -574,10 +612,6 @@ bool EEVT::TypeSet::EnforceVectorSubVectorTypeIs(EEVT::TypeSet &VTOperand,
 //===----------------------------------------------------------------------===//
 // Helpers for working with extended types.
 
-bool RecordPtrCmp::operator()(const Record *LHS, const Record *RHS) const {
-  return LHS->getID() < RHS->getID();
-}
-
 /// Dependent variable map for CodeGenDAGPattern variant generation
 typedef std::map<std::string, int> DepVarMap;
 
@@ -586,7 +620,7 @@ typedef DepVarMap::const_iterator DepVarMap_citer;
 
 static void FindDepVarsOf(TreePatternNode *N, DepVarMap &DepMap) {
   if (N->isLeaf()) {
-    if (dynamic_cast<DefInit*>(N->getLeafValue()) != NULL)
+    if (isa<DefInit>(N->getLeafValue()))
       DepMap[N->getName()]++;
   } else {
     for (size_t i = 0, e = N->getNumChildren(); i != e; ++i)
@@ -695,7 +729,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
   unsigned Size = 3;  // The node itself.
   // If the root node is a ConstantSDNode, increases its size.
   // e.g. (set R32:$dst, 0).
-  if (P->isLeaf() && dynamic_cast<IntInit*>(P->getLeafValue()))
+  if (P->isLeaf() && isa<IntInit>(P->getLeafValue()))
     Size += 2;
 
   // FIXME: This is a hack to statically increase the priority of patterns
@@ -719,7 +753,7 @@ static unsigned getPatternSize(const TreePatternNode *P,
         Child->getType(0) != MVT::Other)
       Size += getPatternSize(Child, CGP);
     else if (Child->isLeaf()) {
-      if (dynamic_cast<IntInit*>(Child->getLeafValue()))
+      if (isa<IntInit>(Child->getLeafValue()))
         Size += 5;  // Matches a ConstantSDNode (+3) and a specific value (+2).
       else if (Child->getComplexPatternInfo(CGP))
         Size += getPatternSize(Child, CGP);
@@ -745,7 +779,7 @@ getPatternComplexity(const CodeGenDAGPatterns &CGP) const {
 std::string PatternToMatch::getPredicateCheck() const {
   std::string PredicateCheck;
   for (unsigned i = 0, e = Predicates->getSize(); i != e; ++i) {
-    if (DefInit *Pred = dynamic_cast<DefInit*>(Predicates->getElement(i))) {
+    if (DefInit *Pred = dyn_cast<DefInit>(Predicates->getElement(i))) {
       Record *Def = Pred->getDef();
       if (!Def->isSubClassOf("Predicate")) {
 #ifndef NDEBUG
@@ -773,7 +807,7 @@ SDTypeConstraint::SDTypeConstraint(Record *R) {
     ConstraintType = SDTCisVT;
     x.SDTCisVT_Info.VT = getValueType(R->getValueAsDef("VT"));
     if (x.SDTCisVT_Info.VT == MVT::isVoid)
-      throw TGError(R->getLoc(), "Cannot use 'Void' as type to SDTCisVT");
+      PrintFatalError(R->getLoc(), "Cannot use 'Void' as type to SDTCisVT");
 
   } else if (R->isSubClassOf("SDTCisPtrTy")) {
     ConstraintType = SDTCisPtrTy;
@@ -833,11 +867,13 @@ static TreePatternNode *getOperandNum(unsigned OpNo, TreePatternNode *N,
 
 /// ApplyTypeConstraint - Given a node in a pattern, apply this type
 /// constraint to the nodes operands.  This returns true if it makes a
-/// change, false otherwise.  If a type contradiction is found, throw an
-/// exception.
+/// change, false otherwise.  If a type contradiction is found, flag an error.
 bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
                                            const SDNodeInfo &NodeInfo,
                                            TreePattern &TP) const {
+  if (TP.hasError())
+    return false;
+
   unsigned ResNo = 0; // The result number being referenced.
   TreePatternNode *NodeToApply = getOperandNum(OperandNo, N, NodeInfo, ResNo);
 
@@ -868,10 +904,12 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
     // The NodeToApply must be a leaf node that is a VT.  OtherOperandNum must
     // have an integer type that is smaller than the VT.
     if (!NodeToApply->isLeaf() ||
-        !dynamic_cast<DefInit*>(NodeToApply->getLeafValue()) ||
+        !isa<DefInit>(NodeToApply->getLeafValue()) ||
         !static_cast<DefInit*>(NodeToApply->getLeafValue())->getDef()
-               ->isSubClassOf("ValueType"))
+               ->isSubClassOf("ValueType")) {
       TP.error(N->getOperator()->getName() + " expects a VT operand!");
+      return false;
+    }
     MVT::SimpleValueType VT =
      getValueType(static_cast<DefInit*>(NodeToApply->getLeafValue())->getDef());
 
@@ -1025,8 +1063,9 @@ static unsigned GetNumNodeResults(Record *Operator, CodeGenDAGPatterns &CDP) {
     // Get the result tree.
     DagInit *Tree = Operator->getValueAsDag("Fragment");
     Record *Op = 0;
-    if (Tree && dynamic_cast<DefInit*>(Tree->getOperator()))
-      Op = dynamic_cast<DefInit*>(Tree->getOperator())->getDef();
+    if (Tree)
+      if (DefInit *DI = dyn_cast<DefInit>(Tree->getOperator()))
+        Op = DI->getDef();
     assert(Op && "Invalid Fragment");
     return GetNumNodeResults(Op, CDP);
   }
@@ -1100,8 +1139,8 @@ bool TreePatternNode::isIsomorphicTo(const TreePatternNode *N,
     return false;
 
   if (isLeaf()) {
-    if (DefInit *DI = dynamic_cast<DefInit*>(getLeafValue())) {
-      if (DefInit *NDI = dynamic_cast<DefInit*>(N->getLeafValue())) {
+    if (DefInit *DI = dyn_cast<DefInit>(getLeafValue())) {
+      if (DefInit *NDI = dyn_cast<DefInit>(N->getLeafValue())) {
         return ((DI->getDef() == NDI->getDef())
                 && (DepVars.find(getName()) == DepVars.end()
                     || getName() == N->getName()));
@@ -1158,8 +1197,8 @@ SubstituteFormalArguments(std::map<std::string, TreePatternNode*> &ArgMap) {
     TreePatternNode *Child = getChild(i);
     if (Child->isLeaf()) {
       Init *Val = Child->getLeafValue();
-      if (dynamic_cast<DefInit*>(Val) &&
-          static_cast<DefInit*>(Val)->getDef()->getName() == "node") {
+      if (isa<DefInit>(Val) &&
+          cast<DefInit>(Val)->getDef()->getName() == "node") {
         // We found a use of a formal argument, replace it with its value.
         TreePatternNode *NewChild = ArgMap[Child->getName()];
         assert(NewChild && "Couldn't find formal argument!");
@@ -1179,7 +1218,11 @@ SubstituteFormalArguments(std::map<std::string, TreePatternNode*> &ArgMap) {
 /// fragments, inline them into place, giving us a pattern without any
 /// PatFrag references.
 TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
-  if (isLeaf()) return this;  // nothing to do.
+  if (TP.hasError())
+    return 0;
+
+  if (isLeaf())
+     return this;  // nothing to do.
   Record *Op = getOperator();
 
   if (!Op->isSubClassOf("PatFrag")) {
@@ -1202,9 +1245,11 @@ TreePatternNode *TreePatternNode::InlinePatternFragments(TreePattern &TP) {
   TreePattern *Frag = TP.getDAGPatterns().getPatternFragment(Op);
 
   // Verify that we are passing the right number of operands.
-  if (Frag->getNumArgs() != Children.size())
+  if (Frag->getNumArgs() != Children.size()) {
     TP.error("'" + Op->getName() + "' fragment requires " +
              utostr(Frag->getNumArgs()) + " operands!");
+    return 0;
+  }
 
   TreePatternNode *FragTree = Frag->getOnlyTree()->clone();
 
@@ -1320,8 +1365,7 @@ getIntrinsicInfo(const CodeGenDAGPatterns &CDP) const {
       getOperator() != CDP.get_intrinsic_wo_chain_sdnode())
     return 0;
 
-  unsigned IID =
-    dynamic_cast<IntInit*>(getChild(0)->getLeafValue())->getValue();
+  unsigned IID = cast<IntInit>(getChild(0)->getLeafValue())->getValue();
   return &CDP.getIntrinsicInfo(IID);
 }
 
@@ -1331,7 +1375,7 @@ const ComplexPattern *
 TreePatternNode::getComplexPatternInfo(const CodeGenDAGPatterns &CGP) const {
   if (!isLeaf()) return 0;
 
-  DefInit *DI = dynamic_cast<DefInit*>(getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(getLeafValue());
   if (DI && DI->getDef()->isSubClassOf("ComplexPattern"))
     return &CGP.getComplexPattern(DI->getDef());
   return 0;
@@ -1379,12 +1423,14 @@ TreePatternNode::isCommutativeIntrinsic(const CodeGenDAGPatterns &CDP) const {
 
 /// ApplyTypeConstraints - Apply all of the type constraints relevant to
 /// this node and its children in the tree.  This returns true if it makes a
-/// change, false otherwise.  If a type contradiction is found, throw an
-/// exception.
+/// change, false otherwise.  If a type contradiction is found, flag an error.
 bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
+  if (TP.hasError())
+    return false;
+
   CodeGenDAGPatterns &CDP = TP.getDAGPatterns();
   if (isLeaf()) {
-    if (DefInit *DI = dynamic_cast<DefInit*>(getLeafValue())) {
+    if (DefInit *DI = dyn_cast<DefInit>(getLeafValue())) {
       // If it's a regclass or something else known, include the type.
       bool MadeChange = false;
       for (unsigned i = 0, e = Types.size(); i != e; ++i)
@@ -1393,7 +1439,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       return MadeChange;
     }
 
-    if (IntInit *II = dynamic_cast<IntInit*>(getLeafValue())) {
+    if (IntInit *II = dyn_cast<IntInit>(getLeafValue())) {
       assert(Types.size() == 1 && "Invalid IntInit");
 
       // Int inits are always integers. :)
@@ -1410,21 +1456,15 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       // Make sure that the value is representable for this type.
       if (Size >= 32) return MadeChange;
 
-      int Val = (II->getValue() << (32-Size)) >> (32-Size);
-      if (Val == II->getValue()) return MadeChange;
-
-      // If sign-extended doesn't fit, does it fit as unsigned?
-      unsigned ValueMask;
-      unsigned UnsignedVal;
-      ValueMask = unsigned(~uint32_t(0UL) >> (32-Size));
-      UnsignedVal = unsigned(II->getValue());
-
-      if ((ValueMask & UnsignedVal) == UnsignedVal)
+      // Check that the value doesn't use more bits than we have. It must either
+      // be a sign- or zero-extended equivalent of the original.
+      int64_t SignBitAndAbove = II->getValue() >> (Size - 1);
+      if (SignBitAndAbove == -1 || SignBitAndAbove == 0 || SignBitAndAbove == 1)
         return MadeChange;
 
-      TP.error("Integer value '" + itostr(II->getValue())+
+      TP.error("Integer value '" + itostr(II->getValue()) +
                "' is out of range for type '" + getEnumName(getType(0)) + "'!");
-      return MadeChange;
+      return false;
     }
     return false;
   }
@@ -1487,10 +1527,12 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
     for (unsigned i = 0, e = NumRetVTs; i != e; ++i)
       MadeChange |= UpdateNodeType(i, Int->IS.RetVTs[i], TP);
 
-    if (getNumChildren() != NumParamVTs + 1)
+    if (getNumChildren() != NumParamVTs + 1) {
       TP.error("Intrinsic '" + Int->Name + "' expects " +
                utostr(NumParamVTs) + " operands, not " +
                utostr(getNumChildren() - 1) + " operands!");
+      return false;
+    }
 
     // Apply type info to the intrinsic ID.
     MadeChange |= getChild(0)->UpdateNodeType(0, MVT::iPTR, TP);
@@ -1510,9 +1552,11 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
 
     // Check that the number of operands is sane.  Negative operands -> varargs.
     if (NI.getNumOperands() >= 0 &&
-        getNumChildren() != (unsigned)NI.getNumOperands())
+        getNumChildren() != (unsigned)NI.getNumOperands()) {
       TP.error(getOperator()->getName() + " node requires exactly " +
                itostr(NI.getNumOperands()) + " operands!");
+      return false;
+    }
 
     bool MadeChange = NI.ApplyTypeConstraints(this, TP);
     for (unsigned i = 0, e = getNumChildren(); i != e; ++i)
@@ -1541,7 +1585,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         const CodeGenRegisterClass &RC =
           CDP.getTargetInfo().getRegisterClass(RegClass);
         MadeChange |= UpdateNodeType(ResNo, RC.getValueTypes(), TP);
-      } else if (ResultNode->getName() == "unknown") {
+      } else if (ResultNode->isSubClassOf("unknown_class")) {
         // Nothing to do.
       } else {
         assert(ResultNode->isSubClassOf("RegisterClass") &&
@@ -1581,15 +1625,16 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       // If the instruction expects a predicate or optional def operand, we
       // codegen this by setting the operand to it's default value if it has a
       // non-empty DefaultOps field.
-      if ((OperandNode->isSubClassOf("PredicateOperand") ||
-           OperandNode->isSubClassOf("OptionalDefOperand")) &&
+      if (OperandNode->isSubClassOf("OperandWithDefaultOps") &&
           !CDP.getDefaultOperand(OperandNode).DefaultOps.empty())
         continue;
 
       // Verify that we didn't run out of provided operands.
-      if (ChildNo >= getNumChildren())
+      if (ChildNo >= getNumChildren()) {
         TP.error("Instruction '" + getOperator()->getName() +
                  "' expects more operands than were provided.");
+        return false;
+      }
 
       MVT::SimpleValueType VT;
       TreePatternNode *Child = getChild(ChildNo++);
@@ -1609,7 +1654,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         MadeChange |= Child->UpdateNodeType(ChildResNo, VT, TP);
       } else if (OperandNode->isSubClassOf("PointerLikeRegClass")) {
         MadeChange |= Child->UpdateNodeType(ChildResNo, MVT::iPTR, TP);
-      } else if (OperandNode->getName() == "unknown") {
+      } else if (OperandNode->isSubClassOf("unknown_class")) {
         // Nothing to do.
       } else
         llvm_unreachable("Unknown operand type!");
@@ -1617,9 +1662,11 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
       MadeChange |= Child->ApplyTypeConstraints(TP, NotRegisters);
     }
 
-    if (ChildNo != getNumChildren())
+    if (ChildNo != getNumChildren()) {
       TP.error("Instruction '" + getOperator()->getName() +
                "' was provided too many operands!");
+      return false;
+    }
 
     return MadeChange;
   }
@@ -1627,9 +1674,11 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
   assert(getOperator()->isSubClassOf("SDNodeXForm") && "Unknown node type!");
 
   // Node transforms always take one operand.
-  if (getNumChildren() != 1)
+  if (getNumChildren() != 1) {
     TP.error("Node transform '" + getOperator()->getName() +
              "' requires one operand!");
+    return false;
+  }
 
   bool MadeChange = getChild(0)->ApplyTypeConstraints(TP, NotRegisters);
 
@@ -1652,7 +1701,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
 static bool OnlyOnRHSOfCommutative(TreePatternNode *N) {
   if (!N->isLeaf() && N->getOperator()->getName() == "imm")
     return true;
-  if (N->isLeaf() && dynamic_cast<IntInit*>(N->getLeafValue()))
+  if (N->isLeaf() && isa<IntInit>(N->getLeafValue()))
     return true;
   return false;
 }
@@ -1703,27 +1752,30 @@ bool TreePatternNode::canPatternMatch(std::string &Reason,
 //
 
 TreePattern::TreePattern(Record *TheRec, ListInit *RawPat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp){
-  isInputPattern = isInput;
+                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
+                         isInputPattern(isInput), HasError(false) {
   for (unsigned i = 0, e = RawPat->getSize(); i != e; ++i)
     Trees.push_back(ParseTreePattern(RawPat->getElement(i), ""));
 }
 
 TreePattern::TreePattern(Record *TheRec, DagInit *Pat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp){
-  isInputPattern = isInput;
+                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
+                         isInputPattern(isInput), HasError(false) {
   Trees.push_back(ParseTreePattern(Pat, ""));
 }
 
 TreePattern::TreePattern(Record *TheRec, TreePatternNode *Pat, bool isInput,
-                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp){
-  isInputPattern = isInput;
+                         CodeGenDAGPatterns &cdp) : TheRecord(TheRec), CDP(cdp),
+                         isInputPattern(isInput), HasError(false) {
   Trees.push_back(Pat);
 }
 
-void TreePattern::error(const std::string &Msg) const {
+void TreePattern::error(const std::string &Msg) {
+  if (HasError)
+    return;
   dump();
-  throw TGError(TheRecord->getLoc(), "In " + TheRecord->getName() + ": " + Msg);
+  PrintError(TheRecord->getLoc(), "In " + TheRecord->getName() + ": " + Msg);
+  HasError = true;
 }
 
 void TreePattern::ComputeNamedNodes() {
@@ -1741,7 +1793,7 @@ void TreePattern::ComputeNamedNodes(TreePatternNode *N) {
 
 
 TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
-  if (DefInit *DI = dynamic_cast<DefInit*>(TheInit)) {
+  if (DefInit *DI = dyn_cast<DefInit>(TheInit)) {
     Record *R = DI->getDef();
 
     // Direct reference to a leaf DagNode or PatFrag?  Turn it into a
@@ -1765,26 +1817,26 @@ TreePatternNode *TreePattern::ParseTreePattern(Init *TheInit, StringRef OpName){
     return Res;
   }
 
-  if (IntInit *II = dynamic_cast<IntInit*>(TheInit)) {
+  if (IntInit *II = dyn_cast<IntInit>(TheInit)) {
     if (!OpName.empty())
       error("Constant int argument should not have a name!");
     return new TreePatternNode(II, 1);
   }
 
-  if (BitsInit *BI = dynamic_cast<BitsInit*>(TheInit)) {
+  if (BitsInit *BI = dyn_cast<BitsInit>(TheInit)) {
     // Turn this into an IntInit.
     Init *II = BI->convertInitializerTo(IntRecTy::get());
-    if (II == 0 || !dynamic_cast<IntInit*>(II))
+    if (II == 0 || !isa<IntInit>(II))
       error("Bits value must be constants!");
     return ParseTreePattern(II, OpName);
   }
 
-  DagInit *Dag = dynamic_cast<DagInit*>(TheInit);
+  DagInit *Dag = dyn_cast<DagInit>(TheInit);
   if (!Dag) {
     TheInit->dump();
     error("Pattern has unexpected init kind!");
   }
-  DefInit *OpDef = dynamic_cast<DefInit*>(Dag->getOperator());
+  DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
   if (!OpDef) error("Pattern has unexpected operator type!");
   Record *Operator = OpDef->getDef();
 
@@ -1912,7 +1964,7 @@ static bool SimplifyTree(TreePatternNode *&N) {
 
 /// InferAllTypes - Infer/propagate as many types throughout the expression
 /// patterns as possible.  Return true if all types are inferred, false
-/// otherwise.  Throw an exception if a type contradiction is found.
+/// otherwise.  Flags an error if a type contradiction is found.
 bool TreePattern::
 InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
   if (NamedNodes.empty())
@@ -1949,7 +2001,7 @@ InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> > *InNamedTypes) {
           // us to match things like:
           //  def : Pat<(v1i64 (bitconvert(v2i32 DPR:$src))), (v1i64 DPR:$src)>;
           if (Nodes[i] == Trees[0] && Nodes[i]->isLeaf()) {
-            DefInit *DI = dynamic_cast<DefInit*>(Nodes[i]->getLeafValue());
+            DefInit *DI = dyn_cast<DefInit>(Nodes[i]->getLeafValue());
             if (DI && (DI->getDef()->isSubClassOf("RegisterClass") ||
                        DI->getDef()->isSubClassOf("RegisterOperand")))
               continue;
@@ -2033,6 +2085,9 @@ CodeGenDAGPatterns::CodeGenDAGPatterns(RecordKeeper &R) :
   // stores, and side effects in many cases by examining an
   // instruction's pattern.
   InferInstructionFlags();
+
+  // Verify that instruction flags match the patterns.
+  VerifyInstructionFlags();
 }
 
 CodeGenDAGPatterns::~CodeGenDAGPatterns() {
@@ -2111,7 +2166,7 @@ void CodeGenDAGPatterns::ParsePatternFragments() {
 
     // Parse the operands list.
     DagInit *OpsList = Fragments[i]->getValueAsDag("Operands");
-    DefInit *OpsOp = dynamic_cast<DefInit*>(OpsList->getOperator());
+    DefInit *OpsOp = dyn_cast<DefInit>(OpsList->getOperator());
     // Special cases: ops == outs == ins. Different names are used to
     // improve readability.
     if (!OpsOp ||
@@ -2123,9 +2178,8 @@ void CodeGenDAGPatterns::ParsePatternFragments() {
     // Copy over the arguments.
     Args.clear();
     for (unsigned j = 0, e = OpsList->getNumArgs(); j != e; ++j) {
-      if (!dynamic_cast<DefInit*>(OpsList->getArg(j)) ||
-          static_cast<DefInit*>(OpsList->getArg(j))->
-          getDef()->getName() != "node")
+      if (!isa<DefInit>(OpsList->getArg(j)) ||
+          cast<DefInit>(OpsList->getArg(j))->getDef()->getName() != "node")
         P->error("Operands list should all be 'node' values.");
       if (OpsList->getArgName(j).empty())
         P->error("Operands list should have names for each operand!");
@@ -2161,14 +2215,8 @@ void CodeGenDAGPatterns::ParsePatternFragments() {
 
     // Infer as many types as possible.  Don't worry about it if we don't infer
     // all of them, some may depend on the inputs of the pattern.
-    try {
-      ThePat->InferAllTypes();
-    } catch (...) {
-      // If this pattern fragment is not supported by this target (no types can
-      // satisfy its constraints), just ignore it.  If the bogus pattern is
-      // actually used by instructions, the type consistency error will be
-      // reported there.
-    }
+    ThePat->InferAllTypes();
+    ThePat->resetError();
 
     // If debugging, print out the pattern fragment result.
     DEBUG(ThePat->dump());
@@ -2176,53 +2224,46 @@ void CodeGenDAGPatterns::ParsePatternFragments() {
 }
 
 void CodeGenDAGPatterns::ParseDefaultOperands() {
-  std::vector<Record*> DefaultOps[2];
-  DefaultOps[0] = Records.getAllDerivedDefinitions("PredicateOperand");
-  DefaultOps[1] = Records.getAllDerivedDefinitions("OptionalDefOperand");
+  std::vector<Record*> DefaultOps;
+  DefaultOps = Records.getAllDerivedDefinitions("OperandWithDefaultOps");
 
   // Find some SDNode.
   assert(!SDNodes.empty() && "No SDNodes parsed?");
   Init *SomeSDNode = DefInit::get(SDNodes.begin()->first);
 
-  for (unsigned iter = 0; iter != 2; ++iter) {
-    for (unsigned i = 0, e = DefaultOps[iter].size(); i != e; ++i) {
-      DagInit *DefaultInfo = DefaultOps[iter][i]->getValueAsDag("DefaultOps");
-
-      // Clone the DefaultInfo dag node, changing the operator from 'ops' to
-      // SomeSDnode so that we can parse this.
-      std::vector<std::pair<Init*, std::string> > Ops;
-      for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
-        Ops.push_back(std::make_pair(DefaultInfo->getArg(op),
-                                     DefaultInfo->getArgName(op)));
-      DagInit *DI = DagInit::get(SomeSDNode, "", Ops);
-
-      // Create a TreePattern to parse this.
-      TreePattern P(DefaultOps[iter][i], DI, false, *this);
-      assert(P.getNumTrees() == 1 && "This ctor can only produce one tree!");
-
-      // Copy the operands over into a DAGDefaultOperand.
-      DAGDefaultOperand DefaultOpInfo;
-
-      TreePatternNode *T = P.getTree(0);
-      for (unsigned op = 0, e = T->getNumChildren(); op != e; ++op) {
-        TreePatternNode *TPN = T->getChild(op);
-        while (TPN->ApplyTypeConstraints(P, false))
-          /* Resolve all types */;
-
-        if (TPN->ContainsUnresolvedType()) {
-          if (iter == 0)
-            throw "Value #" + utostr(i) + " of PredicateOperand '" +
-              DefaultOps[iter][i]->getName() +"' doesn't have a concrete type!";
-          else
-            throw "Value #" + utostr(i) + " of OptionalDefOperand '" +
-              DefaultOps[iter][i]->getName() +"' doesn't have a concrete type!";
-        }
-        DefaultOpInfo.DefaultOps.push_back(TPN);
+  for (unsigned i = 0, e = DefaultOps.size(); i != e; ++i) {
+    DagInit *DefaultInfo = DefaultOps[i]->getValueAsDag("DefaultOps");
+
+    // Clone the DefaultInfo dag node, changing the operator from 'ops' to
+    // SomeSDnode so that we can parse this.
+    std::vector<std::pair<Init*, std::string> > Ops;
+    for (unsigned op = 0, e = DefaultInfo->getNumArgs(); op != e; ++op)
+      Ops.push_back(std::make_pair(DefaultInfo->getArg(op),
+                                   DefaultInfo->getArgName(op)));
+    DagInit *DI = DagInit::get(SomeSDNode, "", Ops);
+
+    // Create a TreePattern to parse this.
+    TreePattern P(DefaultOps[i], DI, false, *this);
+    assert(P.getNumTrees() == 1 && "This ctor can only produce one tree!");
+
+    // Copy the operands over into a DAGDefaultOperand.
+    DAGDefaultOperand DefaultOpInfo;
+
+    TreePatternNode *T = P.getTree(0);
+    for (unsigned op = 0, e = T->getNumChildren(); op != e; ++op) {
+      TreePatternNode *TPN = T->getChild(op);
+      while (TPN->ApplyTypeConstraints(P, false))
+        /* Resolve all types */;
+
+      if (TPN->ContainsUnresolvedType()) {
+        PrintFatalError("Value #" + utostr(i) + " of OperandWithDefaultOps '" +
+          DefaultOps[i]->getName() +"' doesn't have a concrete type!");
       }
-
-      // Insert it into the DefaultOperands map so we can find it later.
-      DefaultOperands[DefaultOps[iter][i]] = DefaultOpInfo;
+      DefaultOpInfo.DefaultOps.push_back(TPN);
     }
+
+    // Insert it into the DefaultOperands map so we can find it later.
+    DefaultOperands[DefaultOps[i]] = DefaultOpInfo;
   }
 }
 
@@ -2233,7 +2274,7 @@ static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
   // No name -> not interesting.
   if (Pat->getName().empty()) {
     if (Pat->isLeaf()) {
-      DefInit *DI = dynamic_cast<DefInit*>(Pat->getLeafValue());
+      DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue());
       if (DI && (DI->getDef()->isSubClassOf("RegisterClass") ||
                  DI->getDef()->isSubClassOf("RegisterOperand")))
         I->error("Input " + DI->getDef()->getName() + " must be named!");
@@ -2243,7 +2284,7 @@ static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
 
   Record *Rec;
   if (Pat->isLeaf()) {
-    DefInit *DI = dynamic_cast<DefInit*>(Pat->getLeafValue());
+    DefInit *DI = dyn_cast<DefInit>(Pat->getLeafValue());
     if (!DI) I->error("Input $" + Pat->getName() + " must be an identifier!");
     Rec = DI->getDef();
   } else {
@@ -2261,7 +2302,7 @@ static bool HandleUse(TreePattern *I, TreePatternNode *Pat,
   }
   Record *SlotRec;
   if (Slot->isLeaf()) {
-    SlotRec = dynamic_cast<DefInit*>(Slot->getLeafValue())->getDef();
+    SlotRec = cast<DefInit>(Slot->getLeafValue())->getDef();
   } else {
     assert(Slot->getNumChildren() == 0 && "can't be a use with children!");
     SlotRec = Slot->getOperator();
@@ -2296,7 +2337,7 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
       if (!Dest->isLeaf())
         I->error("implicitly defined value should be a register!");
 
-      DefInit *Val = dynamic_cast<DefInit*>(Dest->getLeafValue());
+      DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
       if (!Val || !Val->getDef()->isSubClassOf("Register"))
         I->error("implicitly defined value should be a register!");
       InstImpResults.push_back(Val->getDef());
@@ -2337,7 +2378,7 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
     if (!Dest->isLeaf())
       I->error("set destination should be a register!");
 
-    DefInit *Val = dynamic_cast<DefInit*>(Dest->getLeafValue());
+    DefInit *Val = dyn_cast<DefInit>(Dest->getLeafValue());
     if (!Val)
       I->error("set destination should be a register!");
 
@@ -2367,43 +2408,36 @@ FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
 
 class InstAnalyzer {
   const CodeGenDAGPatterns &CDP;
-  bool &mayStore;
-  bool &mayLoad;
-  bool &IsBitcast;
-  bool &HasSideEffects;
-  bool &IsVariadic;
 public:
-  InstAnalyzer(const CodeGenDAGPatterns &cdp,
-               bool &maystore, bool &mayload, bool &isbc, bool &hse, bool &isv)
-    : CDP(cdp), mayStore(maystore), mayLoad(mayload), IsBitcast(isbc),
-      HasSideEffects(hse), IsVariadic(isv) {
-  }
+  bool hasSideEffects;
+  bool mayStore;
+  bool mayLoad;
+  bool isBitcast;
+  bool isVariadic;
 
-  /// Analyze - Analyze the specified instruction, returning true if the
-  /// instruction had a pattern.
-  bool Analyze(Record *InstRecord) {
-    const TreePattern *Pattern = CDP.getInstruction(InstRecord).getPattern();
-    if (Pattern == 0) {
-      HasSideEffects = 1;
-      return false;  // No pattern.
-    }
+  InstAnalyzer(const CodeGenDAGPatterns &cdp)
+    : CDP(cdp), hasSideEffects(false), mayStore(false), mayLoad(false),
+      isBitcast(false), isVariadic(false) {}
 
-    // FIXME: Assume only the first tree is the pattern. The others are clobber
-    // nodes.
-    AnalyzeNode(Pattern->getTree(0));
-    return true;
+  void Analyze(const TreePattern *Pat) {
+    // Assume only the first tree is the pattern. The others are clobber nodes.
+    AnalyzeNode(Pat->getTree(0));
+  }
+
+  void Analyze(const PatternToMatch *Pat) {
+    AnalyzeNode(Pat->getSrcPattern());
   }
 
 private:
   bool IsNodeBitcast(const TreePatternNode *N) const {
-    if (HasSideEffects || mayLoad || mayStore || IsVariadic)
+    if (hasSideEffects || mayLoad || mayStore || isVariadic)
       return false;
 
     if (N->getNumChildren() != 2)
       return false;
 
     const TreePatternNode *N0 = N->getChild(0);
-    if (!N0->isLeaf() || !dynamic_cast<DefInit*>(N0->getLeafValue()))
+    if (!N0->isLeaf() || !isa<DefInit>(N0->getLeafValue()))
       return false;
 
     const TreePatternNode *N1 = N->getChild(1);
@@ -2418,16 +2452,17 @@ private:
     return OpInfo.getEnumName() == "ISD::BITCAST";
   }
 
+public:
   void AnalyzeNode(const TreePatternNode *N) {
     if (N->isLeaf()) {
-      if (DefInit *DI = dynamic_cast<DefInit*>(N->getLeafValue())) {
+      if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
         Record *LeafRec = DI->getDef();
         // Handle ComplexPattern leaves.
         if (LeafRec->isSubClassOf("ComplexPattern")) {
           const ComplexPattern &CP = CDP.getComplexPattern(LeafRec);
           if (CP.hasProperty(SDNPMayStore)) mayStore = true;
           if (CP.hasProperty(SDNPMayLoad)) mayLoad = true;
-          if (CP.hasProperty(SDNPSideEffect)) HasSideEffects = true;
+          if (CP.hasProperty(SDNPSideEffect)) hasSideEffects = true;
         }
       }
       return;
@@ -2439,7 +2474,7 @@ private:
 
     // Ignore set nodes, which are not SDNodes.
     if (N->getOperator()->getName() == "set") {
-      IsBitcast = IsNodeBitcast(N);
+      isBitcast = IsNodeBitcast(N);
       return;
     }
 
@@ -2449,8 +2484,8 @@ private:
     // Notice properties of the node.
     if (OpInfo.hasProperty(SDNPMayStore)) mayStore = true;
     if (OpInfo.hasProperty(SDNPMayLoad)) mayLoad = true;
-    if (OpInfo.hasProperty(SDNPSideEffect)) HasSideEffects = true;
-    if (OpInfo.hasProperty(SDNPVariadic)) IsVariadic = true;
+    if (OpInfo.hasProperty(SDNPSideEffect)) hasSideEffects = true;
+    if (OpInfo.hasProperty(SDNPVariadic)) isVariadic = true;
 
     if (const CodeGenIntrinsic *IntInfo = N->getIntrinsicInfo(CDP)) {
       // If this is an intrinsic, analyze it.
@@ -2462,68 +2497,70 @@ private:
 
       if (IntInfo->ModRef >= CodeGenIntrinsic::ReadWriteMem)
         // WriteMem intrinsics can have other strange effects.
-        HasSideEffects = true;
+        hasSideEffects = true;
     }
   }
 
 };
 
-static void InferFromPattern(const CodeGenInstruction &Inst,
-                             bool &MayStore, bool &MayLoad,
-                             bool &IsBitcast,
-                             bool &HasSideEffects, bool &IsVariadic,
-                             const CodeGenDAGPatterns &CDP) {
-  MayStore = MayLoad = IsBitcast = HasSideEffects = IsVariadic = false;
-
-  bool HadPattern =
-    InstAnalyzer(CDP, MayStore, MayLoad, IsBitcast, HasSideEffects, IsVariadic)
-    .Analyze(Inst.TheDef);
-
-  // InstAnalyzer only correctly analyzes mayStore/mayLoad so far.
-  if (Inst.mayStore) {  // If the .td file explicitly sets mayStore, use it.
-    // If we decided that this is a store from the pattern, then the .td file
-    // entry is redundant.
-    if (MayStore)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "mayStore flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    MayStore = true;
+static bool InferFromPattern(CodeGenInstruction &InstInfo,
+                             const InstAnalyzer &PatInfo,
+                             Record *PatDef) {
+  bool Error = false;
+
+  // Remember where InstInfo got its flags.
+  if (InstInfo.hasUndefFlags())
+      InstInfo.InferredFrom = PatDef;
+
+  // Check explicitly set flags for consistency.
+  if (InstInfo.hasSideEffects != PatInfo.hasSideEffects &&
+      !InstInfo.hasSideEffects_Unset) {
+    // Allow explicitly setting hasSideEffects = 1 on instructions, even when
+    // the pattern has no side effects. That could be useful for div/rem
+    // instructions that may trap.
+    if (!InstInfo.hasSideEffects) {
+      Error = true;
+      PrintError(PatDef->getLoc(), "Pattern doesn't match hasSideEffects = " +
+                 Twine(InstInfo.hasSideEffects));
+    }
   }
 
-  if (Inst.mayLoad) {  // If the .td file explicitly sets mayLoad, use it.
-    // If we decided that this is a load from the pattern, then the .td file
-    // entry is redundant.
-    if (MayLoad)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "mayLoad flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    MayLoad = true;
+  if (InstInfo.mayStore != PatInfo.mayStore && !InstInfo.mayStore_Unset) {
+    Error = true;
+    PrintError(PatDef->getLoc(), "Pattern doesn't match mayStore = " +
+               Twine(InstInfo.mayStore));
   }
 
-  if (Inst.neverHasSideEffects) {
-    if (HadPattern)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "neverHasSideEffects flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    HasSideEffects = false;
+  if (InstInfo.mayLoad != PatInfo.mayLoad && !InstInfo.mayLoad_Unset) {
+    // Allow explicitly setting mayLoad = 1, even when the pattern has no loads.
+    // Some targets translate imediates to loads.
+    if (!InstInfo.mayLoad) {
+      Error = true;
+      PrintError(PatDef->getLoc(), "Pattern doesn't match mayLoad = " +
+                 Twine(InstInfo.mayLoad));
+    }
   }
 
-  if (Inst.hasSideEffects) {
-    if (HasSideEffects)
-      PrintWarning(Inst.TheDef->getLoc(),
-                   "hasSideEffects flag explicitly set on "
-                   "instruction, but flag already inferred from pattern.");
-    HasSideEffects = true;
-  }
+  // Transfer inferred flags.
+  InstInfo.hasSideEffects |= PatInfo.hasSideEffects;
+  InstInfo.mayStore |= PatInfo.mayStore;
+  InstInfo.mayLoad |= PatInfo.mayLoad;
+
+  // These flags are silently added without any verification.
+  InstInfo.isBitcast |= PatInfo.isBitcast;
+
+  // Don't infer isVariadic. This flag means something different on SDNodes and
+  // instructions. For example, a CALL SDNode is variadic because it has the
+  // call arguments as operands, but a CALL instruction is not variadic - it
+  // has argument registers as implicit, not explicit uses.
 
-  if (Inst.Operands.isVariadic)
-    IsVariadic = true;  // Can warn if we want.
+  return Error;
 }
 
 /// hasNullFragReference - Return true if the DAG has any reference to the
 /// null_frag operator.
 static bool hasNullFragReference(DagInit *DI) {
-  DefInit *OpDef = dynamic_cast<DefInit*>(DI->getOperator());
+  DefInit *OpDef = dyn_cast<DefInit>(DI->getOperator());
   if (!OpDef) return false;
   Record *Operator = OpDef->getDef();
 
@@ -2531,7 +2568,7 @@ static bool hasNullFragReference(DagInit *DI) {
   if (Operator->getName() == "null_frag") return true;
   // If any of the arguments reference the null fragment, return true.
   for (unsigned i = 0, e = DI->getNumArgs(); i != e; ++i) {
-    DagInit *Arg = dynamic_cast<DagInit*>(DI->getArg(i));
+    DagInit *Arg = dyn_cast<DagInit>(DI->getArg(i));
     if (Arg && hasNullFragReference(Arg))
       return true;
   }
@@ -2543,7 +2580,7 @@ static bool hasNullFragReference(DagInit *DI) {
 /// the null_frag operator.
 static bool hasNullFragReference(ListInit *LI) {
   for (unsigned i = 0, e = LI->getSize(); i != e; ++i) {
-    DagInit *DI = dynamic_cast<DagInit*>(LI->getElement(i));
+    DagInit *DI = dyn_cast<DagInit>(LI->getElement(i));
     assert(DI && "non-dag in an instruction Pattern list?!");
     if (hasNullFragReference(DI))
       return true;
@@ -2551,6 +2588,17 @@ static bool hasNullFragReference(ListInit *LI) {
   return false;
 }
 
+/// Get all the instructions in a tree.
+static void
+getInstructionsInTree(TreePatternNode *Tree, SmallVectorImpl<Record*> &Instrs) {
+  if (Tree->isLeaf())
+    return;
+  if (Tree->getOperator()->isSubClassOf("Instruction"))
+    Instrs.push_back(Tree->getOperator());
+  for (unsigned i = 0, e = Tree->getNumChildren(); i != e; ++i)
+    getInstructionsInTree(Tree->getChild(i), Instrs);
+}
+
 /// ParseInstructions - Parse all of the instructions, inlining and resolving
 /// any fragments involved.  This populates the Instructions list with fully
 /// resolved instructions.
@@ -2560,7 +2608,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
   for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
     ListInit *LI = 0;
 
-    if (dynamic_cast<ListInit*>(Instrs[i]->getValueInit("Pattern")))
+    if (isa<ListInit>(Instrs[i]->getValueInit("Pattern")))
       LI = Instrs[i]->getValueAsListInit("Pattern");
 
     // If there is no pattern, only collect minimal information about the
@@ -2655,7 +2703,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
 
       if (i == 0)
         Res0Node = RNode;
-      Record *R = dynamic_cast<DefInit*>(RNode->getLeafValue())->getDef();
+      Record *R = cast<DefInit>(RNode->getLeafValue())->getDef();
       if (R == 0)
         I->error("Operand $" + OpName + " should be a set destination: all "
                  "outputs must occur before inputs in operand list!");
@@ -2683,11 +2731,9 @@ void CodeGenDAGPatterns::ParseInstructions() {
         I->error("Operand #" + utostr(i) + " in operands list has no name!");
 
       if (!InstInputsCheck.count(OpName)) {
-        // If this is an predicate operand or optional def operand with an
-        // DefaultOps set filled in, we can ignore this.  When we codegen it,
-        // we will do so as always executed.
-        if (Op.Rec->isSubClassOf("PredicateOperand") ||
-            Op.Rec->isSubClassOf("OptionalDefOperand")) {
+        // If this is an operand with a DefaultOps set filled in, we can ignore
+        // this.  When we codegen it, we will do so as always executed.
+        if (Op.Rec->isSubClassOf("OperandWithDefaultOps")) {
           // Does it have a non-empty DefaultOps field?  If so, ignore this
           // operand.
           if (!getDefaultOperand(Op.Rec).DefaultOps.empty())
@@ -2699,8 +2745,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
       TreePatternNode *InVal = InstInputsCheck[OpName];
       InstInputsCheck.erase(OpName);   // It occurred, remove from map.
 
-      if (InVal->isLeaf() &&
-          dynamic_cast<DefInit*>(InVal->getLeafValue())) {
+      if (InVal->isLeaf() && isa<DefInit>(InVal->getLeafValue())) {
         Record *InRec = static_cast<DefInit*>(InVal->getLeafValue())->getDef();
         if (Op.Rec != InRec && !InRec->isSubClassOf("ComplexPattern"))
           I->error("Operand $" + OpName + "'s register class disagrees"
@@ -2754,11 +2799,11 @@ void CodeGenDAGPatterns::ParseInstructions() {
   }
 
   // If we can, convert the instructions to be patterns that are matched!
-  for (std::map<Record*, DAGInstruction, RecordPtrCmp>::iterator II =
+  for (std::map<Record*, DAGInstruction, LessRecordByID>::iterator II =
         Instructions.begin(),
        E = Instructions.end(); II != E; ++II) {
     DAGInstruction &TheInst = II->second;
-    const TreePattern *I = TheInst.getPattern();
+    TreePattern *I = TheInst.getPattern();
     if (I == 0) continue;  // No pattern.
 
     // FIXME: Assume only the first tree is the pattern. The others are clobber
@@ -2789,7 +2834,7 @@ typedef std::pair<const TreePatternNode*, unsigned> NameRecord;
 
 static void FindNames(const TreePatternNode *P,
                       std::map<std::string, NameRecord> &Names,
-                      const TreePattern *PatternTop) {
+                      TreePattern *PatternTop) {
   if (!P->getName().empty()) {
     NameRecord &Rec = Names[P->getName()];
     // If this is the first instance of the name, remember the node.
@@ -2806,12 +2851,15 @@ static void FindNames(const TreePatternNode *P,
   }
 }
 
-void CodeGenDAGPatterns::AddPatternToMatch(const TreePattern *Pattern,
+void CodeGenDAGPatterns::AddPatternToMatch(TreePattern *Pattern,
                                            const PatternToMatch &PTM) {
   // Do some sanity checking on the pattern we're about to match.
   std::string Reason;
-  if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this))
-    Pattern->error("Pattern can never match: " + Reason);
+  if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) {
+    PrintWarning(Pattern->getRecord()->getLoc(),
+      Twine("Pattern can never match: ") + Reason);
+    return;
+  }
 
   // If the source pattern's root is a complex pattern, that complex pattern
   // must specify the nodes it can potentially match.
@@ -2852,25 +2900,156 @@ void CodeGenDAGPatterns::AddPatternToMatch(const TreePattern *Pattern,
 void CodeGenDAGPatterns::InferInstructionFlags() {
   const std::vector<const CodeGenInstruction*> &Instructions =
     Target.getInstructionsByEnumValue();
+
+  // First try to infer flags from the primary instruction pattern, if any.
+  SmallVector<CodeGenInstruction*, 8> Revisit;
+  unsigned Errors = 0;
   for (unsigned i = 0, e = Instructions.size(); i != e; ++i) {
     CodeGenInstruction &InstInfo =
       const_cast<CodeGenInstruction &>(*Instructions[i]);
-    // Determine properties of the instruction from its pattern.
-    bool MayStore, MayLoad, IsBitcast, HasSideEffects, IsVariadic;
-    InferFromPattern(InstInfo, MayStore, MayLoad, IsBitcast,
-                     HasSideEffects, IsVariadic, *this);
-    InstInfo.mayStore = MayStore;
-    InstInfo.mayLoad = MayLoad;
-    InstInfo.isBitcast = IsBitcast;
-    InstInfo.hasSideEffects = HasSideEffects;
-    InstInfo.Operands.isVariadic = IsVariadic;
 
-    // Sanity checks.
-    if (InstInfo.isReMaterializable && InstInfo.hasSideEffects)
-      throw TGError(InstInfo.TheDef->getLoc(), "The instruction " +
-                    InstInfo.TheDef->getName() +
-                    " is rematerializable AND has unmodeled side effects?");
+    // Treat neverHasSideEffects = 1 as the equivalent of hasSideEffects = 0.
+    // This flag is obsolete and will be removed.
+    if (InstInfo.neverHasSideEffects) {
+      assert(!InstInfo.hasSideEffects);
+      InstInfo.hasSideEffects_Unset = false;
+    }
+
+    // Get the primary instruction pattern.
+    const TreePattern *Pattern = getInstruction(InstInfo.TheDef).getPattern();
+    if (!Pattern) {
+      if (InstInfo.hasUndefFlags())
+        Revisit.push_back(&InstInfo);
+      continue;
+    }
+    InstAnalyzer PatInfo(*this);
+    PatInfo.Analyze(Pattern);
+    Errors += InferFromPattern(InstInfo, PatInfo, InstInfo.TheDef);
+  }
+
+  // Second, look for single-instruction patterns defined outside the
+  // instruction.
+  for (ptm_iterator I = ptm_begin(), E = ptm_end(); I != E; ++I) {
+    const PatternToMatch &PTM = *I;
+
+    // We can only infer from single-instruction patterns, otherwise we won't
+    // know which instruction should get the flags.
+    SmallVector<Record*, 8> PatInstrs;
+    getInstructionsInTree(PTM.getDstPattern(), PatInstrs);
+    if (PatInstrs.size() != 1)
+      continue;
+
+    // Get the single instruction.
+    CodeGenInstruction &InstInfo = Target.getInstruction(PatInstrs.front());
+
+    // Only infer properties from the first pattern. We'll verify the others.
+    if (InstInfo.InferredFrom)
+      continue;
+
+    InstAnalyzer PatInfo(*this);
+    PatInfo.Analyze(&PTM);
+    Errors += InferFromPattern(InstInfo, PatInfo, PTM.getSrcRecord());
+  }
+
+  if (Errors)
+    PrintFatalError("pattern conflicts");
+
+  // Revisit instructions with undefined flags and no pattern.
+  if (Target.guessInstructionProperties()) {
+    for (unsigned i = 0, e = Revisit.size(); i != e; ++i) {
+      CodeGenInstruction &InstInfo = *Revisit[i];
+      if (InstInfo.InferredFrom)
+        continue;
+      // The mayLoad and mayStore flags default to false.
+      // Conservatively assume hasSideEffects if it wasn't explicit.
+      if (InstInfo.hasSideEffects_Unset)
+        InstInfo.hasSideEffects = true;
+    }
+    return;
   }
+
+  // Complain about any flags that are still undefined.
+  for (unsigned i = 0, e = Revisit.size(); i != e; ++i) {
+    CodeGenInstruction &InstInfo = *Revisit[i];
+    if (InstInfo.InferredFrom)
+      continue;
+    if (InstInfo.hasSideEffects_Unset)
+      PrintError(InstInfo.TheDef->getLoc(),
+                 "Can't infer hasSideEffects from patterns");
+    if (InstInfo.mayStore_Unset)
+      PrintError(InstInfo.TheDef->getLoc(),
+                 "Can't infer mayStore from patterns");
+    if (InstInfo.mayLoad_Unset)
+      PrintError(InstInfo.TheDef->getLoc(),
+                 "Can't infer mayLoad from patterns");
+  }
+}
+
+
+/// Verify instruction flags against pattern node properties.
+void CodeGenDAGPatterns::VerifyInstructionFlags() {
+  unsigned Errors = 0;
+  for (ptm_iterator I = ptm_begin(), E = ptm_end(); I != E; ++I) {
+    const PatternToMatch &PTM = *I;
+    SmallVector<Record*, 8> Instrs;
+    getInstructionsInTree(PTM.getDstPattern(), Instrs);
+    if (Instrs.empty())
+      continue;
+
+    // Count the number of instructions with each flag set.
+    unsigned NumSideEffects = 0;
+    unsigned NumStores = 0;
+    unsigned NumLoads = 0;
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      const CodeGenInstruction &InstInfo = Target.getInstruction(Instrs[i]);
+      NumSideEffects += InstInfo.hasSideEffects;
+      NumStores += InstInfo.mayStore;
+      NumLoads += InstInfo.mayLoad;
+    }
+
+    // Analyze the source pattern.
+    InstAnalyzer PatInfo(*this);
+    PatInfo.Analyze(&PTM);
+
+    // Collect error messages.
+    SmallVector<std::string, 4> Msgs;
+
+    // Check for missing flags in the output.
+    // Permit extra flags for now at least.
+    if (PatInfo.hasSideEffects && !NumSideEffects)
+      Msgs.push_back("pattern has side effects, but hasSideEffects isn't set");
+
+    // Don't verify store flags on instructions with side effects. At least for
+    // intrinsics, side effects implies mayStore.
+    if (!PatInfo.hasSideEffects && PatInfo.mayStore && !NumStores)
+      Msgs.push_back("pattern may store, but mayStore isn't set");
+
+    // Similarly, mayStore implies mayLoad on intrinsics.
+    if (!PatInfo.mayStore && PatInfo.mayLoad && !NumLoads)
+      Msgs.push_back("pattern may load, but mayLoad isn't set");
+
+    // Print error messages.
+    if (Msgs.empty())
+      continue;
+    ++Errors;
+
+    for (unsigned i = 0, e = Msgs.size(); i != e; ++i)
+      PrintError(PTM.getSrcRecord()->getLoc(), Twine(Msgs[i]) + " on the " +
+                 (Instrs.size() == 1 ?
+                  "instruction" : "output instructions"));
+    // Provide the location of the relevant instruction definitions.
+    for (unsigned i = 0, e = Instrs.size(); i != e; ++i) {
+      if (Instrs[i] != PTM.getSrcRecord())
+        PrintError(Instrs[i]->getLoc(), "defined here");
+      const CodeGenInstruction &InstInfo = Target.getInstruction(Instrs[i]);
+      if (InstInfo.InferredFrom &&
+          InstInfo.InferredFrom != InstInfo.TheDef &&
+          InstInfo.InferredFrom != PTM.getSrcRecord())
+        PrintError(InstInfo.InferredFrom->getLoc(), "inferred from patttern");
+    }
+  }
+  if (Errors)
+    PrintFatalError("Errors in DAG patterns");
 }
 
 /// Given a pattern result with an unresolved type, see if we can find one
@@ -3230,7 +3409,7 @@ static void GenerateVariantsOf(TreePatternNode *N,
     for (unsigned i = 0, e = N->getNumChildren(); i != e; ++i) {
       TreePatternNode *Child = N->getChild(i);
       if (Child->isLeaf())
-        if (DefInit *DI = dynamic_cast<DefInit*>(Child->getLeafValue())) {
+        if (DefInit *DI = dyn_cast<DefInit>(Child->getLeafValue())) {
           Record *RR = DI->getDef();
           if (RR->isSubClassOf("Register"))
             continue;
@@ -3330,4 +3509,3 @@ void CodeGenDAGPatterns::GenerateVariants() {
     DEBUG(errs() << "\n");
   }
 }
-
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 5a2d40aa7c86..9be763f2ff11 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -105,7 +105,7 @@ namespace EEVT {
 
     /// MergeInTypeInfo - This merges in type information from the specified
     /// argument.  If 'this' changes, it returns true.  If the two types are
-    /// contradictory (e.g. merge f32 into i32) then this throws an exception.
+    /// contradictory (e.g. merge f32 into i32) then this flags an error.
     bool MergeInTypeInfo(const EEVT::TypeSet &InVT, TreePattern &TP);
 
     bool MergeInTypeInfo(MVT::SimpleValueType InVT, TreePattern &TP) {
@@ -187,8 +187,8 @@ struct SDTypeConstraint {
 
   /// ApplyTypeConstraint - Given a node in a pattern, apply this type
   /// constraint to the nodes operands.  This returns true if it makes a
-  /// change, false otherwise.  If a type contradiction is found, throw an
-  /// exception.
+  /// change, false otherwise.  If a type contradiction is found, an error
+  /// is flagged.
   bool ApplyTypeConstraint(TreePatternNode *N, const SDNodeInfo &NodeInfo,
                            TreePattern &TP) const;
 };
@@ -232,7 +232,7 @@ public:
   /// ApplyTypeConstraints - Given a node in a pattern, apply the type
   /// constraints for this node to the operands of the node.  This returns
   /// true if it makes a change, false otherwise.  If a type contradiction is
-  /// found, throw an exception.
+  /// found, an error is flagged.
   bool ApplyTypeConstraints(TreePatternNode *N, TreePattern &TP) const {
     bool MadeChange = false;
     for (unsigned i = 0, e = TypeConstraints.size(); i != e; ++i)
@@ -446,13 +446,12 @@ public:   // Higher level manipulation routines.
 
   /// ApplyTypeConstraints - Apply all of the type constraints relevant to
   /// this node and its children in the tree.  This returns true if it makes a
-  /// change, false otherwise.  If a type contradiction is found, throw an
-  /// exception.
+  /// change, false otherwise.  If a type contradiction is found, flag an error.
   bool ApplyTypeConstraints(TreePattern &TP, bool NotRegisters);
 
   /// UpdateNodeType - Set the node type of N to VT if VT contains
-  /// information.  If N already contains a conflicting type, then throw an
-  /// exception.  This returns true if any information was updated.
+  /// information.  If N already contains a conflicting type, then flag an
+  /// error.  This returns true if any information was updated.
   ///
   bool UpdateNodeType(unsigned ResNo, const EEVT::TypeSet &InTy,
                       TreePattern &TP) {
@@ -514,6 +513,10 @@ class TreePattern {
   /// isInputPattern - True if this is an input pattern, something to match.
   /// False if this is an output pattern, something to emit.
   bool isInputPattern;
+
+  /// hasError - True if the currently processed nodes have unresolvable types
+  /// or other non-fatal errors
+  bool HasError;
 public:
 
   /// TreePattern constructor - Parse the specified DagInits into the
@@ -565,13 +568,19 @@ public:
 
   /// InferAllTypes - Infer/propagate as many types throughout the expression
   /// patterns as possible.  Return true if all types are inferred, false
-  /// otherwise.  Throw an exception if a type contradiction is found.
+  /// otherwise.  Bail out if a type contradiction is found.
   bool InferAllTypes(const StringMap<SmallVector<TreePatternNode*,1> >
                           *NamedTypes=0);
 
-  /// error - Throw an exception, prefixing it with information about this
-  /// pattern.
-  void error(const std::string &Msg) const;
+  /// error - If this is the first error in the current resolution step,
+  /// print it and set the error flag.  Otherwise, continue silently.
+  void error(const std::string &Msg);
+  bool hasError() const {
+    return HasError;
+  }
+  void resetError() {
+    HasError = false;
+  }
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -582,8 +591,8 @@ private:
   void ComputeNamedNodes(TreePatternNode *N);
 };
 
-/// DAGDefaultOperand - One of these is created for each PredicateOperand
-/// or OptionalDefOperand that has a set ExecuteAlways / DefaultOps field.
+/// DAGDefaultOperand - One of these is created for each OperandWithDefaultOps
+/// that has a set ExecuteAlways / DefaultOps field.
 struct DAGDefaultOperand {
   std::vector<TreePatternNode*> DefaultOps;
 };
@@ -602,7 +611,7 @@ public:
     : Pattern(TP), Results(results), Operands(operands),
       ImpResults(impresults), ResultPattern(0) {}
 
-  const TreePattern *getPattern() const { return Pattern; }
+  TreePattern *getPattern() const { return Pattern; }
   unsigned getNumResults() const { return Results.size(); }
   unsigned getNumOperands() const { return Operands.size(); }
   unsigned getNumImpResults() const { return ImpResults.size(); }
@@ -661,23 +670,18 @@ public:
   unsigned getPatternComplexity(const CodeGenDAGPatterns &CGP) const;
 };
 
-// Deterministic comparison of Record*.
-struct RecordPtrCmp {
-  bool operator()(const Record *LHS, const Record *RHS) const;
-};
-
 class CodeGenDAGPatterns {
   RecordKeeper &Records;
   CodeGenTarget Target;
   std::vector<CodeGenIntrinsic> Intrinsics;
   std::vector<CodeGenIntrinsic> TgtIntrinsics;
 
-  std::map<Record*, SDNodeInfo, RecordPtrCmp> SDNodes;
-  std::map<Record*, std::pair<Record*, std::string>, RecordPtrCmp> SDNodeXForms;
-  std::map<Record*, ComplexPattern, RecordPtrCmp> ComplexPatterns;
-  std::map<Record*, TreePattern*, RecordPtrCmp> PatternFragments;
-  std::map<Record*, DAGDefaultOperand, RecordPtrCmp> DefaultOperands;
-  std::map<Record*, DAGInstruction, RecordPtrCmp> Instructions;
+  std::map<Record*, SDNodeInfo, LessRecordByID> SDNodes;
+  std::map<Record*, std::pair<Record*, std::string>, LessRecordByID> SDNodeXForms;
+  std::map<Record*, ComplexPattern, LessRecordByID> ComplexPatterns;
+  std::map<Record*, TreePattern*, LessRecordByID> PatternFragments;
+  std::map<Record*, DAGDefaultOperand, LessRecordByID> DefaultOperands;
+  std::map<Record*, DAGInstruction, LessRecordByID> Instructions;
 
   // Specific SDNode definitions:
   Record *intrinsic_void_sdnode;
@@ -708,7 +712,7 @@ public:
     return SDNodeXForms.find(R)->second;
   }
 
-  typedef std::map<Record*, NodeXForm, RecordPtrCmp>::const_iterator
+  typedef std::map<Record*, NodeXForm, LessRecordByID>::const_iterator
           nx_iterator;
   nx_iterator nx_begin() const { return SDNodeXForms.begin(); }
   nx_iterator nx_end() const { return SDNodeXForms.end(); }
@@ -758,7 +762,7 @@ public:
     return PatternFragments.find(R)->second;
   }
 
-  typedef std::map<Record*, TreePattern*, RecordPtrCmp>::const_iterator
+  typedef std::map<Record*, TreePattern*, LessRecordByID>::const_iterator
           pf_iterator;
   pf_iterator pf_begin() const { return PatternFragments.begin(); }
   pf_iterator pf_end() const { return PatternFragments.end(); }
@@ -797,8 +801,9 @@ private:
   void ParsePatterns();
   void InferInstructionFlags();
   void GenerateVariants();
+  void VerifyInstructionFlags();
 
-  void AddPatternToMatch(const TreePattern *Pattern, const PatternToMatch &PTM);
+  void AddPatternToMatch(TreePattern *Pattern, const PatternToMatch &PTM);
   void FindPatternInputsAndOutputs(TreePattern *I, TreePatternNode *Pat,
                                    std::map<std::string,
                                    TreePatternNode*> &InstInputs,
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 12e153a66514..0a8684d3da5a 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -32,20 +32,20 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
 
   DagInit *OutDI = R->getValueAsDag("OutOperandList");
 
-  if (DefInit *Init = dynamic_cast<DefInit*>(OutDI->getOperator())) {
+  if (DefInit *Init = dyn_cast<DefInit>(OutDI->getOperator())) {
     if (Init->getDef()->getName() != "outs")
-      throw R->getName() + ": invalid def name for output list: use 'outs'";
+      PrintFatalError(R->getName() + ": invalid def name for output list: use 'outs'");
   } else
-    throw R->getName() + ": invalid output list: use 'outs'";
+    PrintFatalError(R->getName() + ": invalid output list: use 'outs'");
 
   NumDefs = OutDI->getNumArgs();
 
   DagInit *InDI = R->getValueAsDag("InOperandList");
-  if (DefInit *Init = dynamic_cast<DefInit*>(InDI->getOperator())) {
+  if (DefInit *Init = dyn_cast<DefInit>(InDI->getOperator())) {
     if (Init->getDef()->getName() != "ins")
-      throw R->getName() + ": invalid def name for input list: use 'ins'";
+      PrintFatalError(R->getName() + ": invalid def name for input list: use 'ins'");
   } else
-    throw R->getName() + ": invalid input list: use 'ins'";
+    PrintFatalError(R->getName() + ": invalid input list: use 'ins'");
 
   unsigned MIOperandNo = 0;
   std::set<std::string> OperandNames;
@@ -60,9 +60,9 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
       ArgName = InDI->getArgName(i-NumDefs);
     }
 
-    DefInit *Arg = dynamic_cast<DefInit*>(ArgInit);
+    DefInit *Arg = dyn_cast<DefInit>(ArgInit);
     if (!Arg)
-      throw "Illegal operand for the '" + R->getName() + "' instruction!";
+      PrintFatalError("Illegal operand for the '" + R->getName() + "' instruction!");
 
     Record *Rec = Arg->getDef();
     std::string PrintMethod = "printOperand";
@@ -80,11 +80,10 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
       MIOpInfo = Rec->getValueAsDag("MIOperandInfo");
 
       // Verify that MIOpInfo has an 'ops' root value.
-      if (!dynamic_cast<DefInit*>(MIOpInfo->getOperator()) ||
-          dynamic_cast<DefInit*>(MIOpInfo->getOperator())
-          ->getDef()->getName() != "ops")
-        throw "Bad value for MIOperandInfo in operand '" + Rec->getName() +
-        "'\n";
+      if (!isa<DefInit>(MIOpInfo->getOperator()) ||
+          cast<DefInit>(MIOpInfo->getOperator())->getDef()->getName() != "ops")
+        PrintFatalError("Bad value for MIOperandInfo in operand '" + Rec->getName() +
+          "'\n");
 
       // If we have MIOpInfo, then we have #operands equal to number of entries
       // in MIOperandInfo.
@@ -101,17 +100,17 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
     } else if (Rec->isSubClassOf("RegisterClass")) {
       OperandType = "OPERAND_REGISTER";
     } else if (!Rec->isSubClassOf("PointerLikeRegClass") &&
-               Rec->getName() != "unknown")
-      throw "Unknown operand class '" + Rec->getName() +
-      "' in '" + R->getName() + "' instruction!";
+               !Rec->isSubClassOf("unknown_class"))
+      PrintFatalError("Unknown operand class '" + Rec->getName() +
+        "' in '" + R->getName() + "' instruction!");
 
     // Check that the operand has a name and that it's unique.
     if (ArgName.empty())
-      throw "In instruction '" + R->getName() + "', operand #" + utostr(i) +
-      " has no name!";
+      PrintFatalError("In instruction '" + R->getName() + "', operand #" + utostr(i) +
+        " has no name!");
     if (!OperandNames.insert(ArgName).second)
-      throw "In instruction '" + R->getName() + "', operand #" + utostr(i) +
-      " has the same name as a previous operand!";
+      PrintFatalError("In instruction '" + R->getName() + "', operand #" + utostr(i) +
+        " has the same name as a previous operand!");
 
     OperandList.push_back(OperandInfo(Rec, ArgName, PrintMethod, EncoderMethod,
                                       OperandType, MIOperandNo, NumOps,
@@ -129,13 +128,13 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
 
 /// getOperandNamed - Return the index of the operand with the specified
 /// non-empty name.  If the instruction does not have an operand with the
-/// specified name, throw an exception.
+/// specified name, abort.
 ///
 unsigned CGIOperandList::getOperandNamed(StringRef Name) const {
   unsigned OpIdx;
   if (hasOperandNamed(Name, OpIdx)) return OpIdx;
-  throw "'" + TheDef->getName() + "' does not have an operand named '$" +
-    Name.str() + "'!";
+  PrintFatalError("'" + TheDef->getName() + "' does not have an operand named '$" +
+    Name.str() + "'!");
 }
 
 /// hasOperandNamed - Query whether the instruction has an operand of the
@@ -154,7 +153,7 @@ bool CGIOperandList::hasOperandNamed(StringRef Name, unsigned &OpIdx) const {
 std::pair<unsigned,unsigned>
 CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
   if (Op.empty() || Op[0] != '$')
-    throw TheDef->getName() + ": Illegal operand name: '" + Op + "'";
+    PrintFatalError(TheDef->getName() + ": Illegal operand name: '" + Op + "'");
 
   std::string OpName = Op.substr(1);
   std::string SubOpName;
@@ -164,7 +163,7 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
   if (DotIdx != std::string::npos) {
     SubOpName = OpName.substr(DotIdx+1);
     if (SubOpName.empty())
-      throw TheDef->getName() + ": illegal empty suboperand name in '" +Op +"'";
+      PrintFatalError(TheDef->getName() + ": illegal empty suboperand name in '" +Op +"'");
     OpName = OpName.substr(0, DotIdx);
   }
 
@@ -174,8 +173,8 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
     // If one was needed, throw.
     if (OperandList[OpIdx].MINumOperands > 1 && !AllowWholeOp &&
         SubOpName.empty())
-      throw TheDef->getName() + ": Illegal to refer to"
-      " whole operand part of complex operand '" + Op + "'";
+      PrintFatalError(TheDef->getName() + ": Illegal to refer to"
+        " whole operand part of complex operand '" + Op + "'");
 
     // Otherwise, return the operand.
     return std::make_pair(OpIdx, 0U);
@@ -184,7 +183,7 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
   // Find the suboperand number involved.
   DagInit *MIOpInfo = OperandList[OpIdx].MIOperandInfo;
   if (MIOpInfo == 0)
-    throw TheDef->getName() + ": unknown suboperand name in '" + Op + "'";
+    PrintFatalError(TheDef->getName() + ": unknown suboperand name in '" + Op + "'");
 
   // Find the operand with the right name.
   for (unsigned i = 0, e = MIOpInfo->getNumArgs(); i != e; ++i)
@@ -192,7 +191,7 @@ CGIOperandList::ParseOperandName(const std::string &Op, bool AllowWholeOp) {
       return std::make_pair(OpIdx, i);
 
   // Otherwise, didn't find it!
-  throw TheDef->getName() + ": unknown suboperand name in '" + Op + "'";
+  PrintFatalError(TheDef->getName() + ": unknown suboperand name in '" + Op + "'");
 }
 
 static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
@@ -204,13 +203,13 @@ static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
     std::string Name = CStr.substr(wpos+1);
     wpos = Name.find_first_not_of(" \t");
     if (wpos == std::string::npos)
-      throw "Illegal format for @earlyclobber constraint: '" + CStr + "'";
+      PrintFatalError("Illegal format for @earlyclobber constraint: '" + CStr + "'");
     Name = Name.substr(wpos);
     std::pair<unsigned,unsigned> Op = Ops.ParseOperandName(Name, false);
 
     // Build the string for the operand
     if (!Ops[Op.first].Constraints[Op.second].isNone())
-      throw "Operand '" + Name + "' cannot have multiple constraints!";
+      PrintFatalError("Operand '" + Name + "' cannot have multiple constraints!");
     Ops[Op.first].Constraints[Op.second] =
     CGIOperandList::ConstraintInfo::getEarlyClobber();
     return;
@@ -225,25 +224,27 @@ static void ParseConstraint(const std::string &CStr, CGIOperandList &Ops) {
   // TIED_TO: $src1 = $dst
   wpos = Name.find_first_of(" \t");
   if (wpos == std::string::npos)
-    throw "Illegal format for tied-to constraint: '" + CStr + "'";
+    PrintFatalError("Illegal format for tied-to constraint: '" + CStr + "'");
   std::string DestOpName = Name.substr(0, wpos);
   std::pair<unsigned,unsigned> DestOp = Ops.ParseOperandName(DestOpName, false);
 
   Name = CStr.substr(pos+1);
   wpos = Name.find_first_not_of(" \t");
   if (wpos == std::string::npos)
-    throw "Illegal format for tied-to constraint: '" + CStr + "'";
-
-  std::pair<unsigned,unsigned> SrcOp =
-  Ops.ParseOperandName(Name.substr(wpos), false);
-  if (SrcOp > DestOp)
-    throw "Illegal tied-to operand constraint '" + CStr + "'";
+    PrintFatalError("Illegal format for tied-to constraint: '" + CStr + "'");
 
+  std::string SrcOpName = Name.substr(wpos);
+  std::pair<unsigned,unsigned> SrcOp = Ops.ParseOperandName(SrcOpName, false);
+  if (SrcOp > DestOp) {
+    std::swap(SrcOp, DestOp);
+    std::swap(SrcOpName, DestOpName);
+  }
 
   unsigned FlatOpNo = Ops.getFlattenedOperandNumber(SrcOp);
 
   if (!Ops[DestOp.first].Constraints[DestOp.second].isNone())
-    throw "Operand '" + DestOpName + "' cannot have multiple constraints!";
+    PrintFatalError("Operand '" + DestOpName +
+      "' cannot have multiple constraints!");
   Ops[DestOp.first].Constraints[DestOp.second] =
     CGIOperandList::ConstraintInfo::getTied(FlatOpNo);
 }
@@ -287,7 +288,8 @@ void CGIOperandList::ProcessDisableEncoding(std::string DisableEncoding) {
 // CodeGenInstruction Implementation
 //===----------------------------------------------------------------------===//
 
-CodeGenInstruction::CodeGenInstruction(Record *R) : TheDef(R), Operands(R) {
+CodeGenInstruction::CodeGenInstruction(Record *R)
+  : TheDef(R), Operands(R), InferredFrom(0) {
   Namespace = R->getValueAsString("Namespace");
   AsmString = R->getValueAsString("AsmString");
 
@@ -301,8 +303,6 @@ CodeGenInstruction::CodeGenInstruction(Record *R) : TheDef(R), Operands(R) {
   isBarrier    = R->getValueAsBit("isBarrier");
   isCall       = R->getValueAsBit("isCall");
   canFoldAsLoad = R->getValueAsBit("canFoldAsLoad");
-  mayLoad      = R->getValueAsBit("mayLoad");
-  mayStore     = R->getValueAsBit("mayStore");
   isPredicable = Operands.isPredicable || R->getValueAsBit("isPredicable");
   isConvertibleToThreeAddress = R->getValueAsBit("isConvertibleToThreeAddress");
   isCommutable = R->getValueAsBit("isCommutable");
@@ -313,8 +313,13 @@ CodeGenInstruction::CodeGenInstruction(Record *R) : TheDef(R), Operands(R) {
   hasPostISelHook = R->getValueAsBit("hasPostISelHook");
   hasCtrlDep   = R->getValueAsBit("hasCtrlDep");
   isNotDuplicable = R->getValueAsBit("isNotDuplicable");
-  hasSideEffects = R->getValueAsBit("hasSideEffects");
+
+  mayLoad      = R->getValueAsBitOrUnset("mayLoad", mayLoad_Unset);
+  mayStore     = R->getValueAsBitOrUnset("mayStore", mayStore_Unset);
+  hasSideEffects = R->getValueAsBitOrUnset("hasSideEffects",
+                                           hasSideEffects_Unset);
   neverHasSideEffects = R->getValueAsBit("neverHasSideEffects");
+
   isAsCheapAsAMove = R->getValueAsBit("isAsCheapAsAMove");
   hasExtraSrcRegAllocReq = R->getValueAsBit("hasExtraSrcRegAllocReq");
   hasExtraDefRegAllocReq = R->getValueAsBit("hasExtraDefRegAllocReq");
@@ -324,7 +329,7 @@ CodeGenInstruction::CodeGenInstruction(Record *R) : TheDef(R), Operands(R) {
   ImplicitUses = R->getValueAsListOfDefs("Uses");
 
   if (neverHasSideEffects + hasSideEffects > 1)
-    throw R->getName() + ": multiple conflicting side-effect flags set!";
+    PrintFatalError(R->getName() + ": multiple conflicting side-effect flags set!");
 
   // Parse Constraints.
   ParseConstraints(R->getValueAsString("Constraints"), Operands);
@@ -409,16 +414,16 @@ FlattenAsmStringVariants(StringRef Cur, unsigned Variant) {
 /// successful match, with ResOp set to the result operand to be used.
 bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
                                        Record *InstOpRec, bool hasSubOps,
-                                       SMLoc Loc, CodeGenTarget &T,
+                                       ArrayRef<SMLoc> Loc, CodeGenTarget &T,
                                        ResultOperand &ResOp) {
   Init *Arg = Result->getArg(AliasOpNo);
-  DefInit *ADI = dynamic_cast<DefInit*>(Arg);
+  DefInit *ADI = dyn_cast<DefInit>(Arg);
 
   if (ADI && ADI->getDef() == InstOpRec) {
     // If the operand is a record, it must have a name, and the record type
     // must match up with the instruction's argument type.
     if (Result->getArgName(AliasOpNo).empty())
-      throw TGError(Loc, "result argument #" + utostr(AliasOpNo) +
+      PrintFatalError(Loc, "result argument #" + utostr(AliasOpNo) +
                     " must have a name!");
     ResOp = ResultOperand(Result->getArgName(AliasOpNo), ADI->getDef());
     return true;
@@ -442,7 +447,7 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
       DagInit *DI = InstOpRec->getValueAsDag("MIOperandInfo");
       // The operand info should only have a single (register) entry. We
       // want the register class of it.
-      InstOpRec = dynamic_cast<DefInit*>(DI->getArg(0))->getDef();
+      InstOpRec = cast<DefInit>(DI->getArg(0))->getDef();
     }
 
     if (InstOpRec->isSubClassOf("RegisterOperand"))
@@ -453,13 +458,13 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
 
     if (!T.getRegisterClass(InstOpRec)
         .contains(T.getRegBank().getReg(ADI->getDef())))
-      throw TGError(Loc, "fixed register " + ADI->getDef()->getName() +
-                    " is not a member of the " + InstOpRec->getName() +
-                    " register class!");
+      PrintFatalError(Loc, "fixed register " + ADI->getDef()->getName() +
+                      " is not a member of the " + InstOpRec->getName() +
+                      " register class!");
 
     if (!Result->getArgName(AliasOpNo).empty())
-      throw TGError(Loc, "result fixed register argument must "
-                    "not have a name!");
+      PrintFatalError(Loc, "result fixed register argument must "
+                      "not have a name!");
 
     ResOp = ResultOperand(ADI->getDef());
     return true;
@@ -482,13 +487,13 @@ bool CodeGenInstAlias::tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
   }
 
   // Literal integers.
-  if (IntInit *II = dynamic_cast<IntInit*>(Arg)) {
+  if (IntInit *II = dyn_cast<IntInit>(Arg)) {
     if (hasSubOps || !InstOpRec->isSubClassOf("Operand"))
       return false;
     // Integer arguments can't have names.
     if (!Result->getArgName(AliasOpNo).empty())
-      throw TGError(Loc, "result argument #" + utostr(AliasOpNo) +
-                    " must not have a name!");
+      PrintFatalError(Loc, "result argument #" + utostr(AliasOpNo) +
+                      " must not have a name!");
     ResOp = ResultOperand(II->getValue());
     return true;
   }
@@ -514,9 +519,10 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
   Result = R->getValueAsDag("ResultInst");
 
   // Verify that the root of the result is an instruction.
-  DefInit *DI = dynamic_cast<DefInit*>(Result->getOperator());
+  DefInit *DI = dyn_cast<DefInit>(Result->getOperator());
   if (DI == 0 || !DI->getDef()->isSubClassOf("Instruction"))
-    throw TGError(R->getLoc(), "result of inst alias should be an instruction");
+    PrintFatalError(R->getLoc(),
+                    "result of inst alias should be an instruction");
 
   ResultInst = &T.getInstruction(DI->getDef());
 
@@ -524,7 +530,7 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
   // the same class.
   StringMap<Record*> NameClass;
   for (unsigned i = 0, e = Result->getNumArgs(); i != e; ++i) {
-    DefInit *ADI = dynamic_cast<DefInit*>(Result->getArg(i));
+    DefInit *ADI = dyn_cast<DefInit>(Result->getArg(i));
     if (!ADI || Result->getArgName(i).empty())
       continue;
     // Verify we don't have something like: (someinst GR16:$foo, GR32:$foo)
@@ -532,9 +538,9 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
     // same type.
     Record *&Entry = NameClass[Result->getArgName(i)];
     if (Entry && Entry != ADI->getDef())
-      throw TGError(R->getLoc(), "result value $" + Result->getArgName(i) +
-                    " is both " + Entry->getName() + " and " +
-                    ADI->getDef()->getName() + "!");
+      PrintFatalError(R->getLoc(), "result value $" + Result->getArgName(i) +
+                      " is both " + Entry->getName() + " and " +
+                      ADI->getDef()->getName() + "!");
     Entry = ADI->getDef();
   }
 
@@ -550,7 +556,7 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
       continue;
 
     if (AliasOpNo >= Result->getNumArgs())
-      throw TGError(R->getLoc(), "not enough arguments for instruction!");
+      PrintFatalError(R->getLoc(), "not enough arguments for instruction!");
 
     Record *InstOpRec = ResultInst->Operands[i].Rec;
     unsigned NumSubOps = ResultInst->Operands[i].MINumOperands;
@@ -571,7 +577,7 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
       } else {
          DagInit *MIOI = ResultInst->Operands[i].MIOperandInfo;
          for (unsigned SubOp = 0; SubOp != NumSubOps; ++SubOp) {
-          Record *SubRec = dynamic_cast<DefInit*>(MIOI->getArg(SubOp))->getDef();
+          Record *SubRec = cast<DefInit>(MIOI->getArg(SubOp))->getDef();
 
           // Take care to instantiate each of the suboperands with the correct
           // nomenclature: $foo.bar
@@ -591,26 +597,26 @@ CodeGenInstAlias::CodeGenInstAlias(Record *R, CodeGenTarget &T) : TheDef(R) {
       DagInit *MIOI = ResultInst->Operands[i].MIOperandInfo;
       for (unsigned SubOp = 0; SubOp != NumSubOps; ++SubOp) {
         if (AliasOpNo >= Result->getNumArgs())
-          throw TGError(R->getLoc(), "not enough arguments for instruction!");
-        Record *SubRec = dynamic_cast<DefInit*>(MIOI->getArg(SubOp))->getDef();
+          PrintFatalError(R->getLoc(), "not enough arguments for instruction!");
+        Record *SubRec = cast<DefInit>(MIOI->getArg(SubOp))->getDef();
         if (tryAliasOpMatch(Result, AliasOpNo, SubRec, false,
                             R->getLoc(), T, ResOp)) {
           ResultOperands.push_back(ResOp);
           ResultInstOperandIndex.push_back(std::make_pair(i, SubOp));
           ++AliasOpNo;
         } else {
-          throw TGError(R->getLoc(), "result argument #" + utostr(AliasOpNo) +
+          PrintFatalError(R->getLoc(), "result argument #" + utostr(AliasOpNo) +
                         " does not match instruction operand class " +
                         (SubOp == 0 ? InstOpRec->getName() :SubRec->getName()));
         }
       }
       continue;
     }
-    throw TGError(R->getLoc(), "result argument #" + utostr(AliasOpNo) +
-                  " does not match instruction operand class " +
-                  InstOpRec->getName());
+    PrintFatalError(R->getLoc(), "result argument #" + utostr(AliasOpNo) +
+                    " does not match instruction operand class " +
+                    InstOpRec->getName());
   }
 
   if (AliasOpNo != Result->getNumArgs())
-    throw TGError(R->getLoc(), "too many operands for instruction!");
+    PrintFatalError(R->getLoc(), "too many operands for instruction!");
 }
diff --git a/utils/TableGen/CodeGenInstruction.h b/utils/TableGen/CodeGenInstruction.h
index 95b572d2d08c..55d44399dff9 100644
--- a/utils/TableGen/CodeGenInstruction.h
+++ b/utils/TableGen/CodeGenInstruction.h
@@ -152,7 +152,7 @@ namespace llvm {
 
     /// getOperandNamed - Return the index of the operand with the specified
     /// non-empty name.  If the instruction does not have an operand with the
-    /// specified name, throw an exception.
+    /// specified name, abort.
     unsigned getOperandNamed(StringRef Name) const;
 
     /// hasOperandNamed - Query whether the instruction has an operand of the
@@ -162,9 +162,8 @@ namespace llvm {
 
     /// ParseOperandName - Parse an operand name like "$foo" or "$foo.bar",
     /// where $foo is a whole operand and $foo.bar refers to a suboperand.
-    /// This throws an exception if the name is invalid.  If AllowWholeOp is
-    /// true, references to operands with suboperands are allowed, otherwise
-    /// not.
+    /// This aborts if the name is invalid.  If AllowWholeOp is true, references
+    /// to operands with suboperands are allowed, otherwise not.
     std::pair<unsigned,unsigned> ParseOperandName(const std::string &Op,
                                                   bool AllowWholeOp = true);
 
@@ -226,7 +225,10 @@ namespace llvm {
     bool isBarrier;
     bool isCall;
     bool canFoldAsLoad;
-    bool mayLoad, mayStore;
+    bool mayLoad;
+    bool mayLoad_Unset;
+    bool mayStore;
+    bool mayStore_Unset;
     bool isPredicable;
     bool isConvertibleToThreeAddress;
     bool isCommutable;
@@ -238,6 +240,7 @@ namespace llvm {
     bool hasCtrlDep;
     bool isNotDuplicable;
     bool hasSideEffects;
+    bool hasSideEffects_Unset;
     bool neverHasSideEffects;
     bool isAsCheapAsAMove;
     bool hasExtraSrcRegAllocReq;
@@ -245,6 +248,14 @@ namespace llvm {
     bool isCodeGenOnly;
     bool isPseudo;
 
+    /// Are there any undefined flags?
+    bool hasUndefFlags() const {
+      return mayLoad_Unset || mayStore_Unset || hasSideEffects_Unset;
+    }
+
+    // The record used to infer instruction flags, or NULL if no flag values
+    // have been inferred.
+    Record *InferredFrom;
 
     CodeGenInstruction(Record *R);
 
@@ -319,7 +330,7 @@ namespace llvm {
     CodeGenInstAlias(Record *R, CodeGenTarget &T);
 
     bool tryAliasOpMatch(DagInit *Result, unsigned AliasOpNo,
-                         Record *InstOpRec, bool hasSubOps, SMLoc Loc,
+                         Record *InstOpRec, bool hasSubOps, ArrayRef<SMLoc> Loc,
                          CodeGenTarget &T, ResultOperand &ResOp);
   };
 }
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
new file mode 100644
index 000000000000..1653d67da97a
--- /dev/null
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -0,0 +1,606 @@
+//===- CodeGenMapTable.cpp - Instruction Mapping Table Generator ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// CodeGenMapTable provides functionality for the TabelGen to create
+// relation mapping between instructions. Relation models are defined using
+// InstrMapping as a base class. This file implements the functionality which
+// parses these definitions and generates relation maps using the information
+// specified there. These maps are emitted as tables in the XXXGenInstrInfo.inc
+// file along with the functions to query them.
+//
+// A relationship model to relate non-predicate instructions with their
+// predicated true/false forms can be defined as follows:
+//
+// def getPredOpcode : InstrMapping {
+//  let FilterClass = "PredRel";
+//  let RowFields = ["BaseOpcode"];
+//  let ColFields = ["PredSense"];
+//  let KeyCol = ["none"];
+//  let ValueCols = [["true"], ["false"]]; }
+//
+// CodeGenMapTable parses this map and generates a table in XXXGenInstrInfo.inc
+// file that contains the instructions modeling this relationship. This table
+// is defined in the function
+// "int getPredOpcode(uint16_t Opcode, enum PredSense inPredSense)"
+// that can be used to retrieve the predicated form of the instruction by
+// passing its opcode value and the predicate sense (true/false) of the desired
+// instruction as arguments.
+//
+// Short description of the algorithm:
+//
+// 1) Iterate through all the records that derive from "InstrMapping" class.
+// 2) For each record, filter out instructions based on the FilterClass value.
+// 3) Iterate through this set of instructions and insert them into
+// RowInstrMap map based on their RowFields values. RowInstrMap is keyed by the
+// vector of RowFields values and contains vectors of Records (instructions) as
+// values. RowFields is a list of fields that are required to have the same
+// values for all the instructions appearing in the same row of the relation
+// table. All the instructions in a given row of the relation table have some
+// sort of relationship with the key instruction defined by the corresponding
+// relationship model.
+//
+// Ex: RowInstrMap(RowVal1, RowVal2, ...) -> [Instr1, Instr2, Instr3, ... ]
+// Here Instr1, Instr2, Instr3 have same values (RowVal1, RowVal2) for
+// RowFields. These groups of instructions are later matched against ValueCols
+// to determine the column they belong to, if any.
+//
+// While building the RowInstrMap map, collect all the key instructions in
+// KeyInstrVec. These are the instructions having the same values as KeyCol
+// for all the fields listed in ColFields.
+//
+// For Example:
+//
+// Relate non-predicate instructions with their predicated true/false forms.
+//
+// def getPredOpcode : InstrMapping {
+//  let FilterClass = "PredRel";
+//  let RowFields = ["BaseOpcode"];
+//  let ColFields = ["PredSense"];
+//  let KeyCol = ["none"];
+//  let ValueCols = [["true"], ["false"]]; }
+//
+// Here, only instructions that have "none" as PredSense will be selected as key
+// instructions.
+//
+// 4) For each key instruction, get the group of instructions that share the
+// same key-value as the key instruction from RowInstrMap. Iterate over the list
+// of columns in ValueCols (it is defined as a list<list<string> >. Therefore,
+// it can specify multi-column relationships). For each column, find the
+// instruction from the group that matches all the values for the column.
+// Multiple matches are not allowed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenTarget.h"
+#include "llvm/Support/Format.h"
+#include "llvm/TableGen/Error.h"
+using namespace llvm;
+typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
+
+typedef std::map<std::vector<Init*>, std::vector<Record*> > RowInstrMapTy;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// This class is used to represent InstrMapping class defined in Target.td file.
+class InstrMap {
+private:
+  std::string Name;
+  std::string FilterClass;
+  ListInit *RowFields;
+  ListInit *ColFields;
+  ListInit *KeyCol;
+  std::vector<ListInit*> ValueCols;
+
+public:
+  InstrMap(Record* MapRec) {
+    Name = MapRec->getName();
+
+    // FilterClass - It's used to reduce the search space only to the
+    // instructions that define the kind of relationship modeled by
+    // this InstrMapping object/record.
+    const RecordVal *Filter = MapRec->getValue("FilterClass");
+    FilterClass = Filter->getValue()->getAsUnquotedString();
+
+    // List of fields/attributes that need to be same across all the
+    // instructions in a row of the relation table.
+    RowFields = MapRec->getValueAsListInit("RowFields");
+
+    // List of fields/attributes that are constant across all the instruction
+    // in a column of the relation table. Ex: ColFields = 'predSense'
+    ColFields = MapRec->getValueAsListInit("ColFields");
+
+    // Values for the fields/attributes listed in 'ColFields'.
+    // Ex: KeyCol = 'noPred' -- key instruction is non predicated
+    KeyCol = MapRec->getValueAsListInit("KeyCol");
+
+    // List of values for the fields/attributes listed in 'ColFields', one for
+    // each column in the relation table.
+    //
+    // Ex: ValueCols = [['true'],['false']] -- it results two columns in the
+    // table. First column requires all the instructions to have predSense
+    // set to 'true' and second column requires it to be 'false'.
+    ListInit *ColValList = MapRec->getValueAsListInit("ValueCols");
+
+    // Each instruction map must specify at least one column for it to be valid.
+    if (ColValList->getSize() == 0)
+      PrintFatalError(MapRec->getLoc(), "InstrMapping record `" +
+        MapRec->getName() + "' has empty " + "`ValueCols' field!");
+
+    for (unsigned i = 0, e = ColValList->getSize(); i < e; i++) {
+      ListInit *ColI = dyn_cast<ListInit>(ColValList->getElement(i));
+
+      // Make sure that all the sub-lists in 'ValueCols' have same number of
+      // elements as the fields in 'ColFields'.
+      if (ColI->getSize() != ColFields->getSize())
+        PrintFatalError(MapRec->getLoc(), "Record `" + MapRec->getName() +
+          "', field `ValueCols' entries don't match with " +
+          " the entries in 'ColFields'!");
+      ValueCols.push_back(ColI);
+    }
+  }
+
+  std::string getName() const {
+    return Name;
+  }
+
+  std::string getFilterClass() {
+    return FilterClass;
+  }
+
+  ListInit *getRowFields() const {
+    return RowFields;
+  }
+
+  ListInit *getColFields() const {
+    return ColFields;
+  }
+
+  ListInit *getKeyCol() const {
+    return KeyCol;
+  }
+
+  const std::vector<ListInit*> &getValueCols() const {
+    return ValueCols;
+  }
+};
+} // End anonymous namespace.
+
+
+//===----------------------------------------------------------------------===//
+// class MapTableEmitter : It builds the instruction relation maps using
+// the information provided in InstrMapping records. It outputs these
+// relationship maps as tables into XXXGenInstrInfo.inc file along with the
+// functions to query them.
+
+namespace {
+class MapTableEmitter {
+private:
+//  std::string TargetName;
+  const CodeGenTarget &Target;
+  // InstrMapDesc - InstrMapping record to be processed.
+  InstrMap InstrMapDesc;
+
+  // InstrDefs - list of instructions filtered using FilterClass defined
+  // in InstrMapDesc.
+  std::vector<Record*> InstrDefs;
+
+  // RowInstrMap - maps RowFields values to the instructions. It's keyed by the
+  // values of the row fields and contains vector of records as values.
+  RowInstrMapTy RowInstrMap;
+
+  // KeyInstrVec - list of key instructions.
+  std::vector<Record*> KeyInstrVec;
+  DenseMap<Record*, std::vector<Record*> > MapTable;
+
+public:
+  MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec):
+                  Target(Target), InstrMapDesc(IMRec) {
+    const std::string FilterClass = InstrMapDesc.getFilterClass();
+    InstrDefs = Records.getAllDerivedDefinitions(FilterClass);
+  }
+
+  void buildRowInstrMap();
+
+  // Returns true if an instruction is a key instruction, i.e., its ColFields
+  // have same values as KeyCol.
+  bool isKeyColInstr(Record* CurInstr);
+
+  // Find column instruction corresponding to a key instruction based on the
+  // constraints for that column.
+  Record *getInstrForColumn(Record *KeyInstr, ListInit *CurValueCol);
+
+  // Find column instructions for each key instruction based
+  // on ValueCols and store them into MapTable.
+  void buildMapTable();
+
+  void emitBinSearch(raw_ostream &OS, unsigned TableSize);
+  void emitTablesWithFunc(raw_ostream &OS);
+  unsigned emitBinSearchTable(raw_ostream &OS);
+
+  // Lookup functions to query binary search tables.
+  void emitMapFuncBody(raw_ostream &OS, unsigned TableSize);
+
+};
+} // End anonymous namespace.
+
+
+//===----------------------------------------------------------------------===//
+// Process all the instructions that model this relation (alreday present in
+// InstrDefs) and insert them into RowInstrMap which is keyed by the values of
+// the fields listed as RowFields. It stores vectors of records as values.
+// All the related instructions have the same values for the RowFields thus are
+// part of the same key-value pair.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::buildRowInstrMap() {
+  for (unsigned i = 0, e = InstrDefs.size(); i < e; i++) {
+    std::vector<Record*> InstrList;
+    Record *CurInstr = InstrDefs[i];
+    std::vector<Init*> KeyValue;
+    ListInit *RowFields = InstrMapDesc.getRowFields();
+    for (unsigned j = 0, endRF = RowFields->getSize(); j < endRF; j++) {
+      Init *RowFieldsJ = RowFields->getElement(j);
+      Init *CurInstrVal = CurInstr->getValue(RowFieldsJ)->getValue();
+      KeyValue.push_back(CurInstrVal);
+    }
+
+    // Collect key instructions into KeyInstrVec. Later, these instructions are
+    // processed to assign column position to the instructions sharing
+    // their KeyValue in RowInstrMap.
+    if (isKeyColInstr(CurInstr))
+      KeyInstrVec.push_back(CurInstr);
+
+    RowInstrMap[KeyValue].push_back(CurInstr);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Return true if an instruction is a KeyCol instruction.
+//===----------------------------------------------------------------------===//
+
+bool MapTableEmitter::isKeyColInstr(Record* CurInstr) {
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  ListInit *KeyCol = InstrMapDesc.getKeyCol();
+
+  // Check if the instruction is a KeyCol instruction.
+  bool MatchFound = true;
+  for (unsigned j = 0, endCF = ColFields->getSize();
+      (j < endCF) && MatchFound; j++) {
+    RecordVal *ColFieldName = CurInstr->getValue(ColFields->getElement(j));
+    std::string CurInstrVal = ColFieldName->getValue()->getAsUnquotedString();
+    std::string KeyColValue = KeyCol->getElement(j)->getAsUnquotedString();
+    MatchFound = (CurInstrVal == KeyColValue);
+  }
+  return MatchFound;
+}
+
+//===----------------------------------------------------------------------===//
+// Build a map to link key instructions with the column instructions arranged
+// according to their column positions.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::buildMapTable() {
+  // Find column instructions for a given key based on the ColField
+  // constraints.
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  unsigned NumOfCols = ValueCols.size();
+  for (unsigned j = 0, endKI = KeyInstrVec.size(); j < endKI; j++) {
+    Record *CurKeyInstr = KeyInstrVec[j];
+    std::vector<Record*> ColInstrVec(NumOfCols);
+
+    // Find the column instruction based on the constraints for the column.
+    for (unsigned ColIdx = 0; ColIdx < NumOfCols; ColIdx++) {
+      ListInit *CurValueCol = ValueCols[ColIdx];
+      Record *ColInstr = getInstrForColumn(CurKeyInstr, CurValueCol);
+      ColInstrVec[ColIdx] = ColInstr;
+    }
+    MapTable[CurKeyInstr] = ColInstrVec;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Find column instruction based on the constraints for that column.
+//===----------------------------------------------------------------------===//
+
+Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
+                                           ListInit *CurValueCol) {
+  ListInit *RowFields = InstrMapDesc.getRowFields();
+  std::vector<Init*> KeyValue;
+
+  // Construct KeyValue using KeyInstr's values for RowFields.
+  for (unsigned j = 0, endRF = RowFields->getSize(); j < endRF; j++) {
+    Init *RowFieldsJ = RowFields->getElement(j);
+    Init *KeyInstrVal = KeyInstr->getValue(RowFieldsJ)->getValue();
+    KeyValue.push_back(KeyInstrVal);
+  }
+
+  // Get all the instructions that share the same KeyValue as the KeyInstr
+  // in RowInstrMap. We search through these instructions to find a match
+  // for the current column, i.e., the instruction which has the same values
+  // as CurValueCol for all the fields in ColFields.
+  const std::vector<Record*> &RelatedInstrVec = RowInstrMap[KeyValue];
+
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  Record *MatchInstr = NULL;
+
+  for (unsigned i = 0, e = RelatedInstrVec.size(); i < e; i++) {
+    bool MatchFound = true;
+    Record *CurInstr = RelatedInstrVec[i];
+    for (unsigned j = 0, endCF = ColFields->getSize();
+        (j < endCF) && MatchFound; j++) {
+      Init *ColFieldJ = ColFields->getElement(j);
+      Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue();
+      std::string CurInstrVal = CurInstrInit->getAsUnquotedString();
+      Init *ColFieldJVallue = CurValueCol->getElement(j);
+      MatchFound = (CurInstrVal == ColFieldJVallue->getAsUnquotedString());
+    }
+
+    if (MatchFound) {
+      if (MatchInstr) // Already had a match
+        // Error if multiple matches are found for a column.
+        PrintFatalError("Multiple matches found for `" + KeyInstr->getName() +
+              "', for the relation `" + InstrMapDesc.getName());
+      MatchInstr = CurInstr;
+    }
+  }
+  return MatchInstr;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit one table per relation. Only instructions with a valid relation of a
+// given type are included in the table sorted by their enum values (opcodes).
+// Binary search is used for locating instructions in the table.
+//===----------------------------------------------------------------------===//
+
+unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
+
+  const std::vector<const CodeGenInstruction*> &NumberedInstructions =
+                                            Target.getInstructionsByEnumValue();
+  std::string TargetName = Target.getName();
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  unsigned NumCol = ValueCols.size();
+  unsigned TotalNumInstr = NumberedInstructions.size();
+  unsigned TableSize = 0;
+
+  OS << "static const uint16_t "<<InstrMapDesc.getName();
+  // Number of columns in the table are NumCol+1 because key instructions are
+  // emitted as first column.
+  OS << "Table[]["<< NumCol+1 << "] = {\n";
+  for (unsigned i = 0; i < TotalNumInstr; i++) {
+    Record *CurInstr = NumberedInstructions[i]->TheDef;
+    std::vector<Record*> ColInstrs = MapTable[CurInstr];
+    std::string OutStr("");
+    unsigned RelExists = 0;
+    if (ColInstrs.size()) {
+      for (unsigned j = 0; j < NumCol; j++) {
+        if (ColInstrs[j] != NULL) {
+          RelExists = 1;
+          OutStr += ", ";
+          OutStr += TargetName;
+          OutStr += "::";
+          OutStr += ColInstrs[j]->getName();
+        } else { OutStr += ", -1";}
+      }
+
+      if (RelExists) {
+        OS << "  { " << TargetName << "::" << CurInstr->getName();
+        OS << OutStr <<" },\n";
+        TableSize++;
+      }
+    }
+  }
+  if (!TableSize) {
+    OS << "  { " << TargetName << "::" << "INSTRUCTION_LIST_END, ";
+    OS << TargetName << "::" << "INSTRUCTION_LIST_END }";
+  }
+  OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n";
+  return TableSize;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit binary search algorithm as part of the functions used to query
+// relation tables.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) {
+  OS << "  unsigned mid;\n";
+  OS << "  unsigned start = 0;\n";
+  OS << "  unsigned end = " << TableSize << ";\n";
+  OS << "  while (start < end) {\n";
+  OS << "    mid = start + (end - start)/2;\n";
+  OS << "    if (Opcode == " << InstrMapDesc.getName() << "Table[mid][0]) {\n";
+  OS << "      break;\n";
+  OS << "    }\n";
+  OS << "    if (Opcode < " << InstrMapDesc.getName() << "Table[mid][0])\n";
+  OS << "      end = mid;\n";
+  OS << "    else\n";
+  OS << "      start = mid + 1;\n";
+  OS << "  }\n";
+  OS << "  if (start == end)\n";
+  OS << "    return -1; // Instruction doesn't exist in this table.\n\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Emit functions to query relation tables.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
+                                           unsigned TableSize) {
+
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+
+  // Emit binary search algorithm to locate instructions in the
+  // relation table. If found, return opcode value from the appropriate column
+  // of the table.
+  emitBinSearch(OS, TableSize);
+
+  if (ValueCols.size() > 1) {
+    for (unsigned i = 0, e = ValueCols.size(); i < e; i++) {
+      ListInit *ColumnI = ValueCols[i];
+      for (unsigned j = 0, ColSize = ColumnI->getSize(); j < ColSize; j++) {
+        std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
+        OS << "  if (in" << ColName;
+        OS << " == ";
+        OS << ColName << "_" << ColumnI->getElement(j)->getAsUnquotedString();
+        if (j < ColumnI->getSize() - 1) OS << " && ";
+        else OS << ")\n";
+      }
+      OS << "    return " << InstrMapDesc.getName();
+      OS << "Table[mid]["<<i+1<<"];\n";
+    }
+    OS << "  return -1;";
+  }
+  else
+    OS << "  return " << InstrMapDesc.getName() << "Table[mid][1];\n";
+
+  OS <<"}\n\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Emit relation tables and the functions to query them.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
+
+  // Emit function name and the input parameters : mostly opcode value of the
+  // current instruction. However, if a table has multiple columns (more than 2
+  // since first column is used for the key instructions), then we also need
+  // to pass another input to indicate the column to be selected.
+
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  OS << "// "<< InstrMapDesc.getName() << "\n";
+  OS << "int "<< InstrMapDesc.getName() << "(uint16_t Opcode";
+  if (ValueCols.size() > 1) {
+    for (unsigned i = 0, e = ColFields->getSize(); i < e; i++) {
+      std::string ColName = ColFields->getElement(i)->getAsUnquotedString();
+      OS << ", enum " << ColName << " in" << ColName << ") {\n";
+    }
+  } else { OS << ") {\n"; }
+
+  // Emit map table.
+  unsigned TableSize = emitBinSearchTable(OS);
+
+  // Emit rest of the function body.
+  emitMapFuncBody(OS, TableSize);
+}
+
+//===----------------------------------------------------------------------===//
+// Emit enums for the column fields across all the instruction maps.
+//===----------------------------------------------------------------------===//
+
+static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
+
+  std::vector<Record*> InstrMapVec;
+  InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
+  std::map<std::string, std::vector<Init*> > ColFieldValueMap;
+
+  // Iterate over all InstrMapping records and create a map between column
+  // fields and their possible values across all records.
+  for (unsigned i = 0, e = InstrMapVec.size(); i < e; i++) {
+    Record *CurMap = InstrMapVec[i];
+    ListInit *ColFields;
+    ColFields = CurMap->getValueAsListInit("ColFields");
+    ListInit *List = CurMap->getValueAsListInit("ValueCols");
+    std::vector<ListInit*> ValueCols;
+    unsigned ListSize = List->getSize();
+
+    for (unsigned j = 0; j < ListSize; j++) {
+      ListInit *ListJ = dyn_cast<ListInit>(List->getElement(j));
+
+      if (ListJ->getSize() != ColFields->getSize())
+        PrintFatalError("Record `" + CurMap->getName() + "', field "
+          "`ValueCols' entries don't match with the entries in 'ColFields' !");
+      ValueCols.push_back(ListJ);
+    }
+
+    for (unsigned j = 0, endCF = ColFields->getSize(); j < endCF; j++) {
+      for (unsigned k = 0; k < ListSize; k++){
+        std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
+        ColFieldValueMap[ColName].push_back((ValueCols[k])->getElement(j));
+      }
+    }
+  }
+
+  for (std::map<std::string, std::vector<Init*> >::iterator
+       II = ColFieldValueMap.begin(), IE = ColFieldValueMap.end();
+       II != IE; II++) {
+    std::vector<Init*> FieldValues = (*II).second;
+    unsigned FieldSize = FieldValues.size();
+
+    // Delete duplicate entries from ColFieldValueMap
+    for (unsigned i = 0; i < FieldSize - 1; i++) {
+      Init *CurVal = FieldValues[i];
+      for (unsigned j = i+1; j < FieldSize; j++) {
+        if (CurVal == FieldValues[j]) {
+          FieldValues.erase(FieldValues.begin()+j);
+        }
+      }
+    }
+
+    // Emit enumerated values for the column fields.
+    OS << "enum " << (*II).first << " {\n";
+    for (unsigned i = 0; i < FieldSize; i++) {
+      OS << "\t" << (*II).first << "_" << FieldValues[i]->getAsUnquotedString();
+      if (i != FieldValues.size() - 1)
+        OS << ",\n";
+      else
+        OS << "\n};\n\n";
+    }
+  }
+}
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// Parse 'InstrMapping' records and use the information to form relationship
+// between instructions. These relations are emitted as a tables along with the
+// functions to query them.
+//===----------------------------------------------------------------------===//
+void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
+  CodeGenTarget Target(Records);
+  std::string TargetName = Target.getName();
+  std::vector<Record*> InstrMapVec;
+  InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
+
+  if (!InstrMapVec.size())
+    return;
+
+  OS << "#ifdef GET_INSTRMAP_INFO\n";
+  OS << "#undef GET_INSTRMAP_INFO\n";
+  OS << "namespace llvm {\n\n";
+  OS << "namespace " << TargetName << " {\n\n";
+
+  // Emit coulumn field names and their values as enums.
+  emitEnums(OS, Records);
+
+  // Iterate over all instruction mapping records and construct relationship
+  // maps based on the information specified there.
+  //
+  for (unsigned i = 0, e = InstrMapVec.size(); i < e; i++) {
+    MapTableEmitter IMap(Target, Records, InstrMapVec[i]);
+
+    // Build RowInstrMap to group instructions based on their values for
+    // RowFields. In the process, also collect key instructions into
+    // KeyInstrVec.
+    IMap.buildRowInstrMap();
+
+    // Build MapTable to map key instructions with the corresponding column
+    // instructions.
+    IMap.buildMapTable();
+
+    // Emit map tables and the functions to query them.
+    IMap.emitTablesWithFunc(OS);
+  }
+  OS << "} // End " << TargetName << " namespace\n";
+  OS << "} // End llvm namespace\n";
+  OS << "#endif // GET_INSTRMAP_INFO\n\n";
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 011f4b79386f..580e319f24ec 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -28,7 +28,7 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
-  : TheDef(R), EnumValue(Enum) {
+  : TheDef(R), EnumValue(Enum), LaneMask(0) {
   Name = R->getName();
   if (R->getValue("Namespace"))
     Namespace = R->getValueAsString("Namespace");
@@ -36,7 +36,7 @@ CodeGenSubRegIndex::CodeGenSubRegIndex(Record *R, unsigned Enum)
 
 CodeGenSubRegIndex::CodeGenSubRegIndex(StringRef N, StringRef Nspace,
                                        unsigned Enum)
-  : TheDef(0), Name(N), Namespace(Nspace), EnumValue(Enum) {
+  : TheDef(0), Name(N), Namespace(Nspace), EnumValue(Enum), LaneMask(0) {
 }
 
 std::string CodeGenSubRegIndex::getQualifiedName() const {
@@ -54,19 +54,20 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
   std::vector<Record*> Comps = TheDef->getValueAsListOfDefs("ComposedOf");
   if (!Comps.empty()) {
     if (Comps.size() != 2)
-      throw TGError(TheDef->getLoc(), "ComposedOf must have exactly two entries");
+      PrintFatalError(TheDef->getLoc(),
+                      "ComposedOf must have exactly two entries");
     CodeGenSubRegIndex *A = RegBank.getSubRegIdx(Comps[0]);
     CodeGenSubRegIndex *B = RegBank.getSubRegIdx(Comps[1]);
     CodeGenSubRegIndex *X = A->addComposite(B, this);
     if (X)
-      throw TGError(TheDef->getLoc(), "Ambiguous ComposedOf entries");
+      PrintFatalError(TheDef->getLoc(), "Ambiguous ComposedOf entries");
   }
 
   std::vector<Record*> Parts =
     TheDef->getValueAsListOfDefs("CoveringSubRegIndices");
   if (!Parts.empty()) {
     if (Parts.size() < 2)
-      throw TGError(TheDef->getLoc(),
+      PrintFatalError(TheDef->getLoc(),
                     "CoveredBySubRegs must have two or more entries");
     SmallVector<CodeGenSubRegIndex*, 8> IdxParts;
     for (unsigned i = 0, e = Parts.size(); i != e; ++i)
@@ -75,14 +76,21 @@ void CodeGenSubRegIndex::updateComponents(CodeGenRegBank &RegBank) {
   }
 }
 
-void CodeGenSubRegIndex::cleanComposites() {
-  // Clean out redundant mappings of the form this+X -> X.
-  for (CompMap::iterator i = Composed.begin(), e = Composed.end(); i != e;) {
-    CompMap::iterator j = i;
-    ++i;
-    if (j->first == j->second)
-      Composed.erase(j);
-  }
+unsigned CodeGenSubRegIndex::computeLaneMask() {
+  // Already computed?
+  if (LaneMask)
+    return LaneMask;
+
+  // Recursion guard, shouldn't be required.
+  LaneMask = ~0u;
+
+  // The lane mask is simply the union of all sub-indices.
+  unsigned M = 0;
+  for (CompMap::iterator I = Composed.begin(), E = Composed.end(); I != E; ++I)
+    M |= I->second->computeLaneMask();
+  assert(M && "Missing lane mask, sub-register cycle?");
+  LaneMask = M;
+  return LaneMask;
 }
 
 //===----------------------------------------------------------------------===//
@@ -105,8 +113,8 @@ void CodeGenRegister::buildObjectGraph(CodeGenRegBank &RegBank) {
   std::vector<Record*> SRs = TheDef->getValueAsListOfDefs("SubRegs");
 
   if (SRIs.size() != SRs.size())
-    throw TGError(TheDef->getLoc(),
-                  "SubRegs and SubRegIndices must have the same size");
+    PrintFatalError(TheDef->getLoc(),
+                    "SubRegs and SubRegIndices must have the same size");
 
   for (unsigned i = 0, e = SRIs.size(); i != e; ++i) {
     ExplicitSubRegIndices.push_back(RegBank.getSubRegIdx(SRIs[i]));
@@ -217,8 +225,8 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
     CodeGenRegister *SR = ExplicitSubRegs[i];
     CodeGenSubRegIndex *Idx = ExplicitSubRegIndices[i];
     if (!SubRegs.insert(std::make_pair(Idx, SR)).second)
-      throw TGError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
-                    " appears twice in Register " + getName());
+      PrintFatalError(TheDef->getLoc(), "SubRegIndex " + Idx->getName() +
+                      " appears twice in Register " + getName());
     // Map explicit sub-registers first, so the names take precedence.
     // The inherited sub-registers are mapped below.
     SubReg2Idx.insert(std::make_pair(SR, Idx));
@@ -298,11 +306,11 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
   for (SubRegMap::const_iterator SI = SubRegs.begin(), SE = SubRegs.end();
        SI != SE; ++SI) {
     if (SI->second == this) {
-      SMLoc Loc;
+      ArrayRef<SMLoc> Loc;
       if (TheDef)
         Loc = TheDef->getLoc();
-      throw TGError(Loc, "Register " + getName() +
-                    " has itself as a sub-register");
+      PrintFatalError(Loc, "Register " + getName() +
+                      " has itself as a sub-register");
     }
     // Ensure that every sub-register has a unique name.
     DenseMap<const CodeGenRegister*, CodeGenSubRegIndex*>::iterator Ins =
@@ -310,10 +318,10 @@ CodeGenRegister::computeSubRegs(CodeGenRegBank &RegBank) {
     if (Ins->second == SI->first)
       continue;
     // Trouble: Two different names for SI->second.
-    SMLoc Loc;
+    ArrayRef<SMLoc> Loc;
     if (TheDef)
       Loc = TheDef->getLoc();
-    throw TGError(Loc, "Sub-register can't have two names: " +
+    PrintFatalError(Loc, "Sub-register can't have two names: " +
                   SI->second->getName() + " available as " +
                   SI->first->getName() + " and " + Ins->second->getName());
   }
@@ -460,8 +468,8 @@ void CodeGenRegister::computeSecondarySubRegs(CodeGenRegBank &RegBank) {
            SE = NewSubReg->SubRegs.end(); SI != SE; ++SI) {
       CodeGenSubRegIndex *SubIdx = getSubRegIndex(SI->second);
       if (!SubIdx)
-        throw TGError(TheDef->getLoc(), "No SubRegIndex for " +
-                      SI->second->getName() + " in " + getName());
+        PrintFatalError(TheDef->getLoc(), "No SubRegIndex for " +
+                        SI->second->getName() + " in " + getName());
       NewIdx->addComposite(SI->first, SubIdx);
     }
   }
@@ -585,15 +593,16 @@ struct TupleExpander : SetTheory::Expander {
     unsigned Dim = Indices.size();
     ListInit *SubRegs = Def->getValueAsListInit("SubRegs");
     if (Dim != SubRegs->getSize())
-      throw TGError(Def->getLoc(), "SubRegIndices and SubRegs size mismatch");
+      PrintFatalError(Def->getLoc(), "SubRegIndices and SubRegs size mismatch");
     if (Dim < 2)
-      throw TGError(Def->getLoc(), "Tuples must have at least 2 sub-registers");
+      PrintFatalError(Def->getLoc(),
+                      "Tuples must have at least 2 sub-registers");
 
     // Evaluate the sub-register lists to be zipped.
     unsigned Length = ~0u;
     SmallVector<SetTheory::RecSet, 4> Lists(Dim);
     for (unsigned i = 0; i != Dim; ++i) {
-      ST.evaluate(SubRegs->getElement(i), Lists[i]);
+      ST.evaluate(SubRegs->getElement(i), Lists[i], Def->getLoc());
       Length = std::min(Length, unsigned(Lists[i].size()));
     }
 
@@ -699,8 +708,8 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   for (unsigned i = 0, e = TypeList.size(); i != e; ++i) {
     Record *Type = TypeList[i];
     if (!Type->isSubClassOf("ValueType"))
-      throw "RegTypes list member '" + Type->getName() +
-        "' does not derive from the ValueType class!";
+      PrintFatalError("RegTypes list member '" + Type->getName() +
+        "' does not derive from the ValueType class!");
     VTs.push_back(getValueType(Type));
   }
   assert(!VTs.empty() && "RegisterClass must contain at least one ValueType!");
@@ -721,14 +730,14 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   // Alternative allocation orders may be subsets.
   SetTheory::RecSet Order;
   for (unsigned i = 0, e = AltOrders->size(); i != e; ++i) {
-    RegBank.getSets().evaluate(AltOrders->getElement(i), Order);
+    RegBank.getSets().evaluate(AltOrders->getElement(i), Order, R->getLoc());
     Orders[1 + i].append(Order.begin(), Order.end());
     // Verify that all altorder members are regclass members.
     while (!Order.empty()) {
       CodeGenRegister *Reg = RegBank.getReg(Order.back());
       Order.pop_back();
       if (!contains(Reg))
-        throw TGError(R->getLoc(), " AltOrder register " + Reg->getName() +
+        PrintFatalError(R->getLoc(), " AltOrder register " + Reg->getName() +
                       " is not a class member");
     }
   }
@@ -986,6 +995,12 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) {
   for (unsigned i = 0, e = Registers.size(); i != e; ++i)
     Registers[i]->buildObjectGraph(*this);
 
+  // Compute register name map.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    RegistersByName.GetOrCreateValue(
+                       Registers[i]->TheDef->getValueAsString("AsmName"),
+                       Registers[i]);
+
   // Precompute all sub-register maps.
   // This will create Composite entries for all inferred sub-register indices.
   for (unsigned i = 0, e = Registers.size(); i != e; ++i)
@@ -1008,7 +1023,7 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) {
   // Read in register class definitions.
   std::vector<Record*> RCs = Records.getAllDerivedDefinitions("RegisterClass");
   if (RCs.empty())
-    throw std::string("No 'RegisterClass' subclasses defined!");
+    PrintFatalError(std::string("No 'RegisterClass' subclasses defined!"));
 
   // Allocate user-defined register classes.
   RegClasses.reserve(RCs.size());
@@ -1085,7 +1100,7 @@ CodeGenRegisterClass *CodeGenRegBank::getRegClass(Record *Def) {
   if (CodeGenRegisterClass *RC = Def2RC[Def])
     return RC;
 
-  throw TGError(Def->getLoc(), "Not a known RegisterClass!");
+  PrintFatalError(Def->getLoc(), "Not a known RegisterClass!");
 }
 
 CodeGenSubRegIndex*
@@ -1164,11 +1179,35 @@ void CodeGenRegBank::computeComposites() {
       }
     }
   }
+}
+
+// Compute lane masks. This is similar to register units, but at the
+// sub-register index level. Each bit in the lane mask is like a register unit
+// class, and two lane masks will have a bit in common if two sub-register
+// indices overlap in some register.
+//
+// Conservatively share a lane mask bit if two sub-register indices overlap in
+// some registers, but not in others. That shouldn't happen a lot.
+void CodeGenRegBank::computeSubRegIndexLaneMasks() {
+  // First assign individual bits to all the leaf indices.
+  unsigned Bit = 0;
+  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
+    CodeGenSubRegIndex *Idx = SubRegIndices[i];
+    if (Idx->getComposites().empty()) {
+      Idx->LaneMask = 1u << Bit;
+      // Share bit 31 in the unlikely case there are more than 32 leafs.
+      if (Bit < 31) ++Bit;
+    } else {
+      Idx->LaneMask = 0;
+    }
+  }
+
+  // FIXME: What if ad-hoc aliasing introduces overlaps that aren't represented
+  // by the sub-register graph? This doesn't occur in any known targets.
 
-  // We don't care about the difference between (Idx1, Idx2) -> Idx2 and invalid
-  // compositions, so remove any mappings of that form.
+  // Inherit lanes from composites.
   for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
-    SubRegIndices[i]->cleanComposites();
+    SubRegIndices[i]->computeLaneMask();
 }
 
 namespace {
@@ -1554,6 +1593,7 @@ void CodeGenRegBank::computeRegUnitSets() {
 
 void CodeGenRegBank::computeDerivedInfo() {
   computeComposites();
+  computeSubRegIndexLaneMasks();
 
   // Compute a weight for each register unit created during getSubRegs.
   // This may create adopted register units (with unit # >= NumNativeRegUnits).
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 827063e47017..e41107415612 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -40,6 +40,7 @@ namespace llvm {
 
   public:
     const unsigned EnumValue;
+    unsigned LaneMask;
 
     CodeGenSubRegIndex(Record *R, unsigned Enum);
     CodeGenSubRegIndex(StringRef N, StringRef Nspace, unsigned Enum);
@@ -80,12 +81,12 @@ namespace llvm {
     // Update the composite maps of components specified in 'ComposedOf'.
     void updateComponents(CodeGenRegBank&);
 
-    // Clean out redundant composite mappings.
-    void cleanComposites();
-
     // Return the map of composites.
     const CompMap &getComposites() const { return Composed; }
 
+    // Compute LaneMask from Composed. Return LaneMask.
+    unsigned computeLaneMask();
+
   private:
     CompMap Composed;
   };
@@ -439,6 +440,7 @@ namespace llvm {
 
     // Registers.
     std::vector<CodeGenRegister*> Registers;
+    StringMap<CodeGenRegister*> RegistersByName;
     DenseMap<Record*, CodeGenRegister*> Def2Reg;
     unsigned NumNativeRegUnits;
 
@@ -489,6 +491,9 @@ namespace llvm {
     // Populate the Composite map from sub-register relationships.
     void computeComposites();
 
+    // Compute a lane mask for each sub-register index.
+    void computeSubRegIndexLaneMasks();
+
   public:
     CodeGenRegBank(RecordKeeper&);
 
@@ -518,6 +523,9 @@ namespace llvm {
     }
 
     const std::vector<CodeGenRegister*> &getRegisters() { return Registers; }
+    const StringMap<CodeGenRegister*> &getRegistersByName() {
+      return RegistersByName;
+    }
 
     // Find a register from its Record def.
     CodeGenRegister *getReg(Record*);
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index f57fd182eafb..63cc97a8c1da 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -16,41 +16,505 @@
 
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/ADT/STLExtras.h"
 
 using namespace llvm;
 
-// CodeGenModels ctor interprets machine model records and populates maps.
+#ifndef NDEBUG
+static void dumpIdxVec(const IdxVec &V) {
+  for (unsigned i = 0, e = V.size(); i < e; ++i) {
+    dbgs() << V[i] << ", ";
+  }
+}
+static void dumpIdxVec(const SmallVectorImpl<unsigned> &V) {
+  for (unsigned i = 0, e = V.size(); i < e; ++i) {
+    dbgs() << V[i] << ", ";
+  }
+}
+#endif
+
+// (instrs a, b, ...) Evaluate and union all arguments. Identical to AddOp.
+struct InstrsOp : public SetTheory::Operator {
+  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+             ArrayRef<SMLoc> Loc) {
+    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc);
+  }
+};
+
+// (instregex "OpcPat",...) Find all instructions matching an opcode pattern.
+//
+// TODO: Since this is a prefix match, perform a binary search over the
+// instruction names using lower_bound. Note that the predefined instrs must be
+// scanned linearly first. However, this is only safe if the regex pattern has
+// no top-level bars. The DAG already has a list of patterns, so there's no
+// reason to use top-level bars, but we need a way to verify they don't exist
+// before implementing the optimization.
+struct InstRegexOp : public SetTheory::Operator {
+  const CodeGenTarget &Target;
+  InstRegexOp(const CodeGenTarget &t): Target(t) {}
+
+  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+             ArrayRef<SMLoc> Loc) {
+    SmallVector<Regex*, 4> RegexList;
+    for (DagInit::const_arg_iterator
+           AI = Expr->arg_begin(), AE = Expr->arg_end(); AI != AE; ++AI) {
+      StringInit *SI = dyn_cast<StringInit>(*AI);
+      if (!SI)
+        PrintFatalError(Loc, "instregex requires pattern string: "
+          + Expr->getAsString());
+      std::string pat = SI->getValue();
+      // Implement a python-style prefix match.
+      if (pat[0] != '^') {
+        pat.insert(0, "^(");
+        pat.insert(pat.end(), ')');
+      }
+      RegexList.push_back(new Regex(pat));
+    }
+    for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+           E = Target.inst_end(); I != E; ++I) {
+      for (SmallVectorImpl<Regex*>::iterator
+             RI = RegexList.begin(), RE = RegexList.end(); RI != RE; ++RI) {
+        if ((*RI)->match((*I)->TheDef->getName()))
+          Elts.insert((*I)->TheDef);
+      }
+    }
+    DeleteContainerPointers(RegexList);
+  }
+};
+
+/// CodeGenModels ctor interprets machine model records and populates maps.
 CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
                                        const CodeGenTarget &TGT):
-  Records(RK), Target(TGT), NumItineraryClasses(0), HasProcItineraries(false) {
+  Records(RK), Target(TGT), NumItineraryClasses(0) {
+
+  Sets.addFieldExpander("InstRW", "Instrs");
+
+  // Allow Set evaluation to recognize the dags used in InstRW records:
+  // (instrs Op1, Op1...)
+  Sets.addOperator("instrs", new InstrsOp);
+  Sets.addOperator("instregex", new InstRegexOp(Target));
+
+  // Instantiate a CodeGenProcModel for each SchedMachineModel with the values
+  // that are explicitly referenced in tablegen records. Resources associated
+  // with each processor will be derived later. Populate ProcModelMap with the
+  // CodeGenProcModel instances.
+  collectProcModels();
+
+  // Instantiate a CodeGenSchedRW for each SchedReadWrite record explicitly
+  // defined, and populate SchedReads and SchedWrites vectors. Implicit
+  // SchedReadWrites that represent sequences derived from expanded variant will
+  // be inferred later.
+  collectSchedRW();
+
+  // Instantiate a CodeGenSchedClass for each unique SchedRW signature directly
+  // required by an instruction definition, and populate SchedClassIdxMap. Set
+  // NumItineraryClasses to the number of explicit itinerary classes referenced
+  // by instructions. Set NumInstrSchedClasses to the number of itinerary
+  // classes plus any classes implied by instructions that derive from class
+  // Sched and provide SchedRW list. This does not infer any new classes from
+  // SchedVariant.
+  collectSchedClasses();
+
+  // Find instruction itineraries for each processor. Sort and populate
+  // CodeGenProcModel::ItinDefList. (Cycle-to-cycle itineraries). This requires
+  // all itinerary classes to be discovered.
+  collectProcItins();
+
+  // Find ItinRW records for each processor and itinerary class.
+  // (For per-operand resources mapped to itinerary classes).
+  collectProcItinRW();
+
+  // Infer new SchedClasses from SchedVariant.
+  inferSchedClasses();
+
+  // Populate each CodeGenProcModel's WriteResDefs, ReadAdvanceDefs, and
+  // ProcResourceDefs.
+  collectProcResources();
+}
+
+/// Gather all processor models.
+void CodeGenSchedModels::collectProcModels() {
+  RecVec ProcRecords = Records.getAllDerivedDefinitions("Processor");
+  std::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
+
+  // Reserve space because we can. Reallocation would be ok.
+  ProcModels.reserve(ProcRecords.size()+1);
+
+  // Use idx=0 for NoModel/NoItineraries.
+  Record *NoModelDef = Records.getDef("NoSchedModel");
+  Record *NoItinsDef = Records.getDef("NoItineraries");
+  ProcModels.push_back(CodeGenProcModel(0, "NoSchedModel",
+                                        NoModelDef, NoItinsDef));
+  ProcModelMap[NoModelDef] = 0;
+
+  // For each processor, find a unique machine model.
+  for (unsigned i = 0, N = ProcRecords.size(); i < N; ++i)
+    addProcModel(ProcRecords[i]);
+}
+
+/// Get a unique processor model based on the defined MachineModel and
+/// ProcessorItineraries.
+void CodeGenSchedModels::addProcModel(Record *ProcDef) {
+  Record *ModelKey = getModelOrItinDef(ProcDef);
+  if (!ProcModelMap.insert(std::make_pair(ModelKey, ProcModels.size())).second)
+    return;
+
+  std::string Name = ModelKey->getName();
+  if (ModelKey->isSubClassOf("SchedMachineModel")) {
+    Record *ItinsDef = ModelKey->getValueAsDef("Itineraries");
+    ProcModels.push_back(
+      CodeGenProcModel(ProcModels.size(), Name, ModelKey, ItinsDef));
+  }
+  else {
+    // An itinerary is defined without a machine model. Infer a new model.
+    if (!ModelKey->getValueAsListOfDefs("IID").empty())
+      Name = Name + "Model";
+    ProcModels.push_back(
+      CodeGenProcModel(ProcModels.size(), Name,
+                       ProcDef->getValueAsDef("SchedModel"), ModelKey));
+  }
+  DEBUG(ProcModels.back().dump());
+}
+
+// Recursively find all reachable SchedReadWrite records.
+static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
+                        SmallPtrSet<Record*, 16> &RWSet) {
+  if (!RWSet.insert(RWDef))
+    return;
+  RWDefs.push_back(RWDef);
+  // Reads don't current have sequence records, but it can be added later.
+  if (RWDef->isSubClassOf("WriteSequence")) {
+    RecVec Seq = RWDef->getValueAsListOfDefs("Writes");
+    for (RecIter I = Seq.begin(), E = Seq.end(); I != E; ++I)
+      scanSchedRW(*I, RWDefs, RWSet);
+  }
+  else if (RWDef->isSubClassOf("SchedVariant")) {
+    // Visit each variant (guarded by a different predicate).
+    RecVec Vars = RWDef->getValueAsListOfDefs("Variants");
+    for (RecIter VI = Vars.begin(), VE = Vars.end(); VI != VE; ++VI) {
+      // Visit each RW in the sequence selected by the current variant.
+      RecVec Selected = (*VI)->getValueAsListOfDefs("Selected");
+      for (RecIter I = Selected.begin(), E = Selected.end(); I != E; ++I)
+        scanSchedRW(*I, RWDefs, RWSet);
+    }
+  }
+}
+
+// Collect and sort all SchedReadWrites reachable via tablegen records.
+// More may be inferred later when inferring new SchedClasses from variants.
+void CodeGenSchedModels::collectSchedRW() {
+  // Reserve idx=0 for invalid writes/reads.
+  SchedWrites.resize(1);
+  SchedReads.resize(1);
+
+  SmallPtrSet<Record*, 16> RWSet;
+
+  // Find all SchedReadWrites referenced by instruction defs.
+  RecVec SWDefs, SRDefs;
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    Record *SchedDef = (*I)->TheDef;
+    if (!SchedDef->isSubClassOf("Sched"))
+      continue;
+    RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW");
+    for (RecIter RWI = RWs.begin(), RWE = RWs.end(); RWI != RWE; ++RWI) {
+      if ((*RWI)->isSubClassOf("SchedWrite"))
+        scanSchedRW(*RWI, SWDefs, RWSet);
+      else {
+        assert((*RWI)->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+        scanSchedRW(*RWI, SRDefs, RWSet);
+      }
+    }
+  }
+  // Find all ReadWrites referenced by InstRW.
+  RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
+  for (RecIter OI = InstRWDefs.begin(), OE = InstRWDefs.end(); OI != OE; ++OI) {
+    // For all OperandReadWrites.
+    RecVec RWDefs = (*OI)->getValueAsListOfDefs("OperandReadWrites");
+    for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end();
+         RWI != RWE; ++RWI) {
+      if ((*RWI)->isSubClassOf("SchedWrite"))
+        scanSchedRW(*RWI, SWDefs, RWSet);
+      else {
+        assert((*RWI)->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+        scanSchedRW(*RWI, SRDefs, RWSet);
+      }
+    }
+  }
+  // Find all ReadWrites referenced by ItinRW.
+  RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
+  for (RecIter II = ItinRWDefs.begin(), IE = ItinRWDefs.end(); II != IE; ++II) {
+    // For all OperandReadWrites.
+    RecVec RWDefs = (*II)->getValueAsListOfDefs("OperandReadWrites");
+    for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end();
+         RWI != RWE; ++RWI) {
+      if ((*RWI)->isSubClassOf("SchedWrite"))
+        scanSchedRW(*RWI, SWDefs, RWSet);
+      else {
+        assert((*RWI)->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+        scanSchedRW(*RWI, SRDefs, RWSet);
+      }
+    }
+  }
+  // Find all ReadWrites referenced by SchedAlias. AliasDefs needs to be sorted
+  // for the loop below that initializes Alias vectors.
+  RecVec AliasDefs = Records.getAllDerivedDefinitions("SchedAlias");
+  std::sort(AliasDefs.begin(), AliasDefs.end(), LessRecord());
+  for (RecIter AI = AliasDefs.begin(), AE = AliasDefs.end(); AI != AE; ++AI) {
+    Record *MatchDef = (*AI)->getValueAsDef("MatchRW");
+    Record *AliasDef = (*AI)->getValueAsDef("AliasRW");
+    if (MatchDef->isSubClassOf("SchedWrite")) {
+      if (!AliasDef->isSubClassOf("SchedWrite"))
+        PrintFatalError((*AI)->getLoc(), "SchedWrite Alias must be SchedWrite");
+      scanSchedRW(AliasDef, SWDefs, RWSet);
+    }
+    else {
+      assert(MatchDef->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+      if (!AliasDef->isSubClassOf("SchedRead"))
+        PrintFatalError((*AI)->getLoc(), "SchedRead Alias must be SchedRead");
+      scanSchedRW(AliasDef, SRDefs, RWSet);
+    }
+  }
+  // Sort and add the SchedReadWrites directly referenced by instructions or
+  // itinerary resources. Index reads and writes in separate domains.
+  std::sort(SWDefs.begin(), SWDefs.end(), LessRecord());
+  for (RecIter SWI = SWDefs.begin(), SWE = SWDefs.end(); SWI != SWE; ++SWI) {
+    assert(!getSchedRWIdx(*SWI, /*IsRead=*/false) && "duplicate SchedWrite");
+    SchedWrites.push_back(CodeGenSchedRW(SchedWrites.size(), *SWI));
+  }
+  std::sort(SRDefs.begin(), SRDefs.end(), LessRecord());
+  for (RecIter SRI = SRDefs.begin(), SRE = SRDefs.end(); SRI != SRE; ++SRI) {
+    assert(!getSchedRWIdx(*SRI, /*IsRead-*/true) && "duplicate SchedWrite");
+    SchedReads.push_back(CodeGenSchedRW(SchedReads.size(), *SRI));
+  }
+  // Initialize WriteSequence vectors.
+  for (std::vector<CodeGenSchedRW>::iterator WI = SchedWrites.begin(),
+         WE = SchedWrites.end(); WI != WE; ++WI) {
+    if (!WI->IsSequence)
+      continue;
+    findRWs(WI->TheDef->getValueAsListOfDefs("Writes"), WI->Sequence,
+            /*IsRead=*/false);
+  }
+  // Initialize Aliases vectors.
+  for (RecIter AI = AliasDefs.begin(), AE = AliasDefs.end(); AI != AE; ++AI) {
+    Record *AliasDef = (*AI)->getValueAsDef("AliasRW");
+    getSchedRW(AliasDef).IsAlias = true;
+    Record *MatchDef = (*AI)->getValueAsDef("MatchRW");
+    CodeGenSchedRW &RW = getSchedRW(MatchDef);
+    if (RW.IsAlias)
+      PrintFatalError((*AI)->getLoc(), "Cannot Alias an Alias");
+    RW.Aliases.push_back(*AI);
+  }
+  DEBUG(
+    for (unsigned WIdx = 0, WEnd = SchedWrites.size(); WIdx != WEnd; ++WIdx) {
+      dbgs() << WIdx << ": ";
+      SchedWrites[WIdx].dump();
+      dbgs() << '\n';
+    }
+    for (unsigned RIdx = 0, REnd = SchedReads.size(); RIdx != REnd; ++RIdx) {
+      dbgs() << RIdx << ": ";
+      SchedReads[RIdx].dump();
+      dbgs() << '\n';
+    }
+    RecVec RWDefs = Records.getAllDerivedDefinitions("SchedReadWrite");
+    for (RecIter RI = RWDefs.begin(), RE = RWDefs.end();
+         RI != RE; ++RI) {
+      if (!getSchedRWIdx(*RI, (*RI)->isSubClassOf("SchedRead"))) {
+        const std::string &Name = (*RI)->getName();
+        if (Name != "NoWrite" && Name != "ReadDefault")
+          dbgs() << "Unused SchedReadWrite " << (*RI)->getName() << '\n';
+      }
+    });
+}
+
+/// Compute a SchedWrite name from a sequence of writes.
+std::string CodeGenSchedModels::genRWName(const IdxVec& Seq, bool IsRead) {
+  std::string Name("(");
+  for (IdxIter I = Seq.begin(), E = Seq.end(); I != E; ++I) {
+    if (I != Seq.begin())
+      Name += '_';
+    Name += getSchedRW(*I, IsRead).Name;
+  }
+  Name += ')';
+  return Name;
+}
+
+unsigned CodeGenSchedModels::getSchedRWIdx(Record *Def, bool IsRead,
+                                           unsigned After) const {
+  const std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
+  assert(After < RWVec.size() && "start position out of bounds");
+  for (std::vector<CodeGenSchedRW>::const_iterator I = RWVec.begin() + After,
+         E = RWVec.end(); I != E; ++I) {
+    if (I->TheDef == Def)
+      return I - RWVec.begin();
+  }
+  return 0;
+}
+
+bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
+  for (unsigned i = 0, e = SchedReads.size(); i < e; ++i) {
+    Record *ReadDef = SchedReads[i].TheDef;
+    if (!ReadDef || !ReadDef->isSubClassOf("ProcReadAdvance"))
+      continue;
+
+    RecVec ValidWrites = ReadDef->getValueAsListOfDefs("ValidWrites");
+    if (std::find(ValidWrites.begin(), ValidWrites.end(), WriteDef)
+        != ValidWrites.end()) {
+      return true;
+    }
+  }
+  return false;
+}
 
-  // Populate SchedClassIdxMap and set NumItineraryClasses.
-  CollectSchedClasses();
+namespace llvm {
+void splitSchedReadWrites(const RecVec &RWDefs,
+                          RecVec &WriteDefs, RecVec &ReadDefs) {
+  for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end(); RWI != RWE; ++RWI) {
+    if ((*RWI)->isSubClassOf("SchedWrite"))
+      WriteDefs.push_back(*RWI);
+    else {
+      assert((*RWI)->isSubClassOf("SchedRead") && "unknown SchedReadWrite");
+      ReadDefs.push_back(*RWI);
+    }
+  }
+}
+} // namespace llvm
+
+// Split the SchedReadWrites defs and call findRWs for each list.
+void CodeGenSchedModels::findRWs(const RecVec &RWDefs,
+                                 IdxVec &Writes, IdxVec &Reads) const {
+    RecVec WriteDefs;
+    RecVec ReadDefs;
+    splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs);
+    findRWs(WriteDefs, Writes, false);
+    findRWs(ReadDefs, Reads, true);
+}
+
+// Call getSchedRWIdx for all elements in a sequence of SchedRW defs.
+void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &RWs,
+                                 bool IsRead) const {
+  for (RecIter RI = RWDefs.begin(), RE = RWDefs.end(); RI != RE; ++RI) {
+    unsigned Idx = getSchedRWIdx(*RI, IsRead);
+    assert(Idx && "failed to collect SchedReadWrite");
+    RWs.push_back(Idx);
+  }
+}
+
+void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq,
+                                          bool IsRead) const {
+  const CodeGenSchedRW &SchedRW = getSchedRW(RWIdx, IsRead);
+  if (!SchedRW.IsSequence) {
+    RWSeq.push_back(RWIdx);
+    return;
+  }
+  int Repeat =
+    SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1;
+  for (int i = 0; i < Repeat; ++i) {
+    for (IdxIter I = SchedRW.Sequence.begin(), E = SchedRW.Sequence.end();
+         I != E; ++I) {
+      expandRWSequence(*I, RWSeq, IsRead);
+    }
+  }
+}
+
+// Expand a SchedWrite as a sequence following any aliases that coincide with
+// the given processor model.
+void CodeGenSchedModels::expandRWSeqForProc(
+  unsigned RWIdx, IdxVec &RWSeq, bool IsRead,
+  const CodeGenProcModel &ProcModel) const {
+
+  const CodeGenSchedRW &SchedWrite = getSchedRW(RWIdx, IsRead);
+  Record *AliasDef = 0;
+  for (RecIter AI = SchedWrite.Aliases.begin(), AE = SchedWrite.Aliases.end();
+       AI != AE; ++AI) {
+    const CodeGenSchedRW &AliasRW = getSchedRW((*AI)->getValueAsDef("AliasRW"));
+    if ((*AI)->getValueInit("SchedModel")->isComplete()) {
+      Record *ModelDef = (*AI)->getValueAsDef("SchedModel");
+      if (&getProcModel(ModelDef) != &ProcModel)
+        continue;
+    }
+    if (AliasDef)
+      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
+                      "defined for processor " + ProcModel.ModelName +
+                      " Ensure only one SchedAlias exists per RW.");
+    AliasDef = AliasRW.TheDef;
+  }
+  if (AliasDef) {
+    expandRWSeqForProc(getSchedRWIdx(AliasDef, IsRead),
+                       RWSeq, IsRead,ProcModel);
+    return;
+  }
+  if (!SchedWrite.IsSequence) {
+    RWSeq.push_back(RWIdx);
+    return;
+  }
+  int Repeat =
+    SchedWrite.TheDef ? SchedWrite.TheDef->getValueAsInt("Repeat") : 1;
+  for (int i = 0; i < Repeat; ++i) {
+    for (IdxIter I = SchedWrite.Sequence.begin(), E = SchedWrite.Sequence.end();
+         I != E; ++I) {
+      expandRWSeqForProc(*I, RWSeq, IsRead, ProcModel);
+    }
+  }
+}
+
+// Find the existing SchedWrite that models this sequence of writes.
+unsigned CodeGenSchedModels::findRWForSequence(const IdxVec &Seq,
+                                               bool IsRead) {
+  std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
 
-  // Populate ProcModelMap.
-  CollectProcModels();
+  for (std::vector<CodeGenSchedRW>::iterator I = RWVec.begin(), E = RWVec.end();
+       I != E; ++I) {
+    if (I->Sequence == Seq)
+      return I - RWVec.begin();
+  }
+  // Index zero reserved for invalid RW.
+  return 0;
 }
 
-// Visit all the instruction definitions for this target to gather and enumerate
-// the itinerary classes. These are the explicitly specified SchedClasses. More
-// SchedClasses may be inferred.
-void CodeGenSchedModels::CollectSchedClasses() {
+/// Add this ReadWrite if it doesn't already exist.
+unsigned CodeGenSchedModels::findOrInsertRW(ArrayRef<unsigned> Seq,
+                                            bool IsRead) {
+  assert(!Seq.empty() && "cannot insert empty sequence");
+  if (Seq.size() == 1)
+    return Seq.back();
 
-  // NoItinerary is always the first class at Index=0
+  unsigned Idx = findRWForSequence(Seq, IsRead);
+  if (Idx)
+    return Idx;
+
+  unsigned RWIdx = IsRead ? SchedReads.size() : SchedWrites.size();
+  CodeGenSchedRW SchedRW(RWIdx, IsRead, Seq, genRWName(Seq, IsRead));
+  if (IsRead)
+    SchedReads.push_back(SchedRW);
+  else
+    SchedWrites.push_back(SchedRW);
+  return RWIdx;
+}
+
+/// Visit all the instruction definitions for this target to gather and
+/// enumerate the itinerary classes. These are the explicitly specified
+/// SchedClasses. More SchedClasses may be inferred.
+void CodeGenSchedModels::collectSchedClasses() {
+
+  // NoItinerary is always the first class at Idx=0
   SchedClasses.resize(1);
   SchedClasses.back().Name = "NoItinerary";
+  SchedClasses.back().ProcIndices.push_back(0);
   SchedClassIdxMap[SchedClasses.back().Name] = 0;
 
   // Gather and sort all itinerary classes used by instruction descriptions.
-  std::vector<Record*> ItinClassList;
+  RecVec ItinClassList;
   for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
          E = Target.inst_end(); I != E; ++I) {
-    Record *SchedDef = (*I)->TheDef->getValueAsDef("Itinerary");
+    Record *ItinDef = (*I)->TheDef->getValueAsDef("Itinerary");
     // Map a new SchedClass with no index.
-    if (!SchedClassIdxMap.count(SchedDef->getName())) {
-      SchedClassIdxMap[SchedDef->getName()] = 0;
-      ItinClassList.push_back(SchedDef);
+    if (!SchedClassIdxMap.count(ItinDef->getName())) {
+      SchedClassIdxMap[ItinDef->getName()] = 0;
+      ItinClassList.push_back(ItinDef);
     }
   }
   // Assign each itinerary class unique number, skipping NoItinerary==0
@@ -61,91 +525,1139 @@ void CodeGenSchedModels::CollectSchedClasses() {
     SchedClassIdxMap[ItinDef->getName()] = SchedClasses.size();
     SchedClasses.push_back(CodeGenSchedClass(ItinDef));
   }
+  // Infer classes from SchedReadWrite resources listed for each
+  // instruction definition that inherits from class Sched.
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    if (!(*I)->TheDef->isSubClassOf("Sched"))
+      continue;
+    IdxVec Writes, Reads;
+    findRWs((*I)->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
+    // ProcIdx == 0 indicates the class applies to all processors.
+    IdxVec ProcIndices(1, 0);
+    addSchedClass(Writes, Reads, ProcIndices);
+  }
+  // Create classes for InstRW defs.
+  RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
+  std::sort(InstRWDefs.begin(), InstRWDefs.end(), LessRecord());
+  for (RecIter OI = InstRWDefs.begin(), OE = InstRWDefs.end(); OI != OE; ++OI)
+    createInstRWClass(*OI);
+
+  NumInstrSchedClasses = SchedClasses.size();
+
+  bool EnableDump = false;
+  DEBUG(EnableDump = true);
+  if (!EnableDump)
+    return;
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    Record *SchedDef = (*I)->TheDef;
+    std::string InstName = (*I)->TheDef->getName();
+    if (SchedDef->isSubClassOf("Sched")) {
+      IdxVec Writes;
+      IdxVec Reads;
+      findRWs((*I)->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
+      dbgs() << "SchedRW machine model for " << InstName;
+      for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI)
+        dbgs() << " " << SchedWrites[*WI].Name;
+      for (IdxIter RI = Reads.begin(), RE = Reads.end(); RI != RE; ++RI)
+        dbgs() << " " << SchedReads[*RI].Name;
+      dbgs() << '\n';
+    }
+    unsigned SCIdx = InstrClassMap.lookup((*I)->TheDef);
+    if (SCIdx) {
+      const RecVec &RWDefs = SchedClasses[SCIdx].InstRWs;
+      for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end();
+           RWI != RWE; ++RWI) {
+        const CodeGenProcModel &ProcModel =
+          getProcModel((*RWI)->getValueAsDef("SchedModel"));
+        dbgs() << "InstRW on " << ProcModel.ModelName << " for " << InstName;
+        IdxVec Writes;
+        IdxVec Reads;
+        findRWs((*RWI)->getValueAsListOfDefs("OperandReadWrites"),
+                Writes, Reads);
+        for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI)
+          dbgs() << " " << SchedWrites[*WI].Name;
+        for (IdxIter RI = Reads.begin(), RE = Reads.end(); RI != RE; ++RI)
+          dbgs() << " " << SchedReads[*RI].Name;
+        dbgs() << '\n';
+      }
+      continue;
+    }
+    if (!SchedDef->isSubClassOf("Sched")
+        && (SchedDef->getValueAsDef("Itinerary")->getName() == "NoItinerary")) {
+      dbgs() << "No machine model for " << (*I)->TheDef->getName() << '\n';
+    }
+  }
+}
+
+unsigned CodeGenSchedModels::getSchedClassIdx(
+  const RecVec &RWDefs) const {
 
-  // TODO: Infer classes from non-itinerary scheduler resources.
+  IdxVec Writes, Reads;
+  findRWs(RWDefs, Writes, Reads);
+  return findSchedClassIdx(Writes, Reads);
 }
 
-// Gather all processor models.
-void CodeGenSchedModels::CollectProcModels() {
-  std::vector<Record*> ProcRecords =
-    Records.getAllDerivedDefinitions("Processor");
-  std::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
+/// Find an SchedClass that has been inferred from a per-operand list of
+/// SchedWrites and SchedReads.
+unsigned CodeGenSchedModels::findSchedClassIdx(const IdxVec &Writes,
+                                               const IdxVec &Reads) const {
+  for (SchedClassIter I = schedClassBegin(), E = schedClassEnd(); I != E; ++I) {
+    // Classes with InstRWs may have the same Writes/Reads as a class originally
+    // produced by a SchedRW definition. We need to be able to recover the
+    // original class index for processors that don't match any InstRWs.
+    if (I->ItinClassDef || !I->InstRWs.empty())
+      continue;
 
-  // Reserve space because we can. Reallocation would be ok.
-  ProcModels.reserve(ProcRecords.size());
+    if (I->Writes == Writes && I->Reads == Reads) {
+      return I - schedClassBegin();
+    }
+  }
+  return 0;
+}
 
-  // For each processor, find a unique machine model.
-  for (unsigned i = 0, N = ProcRecords.size(); i < N; ++i)
-    addProcModel(ProcRecords[i]);
+// Get the SchedClass index for an instruction.
+unsigned CodeGenSchedModels::getSchedClassIdx(
+  const CodeGenInstruction &Inst) const {
+
+  unsigned SCIdx = InstrClassMap.lookup(Inst.TheDef);
+  if (SCIdx)
+    return SCIdx;
+
+  // If this opcode isn't mapped by the subtarget fallback to the instruction
+  // definition's SchedRW or ItinDef values.
+  if (Inst.TheDef->isSubClassOf("Sched")) {
+    RecVec RWs = Inst.TheDef->getValueAsListOfDefs("SchedRW");
+    return getSchedClassIdx(RWs);
+  }
+  Record *ItinDef = Inst.TheDef->getValueAsDef("Itinerary");
+  assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
+  unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
+  assert(Idx <= NumItineraryClasses && "bad ItinClass index");
+  return Idx;
 }
 
-// Get a unique processor model based on the defined MachineModel and
-// ProcessorItineraries.
-void CodeGenSchedModels::addProcModel(Record *ProcDef) {
-  unsigned Idx = getProcModelIdx(ProcDef);
-  if (Idx < ProcModels.size())
-    return;
+std::string CodeGenSchedModels::createSchedClassName(
+  const IdxVec &OperWrites, const IdxVec &OperReads) {
+
+  std::string Name;
+  for (IdxIter WI = OperWrites.begin(), WE = OperWrites.end(); WI != WE; ++WI) {
+    if (WI != OperWrites.begin())
+      Name += '_';
+    Name += SchedWrites[*WI].Name;
+  }
+  for (IdxIter RI = OperReads.begin(), RE = OperReads.end(); RI != RE; ++RI) {
+    Name += '_';
+    Name += SchedReads[*RI].Name;
+  }
+  return Name;
+}
+
+std::string CodeGenSchedModels::createSchedClassName(const RecVec &InstDefs) {
+
+  std::string Name;
+  for (RecIter I = InstDefs.begin(), E = InstDefs.end(); I != E; ++I) {
+    if (I != InstDefs.begin())
+      Name += '_';
+    Name += (*I)->getName();
+  }
+  return Name;
+}
+
+/// Add an inferred sched class from a per-operand list of SchedWrites and
+/// SchedReads. ProcIndices contains the set of IDs of processors that may
+/// utilize this class.
+unsigned CodeGenSchedModels::addSchedClass(const IdxVec &OperWrites,
+                                           const IdxVec &OperReads,
+                                           const IdxVec &ProcIndices)
+{
+  assert(!ProcIndices.empty() && "expect at least one ProcIdx");
+
+  unsigned Idx = findSchedClassIdx(OperWrites, OperReads);
+  if (Idx) {
+    IdxVec PI;
+    std::set_union(SchedClasses[Idx].ProcIndices.begin(),
+                   SchedClasses[Idx].ProcIndices.end(),
+                   ProcIndices.begin(), ProcIndices.end(),
+                   std::back_inserter(PI));
+    SchedClasses[Idx].ProcIndices.swap(PI);
+    return Idx;
+  }
+  Idx = SchedClasses.size();
+  SchedClasses.resize(Idx+1);
+  CodeGenSchedClass &SC = SchedClasses.back();
+  SC.Name = createSchedClassName(OperWrites, OperReads);
+  SC.Writes = OperWrites;
+  SC.Reads = OperReads;
+  SC.ProcIndices = ProcIndices;
+
+  return Idx;
+}
+
+// Create classes for each set of opcodes that are in the same InstReadWrite
+// definition across all processors.
+void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
+  // ClassInstrs will hold an entry for each subset of Instrs in InstRWDef that
+  // intersects with an existing class via a previous InstRWDef. Instrs that do
+  // not intersect with an existing class refer back to their former class as
+  // determined from ItinDef or SchedRW.
+  SmallVector<std::pair<unsigned, SmallVector<Record *, 8> >, 4> ClassInstrs;
+  // Sort Instrs into sets.
+  const RecVec *InstDefs = Sets.expand(InstRWDef);
+  if (InstDefs->empty())
+    PrintFatalError(InstRWDef->getLoc(), "No matching instruction opcodes");
+
+  for (RecIter I = InstDefs->begin(), E = InstDefs->end(); I != E; ++I) {
+    unsigned SCIdx = 0;
+    InstClassMapTy::const_iterator Pos = InstrClassMap.find(*I);
+    if (Pos != InstrClassMap.end())
+      SCIdx = Pos->second;
+    else {
+      // This instruction has not been mapped yet. Get the original class. All
+      // instructions in the same InstrRW class must be from the same original
+      // class because that is the fall-back class for other processors.
+      Record *ItinDef = (*I)->getValueAsDef("Itinerary");
+      SCIdx = SchedClassIdxMap.lookup(ItinDef->getName());
+      if (!SCIdx && (*I)->isSubClassOf("Sched"))
+        SCIdx = getSchedClassIdx((*I)->getValueAsListOfDefs("SchedRW"));
+    }
+    unsigned CIdx = 0, CEnd = ClassInstrs.size();
+    for (; CIdx != CEnd; ++CIdx) {
+      if (ClassInstrs[CIdx].first == SCIdx)
+        break;
+    }
+    if (CIdx == CEnd) {
+      ClassInstrs.resize(CEnd + 1);
+      ClassInstrs[CIdx].first = SCIdx;
+    }
+    ClassInstrs[CIdx].second.push_back(*I);
+  }
+  // For each set of Instrs, create a new class if necessary, and map or remap
+  // the Instrs to it.
+  unsigned CIdx = 0, CEnd = ClassInstrs.size();
+  for (; CIdx != CEnd; ++CIdx) {
+    unsigned OldSCIdx = ClassInstrs[CIdx].first;
+    ArrayRef<Record*> InstDefs = ClassInstrs[CIdx].second;
+    // If the all instrs in the current class are accounted for, then leave
+    // them mapped to their old class.
+    if (SchedClasses[OldSCIdx].InstRWs.size() == InstDefs.size()) {
+      assert(SchedClasses[OldSCIdx].ProcIndices[0] == 0 &&
+             "expected a generic SchedClass");
+      continue;
+    }
+    unsigned SCIdx = SchedClasses.size();
+    SchedClasses.resize(SCIdx+1);
+    CodeGenSchedClass &SC = SchedClasses.back();
+    SC.Name = createSchedClassName(InstDefs);
+    // Preserve ItinDef and Writes/Reads for processors without an InstRW entry.
+    SC.ItinClassDef = SchedClasses[OldSCIdx].ItinClassDef;
+    SC.Writes = SchedClasses[OldSCIdx].Writes;
+    SC.Reads = SchedClasses[OldSCIdx].Reads;
+    SC.ProcIndices.push_back(0);
+    // Map each Instr to this new class.
+    // Note that InstDefs may be a smaller list than InstRWDef's "Instrs".
+    Record *RWModelDef = InstRWDef->getValueAsDef("SchedModel");
+    SmallSet<unsigned, 4> RemappedClassIDs;
+    for (ArrayRef<Record*>::const_iterator
+           II = InstDefs.begin(), IE = InstDefs.end(); II != IE; ++II) {
+      unsigned OldSCIdx = InstrClassMap[*II];
+      if (OldSCIdx && RemappedClassIDs.insert(OldSCIdx)) {
+        for (RecIter RI = SchedClasses[OldSCIdx].InstRWs.begin(),
+               RE = SchedClasses[OldSCIdx].InstRWs.end(); RI != RE; ++RI) {
+          if ((*RI)->getValueAsDef("SchedModel") == RWModelDef) {
+            PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " +
+                          (*II)->getName() + " also matches " +
+                          (*RI)->getValue("Instrs")->getValue()->getAsString());
+          }
+          assert(*RI != InstRWDef && "SchedClass has duplicate InstRW def");
+          SC.InstRWs.push_back(*RI);
+        }
+      }
+      InstrClassMap[*II] = SCIdx;
+    }
+    SC.InstRWs.push_back(InstRWDef);
+  }
+}
+
+// Gather the processor itineraries.
+void CodeGenSchedModels::collectProcItins() {
+  for (std::vector<CodeGenProcModel>::iterator PI = ProcModels.begin(),
+         PE = ProcModels.end(); PI != PE; ++PI) {
+    CodeGenProcModel &ProcModel = *PI;
+    RecVec ItinRecords = ProcModel.ItinsDef->getValueAsListOfDefs("IID");
+    // Skip empty itinerary.
+    if (ItinRecords.empty())
+      continue;
+
+    ProcModel.ItinDefList.resize(NumItineraryClasses+1);
+
+    // Insert each itinerary data record in the correct position within
+    // the processor model's ItinDefList.
+    for (unsigned i = 0, N = ItinRecords.size(); i < N; i++) {
+      Record *ItinData = ItinRecords[i];
+      Record *ItinDef = ItinData->getValueAsDef("TheClass");
+      if (!SchedClassIdxMap.count(ItinDef->getName())) {
+        DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+              << " has unused itinerary class " << ItinDef->getName() << '\n');
+        continue;
+      }
+      assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
+      unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
+      assert(Idx <= NumItineraryClasses && "bad ItinClass index");
+      ProcModel.ItinDefList[Idx] = ItinData;
+    }
+    // Check for missing itinerary entries.
+    assert(!ProcModel.ItinDefList[0] && "NoItinerary class can't have rec");
+    DEBUG(
+      for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
+        if (!ProcModel.ItinDefList[i])
+          dbgs() << ProcModel.ItinsDef->getName()
+                 << " missing itinerary for class "
+                 << SchedClasses[i].Name << '\n';
+      });
+  }
+}
+
+// Gather the read/write types for each itinerary class.
+void CodeGenSchedModels::collectProcItinRW() {
+  RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
+  std::sort(ItinRWDefs.begin(), ItinRWDefs.end(), LessRecord());
+  for (RecIter II = ItinRWDefs.begin(), IE = ItinRWDefs.end(); II != IE; ++II) {
+    if (!(*II)->getValueInit("SchedModel")->isComplete())
+      PrintFatalError((*II)->getLoc(), "SchedModel is undefined");
+    Record *ModelDef = (*II)->getValueAsDef("SchedModel");
+    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    if (I == ProcModelMap.end()) {
+      PrintFatalError((*II)->getLoc(), "Undefined SchedMachineModel "
+                    + ModelDef->getName());
+    }
+    ProcModels[I->second].ItinRWDefs.push_back(*II);
+  }
+}
+
+/// Infer new classes from existing classes. In the process, this may create new
+/// SchedWrites from sequences of existing SchedWrites.
+void CodeGenSchedModels::inferSchedClasses() {
+  // Visit all existing classes and newly created classes.
+  for (unsigned Idx = 0; Idx != SchedClasses.size(); ++Idx) {
+    if (SchedClasses[Idx].ItinClassDef)
+      inferFromItinClass(SchedClasses[Idx].ItinClassDef, Idx);
+    else if (!SchedClasses[Idx].InstRWs.empty())
+      inferFromInstRWs(Idx);
+    else {
+      inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads,
+                  Idx, SchedClasses[Idx].ProcIndices);
+    }
+    assert(SchedClasses.size() < (NumInstrSchedClasses*6) &&
+           "too many SchedVariants");
+  }
+}
+
+/// Infer classes from per-processor itinerary resources.
+void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef,
+                                            unsigned FromClassIdx) {
+  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
+    const CodeGenProcModel &PM = ProcModels[PIdx];
+    // For all ItinRW entries.
+    bool HasMatch = false;
+    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
+         II != IE; ++II) {
+      RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+      if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
+        continue;
+      if (HasMatch)
+        PrintFatalError((*II)->getLoc(), "Duplicate itinerary class "
+                      + ItinClassDef->getName()
+                      + " in ItinResources for " + PM.ModelName);
+      HasMatch = true;
+      IdxVec Writes, Reads;
+      findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+      IdxVec ProcIndices(1, PIdx);
+      inferFromRW(Writes, Reads, FromClassIdx, ProcIndices);
+    }
+  }
+}
+
+/// Infer classes from per-processor InstReadWrite definitions.
+void CodeGenSchedModels::inferFromInstRWs(unsigned SCIdx) {
+  const RecVec &RWDefs = SchedClasses[SCIdx].InstRWs;
+  for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end(); RWI != RWE; ++RWI) {
+    const RecVec *InstDefs = Sets.expand(*RWI);
+    RecIter II = InstDefs->begin(), IE = InstDefs->end();
+    for (; II != IE; ++II) {
+      if (InstrClassMap[*II] == SCIdx)
+        break;
+    }
+    // If this class no longer has any instructions mapped to it, it has become
+    // irrelevant.
+    if (II == IE)
+      continue;
+    IdxVec Writes, Reads;
+    findRWs((*RWI)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+    unsigned PIdx = getProcModel((*RWI)->getValueAsDef("SchedModel")).Index;
+    IdxVec ProcIndices(1, PIdx);
+    inferFromRW(Writes, Reads, SCIdx, ProcIndices);
+  }
+}
+
+namespace {
+// Helper for substituteVariantOperand.
+struct TransVariant {
+  Record *VarOrSeqDef;  // Variant or sequence.
+  unsigned RWIdx;       // Index of this variant or sequence's matched type.
+  unsigned ProcIdx;     // Processor model index or zero for any.
+  unsigned TransVecIdx; // Index into PredTransitions::TransVec.
+
+  TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti):
+    VarOrSeqDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {}
+};
+
+// Associate a predicate with the SchedReadWrite that it guards.
+// RWIdx is the index of the read/write variant.
+struct PredCheck {
+  bool IsRead;
+  unsigned RWIdx;
+  Record *Predicate;
+
+  PredCheck(bool r, unsigned w, Record *p): IsRead(r), RWIdx(w), Predicate(p) {}
+};
+
+// A Predicate transition is a list of RW sequences guarded by a PredTerm.
+struct PredTransition {
+  // A predicate term is a conjunction of PredChecks.
+  SmallVector<PredCheck, 4> PredTerm;
+  SmallVector<SmallVector<unsigned,4>, 16> WriteSequences;
+  SmallVector<SmallVector<unsigned,4>, 16> ReadSequences;
+  SmallVector<unsigned, 4> ProcIndices;
+};
+
+// Encapsulate a set of partially constructed transitions.
+// The results are built by repeated calls to substituteVariants.
+class PredTransitions {
+  CodeGenSchedModels &SchedModels;
+
+public:
+  std::vector<PredTransition> TransVec;
+
+  PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}
+
+  void substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
+                                bool IsRead, unsigned StartIdx);
+
+  void substituteVariants(const PredTransition &Trans);
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+
+private:
+  bool mutuallyExclusive(Record *PredDef, ArrayRef<PredCheck> Term);
+  void getIntersectingVariants(
+    const CodeGenSchedRW &SchedRW, unsigned TransIdx,
+    std::vector<TransVariant> &IntersectingVariants);
+  void pushVariant(const TransVariant &VInfo, bool IsRead);
+};
+} // anonymous
+
+// Return true if this predicate is mutually exclusive with a PredTerm. This
+// degenerates into checking if the predicate is mutually exclusive with any
+// predicate in the Term's conjunction.
+//
+// All predicates associated with a given SchedRW are considered mutually
+// exclusive. This should work even if the conditions expressed by the
+// predicates are not exclusive because the predicates for a given SchedWrite
+// are always checked in the order they are defined in the .td file. Later
+// conditions implicitly negate any prior condition.
+bool PredTransitions::mutuallyExclusive(Record *PredDef,
+                                        ArrayRef<PredCheck> Term) {
+
+  for (ArrayRef<PredCheck>::iterator I = Term.begin(), E = Term.end();
+       I != E; ++I) {
+    if (I->Predicate == PredDef)
+      return false;
 
-  Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
-  Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
+    const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(I->RWIdx, I->IsRead);
+    assert(SchedRW.HasVariants && "PredCheck must refer to a SchedVariant");
+    RecVec Variants = SchedRW.TheDef->getValueAsListOfDefs("Variants");
+    for (RecIter VI = Variants.begin(), VE = Variants.end(); VI != VE; ++VI) {
+      if ((*VI)->getValueAsDef("Predicate") == PredDef)
+        return true;
+    }
+  }
+  return false;
+}
 
-  std::string ModelName = ModelDef->getName();
-  const std::string &ItinName = ItinsDef->getName();
+static bool hasAliasedVariants(const CodeGenSchedRW &RW,
+                               CodeGenSchedModels &SchedModels) {
+  if (RW.HasVariants)
+    return true;
 
-  bool NoModel = ModelDef->getValueAsBit("NoModel");
-  bool hasTopLevelItin = !ItinsDef->getValueAsListOfDefs("IID").empty();
-  if (NoModel) {
-    // If an itinerary is defined without a machine model, infer a new model.
-    if (NoModel && hasTopLevelItin) {
-      ModelName = ItinName + "Model";
-      ModelDef = NULL;
+  for (RecIter I = RW.Aliases.begin(), E = RW.Aliases.end(); I != E; ++I) {
+    const CodeGenSchedRW &AliasRW =
+      SchedModels.getSchedRW((*I)->getValueAsDef("AliasRW"));
+    if (AliasRW.HasVariants)
+      return true;
+    if (AliasRW.IsSequence) {
+      IdxVec ExpandedRWs;
+      SchedModels.expandRWSequence(AliasRW.Index, ExpandedRWs, AliasRW.IsRead);
+      for (IdxIter SI = ExpandedRWs.begin(), SE = ExpandedRWs.end();
+           SI != SE; ++SI) {
+        if (hasAliasedVariants(SchedModels.getSchedRW(*SI, AliasRW.IsRead),
+                               SchedModels)) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+static bool hasVariant(ArrayRef<PredTransition> Transitions,
+                       CodeGenSchedModels &SchedModels) {
+  for (ArrayRef<PredTransition>::iterator
+         PTI = Transitions.begin(), PTE = Transitions.end();
+       PTI != PTE; ++PTI) {
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           WSI = PTI->WriteSequences.begin(), WSE = PTI->WriteSequences.end();
+         WSI != WSE; ++WSI) {
+      for (SmallVectorImpl<unsigned>::const_iterator
+             WI = WSI->begin(), WE = WSI->end(); WI != WE; ++WI) {
+        if (hasAliasedVariants(SchedModels.getSchedWrite(*WI), SchedModels))
+          return true;
+      }
+    }
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           RSI = PTI->ReadSequences.begin(), RSE = PTI->ReadSequences.end();
+         RSI != RSE; ++RSI) {
+      for (SmallVectorImpl<unsigned>::const_iterator
+             RI = RSI->begin(), RE = RSI->end(); RI != RE; ++RI) {
+        if (hasAliasedVariants(SchedModels.getSchedRead(*RI), SchedModels))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Populate IntersectingVariants with any variants or aliased sequences of the
+// given SchedRW whose processor indices and predicates are not mutually
+// exclusive with the given transition,
+void PredTransitions::getIntersectingVariants(
+  const CodeGenSchedRW &SchedRW, unsigned TransIdx,
+  std::vector<TransVariant> &IntersectingVariants) {
+
+  std::vector<TransVariant> Variants;
+  if (SchedRW.HasVariants) {
+    unsigned VarProcIdx = 0;
+    if (SchedRW.TheDef->getValueInit("SchedModel")->isComplete()) {
+      Record *ModelDef = SchedRW.TheDef->getValueAsDef("SchedModel");
+      VarProcIdx = SchedModels.getProcModel(ModelDef).Index;
+    }
+    // Push each variant. Assign TransVecIdx later.
+    const RecVec VarDefs = SchedRW.TheDef->getValueAsListOfDefs("Variants");
+    for (RecIter RI = VarDefs.begin(), RE = VarDefs.end(); RI != RE; ++RI)
+      Variants.push_back(TransVariant(*RI, SchedRW.Index, VarProcIdx, 0));
+  }
+  for (RecIter AI = SchedRW.Aliases.begin(), AE = SchedRW.Aliases.end();
+       AI != AE; ++AI) {
+    // If either the SchedAlias itself or the SchedReadWrite that it aliases
+    // to is defined within a processor model, constrain all variants to
+    // that processor.
+    unsigned AliasProcIdx = 0;
+    if ((*AI)->getValueInit("SchedModel")->isComplete()) {
+      Record *ModelDef = (*AI)->getValueAsDef("SchedModel");
+      AliasProcIdx = SchedModels.getProcModel(ModelDef).Index;
+    }
+    const CodeGenSchedRW &AliasRW =
+      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+
+    if (AliasRW.HasVariants) {
+      const RecVec VarDefs = AliasRW.TheDef->getValueAsListOfDefs("Variants");
+      for (RecIter RI = VarDefs.begin(), RE = VarDefs.end(); RI != RE; ++RI)
+        Variants.push_back(TransVariant(*RI, AliasRW.Index, AliasProcIdx, 0));
+    }
+    if (AliasRW.IsSequence) {
+      Variants.push_back(
+        TransVariant(AliasRW.TheDef, SchedRW.Index, AliasProcIdx, 0));
+    }
+  }
+  for (unsigned VIdx = 0, VEnd = Variants.size(); VIdx != VEnd; ++VIdx) {
+    TransVariant &Variant = Variants[VIdx];
+    // Don't expand variants if the processor models don't intersect.
+    // A zero processor index means any processor.
+    SmallVector<unsigned, 4> &ProcIndices = TransVec[TransIdx].ProcIndices;
+    if (ProcIndices[0] && Variants[VIdx].ProcIdx) {
+      unsigned Cnt = std::count(ProcIndices.begin(), ProcIndices.end(),
+                                Variant.ProcIdx);
+      if (!Cnt)
+        continue;
+      if (Cnt > 1) {
+        const CodeGenProcModel &PM =
+          *(SchedModels.procModelBegin() + Variant.ProcIdx);
+        PrintFatalError(Variant.VarOrSeqDef->getLoc(),
+                        "Multiple variants defined for processor " +
+                        PM.ModelName +
+                        " Ensure only one SchedAlias exists per RW.");
+      }
+    }
+    if (Variant.VarOrSeqDef->isSubClassOf("SchedVar")) {
+      Record *PredDef = Variant.VarOrSeqDef->getValueAsDef("Predicate");
+      if (mutuallyExclusive(PredDef, TransVec[TransIdx].PredTerm))
+        continue;
+    }
+    if (IntersectingVariants.empty()) {
+      // The first variant builds on the existing transition.
+      Variant.TransVecIdx = TransIdx;
+      IntersectingVariants.push_back(Variant);
+    }
+    else {
+      // Push another copy of the current transition for more variants.
+      Variant.TransVecIdx = TransVec.size();
+      IntersectingVariants.push_back(Variant);
+      TransVec.push_back(TransVec[TransIdx]);
     }
   }
+}
+
+// Push the Reads/Writes selected by this variant onto the PredTransition
+// specified by VInfo.
+void PredTransitions::
+pushVariant(const TransVariant &VInfo, bool IsRead) {
+
+  PredTransition &Trans = TransVec[VInfo.TransVecIdx];
+
+  // If this operand transition is reached through a processor-specific alias,
+  // then the whole transition is specific to this processor.
+  if (VInfo.ProcIdx != 0)
+    Trans.ProcIndices.assign(1, VInfo.ProcIdx);
+
+  IdxVec SelectedRWs;
+  if (VInfo.VarOrSeqDef->isSubClassOf("SchedVar")) {
+    Record *PredDef = VInfo.VarOrSeqDef->getValueAsDef("Predicate");
+    Trans.PredTerm.push_back(PredCheck(IsRead, VInfo.RWIdx,PredDef));
+    RecVec SelectedDefs = VInfo.VarOrSeqDef->getValueAsListOfDefs("Selected");
+    SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead);
+  }
   else {
-    // If a machine model is defined, the itinerary must be defined within it
-    // rather than in the Processor definition itself.
-    assert(!hasTopLevelItin && "Itinerary must be defined in SchedModel");
-    ItinsDef = ModelDef->getValueAsDef("Itineraries");
+    assert(VInfo.VarOrSeqDef->isSubClassOf("WriteSequence") &&
+           "variant must be a SchedVariant or aliased WriteSequence");
+    SelectedRWs.push_back(SchedModels.getSchedRWIdx(VInfo.VarOrSeqDef, IsRead));
   }
 
-  ProcModelMap[getProcModelKey(ProcDef)]= ProcModels.size();
+  const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(VInfo.RWIdx, IsRead);
 
-  ProcModels.push_back(CodeGenProcModel(ModelName, ModelDef, ItinsDef));
+  SmallVectorImpl<SmallVector<unsigned,4> > &RWSequences = IsRead
+    ? Trans.ReadSequences : Trans.WriteSequences;
+  if (SchedRW.IsVariadic) {
+    unsigned OperIdx = RWSequences.size()-1;
+    // Make N-1 copies of this transition's last sequence.
+    for (unsigned i = 1, e = SelectedRWs.size(); i != e; ++i) {
+      RWSequences.push_back(RWSequences[OperIdx]);
+    }
+    // Push each of the N elements of the SelectedRWs onto a copy of the last
+    // sequence (split the current operand into N operands).
+    // Note that write sequences should be expanded within this loop--the entire
+    // sequence belongs to a single operand.
+    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end();
+         RWI != RWE; ++RWI, ++OperIdx) {
+      IdxVec ExpandedRWs;
+      if (IsRead)
+        ExpandedRWs.push_back(*RWI);
+      else
+        SchedModels.expandRWSequence(*RWI, ExpandedRWs, IsRead);
+      RWSequences[OperIdx].insert(RWSequences[OperIdx].end(),
+                                  ExpandedRWs.begin(), ExpandedRWs.end());
+    }
+    assert(OperIdx == RWSequences.size() && "missed a sequence");
+  }
+  else {
+    // Push this transition's expanded sequence onto this transition's last
+    // sequence (add to the current operand's sequence).
+    SmallVectorImpl<unsigned> &Seq = RWSequences.back();
+    IdxVec ExpandedRWs;
+    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end();
+         RWI != RWE; ++RWI) {
+      if (IsRead)
+        ExpandedRWs.push_back(*RWI);
+      else
+        SchedModels.expandRWSequence(*RWI, ExpandedRWs, IsRead);
+    }
+    Seq.insert(Seq.end(), ExpandedRWs.begin(), ExpandedRWs.end());
+  }
+}
+
+// RWSeq is a sequence of all Reads or all Writes for the next read or write
+// operand. StartIdx is an index into TransVec where partial results
+// starts. RWSeq must be applied to all transitions between StartIdx and the end
+// of TransVec.
+void PredTransitions::substituteVariantOperand(
+  const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, unsigned StartIdx) {
 
-  std::vector<Record*> ItinRecords = ItinsDef->getValueAsListOfDefs("IID");
-  CollectProcItin(ProcModels.back(), ItinRecords);
+  // Visit each original RW within the current sequence.
+  for (SmallVectorImpl<unsigned>::const_iterator
+         RWI = RWSeq.begin(), RWE = RWSeq.end(); RWI != RWE; ++RWI) {
+    const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(*RWI, IsRead);
+    // Push this RW on all partial PredTransitions or distribute variants.
+    // New PredTransitions may be pushed within this loop which should not be
+    // revisited (TransEnd must be loop invariant).
+    for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size();
+         TransIdx != TransEnd; ++TransIdx) {
+      // In the common case, push RW onto the current operand's sequence.
+      if (!hasAliasedVariants(SchedRW, SchedModels)) {
+        if (IsRead)
+          TransVec[TransIdx].ReadSequences.back().push_back(*RWI);
+        else
+          TransVec[TransIdx].WriteSequences.back().push_back(*RWI);
+        continue;
+      }
+      // Distribute this partial PredTransition across intersecting variants.
+      // This will push a copies of TransVec[TransIdx] on the back of TransVec.
+      std::vector<TransVariant> IntersectingVariants;
+      getIntersectingVariants(SchedRW, TransIdx, IntersectingVariants);
+      if (IntersectingVariants.empty())
+        PrintFatalError(SchedRW.TheDef->getLoc(),
+                      "No variant of this type has "
+                      "a matching predicate on any processor");
+      // Now expand each variant on top of its copy of the transition.
+      for (std::vector<TransVariant>::const_iterator
+             IVI = IntersectingVariants.begin(),
+             IVE = IntersectingVariants.end();
+           IVI != IVE; ++IVI) {
+        pushVariant(*IVI, IsRead);
+      }
+    }
+  }
 }
 
-// Gather the processor itineraries.
-void CodeGenSchedModels::CollectProcItin(CodeGenProcModel &ProcModel,
-                                         std::vector<Record*> ItinRecords) {
-  // Skip empty itinerary.
-  if (ItinRecords.empty())
+// For each variant of a Read/Write in Trans, substitute the sequence of
+// Read/Writes guarded by the variant. This is exponential in the number of
+// variant Read/Writes, but in practice detection of mutually exclusive
+// predicates should result in linear growth in the total number variants.
+//
+// This is one step in a breadth-first search of nested variants.
+void PredTransitions::substituteVariants(const PredTransition &Trans) {
+  // Build up a set of partial results starting at the back of
+  // PredTransitions. Remember the first new transition.
+  unsigned StartIdx = TransVec.size();
+  TransVec.resize(TransVec.size() + 1);
+  TransVec.back().PredTerm = Trans.PredTerm;
+  TransVec.back().ProcIndices = Trans.ProcIndices;
+
+  // Visit each original write sequence.
+  for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+         WSI = Trans.WriteSequences.begin(), WSE = Trans.WriteSequences.end();
+       WSI != WSE; ++WSI) {
+    // Push a new (empty) write sequence onto all partial Transitions.
+    for (std::vector<PredTransition>::iterator I =
+           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+      I->WriteSequences.resize(I->WriteSequences.size() + 1);
+    }
+    substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
+  }
+  // Visit each original read sequence.
+  for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+         RSI = Trans.ReadSequences.begin(), RSE = Trans.ReadSequences.end();
+       RSI != RSE; ++RSI) {
+    // Push a new (empty) read sequence onto all partial Transitions.
+    for (std::vector<PredTransition>::iterator I =
+           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+      I->ReadSequences.resize(I->ReadSequences.size() + 1);
+    }
+    substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
+  }
+}
+
+// Create a new SchedClass for each variant found by inferFromRW. Pass
+static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
+                                 unsigned FromClassIdx,
+                                 CodeGenSchedModels &SchedModels) {
+  // For each PredTransition, create a new CodeGenSchedTransition, which usually
+  // requires creating a new SchedClass.
+  for (ArrayRef<PredTransition>::iterator
+         I = LastTransitions.begin(), E = LastTransitions.end(); I != E; ++I) {
+    IdxVec OperWritesVariant;
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           WSI = I->WriteSequences.begin(), WSE = I->WriteSequences.end();
+         WSI != WSE; ++WSI) {
+      // Create a new write representing the expanded sequence.
+      OperWritesVariant.push_back(
+        SchedModels.findOrInsertRW(*WSI, /*IsRead=*/false));
+    }
+    IdxVec OperReadsVariant;
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           RSI = I->ReadSequences.begin(), RSE = I->ReadSequences.end();
+         RSI != RSE; ++RSI) {
+      // Create a new read representing the expanded sequence.
+      OperReadsVariant.push_back(
+        SchedModels.findOrInsertRW(*RSI, /*IsRead=*/true));
+    }
+    IdxVec ProcIndices(I->ProcIndices.begin(), I->ProcIndices.end());
+    CodeGenSchedTransition SCTrans;
+    SCTrans.ToClassIdx =
+      SchedModels.addSchedClass(OperWritesVariant, OperReadsVariant,
+                                ProcIndices);
+    SCTrans.ProcIndices = ProcIndices;
+    // The final PredTerm is unique set of predicates guarding the transition.
+    RecVec Preds;
+    for (SmallVectorImpl<PredCheck>::const_iterator
+           PI = I->PredTerm.begin(), PE = I->PredTerm.end(); PI != PE; ++PI) {
+      Preds.push_back(PI->Predicate);
+    }
+    RecIter PredsEnd = std::unique(Preds.begin(), Preds.end());
+    Preds.resize(PredsEnd - Preds.begin());
+    SCTrans.PredTerm = Preds;
+    SchedModels.getSchedClass(FromClassIdx).Transitions.push_back(SCTrans);
+  }
+}
+
+// Create new SchedClasses for the given ReadWrite list. If any of the
+// ReadWrites refers to a SchedVariant, create a new SchedClass for each variant
+// of the ReadWrite list, following Aliases if necessary.
+void CodeGenSchedModels::inferFromRW(const IdxVec &OperWrites,
+                                     const IdxVec &OperReads,
+                                     unsigned FromClassIdx,
+                                     const IdxVec &ProcIndices) {
+  DEBUG(dbgs() << "INFER RW: ");
+
+  // Create a seed transition with an empty PredTerm and the expanded sequences
+  // of SchedWrites for the current SchedClass.
+  std::vector<PredTransition> LastTransitions;
+  LastTransitions.resize(1);
+  LastTransitions.back().ProcIndices.append(ProcIndices.begin(),
+                                            ProcIndices.end());
+
+  for (IdxIter I = OperWrites.begin(), E = OperWrites.end(); I != E; ++I) {
+    IdxVec WriteSeq;
+    expandRWSequence(*I, WriteSeq, /*IsRead=*/false);
+    unsigned Idx = LastTransitions[0].WriteSequences.size();
+    LastTransitions[0].WriteSequences.resize(Idx + 1);
+    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].WriteSequences[Idx];
+    for (IdxIter WI = WriteSeq.begin(), WE = WriteSeq.end(); WI != WE; ++WI)
+      Seq.push_back(*WI);
+    DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
+  }
+  DEBUG(dbgs() << " Reads: ");
+  for (IdxIter I = OperReads.begin(), E = OperReads.end(); I != E; ++I) {
+    IdxVec ReadSeq;
+    expandRWSequence(*I, ReadSeq, /*IsRead=*/true);
+    unsigned Idx = LastTransitions[0].ReadSequences.size();
+    LastTransitions[0].ReadSequences.resize(Idx + 1);
+    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].ReadSequences[Idx];
+    for (IdxIter RI = ReadSeq.begin(), RE = ReadSeq.end(); RI != RE; ++RI)
+      Seq.push_back(*RI);
+    DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Collect all PredTransitions for individual operands.
+  // Iterate until no variant writes remain.
+  while (hasVariant(LastTransitions, *this)) {
+    PredTransitions Transitions(*this);
+    for (std::vector<PredTransition>::const_iterator
+           I = LastTransitions.begin(), E = LastTransitions.end();
+         I != E; ++I) {
+      Transitions.substituteVariants(*I);
+    }
+    DEBUG(Transitions.dump());
+    LastTransitions.swap(Transitions.TransVec);
+  }
+  // If the first transition has no variants, nothing to do.
+  if (LastTransitions[0].PredTerm.empty())
     return;
 
-  HasProcItineraries = true;
+  // WARNING: We are about to mutate the SchedClasses vector. Do not refer to
+  // OperWrites, OperReads, or ProcIndices after calling inferFromTransitions.
+  inferFromTransitions(LastTransitions, FromClassIdx, *this);
+}
 
-  ProcModel.ItinDefList.resize(NumItineraryClasses+1);
+// Collect and sort WriteRes, ReadAdvance, and ProcResources.
+void CodeGenSchedModels::collectProcResources() {
+  // Add any subtarget-specific SchedReadWrites that are directly associated
+  // with processor resources. Refer to the parent SchedClass's ProcIndices to
+  // determine which processors they apply to.
+  for (SchedClassIter SCI = schedClassBegin(), SCE = schedClassEnd();
+       SCI != SCE; ++SCI) {
+    if (SCI->ItinClassDef)
+      collectItinProcResources(SCI->ItinClassDef);
+    else
+      collectRWResources(SCI->Writes, SCI->Reads, SCI->ProcIndices);
+  }
+  // Add resources separately defined by each subtarget.
+  RecVec WRDefs = Records.getAllDerivedDefinitions("WriteRes");
+  for (RecIter WRI = WRDefs.begin(), WRE = WRDefs.end(); WRI != WRE; ++WRI) {
+    Record *ModelDef = (*WRI)->getValueAsDef("SchedModel");
+    addWriteRes(*WRI, getProcModel(ModelDef).Index);
+  }
+  RecVec RADefs = Records.getAllDerivedDefinitions("ReadAdvance");
+  for (RecIter RAI = RADefs.begin(), RAE = RADefs.end(); RAI != RAE; ++RAI) {
+    Record *ModelDef = (*RAI)->getValueAsDef("SchedModel");
+    addReadAdvance(*RAI, getProcModel(ModelDef).Index);
+  }
+  // Finalize each ProcModel by sorting the record arrays.
+  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
+    CodeGenProcModel &PM = ProcModels[PIdx];
+    std::sort(PM.WriteResDefs.begin(), PM.WriteResDefs.end(),
+              LessRecord());
+    std::sort(PM.ReadAdvanceDefs.begin(), PM.ReadAdvanceDefs.end(),
+              LessRecord());
+    std::sort(PM.ProcResourceDefs.begin(), PM.ProcResourceDefs.end(),
+              LessRecord());
+    DEBUG(
+      PM.dump();
+      dbgs() << "WriteResDefs: ";
+      for (RecIter RI = PM.WriteResDefs.begin(),
+             RE = PM.WriteResDefs.end(); RI != RE; ++RI) {
+        if ((*RI)->isSubClassOf("WriteRes"))
+          dbgs() << (*RI)->getValueAsDef("WriteType")->getName() << " ";
+        else
+          dbgs() << (*RI)->getName() << " ";
+      }
+      dbgs() << "\nReadAdvanceDefs: ";
+      for (RecIter RI = PM.ReadAdvanceDefs.begin(),
+             RE = PM.ReadAdvanceDefs.end(); RI != RE; ++RI) {
+        if ((*RI)->isSubClassOf("ReadAdvance"))
+          dbgs() << (*RI)->getValueAsDef("ReadType")->getName() << " ";
+        else
+          dbgs() << (*RI)->getName() << " ";
+      }
+      dbgs() << "\nProcResourceDefs: ";
+      for (RecIter RI = PM.ProcResourceDefs.begin(),
+             RE = PM.ProcResourceDefs.end(); RI != RE; ++RI) {
+        dbgs() << (*RI)->getName() << " ";
+      }
+      dbgs() << '\n');
+  }
+}
 
-  // Insert each itinerary data record in the correct position within
-  // the processor model's ItinDefList.
-  for (unsigned i = 0, N = ItinRecords.size(); i < N; i++) {
-    Record *ItinData = ItinRecords[i];
-    Record *ItinDef = ItinData->getValueAsDef("TheClass");
-    if (!SchedClassIdxMap.count(ItinDef->getName())) {
-      DEBUG(dbgs() << ProcModel.ItinsDef->getName()
-            << " has unused itinerary class " << ItinDef->getName() << '\n');
-      continue;
+// Collect itinerary class resources for each processor.
+void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) {
+  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
+    const CodeGenProcModel &PM = ProcModels[PIdx];
+    // For all ItinRW entries.
+    bool HasMatch = false;
+    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
+         II != IE; ++II) {
+      RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+      if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
+        continue;
+      if (HasMatch)
+        PrintFatalError((*II)->getLoc(), "Duplicate itinerary class "
+                        + ItinClassDef->getName()
+                        + " in ItinResources for " + PM.ModelName);
+      HasMatch = true;
+      IdxVec Writes, Reads;
+      findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+      IdxVec ProcIndices(1, PIdx);
+      collectRWResources(Writes, Reads, ProcIndices);
+    }
+  }
+}
+
+void CodeGenSchedModels::collectRWResources(unsigned RWIdx, bool IsRead,
+                                            const IdxVec &ProcIndices) {
+  const CodeGenSchedRW &SchedRW = getSchedRW(RWIdx, IsRead);
+  if (SchedRW.TheDef) {
+    if (!IsRead && SchedRW.TheDef->isSubClassOf("SchedWriteRes")) {
+      for (IdxIter PI = ProcIndices.begin(), PE = ProcIndices.end();
+           PI != PE; ++PI) {
+        addWriteRes(SchedRW.TheDef, *PI);
+      }
+    }
+    else if (IsRead && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) {
+      for (IdxIter PI = ProcIndices.begin(), PE = ProcIndices.end();
+           PI != PE; ++PI) {
+        addReadAdvance(SchedRW.TheDef, *PI);
+      }
+    }
+  }
+  for (RecIter AI = SchedRW.Aliases.begin(), AE = SchedRW.Aliases.end();
+       AI != AE; ++AI) {
+    IdxVec AliasProcIndices;
+    if ((*AI)->getValueInit("SchedModel")->isComplete()) {
+      AliasProcIndices.push_back(
+        getProcModel((*AI)->getValueAsDef("SchedModel")).Index);
+    }
+    else
+      AliasProcIndices = ProcIndices;
+    const CodeGenSchedRW &AliasRW = getSchedRW((*AI)->getValueAsDef("AliasRW"));
+    assert(AliasRW.IsRead == IsRead && "cannot alias reads to writes");
+
+    IdxVec ExpandedRWs;
+    expandRWSequence(AliasRW.Index, ExpandedRWs, IsRead);
+    for (IdxIter SI = ExpandedRWs.begin(), SE = ExpandedRWs.end();
+         SI != SE; ++SI) {
+      collectRWResources(*SI, IsRead, AliasProcIndices);
+    }
+  }
+}
+
+// Collect resources for a set of read/write types and processor indices.
+void CodeGenSchedModels::collectRWResources(const IdxVec &Writes,
+                                            const IdxVec &Reads,
+                                            const IdxVec &ProcIndices) {
+
+  for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI)
+    collectRWResources(*WI, /*IsRead=*/false, ProcIndices);
+
+  for (IdxIter RI = Reads.begin(), RE = Reads.end(); RI != RE; ++RI)
+    collectRWResources(*RI, /*IsRead=*/true, ProcIndices);
+}
+
+
+// Find the processor's resource units for this kind of resource.
+Record *CodeGenSchedModels::findProcResUnits(Record *ProcResKind,
+                                             const CodeGenProcModel &PM) const {
+  if (ProcResKind->isSubClassOf("ProcResourceUnits"))
+    return ProcResKind;
+
+  Record *ProcUnitDef = 0;
+  RecVec ProcResourceDefs =
+    Records.getAllDerivedDefinitions("ProcResourceUnits");
+
+  for (RecIter RI = ProcResourceDefs.begin(), RE = ProcResourceDefs.end();
+       RI != RE; ++RI) {
+
+    if ((*RI)->getValueAsDef("Kind") == ProcResKind
+        && (*RI)->getValueAsDef("SchedModel") == PM.ModelDef) {
+      if (ProcUnitDef) {
+        PrintFatalError((*RI)->getLoc(),
+                        "Multiple ProcessorResourceUnits associated with "
+                        + ProcResKind->getName());
+      }
+      ProcUnitDef = *RI;
     }
-    ProcModel.ItinDefList[getItinClassIdx(ItinDef)] = ItinData;
   }
+  if (!ProcUnitDef) {
+    PrintFatalError(ProcResKind->getLoc(),
+                    "No ProcessorResources associated with "
+                    + ProcResKind->getName());
+  }
+  return ProcUnitDef;
+}
+
+// Iteratively add a resource and its super resources.
+void CodeGenSchedModels::addProcResource(Record *ProcResKind,
+                                         CodeGenProcModel &PM) {
+  for (;;) {
+    Record *ProcResUnits = findProcResUnits(ProcResKind, PM);
+
+    // See if this ProcResource is already associated with this processor.
+    RecIter I = std::find(PM.ProcResourceDefs.begin(),
+                          PM.ProcResourceDefs.end(), ProcResUnits);
+    if (I != PM.ProcResourceDefs.end())
+      return;
+
+    PM.ProcResourceDefs.push_back(ProcResUnits);
+    if (!ProcResUnits->getValueInit("Super")->isComplete())
+      return;
+
+    ProcResKind = ProcResUnits->getValueAsDef("Super");
+  }
+}
+
+// Add resources for a SchedWrite to this processor if they don't exist.
+void CodeGenSchedModels::addWriteRes(Record *ProcWriteResDef, unsigned PIdx) {
+  assert(PIdx && "don't add resources to an invalid Processor model");
+
+  RecVec &WRDefs = ProcModels[PIdx].WriteResDefs;
+  RecIter WRI = std::find(WRDefs.begin(), WRDefs.end(), ProcWriteResDef);
+  if (WRI != WRDefs.end())
+    return;
+  WRDefs.push_back(ProcWriteResDef);
+
+  // Visit ProcResourceKinds referenced by the newly discovered WriteRes.
+  RecVec ProcResDefs = ProcWriteResDef->getValueAsListOfDefs("ProcResources");
+  for (RecIter WritePRI = ProcResDefs.begin(), WritePRE = ProcResDefs.end();
+       WritePRI != WritePRE; ++WritePRI) {
+    addProcResource(*WritePRI, ProcModels[PIdx]);
+  }
+}
+
+// Add resources for a ReadAdvance to this processor if they don't exist.
+void CodeGenSchedModels::addReadAdvance(Record *ProcReadAdvanceDef,
+                                        unsigned PIdx) {
+  RecVec &RADefs = ProcModels[PIdx].ReadAdvanceDefs;
+  RecIter I = std::find(RADefs.begin(), RADefs.end(), ProcReadAdvanceDef);
+  if (I != RADefs.end())
+    return;
+  RADefs.push_back(ProcReadAdvanceDef);
+}
+
+unsigned CodeGenProcModel::getProcResourceIdx(Record *PRDef) const {
+  RecIter PRPos = std::find(ProcResourceDefs.begin(), ProcResourceDefs.end(),
+                            PRDef);
+  if (PRPos == ProcResourceDefs.end())
+    PrintFatalError(PRDef->getLoc(), "ProcResource def is not included in "
+                    "the ProcResources list for " + ModelName);
+  // Idx=0 is reserved for invalid.
+  return 1 + (PRPos - ProcResourceDefs.begin());
+}
+
 #ifndef NDEBUG
-  // Check for missing itinerary entries.
-  assert(!ProcModel.ItinDefList[0] && "NoItinerary class can't have rec");
-  for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
-    if (!ProcModel.ItinDefList[i])
-      DEBUG(dbgs() << ProcModel.ItinsDef->getName()
-            << " missing itinerary for class " << SchedClasses[i].Name << '\n');
+void CodeGenProcModel::dump() const {
+  dbgs() << Index << ": " << ModelName << " "
+         << (ModelDef ? ModelDef->getName() : "inferred") << " "
+         << (ItinsDef ? ItinsDef->getName() : "no itinerary") << '\n';
+}
+
+void CodeGenSchedRW::dump() const {
+  dbgs() << Name << (IsVariadic ? " (V) " : " ");
+  if (IsSequence) {
+    dbgs() << "(";
+    dumpIdxVec(Sequence);
+    dbgs() << ")";
+  }
+}
+
+void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
+  dbgs() << "SCHEDCLASS " << Name << '\n'
+         << "  Writes: ";
+  for (unsigned i = 0, N = Writes.size(); i < N; ++i) {
+    SchedModels->getSchedWrite(Writes[i]).dump();
+    if (i < N-1) {
+      dbgs() << '\n';
+      dbgs().indent(10);
+    }
+  }
+  dbgs() << "\n  Reads: ";
+  for (unsigned i = 0, N = Reads.size(); i < N; ++i) {
+    SchedModels->getSchedRead(Reads[i]).dump();
+    if (i < N-1) {
+      dbgs() << '\n';
+      dbgs().indent(10);
+    }
+  }
+  dbgs() << "\n  ProcIdx: "; dumpIdxVec(ProcIndices); dbgs() << '\n';
+}
+
+void PredTransitions::dump() const {
+  dbgs() << "Expanded Variants:\n";
+  for (std::vector<PredTransition>::const_iterator
+         TI = TransVec.begin(), TE = TransVec.end(); TI != TE; ++TI) {
+    dbgs() << "{";
+    for (SmallVectorImpl<PredCheck>::const_iterator
+           PCI = TI->PredTerm.begin(), PCE = TI->PredTerm.end();
+         PCI != PCE; ++PCI) {
+      if (PCI != TI->PredTerm.begin())
+        dbgs() << ", ";
+      dbgs() << SchedModels.getSchedRW(PCI->RWIdx, PCI->IsRead).Name
+             << ":" << PCI->Predicate->getName();
+    }
+    dbgs() << "},\n  => {";
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           WSI = TI->WriteSequences.begin(), WSE = TI->WriteSequences.end();
+         WSI != WSE; ++WSI) {
+      dbgs() << "(";
+      for (SmallVectorImpl<unsigned>::const_iterator
+             WI = WSI->begin(), WE = WSI->end(); WI != WE; ++WI) {
+        if (WI != WSI->begin())
+          dbgs() << ", ";
+        dbgs() << SchedModels.getSchedWrite(*WI).Name;
+      }
+      dbgs() << "),";
+    }
+    dbgs() << "}\n";
   }
-#endif
 }
+#endif // NDEBUG
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 9da0145732b3..eed058971b80 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -15,6 +15,7 @@
 #ifndef CODEGEN_SCHEDULE_H
 #define CODEGEN_SCHEDULE_H
 
+#include "SetTheory.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/ADT/DenseMap.h"
@@ -23,21 +24,131 @@
 namespace llvm {
 
 class CodeGenTarget;
+class CodeGenSchedModels;
+class CodeGenInstruction;
 
-// Scheduling class.
-//
-// Each instruction description will be mapped to a scheduling class. It may be
-// an explicitly defined itinerary class, or an inferred class in which case
-// ItinClassDef == NULL.
+typedef std::vector<Record*> RecVec;
+typedef std::vector<Record*>::const_iterator RecIter;
+
+typedef std::vector<unsigned> IdxVec;
+typedef std::vector<unsigned>::const_iterator IdxIter;
+
+void splitSchedReadWrites(const RecVec &RWDefs,
+                          RecVec &WriteDefs, RecVec &ReadDefs);
+
+/// We have two kinds of SchedReadWrites. Explicitly defined and inferred
+/// sequences.  TheDef is nonnull for explicit SchedWrites, but Sequence may or
+/// may not be empty. TheDef is null for inferred sequences, and Sequence must
+/// be nonempty.
+///
+/// IsVariadic controls whether the variants are expanded into multiple operands
+/// or a sequence of writes on one operand.
+struct CodeGenSchedRW {
+  unsigned Index;
+  std::string Name;
+  Record *TheDef;
+  bool IsRead;
+  bool IsAlias;
+  bool HasVariants;
+  bool IsVariadic;
+  bool IsSequence;
+  IdxVec Sequence;
+  RecVec Aliases;
+
+  CodeGenSchedRW(): Index(0), TheDef(0), IsAlias(false), HasVariants(false),
+                    IsVariadic(false), IsSequence(false) {}
+  CodeGenSchedRW(unsigned Idx, Record *Def): Index(Idx), TheDef(Def),
+                                             IsAlias(false), IsVariadic(false) {
+    Name = Def->getName();
+    IsRead = Def->isSubClassOf("SchedRead");
+    HasVariants = Def->isSubClassOf("SchedVariant");
+    if (HasVariants)
+      IsVariadic = Def->getValueAsBit("Variadic");
+
+    // Read records don't currently have sequences, but it can be easily
+    // added. Note that implicit Reads (from ReadVariant) may have a Sequence
+    // (but no record).
+    IsSequence = Def->isSubClassOf("WriteSequence");
+  }
+
+  CodeGenSchedRW(unsigned Idx, bool Read, const IdxVec &Seq,
+                 const std::string &Name):
+    Index(Idx), Name(Name), TheDef(0), IsRead(Read), IsAlias(false),
+    HasVariants(false), IsVariadic(false), IsSequence(true), Sequence(Seq) {
+    assert(Sequence.size() > 1 && "implied sequence needs >1 RWs");
+  }
+
+  bool isValid() const {
+    assert((!HasVariants || TheDef) && "Variant write needs record def");
+    assert((!IsVariadic || HasVariants) && "Variadic write needs variants");
+    assert((!IsSequence || !HasVariants) && "Sequence can't have variant");
+    assert((!IsSequence || !Sequence.empty()) && "Sequence should be nonempty");
+    assert((!IsAlias || Aliases.empty()) && "Alias cannot have aliases");
+    return TheDef || !Sequence.empty();
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// Represent a transition between SchedClasses induced by SchedVariant.
+struct CodeGenSchedTransition {
+  unsigned ToClassIdx;
+  IdxVec ProcIndices;
+  RecVec PredTerm;
+};
+
+/// Scheduling class.
+///
+/// Each instruction description will be mapped to a scheduling class. There are
+/// four types of classes:
+///
+/// 1) An explicitly defined itinerary class with ItinClassDef set.
+/// Writes and ReadDefs are empty. ProcIndices contains 0 for any processor.
+///
+/// 2) An implied class with a list of SchedWrites and SchedReads that are
+/// defined in an instruction definition and which are common across all
+/// subtargets. ProcIndices contains 0 for any processor.
+///
+/// 3) An implied class with a list of InstRW records that map instructions to
+/// SchedWrites and SchedReads per-processor. InstrClassMap should map the same
+/// instructions to this class. ProcIndices contains all the processors that
+/// provided InstrRW records for this class. ItinClassDef or Writes/Reads may
+/// still be defined for processors with no InstRW entry.
+///
+/// 4) An inferred class represents a variant of another class that may be
+/// resolved at runtime. ProcIndices contains the set of processors that may
+/// require the class. ProcIndices are propagated through SchedClasses as
+/// variants are expanded. Multiple SchedClasses may be inferred from an
+/// itinerary class. Each inherits the processor index from the ItinRW record
+/// that mapped the itinerary class to the variant Writes or Reads.
 struct CodeGenSchedClass {
   std::string Name;
-  unsigned Index;
   Record *ItinClassDef;
 
-  CodeGenSchedClass(): Index(0), ItinClassDef(0) {}
-  CodeGenSchedClass(Record *rec): Index(0), ItinClassDef(rec) {
+  IdxVec Writes;
+  IdxVec Reads;
+  // Sorted list of ProcIdx, where ProcIdx==0 implies any processor.
+  IdxVec ProcIndices;
+
+  std::vector<CodeGenSchedTransition> Transitions;
+
+  // InstRW records associated with this class. These records may refer to an
+  // Instruction no longer mapped to this class by InstrClassMap. These
+  // Instructions should be ignored by this class because they have been split
+  // off to join another inferred class.
+  RecVec InstRWs;
+
+  CodeGenSchedClass(): ItinClassDef(0) {}
+  CodeGenSchedClass(Record *rec): ItinClassDef(rec) {
     Name = rec->getName();
+    ProcIndices.push_back(0);
   }
+
+#ifndef NDEBUG
+  void dump(const CodeGenSchedModels *SchedModels) const;
+#endif
 };
 
 // Processor model.
@@ -55,28 +166,69 @@ struct CodeGenSchedClass {
 //
 // ItinDefList orders this processor's InstrItinData records by SchedClass idx.
 struct CodeGenProcModel {
+  unsigned Index;
   std::string ModelName;
   Record *ModelDef;
   Record *ItinsDef;
 
-  // Array of InstrItinData records indexed by CodeGenSchedClass::Index.
-  // The list is empty if the subtarget has no itineraries.
-  std::vector<Record *> ItinDefList;
+  // Derived members...
 
-  CodeGenProcModel(const std::string &Name, Record *MDef, Record *IDef):
-    ModelName(Name), ModelDef(MDef), ItinsDef(IDef) {}
+  // Array of InstrItinData records indexed by a CodeGenSchedClass index.
+  // This list is empty if the Processor has no value for Itineraries.
+  // Initialized by collectProcItins().
+  RecVec ItinDefList;
+
+  // Map itinerary classes to per-operand resources.
+  // This list is empty if no ItinRW refers to this Processor.
+  RecVec ItinRWDefs;
+
+  // All read/write resources associated with this processor.
+  RecVec WriteResDefs;
+  RecVec ReadAdvanceDefs;
+
+  // Per-operand machine model resources associated with this processor.
+  RecVec ProcResourceDefs;
+
+  CodeGenProcModel(unsigned Idx, const std::string &Name, Record *MDef,
+                   Record *IDef) :
+    Index(Idx), ModelName(Name), ModelDef(MDef), ItinsDef(IDef) {}
+
+  bool hasInstrSchedModel() const {
+    return !WriteResDefs.empty() || !ItinRWDefs.empty();
+  }
+
+  unsigned getProcResourceIdx(Record *PRDef) const;
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
 };
 
-// Top level container for machine model data.
+/// Top level container for machine model data.
 class CodeGenSchedModels {
   RecordKeeper &Records;
   const CodeGenTarget &Target;
 
+  // Map dag expressions to Instruction lists.
+  SetTheory Sets;
+
+  // List of unique processor models.
+  std::vector<CodeGenProcModel> ProcModels;
+
+  // Map Processor's MachineModel or ProcItin to a CodeGenProcModel index.
+  typedef DenseMap<Record*, unsigned> ProcModelMapTy;
+  ProcModelMapTy ProcModelMap;
+
+  // Per-operand SchedReadWrite types.
+  std::vector<CodeGenSchedRW> SchedWrites;
+  std::vector<CodeGenSchedRW> SchedReads;
+
   // List of unique SchedClasses.
   std::vector<CodeGenSchedClass> SchedClasses;
 
   // Map SchedClass name to itinerary index.
-  // These are either explicit itinerary classes or inferred classes.
+  // These are either explicit itinerary classes or classes implied by
+  // instruction definitions with SchedReadWrite lists.
   StringMap<unsigned> SchedClassIdxMap;
 
   // SchedClass indices 1 up to and including NumItineraryClasses identify
@@ -84,22 +236,80 @@ class CodeGenSchedModels {
   // definitions. NoItinerary always has index 0 regardless of whether it is
   // explicitly referenced.
   //
-  // Any inferred SchedClass have a index greater than NumItineraryClasses.
+  // Any implied SchedClass has an index greater than NumItineraryClasses.
   unsigned NumItineraryClasses;
 
-  // List of unique processor models.
-  std::vector<CodeGenProcModel> ProcModels;
-
-  // Map Processor's MachineModel + ProcItin fields to a CodeGenProcModel index.
-  typedef DenseMap<std::pair<Record*, Record*>, unsigned> ProcModelMapTy;
-  ProcModelMapTy ProcModelMap;
+  // Any inferred SchedClass has an index greater than NumInstrSchedClassses.
+  unsigned NumInstrSchedClasses;
 
-  // True if any processors have nonempty itineraries.
-  bool HasProcItineraries;
+  // Map Instruction to SchedClass index. Only for Instructions mentioned in
+  // InstRW records.
+  typedef DenseMap<Record*, unsigned> InstClassMapTy;
+  InstClassMapTy InstrClassMap;
 
 public:
   CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
 
+  Record *getModelOrItinDef(Record *ProcDef) const {
+    Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
+    Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
+    if (!ItinsDef->getValueAsListOfDefs("IID").empty()) {
+      assert(ModelDef->getValueAsBit("NoModel")
+             && "Itineraries must be defined within SchedMachineModel");
+      return ItinsDef;
+    }
+    return ModelDef;
+  }
+
+  const CodeGenProcModel &getModelForProc(Record *ProcDef) const {
+    Record *ModelDef = getModelOrItinDef(ProcDef);
+    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    assert(I != ProcModelMap.end() && "missing machine model");
+    return ProcModels[I->second];
+  }
+
+  const CodeGenProcModel &getProcModel(Record *ModelDef) const {
+    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    assert(I != ProcModelMap.end() && "missing machine model");
+    return ProcModels[I->second];
+  }
+
+  // Iterate over the unique processor models.
+  typedef std::vector<CodeGenProcModel>::const_iterator ProcIter;
+  ProcIter procModelBegin() const { return ProcModels.begin(); }
+  ProcIter procModelEnd() const { return ProcModels.end(); }
+
+  // Get a SchedWrite from its index.
+  const CodeGenSchedRW &getSchedWrite(unsigned Idx) const {
+    assert(Idx < SchedWrites.size() && "bad SchedWrite index");
+    assert(SchedWrites[Idx].isValid() && "invalid SchedWrite");
+    return SchedWrites[Idx];
+  }
+  // Get a SchedWrite from its index.
+  const CodeGenSchedRW &getSchedRead(unsigned Idx) const {
+    assert(Idx < SchedReads.size() && "bad SchedRead index");
+    assert(SchedReads[Idx].isValid() && "invalid SchedRead");
+    return SchedReads[Idx];
+  }
+
+  const CodeGenSchedRW &getSchedRW(unsigned Idx, bool IsRead) const {
+    return IsRead ? getSchedRead(Idx) : getSchedWrite(Idx);
+  }
+  CodeGenSchedRW &getSchedRW(Record *Def) {
+    bool IsRead = Def->isSubClassOf("SchedRead");
+    unsigned Idx = getSchedRWIdx(Def, IsRead);
+    return const_cast<CodeGenSchedRW&>(
+      IsRead ? getSchedRead(Idx) : getSchedWrite(Idx));
+  }
+  const CodeGenSchedRW &getSchedRW(Record*Def) const {
+    return const_cast<CodeGenSchedModels&>(*this).getSchedRW(Def);
+  }
+
+  unsigned getSchedRWIdx(Record *Def, bool IsRead, unsigned After = 0) const;
+
+  // Return true if the given write record is referenced by a ReadAdvance.
+  bool hasReadOfWrite(Record *WriteDef) const;
+
   // Check if any instructions are assigned to an explicit itinerary class other
   // than NoItinerary.
   bool hasItineraryClasses() const { return NumItineraryClasses > 0; }
@@ -111,60 +321,90 @@ public:
   }
 
   // Get a SchedClass from its index.
-  const CodeGenSchedClass &getSchedClass(unsigned Idx) {
+  CodeGenSchedClass &getSchedClass(unsigned Idx) {
     assert(Idx < SchedClasses.size() && "bad SchedClass index");
     return SchedClasses[Idx];
   }
-
-  // Get an itinerary class's index. Value indices are '0' for NoItinerary up to
-  // and including numItineraryClasses().
-  unsigned getItinClassIdx(Record *ItinDef) const {
-    assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
-    unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
-    assert(Idx <= NumItineraryClasses && "bad ItinClass index");
-    return Idx;
+  const CodeGenSchedClass &getSchedClass(unsigned Idx) const {
+    assert(Idx < SchedClasses.size() && "bad SchedClass index");
+    return SchedClasses[Idx];
   }
 
-  bool hasProcessorItineraries() const {
-    return HasProcItineraries;
-  }
+  // Get the SchedClass index for an instruction. Instructions with no
+  // itinerary, no SchedReadWrites, and no InstrReadWrites references return 0
+  // for NoItinerary.
+  unsigned getSchedClassIdx(const CodeGenInstruction &Inst) const;
+
+  unsigned getSchedClassIdx(const RecVec &RWDefs) const;
 
-  // Get an existing machine model for a processor definition.
-  const CodeGenProcModel &getProcModel(Record *ProcDef) const {
-    unsigned idx = getProcModelIdx(ProcDef);
-    assert(idx < ProcModels.size() && "missing machine model");
-    return ProcModels[idx];
+  unsigned getSchedClassIdxForItin(const Record *ItinDef) {
+    return SchedClassIdxMap[ItinDef->getName()];
   }
 
-  // Iterate over the unique processor models.
-  typedef std::vector<CodeGenProcModel>::const_iterator ProcIter;
-  ProcIter procModelBegin() const { return ProcModels.begin(); }
-  ProcIter procModelEnd() const { return ProcModels.end(); }
+  typedef std::vector<CodeGenSchedClass>::const_iterator SchedClassIter;
+  SchedClassIter schedClassBegin() const { return SchedClasses.begin(); }
+  SchedClassIter schedClassEnd() const { return SchedClasses.end(); }
 
-private:
-  // Get a key that can uniquely identify a machine model.
-  ProcModelMapTy::key_type getProcModelKey(Record *ProcDef) const {
-    Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
-    Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
-    return std::make_pair(ModelDef, ItinsDef);
-  }
+  void findRWs(const RecVec &RWDefs, IdxVec &Writes, IdxVec &Reads) const;
+  void findRWs(const RecVec &RWDefs, IdxVec &RWs, bool IsRead) const;
+  void expandRWSequence(unsigned RWIdx, IdxVec &RWSeq, bool IsRead) const;
+  void expandRWSeqForProc(unsigned RWIdx, IdxVec &RWSeq, bool IsRead,
+                          const CodeGenProcModel &ProcModel) const;
 
-  // Get the unique index of a machine model.
-  unsigned getProcModelIdx(Record *ProcDef) const {
-    ProcModelMapTy::const_iterator I =
-      ProcModelMap.find(getProcModelKey(ProcDef));
-    if (I == ProcModelMap.end())
-      return ProcModels.size();
-    return I->second;
-  }
+  unsigned addSchedClass(const IdxVec &OperWrites, const IdxVec &OperReads,
+                         const IdxVec &ProcIndices);
+
+  unsigned findOrInsertRW(ArrayRef<unsigned> Seq, bool IsRead);
+
+  unsigned findSchedClassIdx(const IdxVec &Writes, const IdxVec &Reads) const;
+
+  Record *findProcResUnits(Record *ProcResKind,
+                           const CodeGenProcModel &PM) const;
+
+private:
+  void collectProcModels();
 
   // Initialize a new processor model if it is unique.
   void addProcModel(Record *ProcDef);
 
-  void CollectSchedClasses();
-  void CollectProcModels();
-  void CollectProcItin(CodeGenProcModel &ProcModel,
-                       std::vector<Record*> ItinRecords);
+  void collectSchedRW();
+
+  std::string genRWName(const IdxVec& Seq, bool IsRead);
+  unsigned findRWForSequence(const IdxVec &Seq, bool IsRead);
+
+  void collectSchedClasses();
+
+  std::string createSchedClassName(const IdxVec &OperWrites,
+                                   const IdxVec &OperReads);
+  std::string createSchedClassName(const RecVec &InstDefs);
+  void createInstRWClass(Record *InstRWDef);
+
+  void collectProcItins();
+
+  void collectProcItinRW();
+
+  void inferSchedClasses();
+
+  void inferFromRW(const IdxVec &OperWrites, const IdxVec &OperReads,
+                   unsigned FromClassIdx, const IdxVec &ProcIndices);
+  void inferFromItinClass(Record *ItinClassDef, unsigned FromClassIdx);
+  void inferFromInstRWs(unsigned SCIdx);
+
+  void collectProcResources();
+
+  void collectItinProcResources(Record *ItinClassDef);
+
+  void collectRWResources(unsigned RWIdx, bool IsRead,
+                          const IdxVec &ProcIndices);
+
+  void collectRWResources(const IdxVec &Writes, const IdxVec &Reads,
+                          const IdxVec &ProcIndices);
+
+  void addProcResource(Record *ProcResourceKind, CodeGenProcModel &PM);
+
+  void addWriteRes(Record *ProcWriteResDef, unsigned PIdx);
+
+  void addReadAdvance(Record *ProcReadAdvanceDef, unsigned PIdx);
 };
 
 } // namespace llvm
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 1dd2efc4a1bf..c9992eb39228 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -10,13 +10,14 @@
 // This class wraps target description classes used by the various code
 // generation TableGen backends.  This makes it easier to access the data and
 // provides a single place that needs to check it for validity.  All of these
-// classes throw exceptions on error conditions.
+// classes abort on error conditions.
 //
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenTarget.h"
 #include "CodeGenIntrinsics.h"
 #include "CodeGenSchedule.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/STLExtras.h"
@@ -68,22 +69,30 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::x86mmx:   return "MVT::x86mmx";
   case MVT::Glue:     return "MVT::Glue";
   case MVT::isVoid:   return "MVT::isVoid";
+  case MVT::v2i1:     return "MVT::v2i1";
+  case MVT::v4i1:     return "MVT::v4i1";
+  case MVT::v8i1:     return "MVT::v8i1";
+  case MVT::v16i1:    return "MVT::v16i1";
   case MVT::v2i8:     return "MVT::v2i8";
   case MVT::v4i8:     return "MVT::v4i8";
   case MVT::v8i8:     return "MVT::v8i8";
   case MVT::v16i8:    return "MVT::v16i8";
   case MVT::v32i8:    return "MVT::v32i8";
+  case MVT::v1i16:    return "MVT::v1i16";
   case MVT::v2i16:    return "MVT::v2i16";
   case MVT::v4i16:    return "MVT::v4i16";
   case MVT::v8i16:    return "MVT::v8i16";
   case MVT::v16i16:   return "MVT::v16i16";
+  case MVT::v1i32:    return "MVT::v1i32";
   case MVT::v2i32:    return "MVT::v2i32";
   case MVT::v4i32:    return "MVT::v4i32";
   case MVT::v8i32:    return "MVT::v8i32";
+  case MVT::v16i32:   return "MVT::v16i32";
   case MVT::v1i64:    return "MVT::v1i64";
   case MVT::v2i64:    return "MVT::v2i64";
   case MVT::v4i64:    return "MVT::v4i64";
   case MVT::v8i64:    return "MVT::v8i64";
+  case MVT::v16i64:   return "MVT::v16i64";
   case MVT::v2f16:    return "MVT::v2f16";
   case MVT::v2f32:    return "MVT::v2f32";
   case MVT::v4f32:    return "MVT::v4f32";
@@ -116,9 +125,9 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
   : Records(records), RegBank(0), SchedModels(0) {
   std::vector<Record*> Targets = Records.getAllDerivedDefinitions("Target");
   if (Targets.size() == 0)
-    throw std::string("ERROR: No 'Target' subclasses defined!");
+    PrintFatalError("ERROR: No 'Target' subclasses defined!");
   if (Targets.size() != 1)
-    throw std::string("ERROR: Multiple subclasses of Target defined!");
+    PrintFatalError("ERROR: Multiple subclasses of Target defined!");
   TargetRec = Targets[0];
 }
 
@@ -152,7 +161,7 @@ Record *CodeGenTarget::getInstructionSet() const {
 Record *CodeGenTarget::getAsmParser() const {
   std::vector<Record*> LI = TargetRec->getValueAsListOfDefs("AssemblyParsers");
   if (AsmParserNum >= LI.size())
-    throw "Target does not have an AsmParser #" + utostr(AsmParserNum) + "!";
+    PrintFatalError("Target does not have an AsmParser #" + utostr(AsmParserNum) + "!");
   return LI[AsmParserNum];
 }
 
@@ -163,7 +172,7 @@ Record *CodeGenTarget::getAsmParserVariant(unsigned i) const {
   std::vector<Record*> LI =
     TargetRec->getValueAsListOfDefs("AssemblyParserVariants");
   if (i >= LI.size())
-    throw "Target does not have an AsmParserVariant #" + utostr(i) + "!";
+    PrintFatalError("Target does not have an AsmParserVariant #" + utostr(i) + "!");
   return LI[i];
 }
 
@@ -181,7 +190,7 @@ unsigned CodeGenTarget::getAsmParserVariantCount() const {
 Record *CodeGenTarget::getAsmWriter() const {
   std::vector<Record*> LI = TargetRec->getValueAsListOfDefs("AssemblyWriters");
   if (AsmWriterNum >= LI.size())
-    throw "Target does not have an AsmWriter #" + utostr(AsmWriterNum) + "!";
+    PrintFatalError("Target does not have an AsmWriter #" + utostr(AsmWriterNum) + "!");
   return LI[AsmWriterNum];
 }
 
@@ -199,12 +208,11 @@ void CodeGenTarget::ReadRegAltNameIndices() const {
 /// getRegisterByName - If there is a register with the specific AsmName,
 /// return it.
 const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
-  const std::vector<CodeGenRegister*> &Regs = getRegBank().getRegisters();
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i)
-    if (Regs[i]->TheDef->getValueAsString("AsmName") == Name)
-      return Regs[i];
-
-  return 0;
+  const StringMap<CodeGenRegister*> &Regs = getRegBank().getRegistersByName();
+  StringMap<CodeGenRegister*>::const_iterator I = Regs.find(Name);
+  if (I == Regs.end())
+    return 0;
+  return I->second;
 }
 
 std::vector<MVT::SimpleValueType> CodeGenTarget::
@@ -249,7 +257,7 @@ CodeGenSchedModels &CodeGenTarget::getSchedModels() const {
 void CodeGenTarget::ReadInstructions() const {
   std::vector<Record*> Insts = Records.getAllDerivedDefinitions("Instruction");
   if (Insts.size() <= 2)
-    throw std::string("No 'Instruction' subclasses defined!");
+    PrintFatalError("No 'Instruction' subclasses defined!");
 
   // Parse the instructions defined in the .td file.
   for (unsigned i = 0, e = Insts.size(); i != e; ++i)
@@ -265,7 +273,7 @@ GetInstByName(const char *Name,
   DenseMap<const Record*, CodeGenInstruction*>::const_iterator
     I = Insts.find(Rec);
   if (Rec == 0 || I == Insts.end())
-    throw std::string("Could not find '") + Name + "' instruction!";
+    PrintFatalError(std::string("Could not find '") + Name + "' instruction!");
   return I->second;
 }
 
@@ -300,6 +308,8 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
     "REG_SEQUENCE",
     "COPY",
     "BUNDLE",
+    "LIFETIME_START",
+    "LIFETIME_END",
     0
   };
   const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
@@ -334,6 +344,15 @@ bool CodeGenTarget::isLittleEndianEncoding() const {
   return getInstructionSet()->getValueAsBit("isLittleEndianEncoding");
 }
 
+/// guessInstructionProperties - Return true if it's OK to guess instruction
+/// properties instead of raising an error.
+///
+/// This is configurable as a temporary migration aid. It will eventually be
+/// permanently false.
+bool CodeGenTarget::guessInstructionProperties() const {
+  return getInstructionSet()->getValueAsBit("guessInstructionProperties");
+}
+
 //===----------------------------------------------------------------------===//
 // ComplexPattern implementation
 //
@@ -401,7 +420,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
 
   if (DefName.size() <= 4 ||
       std::string(DefName.begin(), DefName.begin() + 4) != "int_")
-    throw "Intrinsic '" + DefName + "' does not start with 'int_'!";
+    PrintFatalError("Intrinsic '" + DefName + "' does not start with 'int_'!");
 
   EnumName = std::string(DefName.begin()+4, DefName.end());
 
@@ -421,7 +440,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
     // Verify it starts with "llvm.".
     if (Name.size() <= 5 ||
         std::string(Name.begin(), Name.begin() + 5) != "llvm.")
-      throw "Intrinsic '" + DefName + "'s name does not start with 'llvm.'!";
+      PrintFatalError("Intrinsic '" + DefName + "'s name does not start with 'llvm.'!");
   }
 
   // If TargetPrefix is specified, make sure that Name starts with
@@ -430,8 +449,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
     if (Name.size() < 6+TargetPrefix.size() ||
         std::string(Name.begin() + 5, Name.begin() + 6 + TargetPrefix.size())
         != (TargetPrefix + "."))
-      throw "Intrinsic '" + DefName + "' does not start with 'llvm." +
-        TargetPrefix + ".'!";
+      PrintFatalError("Intrinsic '" + DefName + "' does not start with 'llvm." +
+        TargetPrefix + ".'!");
   }
 
   // Parse the list of return types.
@@ -463,7 +482,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
 
     // Reject invalid types.
     if (VT == MVT::isVoid)
-      throw "Intrinsic '" + DefName + " has void in result type list!";
+      PrintFatalError("Intrinsic '" + DefName + " has void in result type list!");
 
     IS.RetVTs.push_back(VT);
     IS.RetTypeDefs.push_back(TyEl);
@@ -497,7 +516,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
 
     // Reject invalid types.
     if (VT == MVT::isVoid && i != e-1 /*void at end means varargs*/)
-      throw "Intrinsic '" + DefName + " has void in result type list!";
+      PrintFatalError("Intrinsic '" + DefName + " has void in result type list!");
 
     IS.ParamVTs.push_back(VT);
     IS.ParamTypeDefs.push_back(TyEl);
diff --git a/utils/TableGen/CodeGenTarget.h b/utils/TableGen/CodeGenTarget.h
index 2f8cee4588a4..ddeecee36fdf 100644
--- a/utils/TableGen/CodeGenTarget.h
+++ b/utils/TableGen/CodeGenTarget.h
@@ -9,8 +9,8 @@
 //
 // This file defines wrappers for the Target class and related global
 // functionality.  This makes it easier to access the data and provides a single
-// place that needs to check it for validity.  All of these classes throw
-// exceptions on error conditions.
+// place that needs to check it for validity.  All of these classes abort
+// on error conditions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -177,6 +177,10 @@ public:
   ///
   bool isLittleEndianEncoding() const;
 
+  /// guessInstructionProperties - should we just guess unset instruction
+  /// properties?
+  bool guessInstructionProperties() const;
+
 private:
   void ComputeInstrsByEnum() const;
 };
diff --git a/utils/TableGen/DAGISelMatcher.h b/utils/TableGen/DAGISelMatcher.h
index 3ca16f04269d..7c6ce3babcd8 100644
--- a/utils/TableGen/DAGISelMatcher.h
+++ b/utils/TableGen/DAGISelMatcher.h
@@ -99,8 +99,6 @@ public:
 
   OwningPtr<Matcher> &getNextPtr() { return Next; }
 
-  static inline bool classof(const Matcher *) { return true; }
-
   bool isEqual(const Matcher *M) const {
     if (getKind() != M->getKind()) return false;
     return isEqualImpl(M);
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 1445edbe720a..713f1743c143 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -598,7 +598,7 @@ EmitMatcherList(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
 void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit pattern predicates.
   if (!PatternPredicates.empty()) {
-    OS << "bool CheckPatternPredicate(unsigned PredNo) const {\n";
+    OS << "virtual bool CheckPatternPredicate(unsigned PredNo) const {\n";
     OS << "  switch (PredNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = PatternPredicates.size(); i != e; ++i)
@@ -616,7 +616,8 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
     PFsByName[I->first->getName()] = I->second;
 
   if (!NodePredicates.empty()) {
-    OS << "bool CheckNodePredicate(SDNode *Node, unsigned PredNo) const {\n";
+    OS << "virtual bool CheckNodePredicate(SDNode *Node,\n";
+    OS << "                                unsigned PredNo) const {\n";
     OS << "  switch (PredNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = NodePredicates.size(); i != e; ++i) {
@@ -635,8 +636,8 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit CompletePattern matchers.
   // FIXME: This should be const.
   if (!ComplexPatterns.empty()) {
-    OS << "bool CheckComplexPattern(SDNode *Root, SDNode *Parent, SDValue N,\n";
-    OS << "                         unsigned PatternNo,\n";
+    OS << "virtual bool CheckComplexPattern(SDNode *Root, SDNode *Parent,\n";
+    OS << "                                 SDValue N, unsigned PatternNo,\n";
     OS << "         SmallVectorImpl<std::pair<SDValue, SDNode*> > &Result) {\n";
     OS << "  unsigned NextRes = Result.size();\n";
     OS << "  switch (PatternNo) {\n";
@@ -676,7 +677,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit SDNodeXForm handlers.
   // FIXME: This should be const.
   if (!NodeXForms.empty()) {
-    OS << "SDValue RunSDNodeXForm(SDValue V, unsigned XFormNo) {\n";
+    OS << "virtual SDValue RunSDNodeXForm(SDValue V, unsigned XFormNo) {\n";
     OS << "  switch (XFormNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid xform # in table?\");\n";
 
diff --git a/utils/TableGen/DAGISelMatcherGen.cpp b/utils/TableGen/DAGISelMatcherGen.cpp
index aed222c09495..573f55875ec6 100644
--- a/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/utils/TableGen/DAGISelMatcherGen.cpp
@@ -10,6 +10,7 @@
 #include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenRegisters.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
@@ -172,15 +173,10 @@ void MatcherGen::InferPossibleTypes() {
   // diagnostics, which we know are impossible at this point.
   TreePattern &TP = *CGP.pf_begin()->second;
 
-  try {
-    bool MadeChange = true;
-    while (MadeChange)
-      MadeChange = PatWithNoTypes->ApplyTypeConstraints(TP,
-                                                true/*Ignore reg constraints*/);
-  } catch (...) {
-    errs() << "Type constraint application shouldn't fail!";
-    abort();
-  }
+  bool MadeChange = true;
+  while (MadeChange)
+    MadeChange = PatWithNoTypes->ApplyTypeConstraints(TP,
+                                              true/*Ignore reg constraints*/);
 }
 
 
@@ -203,7 +199,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
   assert(N->isLeaf() && "Not a leaf?");
 
   // Direct match against an integer constant.
-  if (IntInit *II = dynamic_cast<IntInit*>(N->getLeafValue())) {
+  if (IntInit *II = dyn_cast<IntInit>(N->getLeafValue())) {
     // If this is the root of the dag we're matching, we emit a redundant opcode
     // check to ensure that this gets folded into the normal top-level
     // OpcodeSwitch.
@@ -215,7 +211,7 @@ void MatcherGen::EmitLeafMatchCode(const TreePatternNode *N) {
     return AddMatcher(new CheckIntegerMatcher(II->getValue()));
   }
 
-  DefInit *DI = dynamic_cast<DefInit*>(N->getLeafValue());
+  DefInit *DI = dyn_cast<DefInit>(N->getLeafValue());
   if (DI == 0) {
     errs() << "Unknown leaf kind: " << *N << "\n";
     abort();
@@ -283,7 +279,7 @@ void MatcherGen::EmitOperatorMatchCode(const TreePatternNode *N,
        N->getOperator()->getName() == "or") &&
       N->getChild(1)->isLeaf() && N->getChild(1)->getPredicateFns().empty() &&
       N->getPredicateFns().empty()) {
-    if (IntInit *II = dynamic_cast<IntInit*>(N->getChild(1)->getLeafValue())) {
+    if (IntInit *II = dyn_cast<IntInit>(N->getChild(1)->getLeafValue())) {
       if (!isPowerOf2_32(II->getValue())) {  // Don't bother with single bits.
         // If this is at the root of the pattern, we emit a redundant
         // CheckOpcode so that the following checks get factored properly under
@@ -572,14 +568,14 @@ void MatcherGen::EmitResultLeafAsOperand(const TreePatternNode *N,
                                          SmallVectorImpl<unsigned> &ResultOps) {
   assert(N->isLeaf() && "Must be a leaf");
 
-  if (IntInit *II = dynamic_cast<IntInit*>(N->getLeafValue())) {
+  if (IntInit *II = dyn_cast<IntInit>(N->getLeafValue())) {
     AddMatcher(new EmitIntegerMatcher(II->getValue(), N->getType(0)));
     ResultOps.push_back(NextRecordedOperandNo++);
     return;
   }
 
   // If this is an explicit register reference, handle it.
-  if (DefInit *DI = dynamic_cast<DefInit*>(N->getLeafValue())) {
+  if (DefInit *DI = dyn_cast<DefInit>(N->getLeafValue())) {
     Record *Def = DI->getDef();
     if (Def->isSubClassOf("Register")) {
       const CodeGenRegister *Reg =
@@ -727,8 +723,7 @@ EmitResultInstructionAsOperand(const TreePatternNode *N,
 
     // Determine what to emit for this operand.
     Record *OperandNode = II.Operands[InstOpNo].Rec;
-    if ((OperandNode->isSubClassOf("PredicateOperand") ||
-         OperandNode->isSubClassOf("OptionalDefOperand")) &&
+    if (OperandNode->isSubClassOf("OperandWithDefaultOps") &&
         !CGP.getDefaultOperand(OperandNode).DefaultOps.empty()) {
       // This is a predicate or optional def operand; emit the
       // 'default ops' operands.
@@ -877,7 +872,7 @@ void MatcherGen::EmitResultOperand(const TreePatternNode *N,
   if (OpRec->isSubClassOf("SDNodeXForm"))
     return EmitResultSDNodeXFormAsOperand(N, ResultOps);
   errs() << "Unknown result node to emit code for: " << *N << '\n';
-  throw std::string("Unknown node in result pattern!");
+  PrintFatalError("Unknown node in result pattern!");
 }
 
 void MatcherGen::EmitResultCode() {
diff --git a/utils/TableGen/DFAPacketizerEmitter.cpp b/utils/TableGen/DFAPacketizerEmitter.cpp
index 8bfecead6d2e..0ad25a5428db 100644
--- a/utils/TableGen/DFAPacketizerEmitter.cpp
+++ b/utils/TableGen/DFAPacketizerEmitter.cpp
@@ -17,6 +17,7 @@
 
 #include "CodeGenTarget.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <list>
@@ -74,6 +75,8 @@ public:
 // Another way of thinking about this transition is we are mapping a NDFA with
 // two states [0x01] and [0x10] into a DFA with a single state [0x01, 0x10].
 //
+// A State instance also contains a collection of transitions from that state:
+// a map from inputs to new states.
 //
 namespace {
 class State {
@@ -82,10 +85,16 @@ class State {
   int stateNum;
   bool isInitial;
   std::set<unsigned> stateInfo;
+  typedef std::map<unsigned, State *> TransitionMap;
+  TransitionMap Transitions;
 
   State();
   State(const State &S);
 
+  bool operator<(const State &s) const {
+    return stateNum < s.stateNum;
+  }
+
   //
   // canAddInsnClass - Returns true if an instruction of type InsnClass is a
   // valid transition from this state, i.e., can an instruction of type InsnClass
@@ -100,38 +109,18 @@ class State {
   // which are possible from this state (PossibleStates).
   //
   void AddInsnClass(unsigned InsnClass, std::set<unsigned> &PossibleStates);
+  // 
+  // addTransition - Add a transition from this state given the input InsnClass
+  //
+  void addTransition(unsigned InsnClass, State *To);
+  //
+  // hasTransition - Returns true if there is a transition from this state
+  // given the input InsnClass
+  //
+  bool hasTransition(unsigned InsnClass);
 };
 } // End anonymous namespace.
 
-
-namespace {
-struct Transition {
- public:
-  static int currentTransitionNum;
-  int transitionNum;
-  State *from;
-  unsigned input;
-  State *to;
-
-  Transition(State *from_, unsigned input_, State *to_);
-};
-} // End anonymous namespace.
-
-
-//
-// Comparators to keep set of states sorted.
-//
-namespace {
-struct ltState {
-  bool operator()(const State *s1, const State *s2) const;
-};
-
-struct ltTransition {
-  bool operator()(const Transition *s1, const Transition *s2) const;
-};
-} // End anonymous namespace.
-
-
 //
 // class DFA: deterministic finite automaton for processor resource tracking.
 //
@@ -139,36 +128,19 @@ namespace {
 class DFA {
 public:
   DFA();
+  ~DFA();
 
   // Set of states. Need to keep this sorted to emit the transition table.
-  std::set<State*, ltState> states;
+  typedef std::set<State *, less_ptr<State> > StateSet;
+  StateSet states;
 
-  // Map from a state to the list of transitions with that state as source.
-  std::map<State*, std::set<Transition*, ltTransition>, ltState>
-    stateTransitions;
   State *currentState;
 
-  // Highest valued Input seen.
-  unsigned LargestInput;
-
   //
   // Modify the DFA.
   //
   void initialize();
   void addState(State *);
-  void addTransition(Transition *);
-
-  //
-  // getTransition -  Return the state when a transition is made from
-  // State From with Input I. If a transition is not found, return NULL.
-  //
-  State *getTransition(State *, unsigned);
-
-  //
-  // isValidTransition: Predicate that checks if there is a valid transition
-  // from state From on input InsnClass.
-  //
-  bool isValidTransition(State *From, unsigned InsnClass);
 
   //
   // writeTable: Print out a table representing the DFA.
@@ -179,7 +151,7 @@ public:
 
 
 //
-// Constructors for State, Transition, and DFA
+// Constructors and destructors for State and DFA
 //
 State::State() :
   stateNum(currentStateNum++), isInitial(false) {}
@@ -189,22 +161,27 @@ State::State(const State &S) :
   stateNum(currentStateNum++), isInitial(S.isInitial),
   stateInfo(S.stateInfo) {}
 
+DFA::DFA(): currentState(NULL) {}
 
-Transition::Transition(State *from_, unsigned input_, State *to_) :
-  transitionNum(currentTransitionNum++), from(from_), input(input_),
-  to(to_) {}
-
-
-DFA::DFA() :
-  LargestInput(0) {}
-
+DFA::~DFA() {
+  DeleteContainerPointers(states);
+}
 
-bool ltState::operator()(const State *s1, const State *s2) const {
-    return (s1->stateNum < s2->stateNum);
+// 
+// addTransition - Add a transition from this state given the input InsnClass
+//
+void State::addTransition(unsigned InsnClass, State *To) {
+  assert(!Transitions.count(InsnClass) &&
+      "Cannot have multiple transitions for the same input");
+  Transitions[InsnClass] = To;
 }
 
-bool ltTransition::operator()(const Transition *s1, const Transition *s2) const {
-    return (s1->input < s2->input);
+//
+// hasTransition - Returns true if there is a transition from this state
+// given the input InsnClass
+//
+bool State::hasTransition(unsigned InsnClass) {
+  return Transitions.count(InsnClass) > 0;
 }
 
 //
@@ -272,6 +249,7 @@ bool State::canAddInsnClass(unsigned InsnClass) const {
 
 
 void DFA::initialize() {
+  assert(currentState && "Missing current state");
   currentState->isInitial = true;
 }
 
@@ -282,47 +260,7 @@ void DFA::addState(State *S) {
 }
 
 
-void DFA::addTransition(Transition *T) {
-  // Update LargestInput.
-  if (T->input > LargestInput)
-    LargestInput = T->input;
-
-  // Add the new transition.
-  bool Added = stateTransitions[T->from].insert(T).second;
-  assert(Added && "Cannot have multiple states for the same input");
-  (void)Added;
-}
-
-
-//
-// getTransition - Return the state when a transition is made from
-// State From with Input I. If a transition is not found, return NULL.
-//
-State *DFA::getTransition(State *From, unsigned I) {
-  // Do we have a transition from state From?
-  if (!stateTransitions.count(From))
-    return NULL;
-
-  // Do we have a transition from state From with Input I?
-  Transition TVal(NULL, I, NULL);
-  // Do not count this temporal instance
-  Transition::currentTransitionNum--;
-  std::set<Transition*, ltTransition>::iterator T =
-    stateTransitions[From].find(&TVal);
-  if (T != stateTransitions[From].end())
-    return (*T)->to;
-
-  return NULL;
-}
-
-
-bool DFA::isValidTransition(State *From, unsigned InsnClass) {
-  return (getTransition(From, InsnClass) != NULL);
-}
-
-
 int State::currentStateNum = 0;
-int Transition::currentTransitionNum = 0;
 
 DFAPacketizerEmitter::DFAPacketizerEmitter(RecordKeeper &R):
   TargetName(CodeGenTarget(R).getName()),
@@ -341,7 +279,7 @@ DFAPacketizerEmitter::DFAPacketizerEmitter(RecordKeeper &R):
 //
 //
 void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName) {
-  std::set<State*, ltState>::iterator SI = states.begin();
+  DFA::StateSet::iterator SI = states.begin();
   // This table provides a map to the beginning of the transitions for State s
   // in DFAStateInputTable.
   std::vector<int> StateEntry(states.size());
@@ -353,18 +291,16 @@ void DFA::writeTableAndAPI(raw_ostream &OS, const std::string &TargetName) {
   // to construct the StateEntry table.
   int ValidTransitions = 0;
   for (unsigned i = 0; i < states.size(); ++i, ++SI) {
+    assert (((*SI)->stateNum == (int) i) && "Mismatch in state numbers");
     StateEntry[i] = ValidTransitions;
-    for (unsigned j = 0; j <= LargestInput; ++j) {
-      assert (((*SI)->stateNum == (int) i) && "Mismatch in state numbers");
-      State *To = getTransition(*SI, j);
-      if (To == NULL)
-        continue;
-
-      OS << "{" << j << ", "
-         << To->stateNum
+    for (State::TransitionMap::iterator
+        II = (*SI)->Transitions.begin(), IE = (*SI)->Transitions.end();
+        II != IE; ++II) {
+      OS << "{" << II->first << ", "
+         << II->second->stateNum
          << "},    ";
-      ++ValidTransitions;
     }
+    ValidTransitions += (*SI)->Transitions.size();
 
     // If there are no valid transitions from this stage, we need a sentinel
     // transition.
@@ -539,7 +475,7 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
       // If we haven't already created a transition for this input
       // and the state can accommodate this InsnClass, create a transition.
       //
-      if (!D.getTransition(current, InsnClass) &&
+      if (!current->hasTransition(InsnClass) &&
           current->canAddInsnClass(InsnClass)) {
         State *NewState = NULL;
         current->AddInsnClass(InsnClass, NewStateResources);
@@ -559,10 +495,8 @@ void DFAPacketizerEmitter::run(raw_ostream &OS) {
           Visited[NewStateResources] = NewState;
           WorkList.push_back(NewState);
         }
-
-        Transition *NewTransition = new Transition(current, InsnClass,
-                                                   NewState);
-        D.addTransition(NewTransition);
+        
+        current->addTransition(InsnClass, NewState);
       }
     }
   }
diff --git a/utils/TableGen/DisassemblerEmitter.cpp b/utils/TableGen/DisassemblerEmitter.cpp
index 826465a51642..2d11d2480de4 100644
--- a/utils/TableGen/DisassemblerEmitter.cpp
+++ b/utils/TableGen/DisassemblerEmitter.cpp
@@ -117,11 +117,9 @@ void EmitDisassembler(RecordKeeper &Records, raw_ostream &OS) {
     for (unsigned i = 0, e = numberedInstructions.size(); i != e; ++i)
       RecognizableInstr::processInstr(Tables, *numberedInstructions[i], i);
 
-    // FIXME: As long as we are using exceptions, might as well drop this to the
-    // actual conflict site.
     if (Tables.hasConflicts())
-      throw TGError(Target.getTargetRecord()->getLoc(),
-                    "Primary decode conflict");
+      PrintFatalError(Target.getTargetRecord()->getLoc(),
+                      "Primary decode conflict");
 
     Tables.emit(OS);
     return;
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index 0c8b28d22027..ea2545050bc0 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <string>
@@ -358,8 +359,8 @@ static int X86TypeFromOpName(LiteralConstantEmitter *type,
 /// X86PopulateOperands - Handles all the operands in an X86 instruction, adding
 ///   the appropriate flags to their descriptors
 ///
-/// @operandFlags - A reference the array of operand flag objects
-/// @inst         - The instruction to use as a source of information
+/// \param operandTypes A reference the array of operand type objects
+/// \param inst         The instruction to use as a source of information
 static void X86PopulateOperands(
   LiteralConstantEmitter *(&operandTypes)[EDIS_MAX_OPERANDS],
   const CodeGenInstruction &inst) {
@@ -385,11 +386,12 @@ static void X86PopulateOperands(
 
 /// decorate1 - Decorates a named operand with a new flag
 ///
-/// @operandFlags - The array of operand flag objects, which don't have names
-/// @inst         - The CodeGenInstruction, which provides a way to translate
-///                 between names and operand indices
-/// @opName       - The name of the operand
-/// @flag         - The name of the flag to add
+/// \param operandFlags The array of operand flag objects, which don't have
+///                     names
+/// \param inst         The CodeGenInstruction, which provides a way to
+//                      translate between names and operand indices
+/// \param opName       The name of the operand
+/// \param opFlag       The name of the flag to add
 static inline void decorate1(
   FlagsConstantEmitter *(&operandFlags)[EDIS_MAX_OPERANDS],
   const CodeGenInstruction &inst,
@@ -438,9 +440,9 @@ static inline void decorate1(
 ///   instruction to determine what sort of an instruction it is and then adds
 ///   the appropriate flags to the instruction and its operands
 ///
-/// @arg instType     - A reference to the type for the instruction as a whole
-/// @arg operandFlags - A reference to the array of operand flag object pointers
-/// @arg inst         - A reference to the original instruction
+/// \param instType     A reference to the type for the instruction as a whole
+/// \param operandFlags A reference to the array of operand flag object pointers
+/// \param inst         A reference to the original instruction
 static void X86ExtractSemantics(
   LiteralConstantEmitter &instType,
   FlagsConstantEmitter *(&operandFlags)[EDIS_MAX_OPERANDS],
@@ -567,8 +569,8 @@ static void X86ExtractSemantics(
 /// ARMFlagFromOpName - Processes the name of a single ARM operand (which is
 ///   actually its type) and translates it into an operand type
 ///
-/// @arg type     - The type object to set
-/// @arg name     - The name of the operand
+/// \param type The type object to set
+/// \param name The name of the operand
 static int ARMFlagFromOpName(LiteralConstantEmitter *type,
                              const std::string &name) {
   REG("GPR");
@@ -750,8 +752,8 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
 /// ARMPopulateOperands - Handles all the operands in an ARM instruction, adding
 ///   the appropriate flags to their descriptors
 ///
-/// @operandFlags - A reference the array of operand flag objects
-/// @inst         - The instruction to use as a source of information
+/// \param operandTypes A reference the array of operand type objects
+/// \param inst         The instruction to use as a source of information
 static void ARMPopulateOperands(
   LiteralConstantEmitter *(&operandTypes)[EDIS_MAX_OPERANDS],
   const CodeGenInstruction &inst) {
@@ -776,7 +778,7 @@ static void ARMPopulateOperands(
       errs() << "Operand type: " << rec.getName() << '\n';
       errs() << "Operand name: " << operandInfo.Name << '\n';
       errs() << "Instruction name: " << inst.TheDef->getName() << '\n';
-      throw("Unhandled type in EDEmitter");
+      PrintFatalError("Unhandled type in EDEmitter");
     }
   }
 }
@@ -790,10 +792,10 @@ static void ARMPopulateOperands(
 ///   instruction to determine what sort of an instruction it is and then adds
 ///   the appropriate flags to the instruction and its operands
 ///
-/// @arg instType     - A reference to the type for the instruction as a whole
-/// @arg operandTypes - A reference to the array of operand type object pointers
-/// @arg operandFlags - A reference to the array of operand flag object pointers
-/// @arg inst         - A reference to the original instruction
+/// \param instType     A reference to the type for the instruction as a whole
+/// \param operandTypes A reference to the array of operand type object pointers
+/// \param operandFlags A reference to the array of operand flag object pointers
+/// \param inst         A reference to the original instruction
 static void ARMExtractSemantics(
   LiteralConstantEmitter &instType,
   LiteralConstantEmitter *(&operandTypes)[EDIS_MAX_OPERANDS],
@@ -831,8 +833,8 @@ static void ARMExtractSemantics(
 /// populateInstInfo - Fills an array of InstInfos with information about each
 ///   instruction in a target
 ///
-/// @arg infoArray  - The array of InstInfo objects to populate
-/// @arg target     - The CodeGenTarget to use as a source of instructions
+/// \param infoArray The array of InstInfo objects to populate
+/// \param target    The CodeGenTarget to use as a source of instructions
 static void populateInstInfo(CompoundConstantEmitter &infoArray,
                              CodeGenTarget &target) {
   const std::vector<const CodeGenInstruction*> &numberedInstructions =
diff --git a/utils/TableGen/FastISelEmitter.cpp b/utils/TableGen/FastISelEmitter.cpp
index ca784d0dda92..8b1e7f9256f9 100644
--- a/utils/TableGen/FastISelEmitter.cpp
+++ b/utils/TableGen/FastISelEmitter.cpp
@@ -245,7 +245,7 @@ struct OperandsSignature {
       if (Op->getType(0) != VT)
         return false;
 
-      DefInit *OpDI = dynamic_cast<DefInit*>(Op->getLeafValue());
+      DefInit *OpDI = dyn_cast<DefInit>(Op->getLeafValue());
       if (!OpDI)
         return false;
       Record *OpLeafRec = OpDI->getDef();
@@ -406,13 +406,12 @@ static std::string PhyRegForNode(TreePatternNode *Op,
   if (!Op->isLeaf())
     return PhysReg;
 
-  DefInit *OpDI = dynamic_cast<DefInit*>(Op->getLeafValue());
-  Record *OpLeafRec = OpDI->getDef();
+  Record *OpLeafRec = cast<DefInit>(Op->getLeafValue())->getDef();
   if (!OpLeafRec->isSubClassOf("Register"))
     return PhysReg;
 
-  PhysReg += static_cast<StringInit*>(OpLeafRec->getValue( \
-             "Namespace")->getValue())->getValue();
+  PhysReg += cast<StringInit>(OpLeafRec->getValue("Namespace")->getValue())
+               ->getValue();
   PhysReg += "::";
   PhysReg += Target.getRegBank().getReg(OpLeafRec)->getName();
   return PhysReg;
@@ -473,7 +472,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
       // a bit too complicated for now.
       if (!Dst->getChild(1)->isLeaf()) continue;
 
-      DefInit *SR = dynamic_cast<DefInit*>(Dst->getChild(1)->getLeafValue());
+      DefInit *SR = dyn_cast<DefInit>(Dst->getChild(1)->getLeafValue());
       if (SR)
         SubRegNo = getQualifiedName(SR->getDef());
       else
@@ -550,7 +549,7 @@ void FastISelMap::collectPatterns(CodeGenDAGPatterns &CGP) {
     };
     
     if (SimplePatterns[Operands][OpcodeName][VT][RetVT].count(PredicateCheck))
-      throw TGError(Pattern.getSrcRecord()->getLoc(),
+      PrintFatalError(Pattern.getSrcRecord()->getLoc(),
                     "Duplicate record in FastISel table!");
 
     SimplePatterns[Operands][OpcodeName][VT][RetVT][PredicateCheck] = Memo;
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index e89c393b6a9b..5cabcadabdbc 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -15,6 +15,7 @@
 #define DEBUG_TYPE "decoder-emitter"
 
 #include "CodeGenTarget.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallString.h"
@@ -142,7 +143,7 @@ static int Value(bit_value_t V) {
   return ValueNotSet(V) ? -1 : (V == BIT_FALSE ? 0 : 1);
 }
 static bit_value_t bitFromBits(const BitsInit &bits, unsigned index) {
-  if (BitInit *bit = dynamic_cast<BitInit*>(bits.getBit(index)))
+  if (BitInit *bit = dyn_cast<BitInit>(bits.getBit(index)))
     return bit->getValue() ? BIT_TRUE : BIT_FALSE;
 
   // The bit is uninitialized.
@@ -741,7 +742,7 @@ void FixedLenDecoderEmitter::emitTable(formatted_raw_ostream &OS,
 
     switch (*I) {
     default:
-      throw "invalid decode table opcode";
+      PrintFatalError("invalid decode table opcode");
     case MCD::OPC_ExtractField: {
       ++I;
       unsigned Start = *I++;
@@ -1757,8 +1758,8 @@ static bool populateInstruction(const CodeGenInstruction &CGI, unsigned Opc,
     // for decoding register classes.
     // FIXME: This need to be extended to handle instructions with custom
     // decoder methods, and operands with (simple) MIOperandInfo's.
-    TypedInit *TI = dynamic_cast<TypedInit*>(NI->first);
-    RecordRecTy *Type = dynamic_cast<RecordRecTy*>(TI->getType());
+    TypedInit *TI = cast<TypedInit>(NI->first);
+    RecordRecTy *Type = cast<RecordRecTy>(TI->getType());
     Record *TypeRecord = Type->getRecord();
     bool isReg = false;
     if (TypeRecord->isSubClassOf("RegisterOperand"))
@@ -1770,7 +1771,7 @@ static bool populateInstruction(const CodeGenInstruction &CGI, unsigned Opc,
 
     RecordVal *DecoderString = TypeRecord->getValue("DecoderMethod");
     StringInit *String = DecoderString ?
-      dynamic_cast<StringInit*>(DecoderString->getValue()) : 0;
+      dyn_cast<StringInit>(DecoderString->getValue()) : 0;
     if (!isReg && String && String->getValue() != "")
       Decoder = String->getValue();
 
@@ -1781,11 +1782,11 @@ static bool populateInstruction(const CodeGenInstruction &CGI, unsigned Opc,
 
     for (unsigned bi = 0; bi < Bits.getNumBits(); ++bi) {
       VarInit *Var = 0;
-      VarBitInit *BI = dynamic_cast<VarBitInit*>(Bits.getBit(bi));
+      VarBitInit *BI = dyn_cast<VarBitInit>(Bits.getBit(bi));
       if (BI)
-        Var = dynamic_cast<VarInit*>(BI->getVariable());
+        Var = dyn_cast<VarInit>(BI->getBitVar());
       else
-        Var = dynamic_cast<VarInit*>(Bits.getBit(bi));
+        Var = dyn_cast<VarInit>(Bits.getBit(bi));
 
       if (!Var) {
         if (Base != ~0U) {
@@ -1882,7 +1883,7 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "  uint64_t Bits = STI.getFeatureBits();\n"
      << "\n"
      << "  const uint8_t *Ptr = DecodeTable;\n"
-     << "  uint32_t CurFieldValue;\n"
+     << "  uint32_t CurFieldValue = 0;\n"
      << "  DecodeStatus S = MCDisassembler::Success;\n"
      << "  for (;;) {\n"
      << "    ptrdiff_t Loc = Ptr - DecodeTable;\n"
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index b41ad94aca39..48d41d7b96bd 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -16,8 +16,10 @@
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
+#include "TableGenBackends.h"
 #include "SequenceToOffsetTable.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <algorithm>
@@ -89,7 +91,7 @@ InstrInfoEmitter::GetOperandInfo(const CodeGenInstruction &Inst) {
       for (unsigned j = 0, e = Inst.Operands[i].MINumOperands; j != e; ++j) {
         OperandList.push_back(Inst.Operands[i]);
 
-        Record *OpR = dynamic_cast<DefInit*>(MIOI->getArg(j))->getDef();
+        Record *OpR = cast<DefInit>(MIOI->getArg(j))->getDef();
         OperandList.back().Rec = OpR;
       }
     }
@@ -299,16 +301,15 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
                                   const OperandInfoMapTy &OpInfo,
                                   raw_ostream &OS) {
   int MinOperands = 0;
-  if (!Inst.Operands.size() == 0)
+  if (!Inst.Operands.empty())
     // Each logical operand can be multiple MI operands.
     MinOperands = Inst.Operands.back().MIOperandNo +
                   Inst.Operands.back().MINumOperands;
 
-  Record *ItinDef = Inst.TheDef->getValueAsDef("Itinerary");
   OS << "  { ";
   OS << Num << ",\t" << MinOperands << ",\t"
      << Inst.Operands.NumDefs << ",\t"
-     << SchedModels.getItinClassIdx(ItinDef) << ",\t"
+     << SchedModels.getSchedClassIdx(Inst) << ",\t"
      << Inst.TheDef->getValueAsInt("Size") << ",\t0";
 
   // Emit all of the target indepedent flags...
@@ -343,13 +344,14 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
 
   // Emit all of the target-specific flags...
   BitsInit *TSF = Inst.TheDef->getValueAsBitsInit("TSFlags");
-  if (!TSF) throw "no TSFlags?";
+  if (!TSF)
+    PrintFatalError("no TSFlags?");
   uint64_t Value = 0;
   for (unsigned i = 0, e = TSF->getNumBits(); i != e; ++i) {
-    if (BitInit *Bit = dynamic_cast<BitInit*>(TSF->getBit(i)))
+    if (BitInit *Bit = dyn_cast<BitInit>(TSF->getBit(i)))
       Value |= uint64_t(Bit->getValue()) << i;
     else
-      throw "Invalid TSFlags bit in " + Inst.TheDef->getName();
+      PrintFatalError("Invalid TSFlags bit in " + Inst.TheDef->getName());
   }
   OS << ", 0x";
   OS.write_hex(Value);
@@ -416,6 +418,7 @@ namespace llvm {
 
 void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS) {
   InstrInfoEmitter(RK).run(OS);
+  EmitMapTable(RK, OS);
 }
 
 } // End llvm namespace
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index 155d1abea331..fe55242930b1 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenTarget.h"
 #include "SequenceToOffsetTable.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/StringMatcher.h"
 #include "llvm/TableGen/TableGenBackend.h"
@@ -249,7 +250,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   if (EVT(VT).isInteger()) {
     unsigned BitWidth = EVT(VT).getSizeInBits();
     switch (BitWidth) {
-    default: throw "unhandled integer type width in intrinsic!";
+    default: PrintFatalError("unhandled integer type width in intrinsic!");
     case 1: return Sig.push_back(IIT_I1);
     case 8: return Sig.push_back(IIT_I8);
     case 16: return Sig.push_back(IIT_I16);
@@ -259,7 +260,7 @@ static void EncodeFixedValueType(MVT::SimpleValueType VT,
   }
   
   switch (VT) {
-  default: throw "unhandled MVT in intrinsic!";
+  default: PrintFatalError("unhandled MVT in intrinsic!");
   case MVT::f32: return Sig.push_back(IIT_F32);
   case MVT::f64: return Sig.push_back(IIT_F64);
   case MVT::Metadata: return Sig.push_back(IIT_METADATA);
@@ -328,7 +329,7 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
   if (EVT(VT).isVector()) {
     EVT VVT = VT;
     switch (VVT.getVectorNumElements()) {
-    default: throw "unhandled vector type width in intrinsic!";
+    default: PrintFatalError("unhandled vector type width in intrinsic!");
     case 2: Sig.push_back(IIT_V2); break;
     case 4: Sig.push_back(IIT_V4); break;
     case 8: Sig.push_back(IIT_V8); break;
@@ -510,10 +511,10 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   OS << "// Add parameter attributes that are not common to all intrinsics.\n";
   OS << "#ifdef GET_INTRINSIC_ATTRIBUTES\n";
   if (TargetOnly)
-    OS << "static AttrListPtr getAttributes(" << TargetPrefix 
+    OS << "static AttrListPtr getAttributes(LLVMContext &C, " << TargetPrefix
        << "Intrinsic::ID id) {\n";
   else
-    OS << "AttrListPtr Intrinsic::getAttributes(ID id) {\n";
+    OS << "AttrListPtr Intrinsic::getAttributes(LLVMContext &C, ID id) {\n";
 
   // Compute the maximum number of attribute arguments and the map
   typedef std::map<const CodeGenIntrinsic*, unsigned,
@@ -547,6 +548,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   OS << "  AttributeWithIndex AWI[" << maxArgAttrs+1 << "];\n";
   OS << "  unsigned NumAttrs = 0;\n";
   OS << "  if (id != 0) {\n";
+  OS << "    SmallVector<Attributes::AttrVal, 8> AttrVec;\n";
   OS << "    switch(IntrinsicsToAttributesMap[id - ";
   if (TargetOnly)
     OS << "Intrinsic::num_intrinsics";
@@ -564,58 +566,49 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
     unsigned numAttrs = 0;
 
     // The argument attributes are alreadys sorted by argument index.
-    for (unsigned ai = 0, ae = intrinsic.ArgumentAttributes.size(); ai != ae;) {
-      unsigned argNo = intrinsic.ArgumentAttributes[ai].first;
+    unsigned ai = 0, ae = intrinsic.ArgumentAttributes.size();
+    if (ae) {
+      while (ai != ae) {
+        unsigned argNo = intrinsic.ArgumentAttributes[ai].first;
 
-      OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get("
-         << argNo+1 << ", ";
+        OS << "      AttrVec.clear();\n";
 
-      bool moreThanOne = false;
+        do {
+          switch (intrinsic.ArgumentAttributes[ai].second) {
+          case CodeGenIntrinsic::NoCapture:
+            OS << "      AttrVec.push_back(Attributes::NoCapture);\n";
+            break;
+          }
 
-      do {
-        if (moreThanOne) OS << '|';
+          ++ai;
+        } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
 
-        switch (intrinsic.ArgumentAttributes[ai].second) {
-        case CodeGenIntrinsic::NoCapture:
-          OS << "Attribute::NoCapture";
-          break;
-        }
-
-        ++ai;
-        moreThanOne = true;
-      } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
-
-      OS << ");\n";
+        OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get(C, "
+           << argNo+1 << ", AttrVec);\n";
+      }
     }
 
     ModRefKind modRef = getModRefKind(intrinsic);
 
     if (!intrinsic.canThrow || modRef || intrinsic.isNoReturn) {
-      OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get(~0, ";
-      bool Emitted = false;
-      if (!intrinsic.canThrow) {
-        OS << "Attribute::NoUnwind";
-        Emitted = true;
-      }
-      
-      if (intrinsic.isNoReturn) {
-        if (Emitted) OS << '|';
-        OS << "Attribute::NoReturn";
-        Emitted = true;
-      }
+      OS << "      AttrVec.clear();\n";
+
+      if (!intrinsic.canThrow)
+        OS << "      AttrVec.push_back(Attributes::NoUnwind);\n";
+      if (intrinsic.isNoReturn)
+        OS << "      AttrVec.push_back(Attributes::NoReturn);\n";
 
       switch (modRef) {
       case MRK_none: break;
       case MRK_readonly:
-        if (Emitted) OS << '|';
-        OS << "Attribute::ReadOnly";
+        OS << "      AttrVec.push_back(Attributes::ReadOnly);\n";
         break;
       case MRK_readnone:
-        if (Emitted) OS << '|';
-        OS << "Attribute::ReadNone"; 
+        OS << "      AttrVec.push_back(Attributes::ReadNone);\n"; 
         break;
       }
-      OS << ");\n";
+      OS << "      AWI[" << numAttrs++ << "] = AttributeWithIndex::get(C, "
+         << "AttrListPtr::FunctionIndex, AttrVec);\n";
     }
 
     if (numAttrs) {
@@ -628,7 +621,7 @@ EmitAttributes(const std::vector<CodeGenIntrinsic> &Ints, raw_ostream &OS) {
   
   OS << "    }\n";
   OS << "  }\n";
-  OS << "  return AttrListPtr::get(ArrayRef<AttributeWithIndex>(AWI, "
+  OS << "  return AttrListPtr::get(C, ArrayRef<AttributeWithIndex>(AWI, "
              "NumAttrs));\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
@@ -700,8 +693,8 @@ EmitIntrinsicToGCCBuiltinMap(const std::vector<CodeGenIntrinsic> &Ints,
       
       if (!BIM.insert(std::make_pair(Ints[i].GCCBuiltinName,
                                      Ints[i].EnumName)).second)
-        throw "Intrinsic '" + Ints[i].TheDef->getName() +
-              "': duplicate GCC builtin name!";
+        PrintFatalError("Intrinsic '" + Ints[i].TheDef->getName() +
+              "': duplicate GCC builtin name!");
     }
   }
   
diff --git a/utils/TableGen/Makefile b/utils/TableGen/Makefile
index 0c4619d1a252..9bfd94b7576b 100644
--- a/utils/TableGen/Makefile
+++ b/utils/TableGen/Makefile
@@ -10,8 +10,6 @@
 LEVEL = ../..
 TOOLNAME = llvm-tblgen
 USEDLIBS = LLVMTableGen.a LLVMSupport.a
-REQUIRES_EH := 1
-REQUIRES_RTTI := 1
 
 # This tool has no plugins, optimize startup time.
 TOOL_NO_EXPORTS = 1
diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 8d9d41954485..64aaee756b1d 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -74,7 +74,7 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
                      IndexedMap<OpData> &OperandMap, unsigned BaseIdx) {
   unsigned OpsAdded = 0;
   for (unsigned i = 0, e = Dag->getNumArgs(); i != e; ++i) {
-    if (DefInit *DI = dynamic_cast<DefInit*>(Dag->getArg(i))) {
+    if (DefInit *DI = dyn_cast<DefInit>(Dag->getArg(i))) {
       // Physical register reference. Explicit check for the special case
       // "zero_reg" definition.
       if (DI->getDef()->isSubClassOf("Register") ||
@@ -90,7 +90,7 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       // FIXME: We probably shouldn't ever get a non-zero BaseIdx here.
       assert(BaseIdx == 0 && "Named subargument in pseudo expansion?!");
       if (DI->getDef() != Insn.Operands[BaseIdx + i].Rec)
-        throw TGError(Rec->getLoc(),
+        PrintFatalError(Rec->getLoc(),
                       "Pseudo operand type '" + DI->getDef()->getName() +
                       "' does not match expansion operand type '" +
                       Insn.Operands[BaseIdx + i].Rec->getName() + "'");
@@ -100,11 +100,11 @@ addDagOperandMapping(Record *Rec, DagInit *Dag, CodeGenInstruction &Insn,
       for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I)
         OperandMap[BaseIdx + i + I].Kind = OpData::Operand;
       OpsAdded += Insn.Operands[i].MINumOperands;
-    } else if (IntInit *II = dynamic_cast<IntInit*>(Dag->getArg(i))) {
+    } else if (IntInit *II = dyn_cast<IntInit>(Dag->getArg(i))) {
       OperandMap[BaseIdx + i].Kind = OpData::Imm;
       OperandMap[BaseIdx + i].Data.Imm = II->getValue();
       ++OpsAdded;
-    } else if (DagInit *SubDag = dynamic_cast<DagInit*>(Dag->getArg(i))) {
+    } else if (DagInit *SubDag = dyn_cast<DagInit>(Dag->getArg(i))) {
       // Just add the operands recursively. This is almost certainly
       // a constant value for a complex operand (> 1 MI operand).
       unsigned NewOps =
@@ -127,24 +127,24 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
   assert(Dag && "Missing result instruction in pseudo expansion!");
   DEBUG(dbgs() << "  Result: " << *Dag << "\n");
 
-  DefInit *OpDef = dynamic_cast<DefInit*>(Dag->getOperator());
+  DefInit *OpDef = dyn_cast<DefInit>(Dag->getOperator());
   if (!OpDef)
-    throw TGError(Rec->getLoc(), Rec->getName() +
+    PrintFatalError(Rec->getLoc(), Rec->getName() +
                   " has unexpected operator type!");
   Record *Operator = OpDef->getDef();
   if (!Operator->isSubClassOf("Instruction"))
-    throw TGError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
-                                 "' is not an instruction!");
+    PrintFatalError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
+                    "' is not an instruction!");
 
   CodeGenInstruction Insn(Operator);
 
   if (Insn.isCodeGenOnly || Insn.isPseudo)
-    throw TGError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
-                                 "' cannot be another pseudo instruction!");
+    PrintFatalError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
+                    "' cannot be another pseudo instruction!");
 
   if (Insn.Operands.size() != Dag->getNumArgs())
-    throw TGError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
-                                 "' operand count mismatch");
+    PrintFatalError(Rec->getLoc(), "Pseudo result '" + Operator->getName() +
+                    "' operand count mismatch");
 
   unsigned NumMIOperands = 0;
   for (unsigned i = 0, e = Insn.Operands.size(); i != e; ++i)
@@ -156,7 +156,7 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
 
   // If there are more operands that weren't in the DAG, they have to
   // be operands that have default values, or we have an error. Currently,
-  // PredicateOperand and OptionalDefOperand both have default values.
+  // Operands that are a sublass of OperandWithDefaultOp have default values.
 
 
   // Validate that each result pattern argument has a matching (by name)
@@ -179,9 +179,9 @@ void PseudoLoweringEmitter::evaluateExpansion(Record *Rec) {
     StringMap<unsigned>::iterator SourceOp =
       SourceOperands.find(Dag->getArgName(i));
     if (SourceOp == SourceOperands.end())
-      throw TGError(Rec->getLoc(),
-                    "Pseudo output operand '" + Dag->getArgName(i) +
-                    "' has no matching source operand.");
+      PrintFatalError(Rec->getLoc(),
+                      "Pseudo output operand '" + Dag->getArgName(i) +
+                      "' has no matching source operand.");
     // Map the source operand to the destination operand index for each
     // MachineInstr operand.
     for (unsigned I = 0, E = Insn.Operands[i].MINumOperands; I != E; ++I)
@@ -267,7 +267,7 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
 
 void PseudoLoweringEmitter::run(raw_ostream &o) {
   Record *ExpansionClass = Records.getClass("PseudoInstExpansion");
-  Record *InstructionClass = Records.getClass("PseudoInstExpansion");
+  Record *InstructionClass = Records.getClass("Instruction");
   assert(ExpansionClass && "PseudoInstExpansion class definition missing!");
   assert(InstructionClass && "Instruction class definition missing!");
 
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 02546dfca715..95b626723830 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -62,6 +62,8 @@ private:
 
   void EmitRegUnitPressure(raw_ostream &OS, const CodeGenRegBank &RegBank,
                            const std::string &ClassName);
+  void emitComposeSubRegIndices(raw_ostream &OS, CodeGenRegBank &RegBank,
+                                const std::string &ClassName);
 };
 } // End anonymous namespace
 
@@ -325,7 +327,7 @@ RegisterInfoEmitter::EmitRegMappingTables(raw_ostream &OS,
     if (!V || !V->getValue())
       continue;
 
-    DefInit *DI = dynamic_cast<DefInit*>(V->getValue());
+    DefInit *DI = cast<DefInit>(V->getValue());
     Record *Alias = DI->getDef();
     DwarfRegNums[Reg] = DwarfRegNums[Alias];
   }
@@ -530,6 +532,102 @@ static void printDiff16(raw_ostream &OS, uint16_t Val) {
   OS << Val;
 }
 
+// Try to combine Idx's compose map into Vec if it is compatible.
+// Return false if it's not possible.
+static bool combine(const CodeGenSubRegIndex *Idx,
+                    SmallVectorImpl<CodeGenSubRegIndex*> &Vec) {
+  const CodeGenSubRegIndex::CompMap &Map = Idx->getComposites();
+  for (CodeGenSubRegIndex::CompMap::const_iterator
+       I = Map.begin(), E = Map.end(); I != E; ++I) {
+    CodeGenSubRegIndex *&Entry = Vec[I->first->EnumValue - 1];
+    if (Entry && Entry != I->second)
+      return false;
+  }
+
+  // All entries are compatible. Make it so.
+  for (CodeGenSubRegIndex::CompMap::const_iterator
+       I = Map.begin(), E = Map.end(); I != E; ++I)
+    Vec[I->first->EnumValue - 1] = I->second;
+  return true;
+}
+
+static const char *getMinimalTypeForRange(uint64_t Range) {
+  assert(Range < 0xFFFFFFFFULL && "Enum too large");
+  if (Range > 0xFFFF)
+    return "uint32_t";
+  if (Range > 0xFF)
+    return "uint16_t";
+  return "uint8_t";
+}
+
+void
+RegisterInfoEmitter::emitComposeSubRegIndices(raw_ostream &OS,
+                                              CodeGenRegBank &RegBank,
+                                              const std::string &ClName) {
+  ArrayRef<CodeGenSubRegIndex*> SubRegIndices = RegBank.getSubRegIndices();
+  OS << "unsigned " << ClName
+     << "::composeSubRegIndicesImpl(unsigned IdxA, unsigned IdxB) const {\n";
+
+  // Many sub-register indexes are composition-compatible, meaning that
+  //
+  //   compose(IdxA, IdxB) == compose(IdxA', IdxB)
+  //
+  // for many IdxA, IdxA' pairs. Not all sub-register indexes can be composed.
+  // The illegal entries can be use as wildcards to compress the table further.
+
+  // Map each Sub-register index to a compatible table row.
+  SmallVector<unsigned, 4> RowMap;
+  SmallVector<SmallVector<CodeGenSubRegIndex*, 4>, 4> Rows;
+
+  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
+    unsigned Found = ~0u;
+    for (unsigned r = 0, re = Rows.size(); r != re; ++r) {
+      if (combine(SubRegIndices[i], Rows[r])) {
+        Found = r;
+        break;
+      }
+    }
+    if (Found == ~0u) {
+      Found = Rows.size();
+      Rows.resize(Found + 1);
+      Rows.back().resize(SubRegIndices.size());
+      combine(SubRegIndices[i], Rows.back());
+    }
+    RowMap.push_back(Found);
+  }
+
+  // Output the row map if there is multiple rows.
+  if (Rows.size() > 1) {
+    OS << "  static const " << getMinimalTypeForRange(Rows.size())
+       << " RowMap[" << SubRegIndices.size() << "] = {\n    ";
+    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
+      OS << RowMap[i] << ", ";
+    OS << "\n  };\n";
+  }
+
+  // Output the rows.
+  OS << "  static const " << getMinimalTypeForRange(SubRegIndices.size()+1)
+     << " Rows[" << Rows.size() << "][" << SubRegIndices.size() << "] = {\n";
+  for (unsigned r = 0, re = Rows.size(); r != re; ++r) {
+    OS << "    { ";
+    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i)
+      if (Rows[r][i])
+        OS << Rows[r][i]->EnumValue << ", ";
+      else
+        OS << "0, ";
+    OS << "},\n";
+  }
+  OS << "  };\n\n";
+
+  OS << "  --IdxA; assert(IdxA < " << SubRegIndices.size() << ");\n"
+     << "  --IdxB; assert(IdxB < " << SubRegIndices.size() << ");\n";
+  if (Rows.size() > 1)
+    OS << "  return Rows[RowMap[IdxA]][IdxB];\n";
+  else
+    OS << "  return Rows[0][IdxB];\n";
+  OS << "}\n\n";
+}
+
 //
 // runMCDesc - Print out MC register descriptions.
 //
@@ -751,7 +849,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
     BitsInit *BI = Reg->getValueAsBitsInit("HWEncoding");
     uint64_t Value = 0;
     for (unsigned b = 0, be = BI->getNumBits(); b != be; ++b) {
-      if (BitInit *B = dynamic_cast<BitInit*>(BI->getBit(b)))
+      if (BitInit *B = dyn_cast<BitInit>(BI->getBit(b)))
       Value |= (uint64_t)B->getValue() << b;
     }
     OS << "  " << Value << ",\n";
@@ -770,7 +868,7 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
      << TargetName << "RegDiffLists, "
      << TargetName << "RegStrings, "
      << TargetName << "SubRegIdxLists, "
-     << SubRegIndices.size() << ",\n"
+     << (SubRegIndices.size() + 1) << ",\n"
      << "  " << TargetName << "RegEncodingTable);\n\n";
 
   EmitRegMapping(OS, Regs, false);
@@ -802,16 +900,17 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "  virtual bool needsStackRealignment(const MachineFunction &) const\n"
      << "     { return false; }\n";
   if (!RegBank.getSubRegIndices().empty()) {
-    OS << "  unsigned composeSubRegIndices(unsigned, unsigned) const;\n"
-      << "  const TargetRegisterClass *"
+    OS << "  virtual unsigned composeSubRegIndicesImpl"
+       << "(unsigned, unsigned) const;\n"
+      << "  virtual const TargetRegisterClass *"
       "getSubClassWithSubReg(const TargetRegisterClass*, unsigned) const;\n";
   }
-  OS << "  const RegClassWeight &getRegClassWeight("
+  OS << "  virtual const RegClassWeight &getRegClassWeight("
      << "const TargetRegisterClass *RC) const;\n"
-     << "  unsigned getNumRegPressureSets() const;\n"
-     << "  const char *getRegPressureSetName(unsigned Idx) const;\n"
-     << "  unsigned getRegPressureSetLimit(unsigned Idx) const;\n"
-     << "  const int *getRegClassPressureSets("
+     << "  virtual unsigned getNumRegPressureSets() const;\n"
+     << "  virtual const char *getRegPressureSetName(unsigned Idx) const;\n"
+     << "  virtual unsigned getRegPressureSetLimit(unsigned Idx) const;\n"
+     << "  virtual const int *getRegClassPressureSets("
      << "const TargetRegisterClass *RC) const;\n"
      << "};\n\n";
 
@@ -876,15 +975,23 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
   VTSeqs.emit(OS, printSimpleValueType, "MVT::Other");
   OS << "};\n";
 
-  // Emit SubRegIndex names, skipping 0
-  OS << "\nstatic const char *const SubRegIndexTable[] = { \"";
+  // Emit SubRegIndex names, skipping 0.
+  OS << "\nstatic const char *const SubRegIndexNameTable[] = { \"";
   for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
     OS << SubRegIndices[i]->getName();
-    if (i+1 != e)
+    if (i + 1 != e)
       OS << "\", \"";
   }
   OS << "\" };\n\n";
 
+  // Emit SubRegIndex lane masks, including 0.
+  OS << "\nstatic const unsigned SubRegIndexLaneMaskTable[] = {\n  ~0u,\n";
+  for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
+    OS << format("  0x%08x, // ", SubRegIndices[i]->LaneMask)
+       << SubRegIndices[i]->getName() << '\n';
+  }
+  OS << " };\n\n";
+
   OS << "\n";
 
   // Now that all of the structs have been emitted, emit the instances.
@@ -1046,31 +1153,8 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
 
   std::string ClassName = Target.getName() + "GenRegisterInfo";
 
-  // Emit composeSubRegIndices
-  if (!SubRegIndices.empty()) {
-    OS << "unsigned " << ClassName
-      << "::composeSubRegIndices(unsigned IdxA, unsigned IdxB) const {\n"
-      << "  switch (IdxA) {\n"
-      << "  default:\n    return IdxB;\n";
-    for (unsigned i = 0, e = SubRegIndices.size(); i != e; ++i) {
-      bool Open = false;
-      for (unsigned j = 0; j != e; ++j) {
-        if (CodeGenSubRegIndex *Comp =
-            SubRegIndices[i]->compose(SubRegIndices[j])) {
-          if (!Open) {
-            OS << "  case " << SubRegIndices[i]->getQualifiedName()
-              << ": switch(IdxB) {\n    default: return IdxB;\n";
-            Open = true;
-          }
-          OS << "    case " << SubRegIndices[j]->getQualifiedName()
-            << ": return " << Comp->getQualifiedName() << ";\n";
-        }
-      }
-      if (Open)
-        OS << "    }\n";
-    }
-    OS << "  }\n}\n\n";
-  }
+  if (!SubRegIndices.empty())
+    emitComposeSubRegIndices(OS, RegBank, ClassName);
 
   // Emit getSubClassWithSubReg.
   if (!SubRegIndices.empty()) {
@@ -1084,7 +1168,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
     else if (RegisterClasses.size() < UINT16_MAX)
       OS << "  static const uint16_t Table[";
     else
-      throw "Too many register classes.";
+      PrintFatalError("Too many register classes.");
     OS << RegisterClasses.size() << "][" << SubRegIndices.size() << "] = {\n";
     for (unsigned rci = 0, rce = RegisterClasses.size(); rci != rce; ++rci) {
       const CodeGenRegisterClass &RC = *RegisterClasses[rci];
@@ -1122,7 +1206,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "(unsigned RA, unsigned DwarfFlavour, unsigned EHFlavour)\n"
      << "  : TargetRegisterInfo(" << TargetName << "RegInfoDesc"
      << ", RegisterClasses, RegisterClasses+" << RegisterClasses.size() <<",\n"
-     << "             SubRegIndexTable) {\n"
+     << "             SubRegIndexNameTable, SubRegIndexLaneMaskTable) {\n"
      << "  InitMCRegisterInfo(" << TargetName << "RegDesc, "
      << Regs.size()+1 << ", RA,\n                     " << TargetName
      << "MCRegisterClasses, " << RegisterClasses.size() << ",\n"
@@ -1131,7 +1215,7 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "                     " << TargetName << "RegDiffLists,\n"
      << "                     " << TargetName << "RegStrings,\n"
      << "                     " << TargetName << "SubRegIdxLists,\n"
-     << "                     " << SubRegIndices.size() << ",\n"
+     << "                     " << SubRegIndices.size() + 1 << ",\n"
      << "                     " << TargetName << "RegEncodingTable);\n\n";
 
   EmitRegMapping(OS, Regs, true);
diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index d8ab2eeb2581..d4db152a9681 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h
@@ -29,8 +29,8 @@ namespace llvm {
 /// Compute the layout of a table that contains all the sequences, possibly by
 /// reusing entries.
 ///
-/// @param SeqT The sequence container. (vector or string).
-/// @param Less A stable comparator for SeqT elements.
+/// @tparam SeqT The sequence container. (vector or string).
+/// @tparam Less A stable comparator for SeqT elements.
 template<typename SeqT, typename Less = std::less<typename SeqT::value_type> >
 class SequenceToOffsetTable {
   typedef typename SeqT::value_type ElemT;
@@ -82,7 +82,7 @@ public:
   }
 
   bool empty() const { return Seqs.empty(); }
-  
+
   /// layout - Computes the final table layout.
   void layout() {
     assert(Entries == 0 && "Can only call layout() once");
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
index 46e6db173ea9..0dd9853843fe 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/utils/TableGen/SetTheory.cpp
@@ -27,20 +27,20 @@ typedef SetTheory::RecVec RecVec;
 
 // (add a, b, ...) Evaluate and union all arguments.
 struct AddOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
-    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts);
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc);
   }
 };
 
 // (sub Add, Sub, ...) Set difference.
 struct SubOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() < 2)
-      throw "Set difference needs at least two arguments: " +
-        Expr->getAsString();
+      PrintFatalError(Loc, "Set difference needs at least two arguments: " +
+        Expr->getAsString());
     RecSet Add, Sub;
-    ST.evaluate(*Expr->arg_begin(), Add);
-    ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Sub);
+    ST.evaluate(*Expr->arg_begin(), Add, Loc);
+    ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Sub, Loc);
     for (RecSet::iterator I = Add.begin(), E = Add.end(); I != E; ++I)
       if (!Sub.count(*I))
         Elts.insert(*I);
@@ -49,12 +49,13 @@ struct SubOp : public SetTheory::Operator {
 
 // (and S1, S2) Set intersection.
 struct AndOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() != 2)
-      throw "Set intersection requires two arguments: " + Expr->getAsString();
+      PrintFatalError(Loc, "Set intersection requires two arguments: " +
+        Expr->getAsString());
     RecSet S1, S2;
-    ST.evaluate(Expr->arg_begin()[0], S1);
-    ST.evaluate(Expr->arg_begin()[1], S2);
+    ST.evaluate(Expr->arg_begin()[0], S1, Loc);
+    ST.evaluate(Expr->arg_begin()[1], S2, Loc);
     for (RecSet::iterator I = S1.begin(), E = S1.end(); I != E; ++I)
       if (S2.count(*I))
         Elts.insert(*I);
@@ -65,17 +66,19 @@ struct AndOp : public SetTheory::Operator {
 struct SetIntBinOp : public SetTheory::Operator {
   virtual void apply2(SetTheory &ST, DagInit *Expr,
                      RecSet &Set, int64_t N,
-                     RecSet &Elts) =0;
+                     RecSet &Elts, ArrayRef<SMLoc> Loc) =0;
 
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() != 2)
-      throw "Operator requires (Op Set, Int) arguments: " + Expr->getAsString();
+      PrintFatalError(Loc, "Operator requires (Op Set, Int) arguments: " +
+        Expr->getAsString());
     RecSet Set;
-    ST.evaluate(Expr->arg_begin()[0], Set);
-    IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[1]);
+    ST.evaluate(Expr->arg_begin()[0], Set, Loc);
+    IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[1]);
     if (!II)
-      throw "Second argument must be an integer: " + Expr->getAsString();
-    apply2(ST, Expr, Set, II->getValue(), Elts);
+      PrintFatalError(Loc, "Second argument must be an integer: " +
+        Expr->getAsString());
+    apply2(ST, Expr, Set, II->getValue(), Elts, Loc);
   }
 };
 
@@ -83,9 +86,10 @@ struct SetIntBinOp : public SetTheory::Operator {
 struct ShlOp : public SetIntBinOp {
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N < 0)
-      throw "Positive shift required: " + Expr->getAsString();
+      PrintFatalError(Loc, "Positive shift required: " +
+        Expr->getAsString());
     if (unsigned(N) < Set.size())
       Elts.insert(Set.begin() + N, Set.end());
   }
@@ -95,9 +99,10 @@ struct ShlOp : public SetIntBinOp {
 struct TruncOp : public SetIntBinOp {
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N < 0)
-      throw "Positive length required: " + Expr->getAsString();
+      PrintFatalError(Loc, "Positive length required: " +
+        Expr->getAsString());
     if (unsigned(N) > Set.size())
       N = Set.size();
     Elts.insert(Set.begin(), Set.begin() + N);
@@ -112,7 +117,7 @@ struct RotOp : public SetIntBinOp {
 
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Reverse)
       N = -N;
     // N > 0 -> rotate left, N < 0 -> rotate right.
@@ -131,9 +136,10 @@ struct RotOp : public SetIntBinOp {
 struct DecimateOp : public SetIntBinOp {
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N <= 0)
-      throw "Positive stride required: " + Expr->getAsString();
+      PrintFatalError(Loc, "Positive stride required: " +
+        Expr->getAsString());
     for (unsigned I = 0; I < Set.size(); I += N)
       Elts.insert(Set[I]);
   }
@@ -141,12 +147,12 @@ struct DecimateOp : public SetIntBinOp {
 
 // (interleave S1, S2, ...) Interleave elements of the arguments.
 struct InterleaveOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     // Evaluate the arguments individually.
     SmallVector<RecSet, 4> Args(Expr->getNumArgs());
     unsigned MaxSize = 0;
     for (unsigned i = 0, e = Expr->getNumArgs(); i != e; ++i) {
-      ST.evaluate(Expr->getArg(i), Args[i]);
+      ST.evaluate(Expr->getArg(i), Args[i], Loc);
       MaxSize = std::max(MaxSize, unsigned(Args[i].size()));
     }
     // Interleave arguments into Elts.
@@ -159,41 +165,42 @@ struct InterleaveOp : public SetTheory::Operator {
 
 // (sequence "Format", From, To) Generate a sequence of records by name.
 struct SequenceOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     int Step = 1;
     if (Expr->arg_size() > 4)
-      throw "Bad args to (sequence \"Format\", From, To): " +
-        Expr->getAsString();
+      PrintFatalError(Loc, "Bad args to (sequence \"Format\", From, To): " +
+        Expr->getAsString());
     else if (Expr->arg_size() == 4) {
-      if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[3])) {
+      if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[3])) {
         Step = II->getValue();
       } else
-        throw "Stride must be an integer: " + Expr->getAsString();
+        PrintFatalError(Loc, "Stride must be an integer: " +
+          Expr->getAsString());
     }
 
     std::string Format;
-    if (StringInit *SI = dynamic_cast<StringInit*>(Expr->arg_begin()[0]))
+    if (StringInit *SI = dyn_cast<StringInit>(Expr->arg_begin()[0]))
       Format = SI->getValue();
     else
-      throw "Format must be a string: " + Expr->getAsString();
+      PrintFatalError(Loc,  "Format must be a string: " + Expr->getAsString());
 
     int64_t From, To;
-    if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[1]))
+    if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[1]))
       From = II->getValue();
     else
-      throw "From must be an integer: " + Expr->getAsString();
+      PrintFatalError(Loc, "From must be an integer: " + Expr->getAsString());
     if (From < 0 || From >= (1 << 30))
-      throw "From out of range";
+      PrintFatalError(Loc, "From out of range");
 
-    if (IntInit *II = dynamic_cast<IntInit*>(Expr->arg_begin()[2]))
+    if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[2]))
       To = II->getValue();
     else
-      throw "From must be an integer: " + Expr->getAsString();
+      PrintFatalError(Loc, "From must be an integer: " + Expr->getAsString());
     if (To < 0 || To >= (1 << 30))
-      throw "To out of range";
+      PrintFatalError(Loc, "To out of range");
 
     RecordKeeper &Records =
-      dynamic_cast<DefInit&>(*Expr->getOperator()).getDef()->getRecords();
+      cast<DefInit>(Expr->getOperator())->getDef()->getRecords();
 
     Step *= From <= To ? 1 : -1;
     while (true) {
@@ -206,7 +213,8 @@ struct SequenceOp : public SetTheory::Operator {
       OS << format(Format.c_str(), unsigned(From));
       Record *Rec = Records.getDef(OS.str());
       if (!Rec)
-        throw "No def named '" + Name + "': " + Expr->getAsString();
+        PrintFatalError(Loc, "No def named '" + Name + "': " +
+          Expr->getAsString());
       // Try to reevaluate Rec in case it is a set.
       if (const RecVec *Result = ST.expand(Rec))
         Elts.insert(Result->begin(), Result->end());
@@ -225,7 +233,7 @@ struct FieldExpander : public SetTheory::Expander {
   FieldExpander(StringRef fn) : FieldName(fn) {}
 
   void expand(SetTheory &ST, Record *Def, RecSet &Elts) {
-    ST.evaluate(Def->getValueInit(FieldName), Elts);
+    ST.evaluate(Def->getValueInit(FieldName), Elts, Def->getLoc());
   }
 };
 } // end anonymous namespace
@@ -259,9 +267,9 @@ void SetTheory::addFieldExpander(StringRef ClassName, StringRef FieldName) {
   addExpander(ClassName, new FieldExpander(FieldName));
 }
 
-void SetTheory::evaluate(Init *Expr, RecSet &Elts) {
+void SetTheory::evaluate(Init *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
   // A def in a list can be a just an element, or it may expand.
-  if (DefInit *Def = dynamic_cast<DefInit*>(Expr)) {
+  if (DefInit *Def = dyn_cast<DefInit>(Expr)) {
     if (const RecVec *Result = expand(Def->getDef()))
       return Elts.insert(Result->begin(), Result->end());
     Elts.insert(Def->getDef());
@@ -269,20 +277,20 @@ void SetTheory::evaluate(Init *Expr, RecSet &Elts) {
   }
 
   // Lists simply expand.
-  if (ListInit *LI = dynamic_cast<ListInit*>(Expr))
-    return evaluate(LI->begin(), LI->end(), Elts);
+  if (ListInit *LI = dyn_cast<ListInit>(Expr))
+    return evaluate(LI->begin(), LI->end(), Elts, Loc);
 
   // Anything else must be a DAG.
-  DagInit *DagExpr = dynamic_cast<DagInit*>(Expr);
+  DagInit *DagExpr = dyn_cast<DagInit>(Expr);
   if (!DagExpr)
-    throw "Invalid set element: " + Expr->getAsString();
-  DefInit *OpInit = dynamic_cast<DefInit*>(DagExpr->getOperator());
+    PrintFatalError(Loc, "Invalid set element: " + Expr->getAsString());
+  DefInit *OpInit = dyn_cast<DefInit>(DagExpr->getOperator());
   if (!OpInit)
-    throw "Bad set expression: " + Expr->getAsString();
+    PrintFatalError(Loc, "Bad set expression: " + Expr->getAsString());
   Operator *Op = Operators.lookup(OpInit->getDef()->getName());
   if (!Op)
-    throw "Unknown set operator: " + Expr->getAsString();
-  Op->apply(*this, DagExpr, Elts);
+    PrintFatalError(Loc, "Unknown set operator: " + Expr->getAsString());
+  Op->apply(*this, DagExpr, Elts, Loc);
 }
 
 const RecVec *SetTheory::expand(Record *Set) {
@@ -292,19 +300,19 @@ const RecVec *SetTheory::expand(Record *Set) {
     return &I->second;
 
   // This is the first time we see Set. Find a suitable expander.
-  try {
-    const std::vector<Record*> &SC = Set->getSuperClasses();
-    for (unsigned i = 0, e = SC.size(); i != e; ++i)
-      if (Expander *Exp = Expanders.lookup(SC[i]->getName())) {
-        // This breaks recursive definitions.
-        RecVec &EltVec = Expansions[Set];
-        RecSet Elts;
-        Exp->expand(*this, Set, Elts);
-        EltVec.assign(Elts.begin(), Elts.end());
-        return &EltVec;
-      }
-  } catch (const std::string &Error) {
-    throw TGError(Set->getLoc(), Error);
+  const std::vector<Record*> &SC = Set->getSuperClasses();
+  for (unsigned i = 0, e = SC.size(); i != e; ++i) {
+    // Skip unnamed superclasses.
+    if (!dyn_cast<StringInit>(SC[i]->getNameInit()))
+      continue;
+    if (Expander *Exp = Expanders.lookup(SC[i]->getName())) {
+      // This breaks recursive definitions.
+      RecVec &EltVec = Expansions[Set];
+      RecSet Elts;
+      Exp->expand(*this, Set, Elts);
+      EltVec.assign(Elts.begin(), Elts.end());
+      return &EltVec;
+    }
   }
 
   // Set is not expandable.
diff --git a/utils/TableGen/SetTheory.h b/utils/TableGen/SetTheory.h
index b394058f4c35..122372ab33c0 100644
--- a/utils/TableGen/SetTheory.h
+++ b/utils/TableGen/SetTheory.h
@@ -49,6 +49,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Support/SourceMgr.h"
 #include <map>
 #include <vector>
 
@@ -72,7 +73,8 @@ public:
 
     /// apply - Apply this operator to Expr's arguments and insert the result
     /// in Elts.
-    virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts) =0;
+    virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts,
+                       ArrayRef<SMLoc> Loc) =0;
   };
 
   /// Expander - A callback function that can transform a Record representing a
@@ -119,13 +121,13 @@ public:
   void addOperator(StringRef Name, Operator*);
 
   /// evaluate - Evaluate Expr and append the resulting set to Elts.
-  void evaluate(Init *Expr, RecSet &Elts);
+  void evaluate(Init *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc);
 
   /// evaluate - Evaluate a sequence of Inits and append to Elts.
   template<typename Iter>
-  void evaluate(Iter begin, Iter end, RecSet &Elts) {
+  void evaluate(Iter begin, Iter end, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     while (begin != end)
-      evaluate(*begin++, Elts);
+      evaluate(*begin++, Elts, Loc);
   }
 
   /// expand - Expand a record into a set of elements if possible.  Return a
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 34723439596e..f1a06bb52887 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -11,13 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "subtarget-emitter"
+
 #include "CodeGenTarget.h"
 #include "CodeGenSchedule.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include <algorithm>
 #include <map>
 #include <string>
@@ -26,6 +31,32 @@ using namespace llvm;
 
 namespace {
 class SubtargetEmitter {
+  // Each processor has a SchedClassDesc table with an entry for each SchedClass.
+  // The SchedClassDesc table indexes into a global write resource table, write
+  // latency table, and read advance table.
+  struct SchedClassTables {
+    std::vector<std::vector<MCSchedClassDesc> > ProcSchedClasses;
+    std::vector<MCWriteProcResEntry> WriteProcResources;
+    std::vector<MCWriteLatencyEntry> WriteLatencies;
+    std::vector<std::string> WriterNames;
+    std::vector<MCReadAdvanceEntry> ReadAdvanceEntries;
+
+    // Reserve an invalid entry at index 0
+    SchedClassTables() {
+      ProcSchedClasses.resize(1);
+      WriteProcResources.resize(1);
+      WriteLatencies.resize(1);
+      WriterNames.push_back("InvalidWrite");
+      ReadAdvanceEntries.resize(1);
+    }
+  };
+
+  struct LessWriteProcResources {
+    bool operator()(const MCWriteProcResEntry &LHS,
+                    const MCWriteProcResEntry &RHS) {
+      return LHS.ProcResourceIdx < RHS.ProcResourceIdx;
+    }
+  };
 
   RecordKeeper &Records;
   CodeGenSchedModels &SchedModels;
@@ -50,8 +81,18 @@ class SubtargetEmitter {
                          &ProcItinLists);
   void EmitProcessorProp(raw_ostream &OS, const Record *R, const char *Name,
                          char Separator);
+  void EmitProcessorResources(const CodeGenProcModel &ProcModel,
+                              raw_ostream &OS);
+  Record *FindWriteResources(const CodeGenSchedRW &SchedWrite,
+                             const CodeGenProcModel &ProcModel);
+  Record *FindReadAdvance(const CodeGenSchedRW &SchedRead,
+                          const CodeGenProcModel &ProcModel);
+  void GenSchedClassTables(const CodeGenProcModel &ProcModel,
+                           SchedClassTables &SchedTables);
+  void EmitSchedClassTables(SchedClassTables &SchedTables, raw_ostream &OS);
   void EmitProcessorModels(raw_ostream &OS);
   void EmitProcessorLookup(raw_ostream &OS);
+  void EmitSchedModelHelpers(std::string ClassName, raw_ostream &OS);
   void EmitSchedModel(raw_ostream &OS);
   void ParseFeaturesFunction(raw_ostream &OS, unsigned NumFeatures,
                              unsigned NumProcs);
@@ -521,7 +562,7 @@ EmitItineraries(raw_ostream &OS,
   std::vector<std::vector<InstrItinerary> >::iterator
       ProcItinListsIter = ProcItinLists.begin();
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI, ++ProcItinListsIter) {
 
     Record *ItinsDef = PI->ItinsDef;
     if (!ItinsDefSet.insert(ItinsDef))
@@ -532,7 +573,7 @@ EmitItineraries(raw_ostream &OS,
 
     // Get the itinerary list for the processor.
     assert(ProcItinListsIter != ProcItinLists.end() && "bad iterator");
-    std::vector<InstrItinerary> &ItinList = *ProcItinListsIter++;
+    std::vector<InstrItinerary> &ItinList = *ProcItinListsIter;
 
     OS << "\n";
     OS << "static const llvm::InstrItinerary ";
@@ -578,11 +619,488 @@ void SubtargetEmitter::EmitProcessorProp(raw_ostream &OS, const Record *R,
   OS << '\n';
 }
 
+void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
+                                              raw_ostream &OS) {
+  char Sep = ProcModel.ProcResourceDefs.empty() ? ' ' : ',';
+
+  OS << "\n// {Name, NumUnits, SuperIdx, IsBuffered}\n";
+  OS << "static const llvm::MCProcResourceDesc "
+     << ProcModel.ModelName << "ProcResources" << "[] = {\n"
+     << "  {DBGFIELD(\"InvalidUnit\")     0, 0, 0}" << Sep << "\n";
+
+  for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {
+    Record *PRDef = ProcModel.ProcResourceDefs[i];
+
+    // Find the SuperIdx
+    unsigned SuperIdx = 0;
+    Record *SuperDef = 0;
+    if (PRDef->getValueInit("Super")->isComplete()) {
+      SuperDef =
+        SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"), ProcModel);
+      SuperIdx = ProcModel.getProcResourceIdx(SuperDef);
+    }
+    // Emit the ProcResourceDesc
+    if (i+1 == e)
+      Sep = ' ';
+    OS << "  {DBGFIELD(\"" << PRDef->getName() << "\") ";
+    if (PRDef->getName().size() < 15)
+      OS.indent(15 - PRDef->getName().size());
+    OS << PRDef->getValueAsInt("NumUnits") << ", " << SuperIdx << ", "
+       << PRDef->getValueAsBit("Buffered") << "}" << Sep << " // #" << i+1;
+    if (SuperDef)
+      OS << ", Super=" << SuperDef->getName();
+    OS << "\n";
+  }
+  OS << "};\n";
+}
+
+// Find the WriteRes Record that defines processor resources for this
+// SchedWrite.
+Record *SubtargetEmitter::FindWriteResources(
+  const CodeGenSchedRW &SchedWrite, const CodeGenProcModel &ProcModel) {
+
+  // Check if the SchedWrite is already subtarget-specific and directly
+  // specifies a set of processor resources.
+  if (SchedWrite.TheDef->isSubClassOf("SchedWriteRes"))
+    return SchedWrite.TheDef;
+
+  Record *AliasDef = 0;
+  for (RecIter AI = SchedWrite.Aliases.begin(), AE = SchedWrite.Aliases.end();
+       AI != AE; ++AI) {
+    const CodeGenSchedRW &AliasRW =
+      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+    if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) {
+      Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+      if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
+        continue;
+    }
+    if (AliasDef)
+      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
+                    "defined for processor " + ProcModel.ModelName +
+                    " Ensure only one SchedAlias exists per RW.");
+    AliasDef = AliasRW.TheDef;
+  }
+  if (AliasDef && AliasDef->isSubClassOf("SchedWriteRes"))
+    return AliasDef;
+
+  // Check this processor's list of write resources.
+  Record *ResDef = 0;
+  for (RecIter WRI = ProcModel.WriteResDefs.begin(),
+         WRE = ProcModel.WriteResDefs.end(); WRI != WRE; ++WRI) {
+    if (!(*WRI)->isSubClassOf("WriteRes"))
+      continue;
+    if (AliasDef == (*WRI)->getValueAsDef("WriteType")
+        || SchedWrite.TheDef == (*WRI)->getValueAsDef("WriteType")) {
+      if (ResDef) {
+        PrintFatalError((*WRI)->getLoc(), "Resources are defined for both "
+                      "SchedWrite and its alias on processor " +
+                      ProcModel.ModelName);
+      }
+      ResDef = *WRI;
+    }
+  }
+  // TODO: If ProcModel has a base model (previous generation processor),
+  // then call FindWriteResources recursively with that model here.
+  if (!ResDef) {
+    PrintFatalError(ProcModel.ModelDef->getLoc(),
+                  std::string("Processor does not define resources for ")
+                  + SchedWrite.TheDef->getName());
+  }
+  return ResDef;
+}
+
+/// Find the ReadAdvance record for the given SchedRead on this processor or
+/// return NULL.
+Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
+                                          const CodeGenProcModel &ProcModel) {
+  // Check for SchedReads that directly specify a ReadAdvance.
+  if (SchedRead.TheDef->isSubClassOf("SchedReadAdvance"))
+    return SchedRead.TheDef;
+
+  // Check this processor's list of aliases for SchedRead.
+  Record *AliasDef = 0;
+  for (RecIter AI = SchedRead.Aliases.begin(), AE = SchedRead.Aliases.end();
+       AI != AE; ++AI) {
+    const CodeGenSchedRW &AliasRW =
+      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+    if (AliasRW.TheDef->getValueInit("SchedModel")->isComplete()) {
+      Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+      if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
+        continue;
+    }
+    if (AliasDef)
+      PrintFatalError(AliasRW.TheDef->getLoc(), "Multiple aliases "
+                    "defined for processor " + ProcModel.ModelName +
+                    " Ensure only one SchedAlias exists per RW.");
+    AliasDef = AliasRW.TheDef;
+  }
+  if (AliasDef && AliasDef->isSubClassOf("SchedReadAdvance"))
+    return AliasDef;
+
+  // Check this processor's ReadAdvanceList.
+  Record *ResDef = 0;
+  for (RecIter RAI = ProcModel.ReadAdvanceDefs.begin(),
+         RAE = ProcModel.ReadAdvanceDefs.end(); RAI != RAE; ++RAI) {
+    if (!(*RAI)->isSubClassOf("ReadAdvance"))
+      continue;
+    if (AliasDef == (*RAI)->getValueAsDef("ReadType")
+        || SchedRead.TheDef == (*RAI)->getValueAsDef("ReadType")) {
+      if (ResDef) {
+        PrintFatalError((*RAI)->getLoc(), "Resources are defined for both "
+                      "SchedRead and its alias on processor " +
+                      ProcModel.ModelName);
+      }
+      ResDef = *RAI;
+    }
+  }
+  // TODO: If ProcModel has a base model (previous generation processor),
+  // then call FindReadAdvance recursively with that model here.
+  if (!ResDef && SchedRead.TheDef->getName() != "ReadDefault") {
+    PrintFatalError(ProcModel.ModelDef->getLoc(),
+                  std::string("Processor does not define resources for ")
+                  + SchedRead.TheDef->getName());
+  }
+  return ResDef;
+}
+
+// Generate the SchedClass table for this processor and update global
+// tables. Must be called for each processor in order.
+void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
+                                           SchedClassTables &SchedTables) {
+  SchedTables.ProcSchedClasses.resize(SchedTables.ProcSchedClasses.size() + 1);
+  if (!ProcModel.hasInstrSchedModel())
+    return;
+
+  std::vector<MCSchedClassDesc> &SCTab = SchedTables.ProcSchedClasses.back();
+  for (CodeGenSchedModels::SchedClassIter SCI = SchedModels.schedClassBegin(),
+         SCE = SchedModels.schedClassEnd(); SCI != SCE; ++SCI) {
+    DEBUG(SCI->dump(&SchedModels));
+
+    SCTab.resize(SCTab.size() + 1);
+    MCSchedClassDesc &SCDesc = SCTab.back();
+    // SCDesc.Name is guarded by NDEBUG
+    SCDesc.NumMicroOps = 0;
+    SCDesc.BeginGroup = false;
+    SCDesc.EndGroup = false;
+    SCDesc.WriteProcResIdx = 0;
+    SCDesc.WriteLatencyIdx = 0;
+    SCDesc.ReadAdvanceIdx = 0;
+
+    // A Variant SchedClass has no resources of its own.
+    if (!SCI->Transitions.empty()) {
+      SCDesc.NumMicroOps = MCSchedClassDesc::VariantNumMicroOps;
+      continue;
+    }
+
+    // Determine if the SchedClass is actually reachable on this processor. If
+    // not don't try to locate the processor resources, it will fail.
+    // If ProcIndices contains 0, this class applies to all processors.
+    assert(!SCI->ProcIndices.empty() && "expect at least one procidx");
+    if (SCI->ProcIndices[0] != 0) {
+      IdxIter PIPos = std::find(SCI->ProcIndices.begin(),
+                                SCI->ProcIndices.end(), ProcModel.Index);
+      if (PIPos == SCI->ProcIndices.end())
+        continue;
+    }
+    IdxVec Writes = SCI->Writes;
+    IdxVec Reads = SCI->Reads;
+    if (SCI->ItinClassDef) {
+      assert(SCI->InstRWs.empty() && "ItinClass should not have InstRWs");
+      // Check this processor's itinerary class resources.
+      for (RecIter II = ProcModel.ItinRWDefs.begin(),
+             IE = ProcModel.ItinRWDefs.end(); II != IE; ++II) {
+        RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+        if (std::find(Matched.begin(), Matched.end(), SCI->ItinClassDef)
+            != Matched.end()) {
+          SchedModels.findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"),
+                              Writes, Reads);
+          break;
+        }
+      }
+      if (Writes.empty()) {
+        DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+              << " does not have resources for itinerary class "
+              << SCI->ItinClassDef->getName() << '\n');
+      }
+    }
+    else if (!SCI->InstRWs.empty()) {
+      // This class may have a default ReadWrite list which can be overriden by
+      // InstRW definitions.
+      Record *RWDef = 0;
+      for (RecIter RWI = SCI->InstRWs.begin(), RWE = SCI->InstRWs.end();
+           RWI != RWE; ++RWI) {
+        Record *RWModelDef = (*RWI)->getValueAsDef("SchedModel");
+        if (&ProcModel == &SchedModels.getProcModel(RWModelDef)) {
+          RWDef = *RWI;
+          break;
+        }
+      }
+      if (RWDef) {
+        Writes.clear();
+        Reads.clear();
+        SchedModels.findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"),
+                            Writes, Reads);
+      }
+    }
+    // Sum resources across all operand writes.
+    std::vector<MCWriteProcResEntry> WriteProcResources;
+    std::vector<MCWriteLatencyEntry> WriteLatencies;
+    std::vector<std::string> WriterNames;
+    std::vector<MCReadAdvanceEntry> ReadAdvanceEntries;
+    for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI) {
+      IdxVec WriteSeq;
+      SchedModels.expandRWSeqForProc(*WI, WriteSeq, /*IsRead=*/false,
+                                     ProcModel);
+
+      // For each operand, create a latency entry.
+      MCWriteLatencyEntry WLEntry;
+      WLEntry.Cycles = 0;
+      unsigned WriteID = WriteSeq.back();
+      WriterNames.push_back(SchedModels.getSchedWrite(WriteID).Name);
+      // If this Write is not referenced by a ReadAdvance, don't distinguish it
+      // from other WriteLatency entries.
+      if (!SchedModels.hasReadOfWrite(SchedModels.getSchedWrite(WriteID).TheDef)) {
+        WriteID = 0;
+      }
+      WLEntry.WriteResourceID = WriteID;
+
+      for (IdxIter WSI = WriteSeq.begin(), WSE = WriteSeq.end();
+           WSI != WSE; ++WSI) {
+
+        Record *WriteRes =
+          FindWriteResources(SchedModels.getSchedWrite(*WSI), ProcModel);
+
+        // Mark the parent class as invalid for unsupported write types.
+        if (WriteRes->getValueAsBit("Unsupported")) {
+          SCDesc.NumMicroOps = MCSchedClassDesc::InvalidNumMicroOps;
+          break;
+        }
+        WLEntry.Cycles += WriteRes->getValueAsInt("Latency");
+        SCDesc.NumMicroOps += WriteRes->getValueAsInt("NumMicroOps");
+        SCDesc.BeginGroup |= WriteRes->getValueAsBit("BeginGroup");
+        SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup");
+
+        // Create an entry for each ProcResource listed in WriteRes.
+        RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources");
+        std::vector<int64_t> Cycles =
+          WriteRes->getValueAsListOfInts("ResourceCycles");
+        for (unsigned PRIdx = 0, PREnd = PRVec.size();
+             PRIdx != PREnd; ++PRIdx) {
+          MCWriteProcResEntry WPREntry;
+          WPREntry.ProcResourceIdx = ProcModel.getProcResourceIdx(PRVec[PRIdx]);
+          assert(WPREntry.ProcResourceIdx && "Bad ProcResourceIdx");
+          if (Cycles.size() > PRIdx)
+            WPREntry.Cycles = Cycles[PRIdx];
+          else
+            WPREntry.Cycles = 1;
+          WriteProcResources.push_back(WPREntry);
+        }
+      }
+      WriteLatencies.push_back(WLEntry);
+    }
+    // Create an entry for each operand Read in this SchedClass.
+    // Entries must be sorted first by UseIdx then by WriteResourceID.
+    for (unsigned UseIdx = 0, EndIdx = Reads.size();
+         UseIdx != EndIdx; ++UseIdx) {
+      Record *ReadAdvance =
+        FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel);
+      if (!ReadAdvance)
+        continue;
+
+      // Mark the parent class as invalid for unsupported write types.
+      if (ReadAdvance->getValueAsBit("Unsupported")) {
+        SCDesc.NumMicroOps = MCSchedClassDesc::InvalidNumMicroOps;
+        break;
+      }
+      RecVec ValidWrites = ReadAdvance->getValueAsListOfDefs("ValidWrites");
+      IdxVec WriteIDs;
+      if (ValidWrites.empty())
+        WriteIDs.push_back(0);
+      else {
+        for (RecIter VWI = ValidWrites.begin(), VWE = ValidWrites.end();
+             VWI != VWE; ++VWI) {
+          WriteIDs.push_back(SchedModels.getSchedRWIdx(*VWI, /*IsRead=*/false));
+        }
+      }
+      std::sort(WriteIDs.begin(), WriteIDs.end());
+      for(IdxIter WI = WriteIDs.begin(), WE = WriteIDs.end(); WI != WE; ++WI) {
+        MCReadAdvanceEntry RAEntry;
+        RAEntry.UseIdx = UseIdx;
+        RAEntry.WriteResourceID = *WI;
+        RAEntry.Cycles = ReadAdvance->getValueAsInt("Cycles");
+        ReadAdvanceEntries.push_back(RAEntry);
+      }
+    }
+    if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
+      WriteProcResources.clear();
+      WriteLatencies.clear();
+      ReadAdvanceEntries.clear();
+    }
+    // Add the information for this SchedClass to the global tables using basic
+    // compression.
+    //
+    // WritePrecRes entries are sorted by ProcResIdx.
+    std::sort(WriteProcResources.begin(), WriteProcResources.end(),
+              LessWriteProcResources());
+
+    SCDesc.NumWriteProcResEntries = WriteProcResources.size();
+    std::vector<MCWriteProcResEntry>::iterator WPRPos =
+      std::search(SchedTables.WriteProcResources.begin(),
+                  SchedTables.WriteProcResources.end(),
+                  WriteProcResources.begin(), WriteProcResources.end());
+    if (WPRPos != SchedTables.WriteProcResources.end())
+      SCDesc.WriteProcResIdx = WPRPos - SchedTables.WriteProcResources.begin();
+    else {
+      SCDesc.WriteProcResIdx = SchedTables.WriteProcResources.size();
+      SchedTables.WriteProcResources.insert(WPRPos, WriteProcResources.begin(),
+                                            WriteProcResources.end());
+    }
+    // Latency entries must remain in operand order.
+    SCDesc.NumWriteLatencyEntries = WriteLatencies.size();
+    std::vector<MCWriteLatencyEntry>::iterator WLPos =
+      std::search(SchedTables.WriteLatencies.begin(),
+                  SchedTables.WriteLatencies.end(),
+                  WriteLatencies.begin(), WriteLatencies.end());
+    if (WLPos != SchedTables.WriteLatencies.end()) {
+      unsigned idx = WLPos - SchedTables.WriteLatencies.begin();
+      SCDesc.WriteLatencyIdx = idx;
+      for (unsigned i = 0, e = WriteLatencies.size(); i < e; ++i)
+        if (SchedTables.WriterNames[idx + i].find(WriterNames[i]) ==
+            std::string::npos) {
+          SchedTables.WriterNames[idx + i] += std::string("_") + WriterNames[i];
+        }
+    }
+    else {
+      SCDesc.WriteLatencyIdx = SchedTables.WriteLatencies.size();
+      SchedTables.WriteLatencies.insert(SchedTables.WriteLatencies.end(),
+                                        WriteLatencies.begin(),
+                                        WriteLatencies.end());
+      SchedTables.WriterNames.insert(SchedTables.WriterNames.end(),
+                                     WriterNames.begin(), WriterNames.end());
+    }
+    // ReadAdvanceEntries must remain in operand order.
+    SCDesc.NumReadAdvanceEntries = ReadAdvanceEntries.size();
+    std::vector<MCReadAdvanceEntry>::iterator RAPos =
+      std::search(SchedTables.ReadAdvanceEntries.begin(),
+                  SchedTables.ReadAdvanceEntries.end(),
+                  ReadAdvanceEntries.begin(), ReadAdvanceEntries.end());
+    if (RAPos != SchedTables.ReadAdvanceEntries.end())
+      SCDesc.ReadAdvanceIdx = RAPos - SchedTables.ReadAdvanceEntries.begin();
+    else {
+      SCDesc.ReadAdvanceIdx = SchedTables.ReadAdvanceEntries.size();
+      SchedTables.ReadAdvanceEntries.insert(RAPos, ReadAdvanceEntries.begin(),
+                                            ReadAdvanceEntries.end());
+    }
+  }
+}
+
+// Emit SchedClass tables for all processors and associated global tables.
+void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
+                                            raw_ostream &OS) {
+  // Emit global WriteProcResTable.
+  OS << "\n// {ProcResourceIdx, Cycles}\n"
+     << "extern const llvm::MCWriteProcResEntry "
+     << Target << "WriteProcResTable[] = {\n"
+     << "  { 0,  0}, // Invalid\n";
+  for (unsigned WPRIdx = 1, WPREnd = SchedTables.WriteProcResources.size();
+       WPRIdx != WPREnd; ++WPRIdx) {
+    MCWriteProcResEntry &WPREntry = SchedTables.WriteProcResources[WPRIdx];
+    OS << "  {" << format("%2d", WPREntry.ProcResourceIdx) << ", "
+       << format("%2d", WPREntry.Cycles) << "}";
+    if (WPRIdx + 1 < WPREnd)
+      OS << ',';
+    OS << " // #" << WPRIdx << '\n';
+  }
+  OS << "}; // " << Target << "WriteProcResTable\n";
+
+  // Emit global WriteLatencyTable.
+  OS << "\n// {Cycles, WriteResourceID}\n"
+     << "extern const llvm::MCWriteLatencyEntry "
+     << Target << "WriteLatencyTable[] = {\n"
+     << "  { 0,  0}, // Invalid\n";
+  for (unsigned WLIdx = 1, WLEnd = SchedTables.WriteLatencies.size();
+       WLIdx != WLEnd; ++WLIdx) {
+    MCWriteLatencyEntry &WLEntry = SchedTables.WriteLatencies[WLIdx];
+    OS << "  {" << format("%2d", WLEntry.Cycles) << ", "
+       << format("%2d", WLEntry.WriteResourceID) << "}";
+    if (WLIdx + 1 < WLEnd)
+      OS << ',';
+    OS << " // #" << WLIdx << " " << SchedTables.WriterNames[WLIdx] << '\n';
+  }
+  OS << "}; // " << Target << "WriteLatencyTable\n";
+
+  // Emit global ReadAdvanceTable.
+  OS << "\n// {UseIdx, WriteResourceID, Cycles}\n"
+     << "extern const llvm::MCReadAdvanceEntry "
+     << Target << "ReadAdvanceTable[] = {\n"
+     << "  {0,  0,  0}, // Invalid\n";
+  for (unsigned RAIdx = 1, RAEnd = SchedTables.ReadAdvanceEntries.size();
+       RAIdx != RAEnd; ++RAIdx) {
+    MCReadAdvanceEntry &RAEntry = SchedTables.ReadAdvanceEntries[RAIdx];
+    OS << "  {" << RAEntry.UseIdx << ", "
+       << format("%2d", RAEntry.WriteResourceID) << ", "
+       << format("%2d", RAEntry.Cycles) << "}";
+    if (RAIdx + 1 < RAEnd)
+      OS << ',';
+    OS << " // #" << RAIdx << '\n';
+  }
+  OS << "}; // " << Target << "ReadAdvanceTable\n";
+
+  // Emit a SchedClass table for each processor.
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+    if (!PI->hasInstrSchedModel())
+      continue;
+
+    std::vector<MCSchedClassDesc> &SCTab =
+      SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
+
+    OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup,"
+       << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n";
+    OS << "static const llvm::MCSchedClassDesc "
+       << PI->ModelName << "SchedClasses[] = {\n";
+
+    // The first class is always invalid. We no way to distinguish it except by
+    // name and position.
+    assert(SchedModels.getSchedClass(0).Name == "NoItinerary"
+           && "invalid class not first");
+    OS << "  {DBGFIELD(\"InvalidSchedClass\")  "
+       << MCSchedClassDesc::InvalidNumMicroOps
+       << ", 0, 0,  0, 0,  0, 0,  0, 0},\n";
+
+    for (unsigned SCIdx = 1, SCEnd = SCTab.size(); SCIdx != SCEnd; ++SCIdx) {
+      MCSchedClassDesc &MCDesc = SCTab[SCIdx];
+      const CodeGenSchedClass &SchedClass = SchedModels.getSchedClass(SCIdx);
+      OS << "  {DBGFIELD(\"" << SchedClass.Name << "\") ";
+      if (SchedClass.Name.size() < 18)
+        OS.indent(18 - SchedClass.Name.size());
+      OS << MCDesc.NumMicroOps
+         << ", " << MCDesc.BeginGroup << ", " << MCDesc.EndGroup
+         << ", " << format("%2d", MCDesc.WriteProcResIdx)
+         << ", " << MCDesc.NumWriteProcResEntries
+         << ", " << format("%2d", MCDesc.WriteLatencyIdx)
+         << ", " << MCDesc.NumWriteLatencyEntries
+         << ", " << format("%2d", MCDesc.ReadAdvanceIdx)
+         << ", " << MCDesc.NumReadAdvanceEntries << "}";
+      if (SCIdx + 1 < SCEnd)
+        OS << ',';
+      OS << " // #" << SCIdx << '\n';
+    }
+    OS << "}; // " << PI->ModelName << "SchedClasses\n";
+  }
+}
+
 void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
   // For each processor model.
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
          PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
-    // Skip default
+    // Emit processor resource table.
+    if (PI->hasInstrSchedModel())
+      EmitProcessorResources(*PI, OS);
+    else if(!PI->ProcResourceDefs.empty())
+      PrintFatalError(PI->ModelDef->getLoc(), "SchedMachineModel defines "
+                    "ProcResources without defining WriteRes SchedWriteRes");
+
     // Begin processor itinerary properties
     OS << "\n";
     OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
@@ -591,11 +1109,19 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
+    OS << "  " << PI->Index << ", // Processor ID\n";
+    if (PI->hasInstrSchedModel())
+      OS << "  " << PI->ModelName << "ProcResources" << ",\n"
+         << "  " << PI->ModelName << "SchedClasses" << ",\n"
+         << "  " << PI->ProcResourceDefs.size()+1 << ",\n"
+         << "  " << (SchedModels.schedClassEnd()
+                     - SchedModels.schedClassBegin()) << ",\n";
+    else
+      OS << "  0, 0, 0, 0, // No instruction-level machine model.\n";
     if (SchedModels.hasItineraryClasses())
-      OS << "  " << PI->ItinsDef->getName();
+      OS << "  " << PI->ItinsDef->getName() << ");\n";
     else
-      OS << "  0";
-    OS << ");\n";
+      OS << "  0); // No Itinerary\n";
   }
 }
 
@@ -621,14 +1147,10 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
 
     const std::string &Name = Processor->getValueAsString("Name");
     const std::string &ProcModelName =
-      SchedModels.getProcModel(Processor).ModelName;
+      SchedModels.getModelForProc(Processor).ModelName;
 
     // Emit as { "cpu", procinit },
-    OS << "  { "
-       << "\"" << Name << "\", "
-       << "(void *)&" << ProcModelName;
-
-    OS << " }";
+    OS << "  { \"" << Name << "\", (const void *)&" << ProcModelName << " }";
 
     // Depending on ''if more in the list'' emit comma
     if (++i < N) OS << ",";
@@ -644,16 +1166,116 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
 // EmitSchedModel - Emits all scheduling model tables, folding common patterns.
 //
 void SubtargetEmitter::EmitSchedModel(raw_ostream &OS) {
+  OS << "#ifdef DBGFIELD\n"
+     << "#error \"<target>GenSubtargetInfo.inc requires a DBGFIELD macro\"\n"
+     << "#endif\n"
+     << "#ifndef NDEBUG\n"
+     << "#define DBGFIELD(x) x,\n"
+     << "#else\n"
+     << "#define DBGFIELD(x)\n"
+     << "#endif\n";
+
   if (SchedModels.hasItineraryClasses()) {
     std::vector<std::vector<InstrItinerary> > ProcItinLists;
     // Emit the stage data
     EmitStageAndOperandCycleData(OS, ProcItinLists);
     EmitItineraries(OS, ProcItinLists);
   }
+  OS << "\n// ===============================================================\n"
+     << "// Data tables for the new per-operand machine model.\n";
+
+  SchedClassTables SchedTables;
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+    GenSchedClassTables(*PI, SchedTables);
+  }
+  EmitSchedClassTables(SchedTables, OS);
+
   // Emit the processor machine model
   EmitProcessorModels(OS);
   // Emit the processor lookup data
   EmitProcessorLookup(OS);
+
+  OS << "#undef DBGFIELD";
+}
+
+void SubtargetEmitter::EmitSchedModelHelpers(std::string ClassName,
+                                             raw_ostream &OS) {
+  OS << "unsigned " << ClassName
+     << "\n::resolveSchedClass(unsigned SchedClass, const MachineInstr *MI,"
+     << " const TargetSchedModel *SchedModel) const {\n";
+
+  std::vector<Record*> Prologs = Records.getAllDerivedDefinitions("PredicateProlog");
+  std::sort(Prologs.begin(), Prologs.end(), LessRecord());
+  for (std::vector<Record*>::const_iterator
+         PI = Prologs.begin(), PE = Prologs.end(); PI != PE; ++PI) {
+    OS << (*PI)->getValueAsString("Code") << '\n';
+  }
+  IdxVec VariantClasses;
+  for (CodeGenSchedModels::SchedClassIter SCI = SchedModels.schedClassBegin(),
+         SCE = SchedModels.schedClassEnd(); SCI != SCE; ++SCI) {
+    if (SCI->Transitions.empty())
+      continue;
+    VariantClasses.push_back(SCI - SchedModels.schedClassBegin());
+  }
+  if (!VariantClasses.empty()) {
+    OS << "  switch (SchedClass) {\n";
+    for (IdxIter VCI = VariantClasses.begin(), VCE = VariantClasses.end();
+         VCI != VCE; ++VCI) {
+      const CodeGenSchedClass &SC = SchedModels.getSchedClass(*VCI);
+      OS << "  case " << *VCI << ": // " << SC.Name << '\n';
+      IdxVec ProcIndices;
+      for (std::vector<CodeGenSchedTransition>::const_iterator
+             TI = SC.Transitions.begin(), TE = SC.Transitions.end();
+           TI != TE; ++TI) {
+        IdxVec PI;
+        std::set_union(TI->ProcIndices.begin(), TI->ProcIndices.end(),
+                       ProcIndices.begin(), ProcIndices.end(),
+                       std::back_inserter(PI));
+        ProcIndices.swap(PI);
+      }
+      for (IdxIter PI = ProcIndices.begin(), PE = ProcIndices.end();
+           PI != PE; ++PI) {
+        OS << "    ";
+        if (*PI != 0)
+          OS << "if (SchedModel->getProcessorID() == " << *PI << ") ";
+        OS << "{ // " << (SchedModels.procModelBegin() + *PI)->ModelName
+           << '\n';
+        for (std::vector<CodeGenSchedTransition>::const_iterator
+               TI = SC.Transitions.begin(), TE = SC.Transitions.end();
+             TI != TE; ++TI) {
+          OS << "      if (";
+          if (*PI != 0 && !std::count(TI->ProcIndices.begin(),
+                                      TI->ProcIndices.end(), *PI)) {
+              continue;
+          }
+          for (RecIter RI = TI->PredTerm.begin(), RE = TI->PredTerm.end();
+               RI != RE; ++RI) {
+            if (RI != TI->PredTerm.begin())
+              OS << "\n          && ";
+            OS << "(" << (*RI)->getValueAsString("Predicate") << ")";
+          }
+          OS << ")\n"
+             << "        return " << TI->ToClassIdx << "; // "
+             << SchedModels.getSchedClass(TI->ToClassIdx).Name << '\n';
+        }
+        OS << "    }\n";
+        if (*PI == 0)
+          break;
+      }
+      unsigned SCIdx = 0;
+      if (SC.ItinClassDef)
+        SCIdx = SchedModels.getSchedClassIdxForItin(SC.ItinClassDef);
+      else
+        SCIdx = SchedModels.findSchedClassIdx(SC.Writes, SC.Reads);
+      if (SCIdx != *VCI)
+        OS << "    return " << SCIdx << ";\n";
+      OS << "    break;\n";
+    }
+    OS << "  };\n";
+  }
+  OS << "  report_fatal_error(\"Expected a variant SchedClass\");\n"
+     << "} // " << ClassName << "::resolveSchedClass\n";
 }
 
 //
@@ -680,7 +1302,8 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
     return;
   }
 
-  OS << "  uint64_t Bits = ReInitMCSubtargetInfo(CPU, FS);\n";
+  OS << "  InitMCProcessorInfo(CPU, FS);\n"
+     << "  uint64_t Bits = getFeatureBits();\n";
 
   for (unsigned i = 0; i < Features.size(); i++) {
     // Next record
@@ -747,13 +1370,18 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "0, ";
+  OS << '\n'; OS.indent(22);
+  OS << Target << "ProcSchedKV, "
+     << Target << "WriteProcResTable, "
+     << Target << "WriteLatencyTable, "
+     << Target << "ReadAdvanceTable, ";
   if (SchedModels.hasItineraryClasses()) {
-    OS << Target << "ProcSchedKV, "
-       << Target << "Stages, "
+    OS << '\n'; OS.indent(22);
+    OS << Target << "Stages, "
        << Target << "OperandCycles, "
        << Target << "ForwardingPaths, ";
   } else
-    OS << "0, 0, 0, 0, ";
+    OS << "0, 0, 0, ";
   OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
 
   OS << "} // End llvm namespace \n";
@@ -780,6 +1408,8 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << "  explicit " << ClassName << "(StringRef TT, StringRef CPU, "
      << "StringRef FS);\n"
      << "public:\n"
+     << "  unsigned resolveSchedClass(unsigned SchedClass, const MachineInstr *DefMI,"
+     << " const TargetSchedModel *SchedModel) const;\n"
      << "  DFAPacketizer *createDFAPacketizer(const InstrItineraryData *IID)"
      << " const;\n"
      << "};\n";
@@ -790,11 +1420,19 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "\n#ifdef GET_SUBTARGETINFO_CTOR\n";
   OS << "#undef GET_SUBTARGETINFO_CTOR\n";
 
+  OS << "#include \"llvm/CodeGen/TargetSchedule.h\"\n";
   OS << "namespace llvm {\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "SubTypeKV[];\n";
+  OS << "extern const llvm::SubtargetInfoKV " << Target << "ProcSchedKV[];\n";
+  OS << "extern const llvm::MCWriteProcResEntry "
+     << Target << "WriteProcResTable[];\n";
+  OS << "extern const llvm::MCWriteLatencyEntry "
+     << Target << "WriteLatencyTable[];\n";
+  OS << "extern const llvm::MCReadAdvanceEntry "
+     << Target << "ReadAdvanceTable[];\n";
+
   if (SchedModels.hasItineraryClasses()) {
-    OS << "extern const llvm::SubtargetInfoKV " << Target << "ProcSchedKV[];\n";
     OS << "extern const llvm::InstrStage " << Target << "Stages[];\n";
     OS << "extern const unsigned " << Target << "OperandCycles[];\n";
     OS << "extern const unsigned " << Target << "ForwardingPaths[];\n";
@@ -812,14 +1450,22 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "0, ";
+  OS << '\n'; OS.indent(22);
+  OS << Target << "ProcSchedKV, "
+     << Target << "WriteProcResTable, "
+     << Target << "WriteLatencyTable, "
+     << Target << "ReadAdvanceTable, ";
+  OS << '\n'; OS.indent(22);
   if (SchedModels.hasItineraryClasses()) {
-    OS << Target << "ProcSchedKV, "
-       << Target << "Stages, "
+    OS << Target << "Stages, "
        << Target << "OperandCycles, "
        << Target << "ForwardingPaths, ";
   } else
-    OS << "0, 0, 0, 0, ";
+    OS << "0, 0, 0, ";
   OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
+
+  EmitSchedModelHelpers(ClassName, OS);
+
   OS << "} // End llvm namespace \n";
 
   OS << "#endif // GET_SUBTARGETINFO_CTOR\n\n";
diff --git a/utils/TableGen/TGValueTypes.cpp b/utils/TableGen/TGValueTypes.cpp
index af0d9f44cf43..3ac71a49147f 100644
--- a/utils/TableGen/TGValueTypes.cpp
+++ b/utils/TableGen/TGValueTypes.cpp
@@ -15,13 +15,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Casting.h"
 #include <map>
 using namespace llvm;
 
 namespace llvm {
 
 class Type {
+protected:
+  enum TypeKind {
+    TK_ExtendedIntegerType,
+    TK_ExtendedVectorType
+  };
+private:
+  TypeKind Kind;
 public:
+  TypeKind getKind() const {
+    return Kind;
+  }
+  Type(TypeKind K) : Kind(K) {}
   virtual unsigned getSizeInBits() const = 0;
   virtual ~Type() {}
 };
@@ -32,7 +44,10 @@ class ExtendedIntegerType : public Type {
   unsigned BitWidth;
 public:
   explicit ExtendedIntegerType(unsigned bits)
-    : BitWidth(bits) {}
+    : Type(TK_ExtendedIntegerType), BitWidth(bits) {}
+  static bool classof(const Type *T) {
+    return T->getKind() == TK_ExtendedIntegerType;
+  }
   unsigned getSizeInBits() const {
     return getBitWidth();
   }
@@ -46,7 +61,10 @@ class ExtendedVectorType : public Type {
   unsigned NumElements;
 public:
   ExtendedVectorType(EVT elty, unsigned num)
-    : ElementType(elty), NumElements(num) {}
+    : Type(TK_ExtendedVectorType), ElementType(elty), NumElements(num) {}
+  static bool classof(const Type *T) {
+    return T->getKind() == TK_ExtendedVectorType;
+  }
   unsigned getSizeInBits() const {
     return getNumElements() * getElementType().getSizeInBits();
   }
@@ -71,12 +89,12 @@ bool EVT::isExtendedFloatingPoint() const {
 
 bool EVT::isExtendedInteger() const {
   assert(isExtended() && "Type is not extended!");
-  return dynamic_cast<const ExtendedIntegerType *>(LLVMTy) != 0;
+  return isa<ExtendedIntegerType>(LLVMTy);
 }
 
 bool EVT::isExtendedVector() const {
   assert(isExtended() && "Type is not extended!");
-  return dynamic_cast<const ExtendedVectorType *>(LLVMTy) != 0;
+  return isa<ExtendedVectorType>(LLVMTy);
 }
 
 bool EVT::isExtended64BitVector() const {
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 9695b4a351c7..49efe7ed7374 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -20,7 +20,6 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
-#include "llvm/TableGen/TableGenAction.h"
 
 using namespace llvm;
 
@@ -90,86 +89,83 @@ namespace {
   Class("class", cl::desc("Print Enum list for this class"),
           cl::value_desc("class name"));
 
-  class LLVMTableGenAction : public TableGenAction {
-  public:
-    bool operator()(raw_ostream &OS, RecordKeeper &Records) {
-      switch (Action) {
-      case PrintRecords:
-        OS << Records;           // No argument, dump all contents
-        break;
-      case GenEmitter:
-        EmitCodeEmitter(Records, OS);
-        break;
-      case GenRegisterInfo:
-        EmitRegisterInfo(Records, OS);
-        break;
-      case GenInstrInfo:
-        EmitInstrInfo(Records, OS);
-        break;
-      case GenCallingConv:
-        EmitCallingConv(Records, OS);
-        break;
-      case GenAsmWriter:
-        EmitAsmWriter(Records, OS);
-        break;
-      case GenAsmMatcher:
-        EmitAsmMatcher(Records, OS);
-        break;
-      case GenDisassembler:
-        EmitDisassembler(Records, OS);
-        break;
-      case GenPseudoLowering:
-        EmitPseudoLowering(Records, OS);
-        break;
-      case GenDAGISel:
-        EmitDAGISel(Records, OS);
-        break;
-      case GenDFAPacketizer:
-        EmitDFAPacketizer(Records, OS);
-        break;
-      case GenFastISel:
-        EmitFastISel(Records, OS);
-        break;
-      case GenSubtarget:
-        EmitSubtarget(Records, OS);
-        break;
-      case GenIntrinsic:
-        EmitIntrinsics(Records, OS);
-        break;
-      case GenTgtIntrinsic:
-        EmitIntrinsics(Records, OS, true);
-        break;
-      case GenEDInfo:
-        EmitEnhancedDisassemblerInfo(Records, OS);
-        break;
-      case PrintEnums:
-      {
-        std::vector<Record*> Recs = Records.getAllDerivedDefinitions(Class);
-        for (unsigned i = 0, e = Recs.size(); i != e; ++i)
-          OS << Recs[i]->getName() << ", ";
-        OS << "\n";
-        break;
-      }
-      case PrintSets:
-      {
-        SetTheory Sets;
-        Sets.addFieldExpander("Set", "Elements");
-        std::vector<Record*> Recs = Records.getAllDerivedDefinitions("Set");
-        for (unsigned i = 0, e = Recs.size(); i != e; ++i) {
-          OS << Recs[i]->getName() << " = [";
-          const std::vector<Record*> *Elts = Sets.expand(Recs[i]);
-          assert(Elts && "Couldn't expand Set instance");
-          for (unsigned ei = 0, ee = Elts->size(); ei != ee; ++ei)
-            OS << ' ' << (*Elts)[ei]->getName();
-          OS << " ]\n";
-        }
-        break;
-      }
-      }
-
-      return false;
+bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
+  switch (Action) {
+  case PrintRecords:
+    OS << Records;           // No argument, dump all contents
+    break;
+  case GenEmitter:
+    EmitCodeEmitter(Records, OS);
+    break;
+  case GenRegisterInfo:
+    EmitRegisterInfo(Records, OS);
+    break;
+  case GenInstrInfo:
+    EmitInstrInfo(Records, OS);
+    break;
+  case GenCallingConv:
+    EmitCallingConv(Records, OS);
+    break;
+  case GenAsmWriter:
+    EmitAsmWriter(Records, OS);
+    break;
+  case GenAsmMatcher:
+    EmitAsmMatcher(Records, OS);
+    break;
+  case GenDisassembler:
+    EmitDisassembler(Records, OS);
+    break;
+  case GenPseudoLowering:
+    EmitPseudoLowering(Records, OS);
+    break;
+  case GenDAGISel:
+    EmitDAGISel(Records, OS);
+    break;
+  case GenDFAPacketizer:
+    EmitDFAPacketizer(Records, OS);
+    break;
+  case GenFastISel:
+    EmitFastISel(Records, OS);
+    break;
+  case GenSubtarget:
+    EmitSubtarget(Records, OS);
+    break;
+  case GenIntrinsic:
+    EmitIntrinsics(Records, OS);
+    break;
+  case GenTgtIntrinsic:
+    EmitIntrinsics(Records, OS, true);
+    break;
+  case GenEDInfo:
+    EmitEnhancedDisassemblerInfo(Records, OS);
+    break;
+  case PrintEnums:
+  {
+    std::vector<Record*> Recs = Records.getAllDerivedDefinitions(Class);
+    for (unsigned i = 0, e = Recs.size(); i != e; ++i)
+      OS << Recs[i]->getName() << ", ";
+    OS << "\n";
+    break;
+  }
+  case PrintSets:
+  {
+    SetTheory Sets;
+    Sets.addFieldExpander("Set", "Elements");
+    std::vector<Record*> Recs = Records.getAllDerivedDefinitions("Set");
+    for (unsigned i = 0, e = Recs.size(); i != e; ++i) {
+      OS << Recs[i]->getName() << " = [";
+      const std::vector<Record*> *Elts = Sets.expand(Recs[i]);
+      assert(Elts && "Couldn't expand Set instance");
+      for (unsigned ei = 0, ee = Elts->size(); ei != ee; ++ei)
+        OS << ' ' << (*Elts)[ei]->getName();
+      OS << " ]\n";
     }
-  };
+    break;
+  }
+  }
+
+  return false;
+}
 }
 
 int main(int argc, char **argv) {
@@ -177,6 +173,5 @@ int main(int argc, char **argv) {
   PrettyStackTraceProgram X(argc, argv);
   cl::ParseCommandLineOptions(argc, argv);
 
-  LLVMTableGenAction Action;
-  return TableGenMain(argv[0], Action);
+  return TableGenMain(argv[0], &LLVMTableGenMain);
 }
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 2c00c40cfef9..f0d25d8a2c81 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -74,5 +74,6 @@ void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS);
 void EmitPseudoLowering(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterInfo(RecordKeeper &RK, raw_ostream &OS);
 void EmitSubtarget(RecordKeeper &RK, raw_ostream &OS);
+void EmitMapTable(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index f3bd373708f7..468a1f81c719 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -209,6 +209,7 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
   bool satisfiesOneEntry = true;
   bool satisfiesSplitRM = true;
   bool satisfiesSplitReg = true;
+  bool satisfiesSplitMisc = true;
 
   for (unsigned index = 0; index < 256; ++index) {
     if (decision.instructionIDs[index] != decision.instructionIDs[0])
@@ -228,7 +229,7 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
 
     if (((index & 0xc0) != 0xc0) &&
        (decision.instructionIDs[index] != decision.instructionIDs[index&0x38]))
-      satisfiesSplitReg = false;
+      satisfiesSplitMisc = false;
   }
 
   if (satisfiesOneEntry)
@@ -237,9 +238,12 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
   if (satisfiesSplitRM)
     return MODRM_SPLITRM;
 
-  if (satisfiesSplitReg)
+  if (satisfiesSplitReg && satisfiesSplitMisc)
     return MODRM_SPLITREG;
 
+  if (satisfiesSplitMisc)
+    return MODRM_SPLITMISC;
+
   return MODRM_FULL;
 }
 
@@ -332,6 +336,12 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
       for (unsigned index = 0xc0; index < 256; index += 8)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
       break;
+    case MODRM_SPLITMISC:
+      for (unsigned index = 0; index < 64; index += 8)
+        emitOneID(o1, i1, decision.instructionIDs[index], true);
+      for (unsigned index = 0xc0; index < 256; ++index)
+        emitOneID(o1, i1, decision.instructionIDs[index], true);
+      break;
     case MODRM_FULL:
       for (unsigned index = 0; index < 256; ++index)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
@@ -361,11 +371,18 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
     case MODRM_SPLITREG:
       sEntryNumber += 16;
       break;
+    case MODRM_SPLITMISC:
+      sEntryNumber += 8 + 64;
+      break;
     case MODRM_FULL:
       sEntryNumber += 256;
       break;
   }
 
+  // We assume that the index can fit into uint16_t.
+  assert(sEntryNumber < 65536U &&
+         "Index into ModRMDecision is too large for uint16_t!");
+
   ++sTableNumber;
 }
 
diff --git a/utils/TableGen/X86ModRMFilters.h b/utils/TableGen/X86ModRMFilters.h
index 19fecbc3a0a5..2cbaf7985db2 100644
--- a/utils/TableGen/X86ModRMFilters.h
+++ b/utils/TableGen/X86ModRMFilters.h
@@ -70,7 +70,7 @@ class ModFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @r            - True if the mod bits of the ModR/M byte must be 11; false
+  /// \param r        True if the mod bits of the ModR/M byte must be 11; false
   ///                 otherwise.  The name r derives from the fact that the mod
   ///                 bits indicate whether the R/M bits [bits 2-0] signify a
   ///                 register or a memory operand.
@@ -98,11 +98,12 @@ class EscapeFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @c0_ff        - True if the ModR/M byte must fall between 0xc0 and 0xff;
-  ///                 false otherwise.
-  /// @nnn_or_modRM - If c0_ff is true, the required value of the entire ModR/M
-  ///                 byte.  If c0_ff is false, the required value of the nnn
-  ///                 field.
+  /// \param c0_ff True if the ModR/M byte must fall between 0xc0 and 0xff;
+  ///              false otherwise.
+  ///
+  /// \param nnn_or_modRM If c0_ff is true, the required value of the entire
+  ///                     ModR/M byte.  If c0_ff is false, the required value
+  ///                     of the nnn field.
   EscapeFilter(bool c0_ff, uint8_t nnn_or_modRM) :
     ModRMFilter(),
     C0_FF(c0_ff),
@@ -128,8 +129,8 @@ class AddRegEscapeFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @modRM        - The value of the ModR/M byte when the register operand
-  ///                 refers to the first register in the register set.
+  /// \param modRM The value of the ModR/M byte when the register operand
+  ///              refers to the first register in the register set.
   AddRegEscapeFilter(uint8_t modRM) : ModRM(modRM) {
   }
   
@@ -150,9 +151,9 @@ class ExtendedFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @r            - True if the mod field must be set to 11; false otherwise.
-  ///                 The name is explained at ModFilter.
-  /// @nnn          - The required value of the nnn field.
+  /// \param r   True if the mod field must be set to 11; false otherwise.
+  ///            The name is explained at ModFilter.
+  /// \param nnn The required value of the nnn field.
   ExtendedFilter(bool r, uint8_t nnn) : 
     ModRMFilter(),
     R(r),
@@ -177,7 +178,7 @@ class ExactFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @modRM        - The required value of the full ModR/M byte.
+  /// \param modRM The required value of the full ModR/M byte.
   ExactFilter(uint8_t modRM) :
     ModRMFilter(),
     ModRM(modRM) {
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 7ac2336d7321..d6ed2fe2c615 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -38,14 +38,15 @@ using namespace llvm;
   MAP(D0, 45)           \
   MAP(D1, 46)           \
   MAP(D4, 47)           \
-  MAP(D8, 48)           \
-  MAP(D9, 49)           \
-  MAP(DA, 50)           \
-  MAP(DB, 51)           \
-  MAP(DC, 52)           \
-  MAP(DD, 53)           \
-  MAP(DE, 54)           \
-  MAP(DF, 55)
+  MAP(D5, 48)           \
+  MAP(D8, 49)           \
+  MAP(D9, 50)           \
+  MAP(DA, 51)           \
+  MAP(DB, 52)           \
+  MAP(DC, 53)           \
+  MAP(DD, 54)           \
+  MAP(DE, 55)           \
+  MAP(DF, 56)
 
 // A clone of X86 since we can't depend on something that is generated.
 namespace X86Local {
@@ -244,7 +245,7 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   IsSSE            = (HasOpSizePrefix && (Name.find("16") == Name.npos)) ||
                      (Name.find("CRC32") != Name.npos);
   HasFROperands    = hasFROperands();
-  HasVEX_LPrefix   = has256BitOperands() || Rec->getValueAsBit("hasVEX_L");
+  HasVEX_LPrefix   = Rec->getValueAsBit("hasVEX_L");
 
   // Check for 64-bit inst which does not require REX
   Is32Bit = false;
@@ -479,20 +480,6 @@ bool RecognizableInstr::hasFROperands() const {
   return false;
 }
 
-bool RecognizableInstr::has256BitOperands() const {
-  const std::vector<CGIOperandList::OperandInfo> &OperandList = *Operands;
-  unsigned numOperands = OperandList.size();
-
-  for (unsigned operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
-    const std::string &recName = OperandList[operandIndex].Rec->getName();
-
-    if (!recName.compare("VR256")) {
-      return true;
-    }
-  }
-  return false;
-}
-
 void RecognizableInstr::handleOperand(bool optional, unsigned &operandIndex,
                                       unsigned &physicalOperandIndex,
                                       unsigned &numPhysicalOperands,
@@ -1145,6 +1132,8 @@ OperandEncoding RecognizableInstr::immediateEncodingFromString
   // register IDs in 8-bit immediates nowadays.
   ENCODING("VR256",           ENCODING_IB)
   ENCODING("VR128",           ENCODING_IB)
+  ENCODING("FR32",            ENCODING_IB)
+  ENCODING("FR64",            ENCODING_IB)
   errs() << "Unhandled immediate encoding " << s << "\n";
   llvm_unreachable("Unhandled immediate encoding");
 }
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 542e510c6033..9feb3c3c7d3b 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -127,10 +127,7 @@ private:
 
   /// hasFROperands - Returns true if any operand is a FR operand.
   bool hasFROperands() const;
-  
-  /// has256BitOperands - Returns true if any operand is a 256-bit SSE operand.
-  bool has256BitOperands() const;
-  
+
   /// typeFromString - Translates an operand type from the string provided in
   ///   the LLVM tables to an OperandType for use in the operand specifier.
   ///
@@ -143,7 +140,7 @@ private:
   /// @param hasREX_WPrefix - Indicates whether the instruction has a REX.W
   ///                         prefix.  If it does, 32-bit register operands stay
   ///                         32-bit regardless of the operand size.
-  /// @param hasOpSizePrefix- Indicates whether the instruction has an OpSize
+  /// @param hasOpSizePrefix  Indicates whether the instruction has an OpSize
   ///                         prefix.  If it does not, then 16-bit register
   ///                         operands stay 16-bit.
   /// @return               - The operand's type.
@@ -225,23 +222,23 @@ private:
   /// emitInstructionSpecifier - Loads the instruction specifier for the current
   ///   instruction into a DisassemblerTables.
   ///
-  /// @arg tables - The DisassemblerTables to populate with the specifier for
+  /// \param tables The DisassemblerTables to populate with the specifier for
   ///               the current instruction.
   void emitInstructionSpecifier(DisassemblerTables &tables);
   
   /// emitDecodePath - Populates the proper fields in the decode tables
   ///   corresponding to the decode paths for this instruction.
   ///
-  /// @arg tables - The DisassemblerTables to populate with the decode
+  /// \param tables The DisassemblerTables to populate with the decode
   ///               decode information for the current instruction.
   void emitDecodePath(DisassemblerTables &tables) const;
 
   /// Constructor - Initializes a RecognizableInstr with the appropriate fields
   ///   from a CodeGenInstruction.
   ///
-  /// @arg tables - The DisassemblerTables that the specifier will be added to.
-  /// @arg insn   - The CodeGenInstruction to extract information from.
-  /// @arg uid    - The unique ID of the current instruction.
+  /// \param tables The DisassemblerTables that the specifier will be added to.
+  /// \param insn   The CodeGenInstruction to extract information from.
+  /// \param uid    The unique ID of the current instruction.
   RecognizableInstr(DisassemblerTables &tables,
                     const CodeGenInstruction &insn,
                     InstrUID uid);
@@ -249,11 +246,11 @@ public:
   /// processInstr - Accepts a CodeGenInstruction and loads decode information
   ///   for it into a DisassemblerTables if appropriate.
   ///
-  /// @arg tables - The DiassemblerTables to be populated with decode
+  /// \param tables The DiassemblerTables to be populated with decode
   ///               information.
-  /// @arg insn   - The CodeGenInstruction to be used as a source for this
+  /// \param insn   The CodeGenInstruction to be used as a source for this
   ///               information.
-  /// @uid        - The unique ID of the instruction.
+  /// \param uid    The unique ID of the instruction.
   static void processInstr(DisassemblerTables &tables,
                            const CodeGenInstruction &insn,
                            InstrUID uid);
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll b/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll
deleted file mode 100644
index 3017b13e48c0..000000000000
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/Bar/bar-test.ll
+++ /dev/null
@@ -1,3 +0,0 @@
-; RUN: true
-; XFAIL: *
-; XTARGET: darwin
diff --git a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
index e9df1e5b53bf..3fdd63c22459 100644
--- a/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.InTree/test/lit.cfg
@@ -77,7 +77,7 @@ for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
 
 excludes = []
 
-# Provide target_triple for use in XFAIL and XTARGET.
+# Provide target_triple for use in XFAIL.
 config.target_triple = site_exp['target_triplet']
 
 # Provide llvm_supports_target for use in local configs.
diff --git a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
index e9df1e5b53bf..3fdd63c22459 100644
--- a/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
+++ b/utils/lit/lit/ExampleTests/LLVM.OutOfTree/src/test/lit.cfg
@@ -77,7 +77,7 @@ for line in open(os.path.join(config.llvm_obj_root, 'test', 'site.exp')):
 
 excludes = []
 
-# Provide target_triple for use in XFAIL and XTARGET.
+# Provide target_triple for use in XFAIL.
 config.target_triple = site_exp['target_triplet']
 
 # Provide llvm_supports_target for use in local configs.
diff --git a/utils/lit/lit/ExampleTests/lit.cfg b/utils/lit/lit/ExampleTests/lit.cfg
index 20ee37dcef27..2629918d9f6d 100644
--- a/utils/lit/lit/ExampleTests/lit.cfg
+++ b/utils/lit/lit/ExampleTests/lit.cfg
@@ -23,4 +23,4 @@ config.test_exec_root = None
 config.target_triple = 'foo'
 
 # available_features: Used by ShTest and TclTest formats for REQUIRES checks.
-config.available_features = ['some-feature-name']
+config.available_features.add('some-feature-name')
diff --git a/utils/lit/lit/ExampleTests/vg-fail.c b/utils/lit/lit/ExampleTests/vg-fail.c
new file mode 100644
index 000000000000..e3339ff91aab
--- /dev/null
+++ b/utils/lit/lit/ExampleTests/vg-fail.c
@@ -0,0 +1,4 @@
+// This test should XPASS, when run without valgrind.
+
+// RUN: true
+// XFAIL: valgrind
diff --git a/utils/lit/lit/ExampleTests/xfail-feature.c b/utils/lit/lit/ExampleTests/xfail-feature.c
new file mode 100644
index 000000000000..3444bf870080
--- /dev/null
+++ b/utils/lit/lit/ExampleTests/xfail-feature.c
@@ -0,0 +1,4 @@
+// This test should XPASS.
+
+// RUN: true
+// XFAIL: some-feature-name
diff --git a/utils/lit/lit/LitConfig.py b/utils/lit/lit/LitConfig.py
index c71c0ccdea9b..0a359a3db8eb 100644
--- a/utils/lit/lit/LitConfig.py
+++ b/utils/lit/lit/LitConfig.py
@@ -42,14 +42,11 @@ class LitConfig:
         self.numWarnings = 0
 
         self.valgrindArgs = []
-        self.valgrindTriple = ""
         if self.useValgrind:
-            self.valgrindTriple = "-vg"
             self.valgrindArgs = ['valgrind', '-q', '--run-libc-freeres=no',
                                  '--tool=memcheck', '--trace-children=yes',
                                  '--error-exitcode=123']
             if self.valgrindLeakCheck:
-                self.valgrindTriple += "_leak"
                 self.valgrindArgs.append('--leak-check=full')
             else:
                 # The default is 'summary'.
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 71882b76f8b9..0c1911ed3560 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -370,27 +370,27 @@ def executeScript(test, litConfig, tmpBase, commands, cwd):
 
     return executeCommand(command, cwd=cwd, env=test.config.environment)
 
-def isExpectedFail(xfails, xtargets, target_triple):
-    # Check if any xfail matches this target.
+def isExpectedFail(test, xfails):
+    # Check if any of the xfails match an available feature or the target.
     for item in xfails:
-        if item == '*' or item in target_triple:
-            break
-    else:
-        return False
+        # If this is the wildcard, it always fails.
+        if item == '*':
+            return True
 
-    # If so, see if it is expected to pass on this target.
-    #
-    # FIXME: Rename XTARGET to something that makes sense, like XPASS.
-    for item in xtargets:
-        if item == '*' or item in target_triple:
-            return False
+        # If this is an exact match for one of the features, it fails.
+        if item in test.config.available_features:
+            return True
+
+        # If this is a part of the target triple, it fails.
+        if item in test.suite.config.target_triple:
+            return True
 
-    return True
+    return False
 
 def parseIntegratedTestScript(test, normalize_slashes=False,
                               extra_substitutions=[]):
     """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test
-    script and extract the lines to 'RUN' as well as 'XFAIL' and 'XTARGET'
+    script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES'
     information. The RUN lines also will have variable substitution performed.
     """
 
@@ -431,7 +431,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
     # Collect the test lines from the script.
     script = []
     xfails = []
-    xtargets = []
     requires = []
     for ln in open(sourcepath):
         if 'RUN:' in ln:
@@ -450,9 +449,6 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
         elif 'XFAIL:' in ln:
             items = ln[ln.index('XFAIL:') + 6:].split(',')
             xfails.extend([s.strip() for s in items])
-        elif 'XTARGET:' in ln:
-            items = ln[ln.index('XTARGET:') + 8:].split(',')
-            xtargets.extend([s.strip() for s in items])
         elif 'REQUIRES:' in ln:
             items = ln[ln.index('REQUIRES:') + 9:].split(',')
             requires.extend([s.strip() for s in items])
@@ -491,7 +487,7 @@ def parseIntegratedTestScript(test, normalize_slashes=False,
         return (Test.UNSUPPORTED,
                 "Test requires the following features: %s" % msg)
 
-    isXFail = isExpectedFail(xfails, xtargets, test.suite.config.target_triple)
+    isXFail = isExpectedFail(test, xfails)
     return script,isXFail,tmpBase,execdir
 
 def formatTestOutput(status, out, err, exitCode, failDueToStderr, script):
diff --git a/utils/lit/lit/TestingConfig.py b/utils/lit/lit/TestingConfig.py
index 223120c4fe22..a1f79a3bfc4e 100644
--- a/utils/lit/lit/TestingConfig.py
+++ b/utils/lit/lit/TestingConfig.py
@@ -16,6 +16,7 @@ class TestingConfig:
                 'PATH' : os.pathsep.join(litConfig.path +
                                          [os.environ.get('PATH','')]),
                 'SYSTEMROOT' : os.environ.get('SYSTEMROOT',''),
+                'TERM' : os.environ.get('TERM',''),
                 'LLVM_DISABLE_CRASH_REPORT' : '1',
                 }
 
@@ -28,6 +29,13 @@ class TestingConfig:
                         'TMP' : os.environ.get('TMP',''),
                         })
 
+            # Set the default available features based on the LitConfig.
+            available_features = []
+            if litConfig.useValgrind:
+                available_features.append('valgrind')
+                if litConfig.valgrindLeakCheck:
+                    available_features.append('vg_leak')
+
             config = TestingConfig(parent,
                                    name = '<unnamed>',
                                    suffixes = set(),
@@ -39,7 +47,7 @@ class TestingConfig:
                                    test_exec_root = None,
                                    test_source_root = None,
                                    excludes = [],
-                                   available_features = [])
+                                   available_features = available_features)
 
         if os.path.exists(path):
             # FIXME: Improve detection and error reporting of errors in the
diff --git a/utils/lit/lit/Util.py b/utils/lit/lit/Util.py
index 226e453f2859..f29480900ce7 100644
--- a/utils/lit/lit/Util.py
+++ b/utils/lit/lit/Util.py
@@ -56,7 +56,7 @@ def which(command, paths = None):
         paths = os.environ.get('PATH','')
 
     # Check for absolute match first.
-    if os.path.exists(command):
+    if os.path.isfile(command):
         return command
 
     # Would be nice if Python had a lib function for this.
diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index 18b407a02a63..1baf398aa533 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -2,6 +2,7 @@
 Load into LLDB with:
 script import lldbDataFormatters
 type synthetic add -x "^llvm::SmallVectorImpl<.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
+type synthetic add -x "^llvm::SmallVector<.+,.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
 """
 
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
@@ -32,22 +33,15 @@ class SmallVectorSynthProvider:
         return self.begin.CreateChildAtOffset('['+str(index)+']',
                                               offset, self.data_type)
 
-    def get_type_from_name(self):
-        import re
-        name = self.valobj.GetType().GetName()
-        # This class works with both SmallVectors and SmallVectorImpls.
-        res = re.match("^(llvm::)?SmallVectorImpl<(.+)>$", name)
-        if res:
-            return res.group(2)
-        res = re.match("^(llvm::)?SmallVector<(.+), \d+>$", name)
-        if res:
-            return res.group(2)
-        return None
-
     def update(self):
         self.begin = self.valobj.GetChildMemberWithName('BeginX')
         self.end = self.valobj.GetChildMemberWithName('EndX')
-        data_type = self.get_type_from_name()
-        # FIXME: this sometimes returns an invalid type.
-        self.data_type = self.valobj.GetTarget().FindFirstType(data_type)
+        the_type = self.valobj.GetType()
+        # If this is a reference type we have to dereference it to get to the
+        # template parameter.
+        if the_type.IsReferenceType():
+            the_type = the_type.GetDereferencedType()
+
+        self.data_type = the_type.GetTemplateArgumentType(0)
         self.type_size = self.data_type.GetByteSize()
+        assert self.type_size != 0
diff --git a/utils/llvm-lit/llvm-lit.in b/utils/llvm-lit/llvm-lit.in
index 879d18bdc84b..768dc5103c8b 100644
--- a/utils/llvm-lit/llvm-lit.in
+++ b/utils/llvm-lit/llvm-lit.in
@@ -18,10 +18,15 @@ builtin_parameters = {
     'llvm_site_config' : os.path.join(llvm_obj_root, 'test', 'lit.site.cfg')
     }
 
-clang_site_config = os.path.join(llvm_obj_root, 'tools', 'clang', 'test',
-                                 'lit.site.cfg')
-if os.path.exists(clang_site_config):
-    builtin_parameters['clang_site_config'] = clang_site_config
+clang_obj_root = os.path.join(llvm_obj_root, 'tools', 'clang')
+
+if os.path.exists(clang_obj_root):
+    builtin_parameters['clang_site_config'] = \
+        os.path.join(clang_obj_root, 'test', 'lit.site.cfg')
+    clang_tools_extra_obj_root = os.path.join(clang_obj_root, 'tools', 'extra')
+    if os.path.exists(clang_tools_extra_obj_root):
+        builtin_parameters['clang_tools_extra_site_config'] = \
+            os.path.join(clang_tools_extra_obj_root, 'test', 'lit.site.cfg')
 
 if __name__=='__main__':
     import lit
diff --git a/utils/llvm.grm b/utils/llvm.grm
index ad2799f2c596..322036b2c209 100644
--- a/utils/llvm.grm
+++ b/utils/llvm.grm
@@ -175,7 +175,6 @@ FuncAttr      ::= noreturn
  | returns_twice
  | nonlazybind
  | address_safety
- | ia_nsdialect
  ;
 
 OptFuncAttrs  ::= + _ | OptFuncAttrs FuncAttr ;
diff --git a/utils/unittest/googletest/gtest-port.cc b/utils/unittest/googletest/gtest-port.cc
index 07e5bb3c0d6c..3c32ff1ac1ec 100644
--- a/utils/unittest/googletest/gtest-port.cc
+++ b/utils/unittest/googletest/gtest-port.cc
@@ -505,6 +505,10 @@ class CapturedStream {
     GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
                                     << temp_file_path;
     filename_ = temp_file_path;
+#elif GTEST_OS_LINUX_ANDROID
+    char name_template[] = "/sdcard/captured_stderr.XXXXXX";
+    const int captured_fd = mkstemp(name_template);
+    filename_ = name_template;
 # else
     // There's no guarantee that a test has write access to the
     // current directory, so we create the temporary file in the /tmp
diff --git a/utils/unittest/googletest/include/gtest/internal/gtest-port.h b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
index 8ef5d7dd26a2..58f6cafa75fb 100644
--- a/utils/unittest/googletest/include/gtest/internal/gtest-port.h
+++ b/utils/unittest/googletest/include/gtest/internal/gtest-port.h
@@ -230,7 +230,7 @@
 # define GTEST_OS_MAC 1
 #elif defined __linux__
 # define GTEST_OS_LINUX 1
-# ifdef ANDROID
+# if defined(ANDROID) || defined(__ANDROID__)
 #  define GTEST_OS_LINUX_ANDROID 1
 # endif  // ANDROID
 #elif defined __MVS__
diff --git a/utils/vim/llvm.vim b/utils/vim/llvm.vim
index c83e8caf8b19..c16274ba2f41 100644
--- a/utils/vim/llvm.vim
+++ b/utils/vim/llvm.vim
@@ -1,7 +1,7 @@
 " Vim syntax file
 " Language:   llvm
 " Maintainer: The LLVM team, http://llvm.org/
-" Version:      $Revision: 156080 $
+" Version:      $Revision: 166305 $
 
 if version < 600
   syntax clear
@@ -79,7 +79,6 @@ syn match  llvmSpecialComment /;\s*RUN:.*$/
 syn match  llvmSpecialComment /;\s*PR\d*\s*$/
 syn match  llvmSpecialComment /;\s*END\.\s*$/
 syn match  llvmSpecialComment /;\s*XFAIL:.*$/
-syn match  llvmSpecialComment /;\s*XTARGET:.*$/
 
 if version >= 508 || !exists("did_c_syn_inits")
   if version < 508
diff --git a/utils/yaml2obj/yaml2obj.cpp b/utils/yaml2obj/yaml2obj.cpp
index c3b3e5499cde..4fc620f4ea9b 100644
--- a/utils/yaml2obj/yaml2obj.cpp
+++ b/utils/yaml2obj/yaml2obj.cpp
@@ -148,7 +148,7 @@ struct COFFParser {
           return false;
         }
         if (KeyValue == "Machine") {
-          uint16_t Machine;
+          uint16_t Machine = COFF::MT_Invalid;
           if (!getAs(Value, Machine)) {
             // It's not a raw number, try matching the string.
             StringRef ValueValue = Value->getValue(Storage);